Projects
Essentials
libx264
Sign Up
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
Expand all
Collapse all
Changes of Revision 10
View file
libx264.changes
Changed
@@ -1,4 +1,9 @@ ------------------------------------------------------------------- +Wed Aug 5 13:04:18 UTC 2015 - idonmez@suse.com + +- update to 20150804 snapshot + +------------------------------------------------------------------- Sun Mar 1 09:33:42 UTC 2015 - i@margueirte.su - update version 20141218
View file
libx264.spec
Changed
@@ -16,8 +16,8 @@ # -%define soname 142 -%define svn 20141218 +%define soname 148 +%define svn 20150804 Name: libx264 Version: 0.%{soname}svn%{svn} Release: 0
View file
x264-snapshot-20141218-2245.tar.bz2/extras/gas-preprocessor.pl
Deleted
@@ -1,253 +0,0 @@ -#!/usr/bin/env perl -# by David Conrad -# This code is licensed under GPLv2 or later; go to gnu.org to read it -# (not that it much matters for an asm preprocessor) -# usage: set your assembler to be something like "perl gas-preprocessor.pl gcc" -use strict; - -# Apple's gas is ancient and doesn't support modern preprocessing features like -# .rept and has ugly macro syntax, among other things. Thus, this script -# implements the subset of the gas preprocessor used by x264 and ffmpeg -# that isn't supported by Apple's gas. - -my @gcc_cmd = @ARGV; -my @preprocess_c_cmd; - -if (grep /\.c$/, @gcc_cmd) { - # C file (inline asm?) - compile - @preprocess_c_cmd = (@gcc_cmd, "-S"); -} elsif (grep /\.S$/, @gcc_cmd) { - # asm file, just do C preprocessor - @preprocess_c_cmd = (@gcc_cmd, "-E"); -} else { - die "Unrecognized input filetype"; -} -@gcc_cmd = map { /\.[cS]$/ ? qw(-x assembler -) : $_ } @gcc_cmd; -@preprocess_c_cmd = map { /\.o$/ ? "-" : $_ } @preprocess_c_cmd; - -open(ASMFILE, "-|", @preprocess_c_cmd) || die "Error running preprocessor"; - -my $current_macro = ''; -my %macro_lines; -my %macro_args; -my %macro_args_default; - -my @pass1_lines; - -# pass 1: parse .macro -# note that the handling of arguments is probably overly permissive vs. gas -# but it should be the same for valid cases -while (<ASMFILE>) { - # comment out unsupported directives - s/\.type/@.type/x; - s/\.func/@.func/x; - s/\.endfunc/@.endfunc/x; - s/\.ltorg/@.ltorg/x; - s/\.size/@.size/x; - s/\.fpu/@.fpu/x; - - # the syntax for these is a little different - s/\.global/.globl/x; - # also catch .section .rodata since the equivalent to .const_data is .section __DATA,__const - s/(.*)\.rodata/.const_data/x; - s/\.int/.long/x; - s/\.float/.single/x; - - # catch unknown section names that aren't mach-o style (with a comma) - if (/.section ([^,]*)$/) { - die ".section $1 unsupported; figure out the mach-o section name and add it"; - } - - # macros creating macros is not handled (is that valid?) - if (/\.macro\s+([\d\w\.]+)\s*(.*)/) { - $current_macro = $1; - - # commas in the argument list are optional, so only use whitespace as the separator - my $arglist = $2; - $arglist =~ s/,/ /g; - - my @args = split(/\s+/, $arglist); - foreach my $i (0 .. $#args) { - my @argpair = split(/=/, $args[$i]); - $macro_args{$current_macro}[$i] = $argpair[0]; - $argpair[0] =~ s/:vararg$//; - $macro_args_default{$current_macro}{$argpair[0]} = $argpair[1]; - } - # ensure %macro_lines has the macro name added as a key - $macro_lines{$current_macro} = []; - } elsif (/\.endm/) { - if (!$current_macro) { - die "ERROR: .endm without .macro"; - } - $current_macro = ''; - } elsif ($current_macro) { - push(@{$macro_lines{$current_macro}}, $_); - } else { - expand_macros($_); - } -} - -sub expand_macros { - my $line = @_[0]; - if ($line =~ /(\S+:|)\s*([\w\d\.]+)\s*(.*)/ && exists $macro_lines{$2}) { - push(@pass1_lines, $1); - my $macro = $2; - - # commas are optional here too, but are syntactically important because - # parameters can be blank - my @arglist = split(/,/, $3); - my @args; - foreach (@arglist) { - my @whitespace_split = split(/\s+/, $_); - if (!@whitespace_split) { - push(@args, ''); - } else { - foreach (@whitespace_split) { - if (length($_)) { - push(@args, $_); - } - } - } - } - - my %replacements; - if ($macro_args_default{$macro}){ - %replacements = %{$macro_args_default{$macro}}; - } - - # construct hashtable of text to replace - foreach my $i (0 .. $#args) { - my $argname = $macro_args{$macro}[$i]; - - if ($args[$i] =~ m/=/) { - # arg=val references the argument name - # XXX: I'm not sure what the expected behaviour if a lot of - # these are mixed with unnamed args - my @named_arg = split(/=/, $args[$i]); - $replacements{$named_arg[0]} = $named_arg[1]; - } elsif ($i > $#{$macro_args{$macro}}) { - # more args given than the macro has named args - # XXX: is vararg allowed on arguments before the last? - $argname = $macro_args{$macro}[-1]; - if ($argname =~ s/:vararg$//) { - $replacements{$argname} .= ", $args[$i]"; - } else { - die "Too many arguments to macro $macro"; - } - } else { - $argname =~ s/:vararg$//; - $replacements{$argname} = $args[$i]; - } - } - - # apply replacements as regex - foreach (@{$macro_lines{$macro}}) { - my $macro_line = $_; - # do replacements by longest first, this avoids wrong replacement - # when argument names are subsets of each other - foreach (reverse sort {length $a <=> length $b} keys %replacements) { - $macro_line =~ s/\\$_/$replacements{$_}/g; - } - $macro_line =~ s/\\\(\)//g; # remove \() - expand_macros($macro_line); - } - } else { - push(@pass1_lines, $line); - } -} - -close(ASMFILE) or exit 1; -open(ASMFILE, "|-", @gcc_cmd) or die "Error running assembler"; - -my @sections; -my $num_repts; -my $rept_lines; - -my %literal_labels; # for ldr <reg>, =<expr> -my $literal_num = 0; - -# pass 2: parse .rept and .if variants -# NOTE: since we don't implement a proper parser, using .rept with a -# variable assigned from .set is not supported -foreach my $line (@pass1_lines) { - # textual comparison .if - # this assumes nothing else on the same line - if ($line =~ /\.ifnb\s+(.*)/) { - if ($1) { - $line = ".if 1\n"; - } else { - $line = ".if 0\n"; - } - } elsif ($line =~ /\.ifb\s+(.*)/) { - if ($1) { - $line = ".if 0\n"; - } else { - $line = ".if 1\n"; - } - } elsif ($line =~ /\.ifc\s+(.*)\s*,\s*(.*)/) { - if ($1 eq $2) { - $line = ".if 1\n"; - } else { - $line = ".if 0\n"; - } - } - - # handle .previous (only with regard to .section not .subsection) - if ($line =~ /\.(section|text|const_data)/) { - push(@sections, $line); - } elsif ($line =~ /\.previous/) { - if (!$sections[-2]) { - die ".previous without a previous section"; - } - $line = $sections[-2]; - push(@sections, $line); - } - - # handle ldr <reg>, =<expr> - if ($line =~ /(.*)\s*ldr([\w\s\d]+)\s*,\s*=(.*)/) { - my $label = $literal_labels{$3}; - if (!$label) { - $label = ".Literal_$literal_num"; - $literal_num++; - $literal_labels{$3} = $label; - } - $line = "$1 ldr$2, $label\n"; - } elsif ($line =~ /\.ltorg/) { - foreach my $literal (keys %literal_labels) { - $line .= "$literal_labels{$literal}:\n .word $literal\n"; - } - %literal_labels = (); - } - - # @l -> lo16() @ha -> ha16() - $line =~ s/,\s+([^,]+)\@l(\s)/, lo16($1)$2/g; - $line =~ s/,\s+([^,]+)\@ha(\s)/, ha16($1)$2/g; - - if ($line =~ /\.rept\s+(.*)/) { - $num_repts = $1; - $rept_lines = "\n"; - - # handle the possibility of repeating another directive on the same line - # .endr on the same line is not valid, I don't know if a non-directive is - if ($num_repts =~ s/(\.\w+.*)//) { - $rept_lines .= "$1\n"; - } - $num_repts = eval($num_repts); - } elsif ($line =~ /\.endr/) { - for (1 .. $num_repts) { - print ASMFILE $rept_lines; - } - $rept_lines = ''; - } elsif ($rept_lines) { - $rept_lines .= $line; - } else { - print ASMFILE $line; - } -} - -print ASMFILE ".text\n"; -foreach my $literal (keys %literal_labels) { - print ASMFILE "$literal_labels{$literal}:\n .word $literal\n"; -} - -close(ASMFILE) or exit 1;
View file
x264-snapshot-20141218-2245.tar.bz2/extras/windowsPorts
Deleted
-(directory)
View file
x264-snapshot-20141218-2245.tar.bz2/extras/windowsPorts/basicDataTypeConversions.h
Deleted
@@ -1,85 +0,0 @@ -#ifndef __DATA_TYPE_CONVERSIONS_H__ -#define __DATA_TYPE_CONVERSIONS_H__ - -#include <stdint.h> -#include <wchar.h> - -#ifdef __cplusplus -namespace avxsynth { -#endif // __cplusplus - -typedef int64_t __int64; -typedef int32_t __int32; -#ifdef __cplusplus -typedef bool BOOL; -#else -typedef uint32_t BOOL; -#endif // __cplusplus -typedef void* HMODULE; -typedef void* LPVOID; -typedef void* PVOID; -typedef PVOID HANDLE; -typedef HANDLE HWND; -typedef HANDLE HINSTANCE; -typedef void* HDC; -typedef void* HBITMAP; -typedef void* HICON; -typedef void* HFONT; -typedef void* HGDIOBJ; -typedef void* HBRUSH; -typedef void* HMMIO; -typedef void* HACMSTREAM; -typedef void* HACMDRIVER; -typedef void* HIC; -typedef void* HACMOBJ; -typedef HACMSTREAM* LPHACMSTREAM; -typedef void* HACMDRIVERID; -typedef void* LPHACMDRIVER; -typedef unsigned char BYTE; -typedef BYTE* LPBYTE; -typedef char TCHAR; -typedef TCHAR* LPTSTR; -typedef const TCHAR* LPCTSTR; -typedef char* LPSTR; -typedef LPSTR LPOLESTR; -typedef const char* LPCSTR; -typedef LPCSTR LPCOLESTR; -typedef wchar_t WCHAR; -typedef unsigned short WORD; -typedef unsigned int UINT; -typedef UINT MMRESULT; -typedef uint32_t DWORD; -typedef DWORD COLORREF; -typedef DWORD FOURCC; -typedef DWORD HRESULT; -typedef DWORD* LPDWORD; -typedef DWORD* DWORD_PTR; -typedef int32_t LONG; -typedef int32_t* LONG_PTR; -typedef LONG_PTR LRESULT; -typedef uint32_t ULONG; -typedef uint32_t* ULONG_PTR; -//typedef __int64_t intptr_t; -typedef uint64_t _fsize_t; - - -// -// Structures -// - -typedef struct _GUID { - DWORD Data1; - WORD Data2; - WORD Data3; - BYTE Data4[8]; -} GUID; - -typedef GUID REFIID; -typedef GUID CLSID; -typedef CLSID* LPCLSID; -typedef GUID IID; - -#ifdef __cplusplus -}; // namespace avxsynth -#endif // __cplusplus -#endif // __DATA_TYPE_CONVERSIONS_H__
View file
x264-snapshot-20141218-2245.tar.bz2/extras/windowsPorts/windows2linux.h
Deleted
@@ -1,77 +0,0 @@ -#ifndef __WINDOWS2LINUX_H__ -#define __WINDOWS2LINUX_H__ - -/* - * LINUX SPECIFIC DEFINITIONS -*/ -// -// Data types conversions -// -#include <stdlib.h> -#include <string.h> -#include "basicDataTypeConversions.h" - -#ifdef __cplusplus -namespace avxsynth { -#endif // __cplusplus -// -// purposefully define the following MSFT definitions -// to mean nothing (as they do not mean anything on Linux) -// -#define __stdcall -#define __cdecl -#define noreturn -#define __declspec(x) -#define STDAPI extern "C" HRESULT -#define STDMETHODIMP HRESULT __stdcall -#define STDMETHODIMP_(x) x __stdcall - -#define STDMETHOD(x) virtual HRESULT x -#define STDMETHOD_(a, x) virtual a x - -#ifndef TRUE -#define TRUE true -#endif - -#ifndef FALSE -#define FALSE false -#endif - -#define S_OK (0x00000000) -#define S_FALSE (0x00000001) -#define E_NOINTERFACE (0X80004002) -#define E_POINTER (0x80004003) -#define E_FAIL (0x80004005) -#define E_OUTOFMEMORY (0x8007000E) - -#define INVALID_HANDLE_VALUE ((HANDLE)((LONG_PTR)-1)) -#define FAILED(hr) ((hr) & 0x80000000) -#define SUCCEEDED(hr) (!FAILED(hr)) - - -// -// Functions -// -#define MAKEDWORD(a,b,c,d) ((a << 24) | (b << 16) | (c << 8) | (d)) -#define MAKEWORD(a,b) ((a << 8) | (b)) - -#define lstrlen strlen -#define lstrcpy strcpy -#define lstrcmpi strcasecmp -#define _stricmp strcasecmp -#define InterlockedIncrement(x) __sync_fetch_and_add((x), 1) -#define InterlockedDecrement(x) __sync_fetch_and_sub((x), 1) -// Windows uses (new, old) ordering but GCC has (old, new) -#define InterlockedCompareExchange(x,y,z) __sync_val_compare_and_swap(x,z,y) - -#define UInt32x32To64(a, b) ( (uint64_t) ( ((uint64_t)((uint32_t)(a))) * ((uint32_t)(b)) ) ) -#define Int64ShrlMod32(a, b) ( (uint64_t) ( (uint64_t)(a) >> (b) ) ) -#define Int32x32To64(a, b) ((__int64)(((__int64)((long)(a))) * ((long)(b)))) - -#define MulDiv(nNumber, nNumerator, nDenominator) (int32_t) (((int64_t) (nNumber) * (int64_t) (nNumerator) + (int64_t) ((nDenominator)/2)) / (int64_t) (nDenominator)) - -#ifdef __cplusplus -}; // namespace avxsynth -#endif // __cplusplus - -#endif // __WINDOWS2LINUX_H__
View file
x264-snapshot-20141218-2245.tar.bz2/AUTHORS -> x264-snapshot-20150804-2245.tar.bz2/AUTHORS
Changed
@@ -1,8 +1,8 @@ # Contributors to x264 -# +# # The format of this file was inspired by the Linux kernel CREDITS file. # Authors are listed alphabetically. -# +# # The fields are: name (N), email (E), web-address (W), CVS account login (C), # PGP key ID and fingerprint (P), description (D), and snail-mail address (S).
View file
x264-snapshot-20141218-2245.tar.bz2/Makefile -> x264-snapshot-20150804-2245.tar.bz2/Makefile
Changed
@@ -87,12 +87,12 @@ endif X86SRC = $(X86SRC0:%=common/x86/%) -ifeq ($(ARCH),X86) +ifeq ($(SYS_ARCH),X86) ARCH_X86 = yes ASMSRC = $(X86SRC) common/x86/pixel-32.asm endif -ifeq ($(ARCH),X86_64) +ifeq ($(SYS_ARCH),X86_64) ARCH_X86 = yes ASMSRC = $(X86SRC:-32.asm=-64.asm) common/x86/trellis-64.asm endif @@ -106,7 +106,7 @@ endif # AltiVec optims -ifeq ($(ARCH),PPC) +ifeq ($(SYS_ARCH),PPC) ifneq ($(AS),) SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \ common/ppc/quant.c common/ppc/deblock.c \ @@ -115,7 +115,7 @@ endif # NEON optims -ifeq ($(ARCH),ARM) +ifeq ($(SYS_ARCH),ARM) ifneq ($(AS),) ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \ common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S \ @@ -126,20 +126,32 @@ endif # AArch64 NEON optims -ifeq ($(ARCH),AARCH64) +ifeq ($(SYS_ARCH),AARCH64) ifneq ($(AS),) -ASMSRC += common/aarch64/dct-a.S \ +ASMSRC += common/aarch64/bitstream-a.S \ + common/aarch64/cabac-a.S \ + common/aarch64/dct-a.S \ common/aarch64/deblock-a.S \ common/aarch64/mc-a.S \ common/aarch64/pixel-a.S \ common/aarch64/predict-a.S \ common/aarch64/quant-a.S -SRCS += common/aarch64/mc-c.c \ +SRCS += common/aarch64/asm-offsets.c \ + common/aarch64/mc-c.c \ common/aarch64/predict-c.c OBJASM = $(ASMSRC:%.S=%.o) endif endif +# MSA optims +ifeq ($(SYS_ARCH),MIPS) +ifneq ($(findstring HAVE_MSA 1, $(CONFIG)),) +SRCS += common/mips/mc-c.c common/mips/dct-c.c \ + common/mips/deblock-c.c common/mips/pixel-c.c \ + common/mips/predict-c.c common/mips/quant-c.c +endif +endif + ifneq ($(HAVE_GETOPT_LONG),1) SRCCLI += extras/getopt.c endif @@ -264,7 +276,7 @@ rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc distclean: clean - rm -f config.mak x264_config.h config.h config.log x264.pc x264.def + rm -f config.mak x264_config.h config.h config.log x264.pc x264.def conftest* install-cli: cli $(INSTALL) -d $(DESTDIR)$(bindir)
View file
x264-snapshot-20150804-2245.tar.bz2/common/aarch64/asm-offsets.c
Added
@@ -0,0 +1,42 @@ +/***************************************************************************** + * asm-offsets.c: check asm offsets for aarch64 + ***************************************************************************** + * Copyright (C) 2014-2015 x264 project + * + * Authors: Janne Grunau <janne-x264@jannau.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "asm-offsets.h" + +#define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \ +{ \ + int m_##m[2 * (offsetof(s, m) == o) - 1]; \ +} + +X264_CHECK_OFFSET(x264_cabac_t, i_low, CABAC_I_LOW); +X264_CHECK_OFFSET(x264_cabac_t, i_range, CABAC_I_RANGE); +X264_CHECK_OFFSET(x264_cabac_t, i_queue, CABAC_I_QUEUE); +X264_CHECK_OFFSET(x264_cabac_t, i_bytes_outstanding, CABAC_I_BYTES_OUTSTANDING); +X264_CHECK_OFFSET(x264_cabac_t, p_start, CABAC_P_START); +X264_CHECK_OFFSET(x264_cabac_t, p, CABAC_P); +X264_CHECK_OFFSET(x264_cabac_t, p_end, CABAC_P_END); +X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded, CABAC_F8_BITS_ENCODED); +X264_CHECK_OFFSET(x264_cabac_t, state, CABAC_STATE);
View file
x264-snapshot-20150804-2245.tar.bz2/common/aarch64/asm-offsets.h
Added
@@ -0,0 +1,39 @@ +/***************************************************************************** + * asm-offsets.h: asm offsets for aarch64 + ***************************************************************************** + * Copyright (C) 2014-2015 x264 project + * + * Authors: Janne Grunau <janne-x264@jannau.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_AARCH64_ASM_OFFSETS_H +#define X264_AARCH64_ASM_OFFSETS_H + +#define CABAC_I_LOW 0x00 +#define CABAC_I_RANGE 0x04 +#define CABAC_I_QUEUE 0x08 +#define CABAC_I_BYTES_OUTSTANDING 0x0c +#define CABAC_P_START 0x10 +#define CABAC_P 0x18 +#define CABAC_P_END 0x20 +#define CABAC_F8_BITS_ENCODED 0x30 +#define CABAC_STATE 0x34 + +#endif
View file
x264-snapshot-20141218-2245.tar.bz2/common/aarch64/asm.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/asm.S
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * asm.S: AArch64 utility macros ***************************************************************************** - * Copyright (C) 2008-2014 x264 project + * Copyright (C) 2008-2015 x264 project * * Authors: Mans Rullgard <mans@mansr.com> * David Conrad <lessen42@gmail.com>
View file
x264-snapshot-20150804-2245.tar.bz2/common/aarch64/bitstream-a.S
Added
@@ -0,0 +1,82 @@ +/***************************************************************************** + * bitstream-a.S: aarch64 bitstream functions + ***************************************************************************** + * Copyright (C) 2014-2015 x264 project + * + * Authors: Janne Grunau <janne-x264@jannau.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" + +function x264_nal_escape_neon, export=1 + movi v0.16b, #0xff + movi v4.16b, #4 + mov w3, #3 + subs x6, x1, x2 + cbz x6, 99f +0: + cmn x6, #15 + b.lt 16f + mov x1, x2 + b 100f +16: + ld1 {v1.16b}, [x1], #16 + ext v2.16b, v0.16b, v1.16b, #14 + ext v3.16b, v0.16b, v1.16b, #15 + cmhi v7.16b, v4.16b, v1.16b + cmeq v5.16b, v2.16b, #0 + cmeq v6.16b, v3.16b, #0 + and v5.16b, v5.16b, v7.16b + and v5.16b, v5.16b, v6.16b + shrn v7.8b, v5.8h, #4 + mov x7, v7.d[0] + cbz x7, 16f + mov x6, #-16 +100: + umov w5, v0.b[14] + umov w4, v0.b[15] + orr w5, w4, w5, lsl #8 +101: + ldrb w4, [x1, x6] + orr w9, w4, w5, lsl #16 + cmp w9, #3 + b.hi 102f + strb w3, [x0], #1 + orr w5, w3, w5, lsl #8 +102: + adds x6, x6, #1 + strb w4, [x0], #1 + orr w5, w4, w5, lsl #8 + b.lt 101b + subs x6, x1, x2 + lsr w9, w5, #8 + mov v0.b[14], w9 + mov v0.b[15], w5 + b.lt 0b + + ret +16: + subs x6, x1, x2 + st1 {v1.16b}, [x0], #16 + mov v0.16b, v1.16b + b.lt 0b +99: + ret +endfunc
View file
x264-snapshot-20150804-2245.tar.bz2/common/aarch64/cabac-a.S
Added
@@ -0,0 +1,122 @@ +/***************************************************************************** + * cabac-a.S: aarch64 cabac + ***************************************************************************** + * Copyright (C) 2014-2015 x264 project + * + * Authors: Janne Grunau <janne-x264@jannau.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" +#include "asm-offsets.h" + +// w11 holds x264_cabac_t.i_low +// w12 holds x264_cabac_t.i_range + +function x264_cabac_encode_decision_asm, export=1 + movrel x8, X(x264_cabac_range_lps) + movrel x9, X(x264_cabac_transition) + add w10, w1, #CABAC_STATE + ldrb w3, [x0, x10] // i_state + ldr w12, [x0, #CABAC_I_RANGE] + and x4, x3, #~1 + asr w5, w12, #6 + add x8, x8, x4, lsl #1 + sub w5, w5, #4 + eor w6, w2, w3 // b ^ i_state + ldrb w4, [x8, x5] // i_range_lps + ldr w11, [x0, #CABAC_I_LOW] + sub w12, w12, w4 + tbz w6, #0, 1f // (b ^ i_state) & 1 + add w11, w11, w12 + mov w12, w4 +1: + orr w4, w2, w3, lsl #1 + ldrb w9, [x9, x4] + strb w9, [x0, x10] // i_state + +cabac_encode_renorm: + clz w5, w12 + ldr w2, [x0, #CABAC_I_QUEUE] + sub w5, w5, #23 + lsl w12, w12, w5 + lsl w11, w11, w5 +2: + adds w2, w2, w5 + str w12, [x0, #CABAC_I_RANGE] + b.lt 0f +cabac_putbyte: + mov w13, #0x400 + add w12, w2, #10 + lsl w13, w13, w2 + asr w4, w11, w12 // out + sub w2, w2, #8 + sub w13, w13, #1 + subs w5, w4, #0xff + and w11, w11, w13 + ldr w6, [x0, #CABAC_I_BYTES_OUTSTANDING] + str w2, [x0, #CABAC_I_QUEUE] + b.ne 1f + + add w6, w6, #1 + str w11, [x0, #CABAC_I_LOW] + str w6, [x0, #CABAC_I_BYTES_OUTSTANDING] + ret + +1: + ldr x7, [x0, #CABAC_P] + asr w5, w4, #8 // carry + ldrb w8, [x7, #-1] + add w8, w8, w5 + sub w5, w5, #1 + strb w8, [x7, #-1] + cbz w6, 3f +2: + subs w6, w6, #1 + strb w5, [x7], #1 + b.gt 2b +3: + strb w4, [x7], #1 + str wzr, [x0, #CABAC_I_BYTES_OUTSTANDING] + str x7, [x0, #CABAC_P] +0: + str w11, [x0, #CABAC_I_LOW] + str w2, [x0, #CABAC_I_QUEUE] + ret +endfunc + +function x264_cabac_encode_bypass_asm, export=1 + ldr w12, [x0, #CABAC_I_RANGE] + ldr w11, [x0, #CABAC_I_LOW] + ldr w2, [x0, #CABAC_I_QUEUE] + and w1, w1, w12 + add w11, w1, w11, lsl #1 + adds w2, w2, #1 + b.ge cabac_putbyte + str w11, [x0, #CABAC_I_LOW] + str w2, [x0, #CABAC_I_QUEUE] + ret +endfunc + +function x264_cabac_encode_terminal_asm, export=1 + ldr w12, [x0, #CABAC_I_RANGE] + ldr w11, [x0, #CABAC_I_LOW] + sub w12, w12, #2 + b cabac_encode_renorm +endfunc
View file
x264-snapshot-20141218-2245.tar.bz2/common/aarch64/dct-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/dct-a.S
Changed
@@ -1,9 +1,10 @@ /**************************************************************************** - * dct-a.S: AArch6464 transform and zigzag + * dct-a.S: aarch64 transform and zigzag ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> + * Janne Grunau <janne-x264@jannau.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -32,6 +33,25 @@ .byte 26,27, 28,29, 22,23, 30,31 endconst +const scan4x4_field, align=4 +.byte 0,1, 2,3, 8,9, 4,5 +.byte 6,7, 10,11, 12,13, 14,15 +endconst + +const sub4x4_frame, align=4 +.byte 0, 1, 4, 8 +.byte 5, 2, 3, 6 +.byte 9, 12, 13, 10 +.byte 7, 11, 14, 15 +endconst + +const sub4x4_field, align=4 +.byte 0, 4, 1, 8 +.byte 12, 5, 9, 13 +.byte 2, 6, 10, 14 +.byte 3, 7, 11, 15 +endconst + // sum = a + (b>>shift) sub = (a>>shift) - b .macro SUMSUB_SHR shift sum sub a b t0 t1 sshr \t0, \b, #\shift @@ -602,56 +622,99 @@ ret endfunc +.macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7 + ld1 {\t0\().8b}, [x1], x3 + ld1 {\t1\().8b}, [x2], x4 + ld1 {\t2\().8b}, [x1], x3 + ld1 {\t3\().8b}, [x2], x4 + usubl \t0\().8h, \t0\().8b, \t1\().8b + ld1 {\t4\().8b}, [x1], x3 + ld1 {\t5\().8b}, [x2], x4 + usubl \t1\().8h, \t2\().8b, \t3\().8b + ld1 {\t6\().8b}, [x1], x3 + ld1 {\t7\().8b}, [x2], x4 + add \dst\().8h, \t0\().8h, \t1\().8h + usubl \t2\().8h, \t4\().8b, \t5\().8b + usubl \t3\().8h, \t6\().8b, \t7\().8b + add \dst\().8h, \dst\().8h, \t2\().8h + add \dst\().8h, \dst\().8h, \t3\().8h +.endm + function x264_sub8x8_dct_dc_neon, export=1 mov x3, #FENC_STRIDE mov x4, #FDEC_STRIDE - ld1 {v16.8b}, [x1], x3 - ld1 {v17.8b}, [x2], x4 - usubl v16.8h, v16.8b, v17.8b - ld1 {v18.8b}, [x1], x3 - ld1 {v19.8b}, [x2], x4 - usubl v17.8h, v18.8b, v19.8b - ld1 {v20.8b}, [x1], x3 - ld1 {v21.8b}, [x2], x4 - usubl v18.8h, v20.8b, v21.8b - ld1 {v22.8b}, [x1], x3 - add v0.8h, v16.8h, v17.8h - ld1 {v23.8b}, [x2], x4 - usubl v19.8h, v22.8b, v23.8b - ld1 {v24.8b}, [x1], x3 - add v0.8h, v0.8h, v18.8h - ld1 {v25.8b}, [x2], x4 - usubl v20.8h, v24.8b, v25.8b - ld1 {v26.8b}, [x1], x3 - add v0.8h, v0.8h, v19.8h - ld1 {v27.8b}, [x2], x4 - usubl v21.8h, v26.8b, v27.8b - ld1 {v28.8b}, [x1], x3 - ld1 {v29.8b}, [x2], x4 - usubl v22.8h, v28.8b, v29.8b - ld1 {v30.8b}, [x1], x3 - add v1.8h, v20.8h, v21.8h - ld1 {v31.8b}, [x2], x4 - usubl v23.8h, v30.8b, v31.8b - add v1.8h, v1.8h, v22.8h - add v1.8h, v1.8h, v23.8h + sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23 + sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31 + + transpose v2.2d, v3.2d, v0.2d, v1.2d + SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h + transpose v2.2d, v3.2d, v0.2d, v1.2d + SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h transpose v2.2d, v3.2d, v0.2d, v1.2d - add v0.8h, v2.8h, v3.8h - sub v1.8h, v2.8h, v3.8h + addp v0.8h, v2.8h, v3.8h + addp v0.8h, v0.8h, v0.8h - transpose v2.2d, v3.2d, v0.2d, v1.2d + st1 {v0.4h}, [x0] + ret +endfunc + +function x264_sub8x16_dct_dc_neon, export=1 + mov x3, #FENC_STRIDE + mov x4, #FDEC_STRIDE + sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23 + sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31 + sub4x4x2_dct_dc v2, v16, v17, v18, v19, v20, v21, v22, v23 + sub4x4x2_dct_dc v3, v24, v25, v26, v27, v28, v29, v30, v31 - add v0.8h, v2.8h, v3.8h - sub v1.8h, v2.8h, v3.8h + addp v4.8h, v0.8h, v2.8h + addp v5.8h, v1.8h, v3.8h + + transpose v2.4s, v3.4s, v4.4s, v5.4s + SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h + + transpose v2.4s, v3.4s, v0.4s, v1.4s + SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h transpose v2.2d, v3.2d, v0.2d, v1.2d + SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h + + trn1 v2.2d, v0.2d, v1.2d + trn2 v3.2d, v1.2d, v0.2d addp v0.8h, v2.8h, v3.8h - addp v0.8h, v0.8h, v0.8h - st1 {v0.4h}, [x0] + st1 {v0.8h}, [x0] + ret +endfunc + +function x264_zigzag_interleave_8x8_cavlc_neon, export=1 + mov x3, #7 + movi v31.4s, #1 + ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64 + ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64 + umax v16.8h, v0.8h, v4.8h + umax v17.8h, v1.8h, v5.8h + umax v18.8h, v2.8h, v6.8h + umax v19.8h, v3.8h, v7.8h + st1 {v0.8h}, [x0], #16 + st1 {v4.8h}, [x0], #16 + umaxp v16.8h, v16.8h, v17.8h + umaxp v18.8h, v18.8h, v19.8h + st1 {v1.8h}, [x0], #16 + st1 {v5.8h}, [x0], #16 + umaxp v16.8h, v16.8h, v18.8h + st1 {v2.8h}, [x0], #16 + st1 {v6.8h}, [x0], #16 + cmhi v16.4s, v16.4s, v31.4s + st1 {v3.8h}, [x0], #16 + and v16.16b, v16.16b, v31.16b + st1 {v7.8h}, [x0], #16 + st1 {v16.b}[0], [x2], #1 + st1 {v16.b}[4], [x2], x3 + st1 {v16.b}[8], [x2], #1 + st1 {v16.b}[12], [x2] ret endfunc @@ -664,3 +727,282 @@ st1 {v2.16b,v3.16b}, [x0] ret endfunc + +.macro zigzag_sub_4x4 f ac +function x264_zigzag_sub_4x4\ac\()_\f\()_neon, export=1 + mov x9, #FENC_STRIDE + mov x4, #FDEC_STRIDE + movrel x5, sub4x4_\f + mov x6, x2 + ld1 {v0.s}[0], [x1], x9 + ld1 {v0.s}[1], [x1], x9 + ld1 {v0.s}[2], [x1], x9 + ld1 {v0.s}[3], [x1], x9 + ld1 {v16.16b}, [x5] + ld1 {v1.s}[0], [x2], x4 + ld1 {v1.s}[1], [x2], x4 + ld1 {v1.s}[2], [x2], x4 + ld1 {v1.s}[3], [x2], x4 + tbl v2.16b, {v0.16b}, v16.16b + tbl v3.16b, {v1.16b}, v16.16b + st1 {v0.s}[0], [x6], x4 + usubl v4.8h, v2.8b, v3.8b +.ifc \ac, ac + dup h7, v4.h[0] + ins v4.h[0], wzr + fmov w5, s7 + strh w5, [x3] +.endif + usubl2 v5.8h, v2.16b, v3.16b + st1 {v0.s}[1], [x6], x4 + umax v6.8h, v4.8h, v5.8h + umaxv h6, v6.8h + st1 {v0.s}[2], [x6], x4 + fmov w7, s6 + st1 {v0.s}[3], [x6], x4 + cmp w7, #0 + st1 {v4.8h,v5.8h}, [x0] + cset w0, ne + ret +endfunc +.endm + +zigzag_sub_4x4 field +zigzag_sub_4x4 field, ac +zigzag_sub_4x4 frame +zigzag_sub_4x4 frame, ac + +function x264_zigzag_scan_4x4_field_neon, export=1 + movrel x2, scan4x4_field + ld1 {v0.8h,v1.8h}, [x1] + ld1 {v16.16b}, [x2] + tbl v0.16b, {v0.16b}, v16.16b + st1 {v0.8h,v1.8h}, [x0] + ret +endfunc + +function x264_zigzag_scan_8x8_frame_neon, export=1 + movrel x2, scan8x8_frame + ld1 {v0.8h,v1.8h}, [x1], #32 + ld1 {v2.8h,v3.8h}, [x1], #32 + ld1 {v4.8h,v5.8h}, [x1], #32 + ld1 {v6.8h,v7.8h}, [x1] + ld1 {v16.16b,v17.16b}, [x2], #32 + ld1 {v18.16b,v19.16b}, [x2], #32 + ld1 {v20.16b,v21.16b}, [x2], #32 + ld1 {v22.16b,v23.16b}, [x2], #32 + tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b + tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b + tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b + tbl v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b + tbl v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b + tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b + tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b + tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b + mov v25.h[6], v4.h[0] + mov v25.h[7], v5.h[0] + mov v26.h[0], v4.h[1] + mov v27.h[4], v7.h[0] + mov v28.h[7], v4.h[4] + mov v29.h[7], v3.h[6] + mov v30.h[0], v2.h[7] + mov v30.h[1], v3.h[7] + st1 {v24.8h,v25.8h}, [x0], #32 + st1 {v26.8h,v27.8h}, [x0], #32 + st1 {v28.8h,v29.8h}, [x0], #32 + st1 {v30.8h,v31.8h}, [x0] + ret +endfunc + +#define Z(z) 2*(z), 2*(z)+1 +#define T(x,y) Z(x*8+y) +const scan8x8_frame, align=5 + .byte T(0,0), T(1,0), T(0,1), T(0,2) + .byte T(1,1), T(2,0), T(3,0), T(2,1) + .byte T(1,2), T(0,3), T(0,4), T(1,3) + .byte T(2,2), T(3,1), T(4,0), T(5,0) + .byte T(4,1), T(3,2), T(2,3), T(1,4) + .byte T(0,5), T(0,6), T(1,5), T(2,4) +#undef T +#define T(x,y) Z((x-3)*8+y) + .byte T(3,3), T(4,2), T(5,1), T(6,0) + .byte T(7,0), T(6,1), T(5,2), T(4,3) +#undef T +#define T(x,y) Z((x-0)*8+y) + .byte T(3,4), T(2,5), T(1,6), T(0,7) + .byte T(1,7), T(2,6), T(3,5), T(4,4) +#undef T +#define T(x,y) Z((x-4)*8+y) + .byte T(5,3), T(6,2), T(7,1), T(7,2) + .byte T(6,3), T(5,4), T(4,5), T(3,6) + .byte T(2,7), T(3,7), T(4,6), T(5,5) + .byte T(6,4), T(7,3), T(7,4), T(6,5) + .byte T(5,6), T(4,7), T(5,7), T(6,6) + .byte T(7,5), T(7,6), T(6,7), T(7,7) +endconst + +function x264_zigzag_scan_8x8_field_neon, export=1 + movrel x2, scan8x8_field + ld1 {v0.8h,v1.8h}, [x1], #32 + ld1 {v2.8h,v3.8h}, [x1], #32 + ld1 {v4.8h,v5.8h}, [x1], #32 + ld1 {v6.8h,v7.8h}, [x1] + ld1 {v16.16b,v17.16b}, [x2], #32 + ld1 {v18.16b,v19.16b}, [x2], #32 + ld1 {v20.16b,v21.16b}, [x2], #32 + ld1 {v22.16b}, [x2] + ext v31.16b, v7.16b, v7.16b, #4 + tbl v24.16b, {v0.16b,v1.16b}, v16.16b + tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b + tbl v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b + tbl v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b + tbl v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b + tbl v29.16b, {v4.16b,v5.16b,v6.16b}, v21.16b + tbl v30.16b, {v5.16b,v6.16b,v7.16b}, v22.16b + ext v31.16b, v6.16b, v31.16b, #12 + st1 {v24.8h,v25.8h}, [x0], #32 + st1 {v26.8h,v27.8h}, [x0], #32 + st1 {v28.8h,v29.8h}, [x0], #32 + st1 {v30.8h,v31.8h}, [x0] + ret +endfunc + +.macro zigzag_sub8x8 f +function x264_zigzag_sub_8x8_\f\()_neon, export=1 + movrel x4, sub8x8_\f + mov x5, #FENC_STRIDE + mov x6, #FDEC_STRIDE + mov x7, x2 + ld1 {v0.d}[0], [x1], x5 + ld1 {v0.d}[1], [x1], x5 + ld1 {v1.d}[0], [x1], x5 + ld1 {v1.d}[1], [x1], x5 + ld1 {v2.d}[0], [x1], x5 + ld1 {v2.d}[1], [x1], x5 + ld1 {v3.d}[0], [x1], x5 + ld1 {v3.d}[1], [x1] + ld1 {v4.d}[0], [x2], x6 + ld1 {v4.d}[1], [x2], x6 + ld1 {v5.d}[0], [x2], x6 + ld1 {v5.d}[1], [x2], x6 + ld1 {v6.d}[0], [x2], x6 + ld1 {v6.d}[1], [x2], x6 + ld1 {v7.d}[0], [x2], x6 + ld1 {v7.d}[1], [x2] + ld1 {v16.16b,v17.16b}, [x4], #32 + ld1 {v18.16b,v19.16b}, [x4], #32 + tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b + tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b + tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b + tbl v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b + tbl v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b + tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b + tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b + tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b + usubl v4.8h, v24.8b, v28.8b + usubl2 v5.8h, v24.16b, v28.16b + usubl v6.8h, v25.8b, v29.8b + usubl2 v7.8h, v25.16b, v29.16b + usubl v16.8h, v26.8b, v30.8b + usubl2 v17.8h, v26.16b, v30.16b + usubl v18.8h, v27.8b, v31.8b + usubl2 v19.8h, v27.16b, v31.16b + umax v20.8h, v4.8h, v5.8h + umax v21.8h, v6.8h, v7.8h + umax v22.8h, v16.8h, v17.8h + umax v23.8h, v18.8h, v19.8h + umax v20.8h, v20.8h, v21.8h + umax v21.8h, v22.8h, v23.8h + umax v20.8h, v20.8h, v21.8h + umaxv h22, v20.8h + st1 {v0.d}[0], [x7], x6 + st1 {v0.d}[1], [x7], x6 + st1 {v1.d}[0], [x7], x6 + st1 {v1.d}[1], [x7], x6 + st1 {v2.d}[0], [x7], x6 + st1 {v2.d}[1], [x7], x6 + st1 {v3.d}[0], [x7], x6 + st1 {v3.d}[1], [x7] + st1 {v4.8h,v5.8h}, [x0], #32 + st1 {v6.8h,v7.8h}, [x0], #32 + st1 {v16.8h,v17.8h}, [x0], #32 + st1 {v18.8h,v19.8h}, [x0] + fmov w9, s22 + cmp w9, #0 + cset w0, ne + ret +endfunc +.endm + +zigzag_sub8x8 field +zigzag_sub8x8 frame + +#undef T +#define T(x,y) Z(x*8+y) +const scan8x8_field, align=5 + .byte T(0,0), T(0,1), T(0,2), T(1,0) + .byte T(1,1), T(0,3), T(0,4), T(1,2) + .byte T(2,0), T(1,3), T(0,5), T(0,6) + .byte T(0,7), T(1,4), T(2,1), T(3,0) +#undef T +#define T(x,y) Z((x-1)*8+y) + .byte T(2,2), T(1,5), T(1,6), T(1,7) + .byte T(2,3), T(3,1), T(4,0), T(3,2) +#undef T +#define T(x,y) Z((x-2)*8+y) + .byte T(2,4), T(2,5), T(2,6), T(2,7) + .byte T(3,3), T(4,1), T(5,0), T(4,2) +#undef T +#define T(x,y) Z((x-3)*8+y) + .byte T(3,4), T(3,5), T(3,6), T(3,7) + .byte T(4,3), T(5,1), T(6,0), T(5,2) +#undef T +#define T(x,y) Z((x-4)*8+y) + .byte T(4,4), T(4,5), T(4,6), T(4,7) + .byte T(5,3), T(6,1), T(6,2), T(5,4) +#undef T +#define T(x,y) Z((x-5)*8+y) + .byte T(5,5), T(5,6), T(5,7), T(6,3) + .byte T(7,0), T(7,1), T(6,4), T(6,5) +endconst + + +#undef T +#define T(y,x) x*8+y +const sub8x8_frame, align=5 + .byte T(0,0), T(1,0), T(0,1), T(0,2) + .byte T(1,1), T(2,0), T(3,0), T(2,1) + .byte T(1,2), T(0,3), T(0,4), T(1,3) + .byte T(2,2), T(3,1), T(4,0), T(5,0) + .byte T(4,1), T(3,2), T(2,3), T(1,4) + .byte T(0,5), T(0,6), T(1,5), T(2,4) + .byte T(3,3), T(4,2), T(5,1), T(6,0) + .byte T(7,0), T(6,1), T(5,2), T(4,3) + .byte T(3,4), T(2,5), T(1,6), T(0,7) + .byte T(1,7), T(2,6), T(3,5), T(4,4) + .byte T(5,3), T(6,2), T(7,1), T(7,2) + .byte T(6,3), T(5,4), T(4,5), T(3,6) + .byte T(2,7), T(3,7), T(4,6), T(5,5) + .byte T(6,4), T(7,3), T(7,4), T(6,5) + .byte T(5,6), T(4,7), T(5,7), T(6,6) + .byte T(7,5), T(7,6), T(6,7), T(7,7) +endconst + +const sub8x8_field, align=5 + .byte T(0,0), T(0,1), T(0,2), T(1,0) + .byte T(1,1), T(0,3), T(0,4), T(1,2) + .byte T(2,0), T(1,3), T(0,5), T(0,6) + .byte T(0,7), T(1,4), T(2,1), T(3,0) + .byte T(2,2), T(1,5), T(1,6), T(1,7) + .byte T(2,3), T(3,1), T(4,0), T(3,2) + .byte T(2,4), T(2,5), T(2,6), T(2,7) + .byte T(3,3), T(4,1), T(5,0), T(4,2) + .byte T(3,4), T(3,5), T(3,6), T(3,7) + .byte T(4,3), T(5,1), T(6,0), T(5,2) + .byte T(4,4), T(4,5), T(4,6), T(4,7) + .byte T(5,3), T(6,1), T(6,2), T(5,4) + .byte T(5,5), T(5,6), T(5,7), T(6,3) + .byte T(7,0), T(7,1), T(6,4), T(6,5) + .byte T(6,6), T(6,7), T(7,2), T(7,3) + .byte T(7,4), T(7,5), T(7,6), T(7,7) +endconst
View file
x264-snapshot-20141218-2245.tar.bz2/common/aarch64/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/dct.h
Changed
@@ -1,9 +1,10 @@ /***************************************************************************** - * dct.h: AArch64 transform and zigzag + * dct.h: aarch64 transform and zigzag ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> + * Janne Grunau <janne-x264@jannau.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -40,6 +41,7 @@ void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] ); void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] ); void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 ); @@ -48,5 +50,18 @@ void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] ); void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_field_neon( int16_t level[64], int16_t dct[64] ); + +int x264_zigzag_sub_4x4_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst ); +int x264_zigzag_sub_4x4ac_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc ); +int x264_zigzag_sub_4x4_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst ); +int x264_zigzag_sub_4x4ac_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc ); + +int x264_zigzag_sub_8x8_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst ); +int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst ); + +void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz ); #endif
View file
x264-snapshot-20141218-2245.tar.bz2/common/aarch64/deblock-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/deblock-a.S
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * deblock.S: aarch64 deblocking ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: Mans Rullgard <mans@mansr.com> * Janne Grunau <janne-x264@jannau.net> @@ -180,6 +180,202 @@ ret endfunc +.macro h264_loop_filter_start_intra + orr w4, w2, w3 + cmp w4, #0 + b.ne 1f + ret +1: + dup v30.16b, w2 // alpha + dup v31.16b, w3 // beta +.endm + +.macro h264_loop_filter_luma_intra + uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0) + uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0) + uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0) + cmhi v19.16b, v30.16b, v16.16b // < alpha + cmhi v17.16b, v31.16b, v17.16b // < beta + cmhi v18.16b, v31.16b, v18.16b // < beta + + movi v29.16b, #2 + ushr v30.16b, v30.16b, #2 // alpha >> 2 + add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2 + cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2 + + and v19.16b, v19.16b, v17.16b + and v19.16b, v19.16b, v18.16b + shrn v20.8b, v19.8h, #4 + mov x4, v20.d[0] + cbz x4, 9f + + ushll v20.8h, v6.8b, #1 + ushll v22.8h, v1.8b, #1 + ushll2 v21.8h, v6.16b, #1 + ushll2 v23.8h, v1.16b, #1 + uaddw v20.8h, v20.8h, v7.8b + uaddw v22.8h, v22.8h, v0.8b + uaddw2 v21.8h, v21.8h, v7.16b + uaddw2 v23.8h, v23.8h, v0.16b + uaddw v20.8h, v20.8h, v1.8b + uaddw v22.8h, v22.8h, v6.8b + uaddw2 v21.8h, v21.8h, v1.16b + uaddw2 v23.8h, v23.8h, v6.16b + + rshrn v24.8b, v20.8h, #2 // p0'_1 + rshrn v25.8b, v22.8h, #2 // q0'_1 + rshrn2 v24.16b, v21.8h, #2 // p0'_1 + rshrn2 v25.16b, v23.8h, #2 // q0'_1 + + uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0) + uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0) + cmhi v17.16b, v31.16b, v17.16b // < beta + cmhi v18.16b, v31.16b, v18.16b // < beta + + and v17.16b, v16.16b, v17.16b // if_2 && if_3 + and v18.16b, v16.16b, v18.16b // if_2 && if_4 + + not v30.16b, v17.16b + not v31.16b, v18.16b + + and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3) + and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4) + + and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3 + and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4 + + //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4 + uaddl v26.8h, v5.8b, v7.8b + uaddl2 v27.8h, v5.16b, v7.16b + uaddw v26.8h, v26.8h, v0.8b + uaddw2 v27.8h, v27.8h, v0.16b + add v20.8h, v20.8h, v26.8h + add v21.8h, v21.8h, v27.8h + uaddw v20.8h, v20.8h, v0.8b + uaddw2 v21.8h, v21.8h, v0.16b + rshrn v20.8b, v20.8h, #3 // p0'_2 + rshrn2 v20.16b, v21.8h, #3 // p0'_2 + uaddw v26.8h, v26.8h, v6.8b + uaddw2 v27.8h, v27.8h, v6.16b + rshrn v21.8b, v26.8h, #2 // p1'_2 + rshrn2 v21.16b, v27.8h, #2 // p1'_2 + uaddl v28.8h, v4.8b, v5.8b + uaddl2 v29.8h, v4.16b, v5.16b + shl v28.8h, v28.8h, #1 + shl v29.8h, v29.8h, #1 + add v28.8h, v28.8h, v26.8h + add v29.8h, v29.8h, v27.8h + rshrn v19.8b, v28.8h, #3 // p2'_2 + rshrn2 v19.16b, v29.8h, #3 // p2'_2 + + //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3 + uaddl v26.8h, v2.8b, v0.8b + uaddl2 v27.8h, v2.16b, v0.16b + uaddw v26.8h, v26.8h, v7.8b + uaddw2 v27.8h, v27.8h, v7.16b + add v22.8h, v22.8h, v26.8h + add v23.8h, v23.8h, v27.8h + uaddw v22.8h, v22.8h, v7.8b + uaddw2 v23.8h, v23.8h, v7.16b + rshrn v22.8b, v22.8h, #3 // q0'_2 + rshrn2 v22.16b, v23.8h, #3 // q0'_2 + uaddw v26.8h, v26.8h, v1.8b + uaddw2 v27.8h, v27.8h, v1.16b + rshrn v23.8b, v26.8h, #2 // q1'_2 + rshrn2 v23.16b, v27.8h, #2 // q1'_2 + uaddl v28.8h, v2.8b, v3.8b + uaddl2 v29.8h, v2.16b, v3.16b + shl v28.8h, v28.8h, #1 + shl v29.8h, v29.8h, #1 + add v28.8h, v28.8h, v26.8h + add v29.8h, v29.8h, v27.8h + rshrn v26.8b, v28.8h, #3 // q2'_2 + rshrn2 v26.16b, v29.8h, #3 // q2'_2 + + bit v7.16b, v24.16b, v30.16b // p0'_1 + bit v0.16b, v25.16b, v31.16b // q0'_1 + bit v7.16b, v20.16b, v17.16b // p0'_2 + bit v6.16b, v21.16b, v17.16b // p1'_2 + bit v5.16b, v19.16b, v17.16b // p2'_2 + bit v0.16b, v22.16b, v18.16b // q0'_2 + bit v1.16b, v23.16b, v18.16b // q1'_2 + bit v2.16b, v26.16b, v18.16b // q2'_2 +.endm + +function x264_deblock_v_luma_intra_neon, export=1 + h264_loop_filter_start_intra + + ld1 {v0.16b}, [x0], x1 // q0 + ld1 {v1.16b}, [x0], x1 // q1 + ld1 {v2.16b}, [x0], x1 // q2 + ld1 {v3.16b}, [x0], x1 // q3 + sub x0, x0, x1, lsl #3 + ld1 {v4.16b}, [x0], x1 // p3 + ld1 {v5.16b}, [x0], x1 // p2 + ld1 {v6.16b}, [x0], x1 // p1 + ld1 {v7.16b}, [x0] // p0 + + h264_loop_filter_luma_intra + + sub x0, x0, x1, lsl #1 + st1 {v5.16b}, [x0], x1 // p2 + st1 {v6.16b}, [x0], x1 // p1 + st1 {v7.16b}, [x0], x1 // p0 + st1 {v0.16b}, [x0], x1 // q0 + st1 {v1.16b}, [x0], x1 // q1 + st1 {v2.16b}, [x0] // q2 +9: + ret +endfunc + +function x264_deblock_h_luma_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x0, x0, #4 + ld1 {v4.8b}, [x0], x1 + ld1 {v5.8b}, [x0], x1 + ld1 {v6.8b}, [x0], x1 + ld1 {v7.8b}, [x0], x1 + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x0], x1 + ld1 {v2.8b}, [x0], x1 + ld1 {v3.8b}, [x0], x1 + ld1 {v4.d}[1], [x0], x1 + ld1 {v5.d}[1], [x0], x1 + ld1 {v6.d}[1], [x0], x1 + ld1 {v7.d}[1], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v1.d}[1], [x0], x1 + ld1 {v2.d}[1], [x0], x1 + ld1 {v3.d}[1], [x0], x1 + + transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 + + h264_loop_filter_luma_intra + + transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23 + + sub x0, x0, x1, lsl #4 + st1 {v4.8b}, [x0], x1 + st1 {v5.8b}, [x0], x1 + st1 {v6.8b}, [x0], x1 + st1 {v7.8b}, [x0], x1 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + st1 {v4.d}[1], [x0], x1 + st1 {v5.d}[1], [x0], x1 + st1 {v6.d}[1], [x0], x1 + st1 {v7.d}[1], [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[1], [x0], x1 + st1 {v2.d}[1], [x0], x1 + st1 {v3.d}[1], [x0], x1 +9: + ret +endfunc + .macro h264_loop_filter_chroma dup v22.16b, w2 // alpha uxtl v24.8h, v24.8b @@ -247,6 +443,7 @@ h264_loop_filter_start sub x0, x0, #4 +deblock_h_chroma: ld1 {v18.d}[0], [x0], x1 ld1 {v16.d}[0], [x0], x1 ld1 {v0.d}[0], [x0], x1 @@ -275,6 +472,229 @@ ret endfunc +function x264_deblock_h_chroma_422_neon, export=1 + add x5, x0, x1 + add x1, x1, x1 + mov x7, x30 + bl X(x264_deblock_h_chroma_neon) + ldr w6, [x4] + mov x30, x7 + sub x0, x5, #4 + mov v24.s[0], w6 + b deblock_h_chroma +endfunc + +.macro h264_loop_filter_chroma8 + dup v22.8b, w2 // alpha + uxtl v24.8h, v24.8b + uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0) + uxtl v4.8h, v17.8b + uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0) + usubw v4.8h, v4.8h, v16.8b + sli v24.8h, v24.8h, #8 + shl v4.8h, v4.8h, #2 + uabd v30.8b, v19.8b, v17.8b // abs(q1 - q0) + uaddw v4.8h, v4.8h, v18.8b + cmhi v26.8b, v22.8b, v26.8b // < alpha + usubw v4.8h, v4.8h, v19.8b + dup v22.8b, w3 // beta + rshrn v4.8b, v4.8h, #3 + cmhi v28.8b, v22.8b, v28.8b // < beta + cmhi v30.8b, v22.8b, v30.8b // < beta + smin v4.8b, v4.8b, v24.8b + neg v25.8b, v24.8b + and v26.8b, v26.8b, v28.8b + smax v4.8b, v4.8b, v25.8b + and v26.8b, v26.8b, v30.8b + uxtl v22.8h, v17.8b + and v4.8b, v4.8b, v26.8b + uxtl v28.8h, v16.8b + saddw v28.8h, v28.8h, v4.8b + ssubw v22.8h, v22.8h, v4.8b + sqxtun v16.8b, v28.8h + sqxtun v17.8b, v22.8h +.endm + +function x264_deblock_h_chroma_mbaff_neon, export=1 + h264_loop_filter_start + + sub x4, x0, #4 + sub x0, x0, #2 + + ld1 {v18.8b}, [x4], x1 + ld1 {v16.8b}, [x4], x1 + ld1 {v17.8b}, [x4], x1 + ld1 {v19.8b}, [x4] + + transpose4x4.h v18, v16, v17, v19, v28, v29, v30, v31 + + h264_loop_filter_chroma8 + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0] + + ret +endfunc + +.macro h264_loop_filter_chroma_intra, width=16 + uabd v26.16b, v16.16b, v17.16b // abs(p0 - q0) + uabd v27.16b, v18.16b, v16.16b // abs(p1 - p0) + uabd v28.16b, v19.16b, v17.16b // abs(q1 - q0) + cmhi v26.16b, v30.16b, v26.16b // < alpha + cmhi v27.16b, v31.16b, v27.16b // < beta + cmhi v28.16b, v31.16b, v28.16b // < beta + and v26.16b, v26.16b, v27.16b + and v26.16b, v26.16b, v28.16b + + ushll v4.8h, v18.8b, #1 + ushll v6.8h, v19.8b, #1 +.ifc \width, 16 + ushll2 v5.8h, v18.16b, #1 + ushll2 v7.8h, v19.16b, #1 + uaddl2 v21.8h, v16.16b, v19.16b + uaddl2 v23.8h, v17.16b, v18.16b +.endif + uaddl v20.8h, v16.8b, v19.8b + uaddl v22.8h, v17.8b, v18.8b + add v20.8h, v20.8h, v4.8h // mlal? + add v22.8h, v22.8h, v6.8h +.ifc \width, 16 + add v21.8h, v21.8h, v5.8h + add v23.8h, v23.8h, v7.8h +.endif + uqrshrn v24.8b, v20.8h, #2 + uqrshrn v25.8b, v22.8h, #2 +.ifc \width, 16 + uqrshrn2 v24.16b, v21.8h, #2 + uqrshrn2 v25.16b, v23.8h, #2 +.endif + bit v16.16b, v24.16b, v26.16b + bit v17.16b, v25.16b, v26.16b +.endm + +function x264_deblock_v_chroma_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x0, x0, x1, lsl #1 + ld1 {v18.16b}, [x0], x1 + ld1 {v16.16b}, [x0], x1 + ld1 {v17.16b}, [x0], x1 + ld1 {v19.16b}, [x0] + + h264_loop_filter_chroma_intra + + sub x0, x0, x1, lsl #1 + st1 {v16.16b}, [x0], x1 + st1 {v17.16b}, [x0], x1 + + ret +endfunc + +function x264_deblock_h_chroma_intra_mbaff_neon, export=1 + h264_loop_filter_start_intra + + sub x4, x0, #4 + sub x0, x0, #2 + ld1 {v18.8b}, [x4], x1 + ld1 {v16.8b}, [x4], x1 + ld1 {v17.8b}, [x4], x1 + ld1 {v19.8b}, [x4], x1 + + transpose4x4.h v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra, width=8 + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0], x1 + + ret +endfunc + +function x264_deblock_h_chroma_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x4, x0, #4 + sub x0, x0, #2 + ld1 {v18.d}[0], [x4], x1 + ld1 {v16.d}[0], [x4], x1 + ld1 {v17.d}[0], [x4], x1 + ld1 {v19.d}[0], [x4], x1 + ld1 {v18.d}[1], [x4], x1 + ld1 {v16.d}[1], [x4], x1 + ld1 {v17.d}[1], [x4], x1 + ld1 {v19.d}[1], [x4], x1 + + transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0], x1 + st2 {v16.h,v17.h}[4], [x0], x1 + st2 {v16.h,v17.h}[5], [x0], x1 + st2 {v16.h,v17.h}[6], [x0], x1 + st2 {v16.h,v17.h}[7], [x0], x1 + + ret +endfunc + +function x264_deblock_h_chroma_422_intra_neon, export=1 + h264_loop_filter_start_intra + + sub x4, x0, #4 + sub x0, x0, #2 + ld1 {v18.d}[0], [x4], x1 + ld1 {v16.d}[0], [x4], x1 + ld1 {v17.d}[0], [x4], x1 + ld1 {v19.d}[0], [x4], x1 + ld1 {v18.d}[1], [x4], x1 + ld1 {v16.d}[1], [x4], x1 + ld1 {v17.d}[1], [x4], x1 + ld1 {v19.d}[1], [x4], x1 + + transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0], x1 + st2 {v16.h,v17.h}[4], [x0], x1 + st2 {v16.h,v17.h}[5], [x0], x1 + st2 {v16.h,v17.h}[6], [x0], x1 + st2 {v16.h,v17.h}[7], [x0], x1 + + ld1 {v18.d}[0], [x4], x1 + ld1 {v16.d}[0], [x4], x1 + ld1 {v17.d}[0], [x4], x1 + ld1 {v19.d}[0], [x4], x1 + ld1 {v18.d}[1], [x4], x1 + ld1 {v16.d}[1], [x4], x1 + ld1 {v17.d}[1], [x4], x1 + ld1 {v19.d}[1], [x4], x1 + + transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29 + + h264_loop_filter_chroma_intra + + st2 {v16.h,v17.h}[0], [x0], x1 + st2 {v16.h,v17.h}[1], [x0], x1 + st2 {v16.h,v17.h}[2], [x0], x1 + st2 {v16.h,v17.h}[3], [x0], x1 + st2 {v16.h,v17.h}[4], [x0], x1 + st2 {v16.h,v17.h}[5], [x0], x1 + st2 {v16.h,v17.h}[6], [x0], x1 + st2 {v16.h,v17.h}[7], [x0], x1 + + ret +endfunc //static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], // int8_t ref[2][X264_SCAN8_LUMA_SIZE],
View file
x264-snapshot-20141218-2245.tar.bz2/common/aarch64/mc-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/mc-a.S
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * mc.S: aarch64 motion compensation ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> * Janne Grunau <janne-x264@jannau.net> @@ -1253,6 +1253,34 @@ ret endfunc +function x264_plane_copy_neon, export=1 + add x8, x4, #15 + and x4, x8, #~15 + sub x1, x1, x4 + sub x3, x3, x4 +1: + mov w8, w4 +16: + tst w8, #16 + b.eq 32f + subs w8, w8, #16 + ldr q0, [x2], #16 + str q0, [x0], #16 + b.eq 0f +32: + subs w8, w8, #32 + ldp q0, q1, [x2], #32 + stp q0, q1, [x0], #32 + b.gt 32b +0: + subs w5, w5, #1 + add x2, x2, x3 + add x0, x0, x1 + b.gt 1b + + ret +endfunc + function x264_plane_copy_deinterleave_neon, export=1 add w9, w6, #15 and w9, w9, #0xfffffff0 @@ -1363,3 +1391,279 @@ ret endfunc + +.macro integral4h p1, p2 + ext v1.8b, \p1\().8b, \p2\().8b, #1 + ext v2.8b, \p1\().8b, \p2\().8b, #2 + ext v3.8b, \p1\().8b, \p2\().8b, #3 + uaddl v0.8h, \p1\().8b, v1.8b + uaddl v4.8h, v2.8b, v3.8b + add v0.8h, v0.8h, v4.8h + add v0.8h, v0.8h, v5.8h +.endm + +function integral_init4h_neon, export=1 + sub x3, x0, x2 + ld1 {v6.8b,v7.8b}, [x1], #16 +1: + subs x2, x2, #16 + ld1 {v5.8h}, [x3], #16 + integral4h v6, v7 + ld1 {v6.8b}, [x1], #8 + ld1 {v5.8h}, [x3], #16 + st1 {v0.8h}, [x0], #16 + integral4h v7, v6 + ld1 {v7.8b}, [x1], #8 + st1 {v0.8h}, [x0], #16 + b.gt 1b + ret +endfunc + +.macro integral8h p1, p2, s + ext v1.8b, \p1\().8b, \p2\().8b, #1 + ext v2.8b, \p1\().8b, \p2\().8b, #2 + ext v3.8b, \p1\().8b, \p2\().8b, #3 + ext v4.8b, \p1\().8b, \p2\().8b, #4 + ext v5.8b, \p1\().8b, \p2\().8b, #5 + ext v6.8b, \p1\().8b, \p2\().8b, #6 + ext v7.8b, \p1\().8b, \p2\().8b, #7 + uaddl v0.8h, \p1\().8b, v1.8b + uaddl v2.8h, v2.8b, v3.8b + uaddl v4.8h, v4.8b, v5.8b + uaddl v6.8h, v6.8b, v7.8b + add v0.8h, v0.8h, v2.8h + add v4.8h, v4.8h, v6.8h + add v0.8h, v0.8h, v4.8h + add v0.8h, v0.8h, \s\().8h +.endm + +function integral_init8h_neon, export=1 + sub x3, x0, x2 + ld1 {v16.8b,v17.8b}, [x1], #16 +1: + subs x2, x2, #16 + ld1 {v18.8h}, [x3], #16 + integral8h v16, v17, v18 + ld1 {v16.8b}, [x1], #8 + ld1 {v18.8h}, [x3], #16 + st1 {v0.8h}, [x0], #16 + integral8h v17, v16, v18 + ld1 {v17.8b}, [x1], #8 + st1 {v0.8h}, [x0], #16 + b.gt 1b + ret +endfunc + +function integral_init4v_neon, export=1 + mov x3, x0 + add x4, x0, x2, lsl #3 + add x8, x0, x2, lsl #4 + sub x2, x2, #8 + ld1 {v20.8h,v21.8h,v22.8h}, [x3], #48 + ld1 {v16.8h,v17.8h,v18.8h}, [x8], #48 +1: + subs x2, x2, #16 + ld1 {v24.8h,v25.8h}, [x4], #32 + ext v0.16b, v20.16b, v21.16b, #8 + ext v1.16b, v21.16b, v22.16b, #8 + ext v2.16b, v16.16b, v17.16b, #8 + ext v3.16b, v17.16b, v18.16b, #8 + sub v24.8h, v24.8h, v20.8h + sub v25.8h, v25.8h, v21.8h + add v0.8h, v0.8h, v20.8h + add v1.8h, v1.8h, v21.8h + add v2.8h, v2.8h, v16.8h + add v3.8h, v3.8h, v17.8h + st1 {v24.8h}, [x1], #16 + st1 {v25.8h}, [x1], #16 + mov v20.16b, v22.16b + mov v16.16b, v18.16b + sub v0.8h, v2.8h, v0.8h + sub v1.8h, v3.8h, v1.8h + ld1 {v21.8h,v22.8h}, [x3], #32 + ld1 {v17.8h,v18.8h}, [x8], #32 + st1 {v0.8h}, [x0], #16 + st1 {v1.8h}, [x0], #16 + b.gt 1b +2: + ret +endfunc + +function integral_init8v_neon, export=1 + add x2, x0, x1, lsl #4 + sub x1, x1, #8 + ands x3, x1, #16 - 1 + b.eq 1f + subs x1, x1, #8 + ld1 {v0.8h}, [x0] + ld1 {v2.8h}, [x2], #16 + sub v4.8h, v2.8h, v0.8h + st1 {v4.8h}, [x0], #16 + b.le 2f +1: + subs x1, x1, #16 + ld1 {v0.8h,v1.8h}, [x0] + ld1 {v2.8h,v3.8h}, [x2], #32 + sub v4.8h, v2.8h, v0.8h + sub v5.8h, v3.8h, v1.8h + st1 {v4.8h}, [x0], #16 + st1 {v5.8h}, [x0], #16 + b.gt 1b +2: + ret +endfunc + +function x264_mbtree_propagate_cost_neon, export=1 + ld1r {v5.4s}, [x5] +8: + subs w6, w6, #8 + ld1 {v1.8h}, [x1], #16 + ld1 {v2.8h}, [x2], #16 + ld1 {v3.8h}, [x3], #16 + ld1 {v4.8h}, [x4], #16 + bic v3.8h, #0xc0, lsl #8 + umin v3.8h, v2.8h, v3.8h + umull v20.4s, v2.4h, v4.4h // propagate_intra + umull2 v21.4s, v2.8h, v4.8h // propagate_intra + usubl v22.4s, v2.4h, v3.4h // propagate_num + usubl2 v23.4s, v2.8h, v3.8h // propagate_num + uxtl v26.4s, v2.4h // propagate_denom + uxtl2 v27.4s, v2.8h // propagate_denom + uxtl v24.4s, v1.4h + uxtl2 v25.4s, v1.8h + ucvtf v20.4s, v20.4s + ucvtf v21.4s, v21.4s + ucvtf v26.4s, v26.4s + ucvtf v27.4s, v27.4s + ucvtf v22.4s, v22.4s + ucvtf v23.4s, v23.4s + frecpe v28.4s, v26.4s + frecpe v29.4s, v27.4s + ucvtf v24.4s, v24.4s + ucvtf v25.4s, v25.4s + frecps v30.4s, v28.4s, v26.4s + frecps v31.4s, v29.4s, v27.4s + fmla v24.4s, v20.4s, v5.4s // propagate_amount + fmla v25.4s, v21.4s, v5.4s // propagate_amount + fmul v28.4s, v28.4s, v30.4s + fmul v29.4s, v29.4s, v31.4s + fmul v16.4s, v24.4s, v22.4s + fmul v17.4s, v25.4s, v23.4s + fmul v18.4s, v16.4s, v28.4s + fmul v19.4s, v17.4s, v29.4s + fcvtns v20.4s, v18.4s + fcvtns v21.4s, v19.4s + sqxtn v0.4h, v20.4s + sqxtn2 v0.8h, v21.4s + st1 {v0.8h}, [x0], #16 + b.ge 8b + ret +endfunc + +const pw_0to15, align=5 + .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +endconst + +function x264_mbtree_propagate_list_internal_neon, export=1 + movrel x11, pw_0to15 + dup v31.8h, w4 // bipred_weight + movi v30.8h, #0xc0, lsl #8 + ld1 {v29.8h}, [x11] //h->mb.i_mb_x,h->mb.i_mb_y + movi v28.4s, #4//, lsl #16 + movi v27.8h, #31 + movi v26.8h, #32 + dup v24.8h, w5 // mb_y + zip1 v29.8h, v29.8h, v24.8h +8: + subs w6, w6, #8 + ld1 {v1.8h}, [x1], #16 // propagate_amount + ld1 {v2.8h}, [x2], #16 // lowres_cost + and v2.16b, v2.16b, v30.16b + cmeq v25.8h, v2.8h, v30.8h + umull v16.4s, v1.4h, v31.4h + umull2 v17.4s, v1.8h, v31.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + bsl v25.16b, v16.16b, v1.16b // if( lists_used == 3 ) + // propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 + ld1 {v4.8h,v5.8h}, [x0], #32 + sshr v6.8h, v4.8h, #5 + sshr v7.8h, v5.8h, #5 + add v6.8h, v6.8h, v29.8h + add v29.8h, v29.8h, v28.8h + add v7.8h, v7.8h, v29.8h + add v29.8h, v29.8h, v28.8h + st1 {v6.8h,v7.8h}, [x3], #32 + and v4.16b, v4.16b, v27.16b + and v5.16b, v5.16b, v27.16b + uzp1 v6.8h, v4.8h, v5.8h // x & 31 + uzp2 v7.8h, v4.8h, v5.8h // y & 31 + sub v4.8h, v26.8h, v6.8h // 32 - (x & 31) + sub v5.8h, v26.8h, v7.8h // 32 - (y & 31) + mul v19.8h, v6.8h, v7.8h // idx3weight = y*x; + mul v18.8h, v4.8h, v7.8h // idx2weight = y*(32-x); + mul v17.8h, v6.8h, v5.8h // idx1weight = (32-y)*x; + mul v16.8h, v4.8h, v5.8h // idx0weight = (32-y)*(32-x) ; + umull v6.4s, v19.4h, v25.4h + umull2 v7.4s, v19.8h, v25.8h + umull v4.4s, v18.4h, v25.4h + umull2 v5.4s, v18.8h, v25.8h + umull v2.4s, v17.4h, v25.4h + umull2 v3.4s, v17.8h, v25.8h + umull v0.4s, v16.4h, v25.4h + umull2 v1.4s, v16.8h, v25.8h + rshrn v19.4h, v6.4s, #10 + rshrn2 v19.8h, v7.4s, #10 + rshrn v18.4h, v4.4s, #10 + rshrn2 v18.8h, v5.4s, #10 + rshrn v17.4h, v2.4s, #10 + rshrn2 v17.8h, v3.4s, #10 + rshrn v16.4h, v0.4s, #10 + rshrn2 v16.8h, v1.4s, #10 + zip1 v0.8h, v16.8h, v17.8h + zip2 v1.8h, v16.8h, v17.8h + zip1 v2.8h, v18.8h, v19.8h + zip2 v3.8h, v18.8h, v19.8h + st1 {v0.8h,v1.8h}, [x3], #32 + st1 {v2.8h,v3.8h}, [x3], #32 + b.ge 8b + ret +endfunc + +function x264_memcpy_aligned_neon, export=1 + tst x2, #16 + b.eq 32f + sub x2, x2, #16 + ldr q0, [x1], #16 + str q0, [x0], #16 +32: + tst x2, #32 + b.eq 640f + sub x2, x2, #32 + ldp q0, q1, [x1], #32 + stp q0, q1, [x0], #32 +640: + cbz x2, 1f +64: + subs x2, x2, #64 + ldp q0, q1, [x1, #32] + ldp q2, q3, [x1], #64 + stp q0, q1, [x0, #32] + stp q2, q3, [x0], #64 + b.gt 64b +1: + ret +endfunc + +function x264_memzero_aligned_neon, export=1 + movi v0.16b, #0 + movi v1.16b, #0 +1: + subs x1, x1, #128 + stp q0, q1, [x0, #96] + stp q0, q1, [x0, #64] + stp q0, q1, [x0, #32] + stp q0, q1, [x0], 128 + b.gt 1b + ret +endfunc
View file
x264-snapshot-20141218-2245.tar.bz2/common/aarch64/mc-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/mc-c.c
Changed
@@ -1,9 +1,10 @@ /***************************************************************************** * mc-c.c: aarch64 motion compensation ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> + * Janne Grunau <janne-x264@jannau.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -48,6 +49,8 @@ void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_plane_copy_neon( pixel *dst, intptr_t i_dst, + pixel *src, intptr_t i_src, int w, int h ); void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); @@ -89,8 +92,14 @@ void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int ); +void integral_init4h_neon( uint16_t *, uint8_t *, intptr_t ); +void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t ); +void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t ); +void integral_init8v_neon( uint16_t *, intptr_t ); void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int ); +void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int ); + #if !HIGH_BIT_DEPTH static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w ) { @@ -132,9 +141,6 @@ x264_mc_copy_w16_neon, }; -static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; -static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; - static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride, uint8_t *src[4], intptr_t i_src_stride, int mvx, int mvy, @@ -142,13 +148,13 @@ { int qpel_idx = ((mvy&3)<<2) + (mvx&3); intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); - uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset; + uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset; if ( (mvy&3) == 3 ) // explict if() to force conditional add src1 += i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { - uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); x264_pixel_avg_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, src2, i_height ); @@ -168,13 +174,13 @@ { int qpel_idx = ((mvy&3)<<2) + (mvx&3); intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); - uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset; + uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset; if ( (mvy&3) == 3 ) // explict if() to force conditional add src1 += i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { - uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); x264_pixel_avg_wtab_neon[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, src2, i_height ); @@ -199,6 +205,89 @@ int height, int16_t *buf ); #endif // !HIGH_BIT_DEPTH +#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1) +#define CLIP_ADD2(s,x)\ +do\ +{\ + CLIP_ADD((s)[0], (x)[0]);\ + CLIP_ADD((s)[1], (x)[1]);\ +} while(0) + +void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2], + int16_t *propagate_amount, + uint16_t *lowres_costs, + int16_t *output, + int bipred_weight, int mb_y, + int len ); + +static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs, + int16_t (*mvs)[2], + int16_t *propagate_amount, + uint16_t *lowres_costs, + int bipred_weight, int mb_y, + int len, int list ) +{ + int16_t *current = h->scratch_buffer2; + + x264_mbtree_propagate_list_internal_neon( mvs, propagate_amount, + lowres_costs, current, + bipred_weight, mb_y, len ); + + unsigned stride = h->mb.i_mb_stride; + unsigned width = h->mb.i_mb_width; + unsigned height = h->mb.i_mb_height; + + for( unsigned i = 0; i < len; current += 32 ) + { + int end = X264_MIN( i+8, len ); + for( ; i < end; i++, current += 2 ) + { + if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) ) + continue; + + unsigned mbx = current[0]; + unsigned mby = current[1]; + unsigned idx0 = mbx + mby * stride; + unsigned idx2 = idx0 + stride; + + /* Shortcut for the simple/common case of zero MV */ + if( !M32( mvs[i] ) ) + { + CLIP_ADD( ref_costs[idx0], current[16] ); + continue; + } + + if( mbx < width-1 && mby < height-1 ) + { + CLIP_ADD2( ref_costs+idx0, current+16 ); + CLIP_ADD2( ref_costs+idx2, current+32 ); + } + else + { + /* Note: this takes advantage of unsigned representation to + * catch negative mbx/mby. */ + if( mby < height ) + { + if( mbx < width ) + CLIP_ADD( ref_costs[idx0+0], current[16] ); + if( mbx+1 < width ) + CLIP_ADD( ref_costs[idx0+1], current[17] ); + } + if( mby+1 < height ) + { + if( mbx < width ) + CLIP_ADD( ref_costs[idx2+0], current[32] ); + if( mbx+1 < width ) + CLIP_ADD( ref_costs[idx2+1], current[33] ); + } + } + } + } +} + +#undef CLIP_ADD +#undef CLIP_ADD2 + void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf ) { #if !HIGH_BIT_DEPTH @@ -217,6 +306,7 @@ pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon; + pf->plane_copy = x264_plane_copy_neon; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; pf->plane_copy_interleave = x264_plane_copy_interleave_neon; @@ -245,5 +335,16 @@ pf->get_ref = get_ref_neon; pf->hpel_filter = x264_hpel_filter_neon; pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon; + + pf->integral_init4h = integral_init4h_neon; + pf->integral_init8h = integral_init8h_neon; + pf->integral_init4v = integral_init4v_neon; + pf->integral_init8v = integral_init8v_neon; + + pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon; + pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon; + + pf->memcpy_aligned = x264_memcpy_aligned_neon; + pf->memzero_aligned = x264_memzero_aligned_neon; #endif // !HIGH_BIT_DEPTH }
View file
x264-snapshot-20141218-2245.tar.bz2/common/aarch64/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/mc.h
Changed
@@ -1,7 +1,9 @@ /***************************************************************************** * mc.h: aarch64 motion compensation ***************************************************************************** - * Copyright (C) 2014 x264 project + * Copyright (C) 2014-2015 x264 project + * + * Authors: Janne Grunau <janne-x264@jannau.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by
View file
x264-snapshot-20141218-2245.tar.bz2/common/aarch64/pixel-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/pixel-a.S
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * pixel.S: aarch64 pixel metrics ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> * Janne Grunau <janne-x264@jannau.net> @@ -114,6 +114,7 @@ SAD_FUNC 4, 4 SAD_FUNC 4, 8 +SAD_FUNC 4, 16 SAD_FUNC 8, 4 SAD_FUNC 8, 8 SAD_FUNC 8, 16 @@ -148,7 +149,7 @@ \first v17.8h, v2.8b, v0.8b ld1 {v3.8b}, [x3], x5 ld1 {v1.8b}, [x1], x5 - \first v18.8h, v3.8b, v0.8b + \first v18.8h, v3.8b, v0.8b uabal v16.8h, v1.8b, v5.8b ld1 {v2.8b}, [x2], x5 ld1 {v3.8b}, [x3], x5 @@ -248,6 +249,56 @@ SAD_X_FUNC 4, 16, 16 +function x264_pixel_vsad_neon, export=1 + subs w2, w2, #2 + ld1 {v0.16b}, [x0], x1 + ld1 {v1.16b}, [x0], x1 + uabdl v6.8h, v0.8b, v1.8b + uabdl2 v7.8h, v0.16b, v1.16b + b.le 2f +1: + subs w2, w2, #2 + ld1 {v0.16b}, [x0], x1 + uabal v6.8h, v1.8b, v0.8b + uabal2 v7.8h, v1.16b, v0.16b + ld1 {v1.16b}, [x0], x1 + b.lt 2f + uabal v6.8h, v0.8b, v1.8b + uabal2 v7.8h, v0.16b, v1.16b + b.gt 1b +2: + add v5.8h, v6.8h, v7.8h + uaddlv s0, v5.8h + fmov w0, s0 + ret +endfunc + +function x264_pixel_asd8_neon, export=1 + sub w4, w4, #2 + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + usubl v16.8h, v0.8b, v1.8b +1: + subs w4, w4, #2 + ld1 {v4.8b}, [x0], x1 + ld1 {v5.8b}, [x2], x3 + usubl v17.8h, v2.8b, v3.8b + usubl v18.8h, v4.8b, v5.8b + add v16.8h, v16.8h, v17.8h + ld1 {v2.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + add v16.8h, v16.8h, v18.8h + b.gt 1b + usubl v17.8h, v2.8b, v3.8b + add v16.8h, v16.8h, v17.8h + saddlv s0, v16.8h + abs v0.2s, v0.2s + fmov w0, s0 + ret +endfunc + .macro SSD_START_4 ld1 {v16.s}[0], [x0], x1 ld1 {v17.s}[0], [x2], x3 @@ -343,12 +394,84 @@ SSD_FUNC 4, 4 SSD_FUNC 4, 8 +SSD_FUNC 4, 16 SSD_FUNC 8, 4 SSD_FUNC 8, 8 SSD_FUNC 8, 16 SSD_FUNC 16, 8 SSD_FUNC 16, 16 + +function x264_pixel_ssd_nv12_core_neon, export=1 + sxtw x8, w4 + add x8, x8, #8 + and x8, x8, #~15 + movi v6.2d, #0 + movi v7.2d, #0 + sub x1, x1, x8, lsl #1 + sub x3, x3, x8, lsl #1 +1: + subs w8, w4, #16 + ld2 {v0.8b,v1.8b}, [x0], #16 + ld2 {v2.8b,v3.8b}, [x2], #16 + ld2 {v24.8b,v25.8b}, [x0], #16 + ld2 {v26.8b,v27.8b}, [x2], #16 + + usubl v16.8h, v0.8b, v2.8b + usubl v17.8h, v1.8b, v3.8b + smull v20.4s, v16.4h, v16.4h + smull v21.4s, v17.4h, v17.4h + usubl v18.8h, v24.8b, v26.8b + usubl v19.8h, v25.8b, v27.8b + smlal2 v20.4s, v16.8h, v16.8h + smlal2 v21.4s, v17.8h, v17.8h + + b.lt 4f + b.eq 3f +2: + smlal v20.4s, v18.4h, v18.4h + smlal v21.4s, v19.4h, v19.4h + ld2 {v0.8b,v1.8b}, [x0], #16 + ld2 {v2.8b,v3.8b}, [x2], #16 + smlal2 v20.4s, v18.8h, v18.8h + smlal2 v21.4s, v19.8h, v19.8h + + subs w8, w8, #16 + usubl v16.8h, v0.8b, v2.8b + usubl v17.8h, v1.8b, v3.8b + smlal v20.4s, v16.4h, v16.4h + smlal v21.4s, v17.4h, v17.4h + ld2 {v24.8b,v25.8b}, [x0], #16 + ld2 {v26.8b,v27.8b}, [x2], #16 + smlal2 v20.4s, v16.8h, v16.8h + smlal2 v21.4s, v17.8h, v17.8h + b.lt 4f + + usubl v18.8h, v24.8b, v26.8b + usubl v19.8h, v25.8b, v27.8b + b.gt 2b +3: + smlal v20.4s, v18.4h, v18.4h + smlal v21.4s, v19.4h, v19.4h + smlal2 v20.4s, v18.8h, v18.8h + smlal2 v21.4s, v19.8h, v19.8h +4: + subs w5, w5, #1 + uaddw v6.2d, v6.2d, v20.2s + uaddw v7.2d, v7.2d, v21.2s + add x0, x0, x1 + add x2, x2, x3 + uaddw2 v6.2d, v6.2d, v20.4s + uaddw2 v7.2d, v7.2d, v21.4s + b.gt 1b + + addp v6.2d, v6.2d, v7.2d + st1 {v6.d}[0], [x6] + st1 {v6.d}[1], [x7] + + ret +endfunc + .macro pixel_var_8 h function x264_pixel_var_8x\h\()_neon, export=1 ld1 {v16.8b}, [x0], x1 @@ -800,10 +923,65 @@ b x264_satd_8x4v_8x8h_neon endfunc +function x264_pixel_satd_4x16_neon, export=1 + mov x4, x30 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v2.s}[0], [x0], x1 + ld1 {v5.s}[0], [x2], x3 + ld1 {v4.s}[0], [x0], x1 + ld1 {v7.s}[0], [x2], x3 + ld1 {v6.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + ld1 {v3.s}[1], [x2], x3 + ld1 {v2.s}[1], [x0], x1 + ld1 {v5.s}[1], [x2], x3 + ld1 {v4.s}[1], [x0], x1 + ld1 {v7.s}[1], [x2], x3 + ld1 {v6.s}[1], [x0], x1 + usubl v16.8h, v0.8b, v1.8b + usubl v17.8h, v2.8b, v3.8b + usubl v18.8h, v4.8b, v5.8b + usubl v19.8h, v6.8b, v7.8b + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v2.s}[0], [x0], x1 + ld1 {v5.s}[0], [x2], x3 + ld1 {v4.s}[0], [x0], x1 + ld1 {v7.s}[0], [x2], x3 + ld1 {v6.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + ld1 {v3.s}[1], [x2], x3 + ld1 {v2.s}[1], [x0], x1 + ld1 {v5.s}[1], [x2], x3 + ld1 {v4.s}[1], [x0], x1 + ld1 {v7.s}[1], [x2], x3 + ld1 {v6.s}[1], [x0], x1 + usubl v20.8h, v0.8b, v1.8b + usubl v21.8h, v2.8b, v3.8b + usubl v22.8h, v4.8b, v5.8b + usubl v23.8h, v6.8b, v7.8b + + SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h + SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h + + bl x264_satd_8x4v_8x8h_neon + + add v30.8h, v0.8h, v1.8h + add v31.8h, v2.8h, v3.8h + add v0.8h, v30.8h, v31.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc function x264_pixel_sa8d_8x8_neon, export=1 mov x4, x30 - bl x264_sa8d_8x8_neon + bl pixel_sa8d_8x8_neon add v0.8h, v0.8h, v1.8h uaddlv s0, v0.8h mov w0, v0.s[0] @@ -814,20 +992,20 @@ function x264_pixel_sa8d_16x16_neon, export=1 mov x4, x30 - bl x264_sa8d_8x8_neon + bl pixel_sa8d_8x8_neon uaddlp v30.4s, v0.8h uaddlp v31.4s, v1.8h - bl x264_sa8d_8x8_neon + bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h sub x0, x0, x1, lsl #4 sub x2, x2, x3, lsl #4 add x0, x0, #8 add x2, x2, #8 - bl x264_sa8d_8x8_neon + bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h - bl x264_sa8d_8x8_neon + bl pixel_sa8d_8x8_neon uadalp v30.4s, v0.8h uadalp v31.4s, v1.8h add v0.4s, v30.4s, v31.4s @@ -838,13 +1016,48 @@ ret x4 endfunc -function x264_sa8d_8x8_neon +.macro sa8d_satd_8x8 satd= +function pixel_sa8d_\satd\()8x8_neon load_diff_fly_8x8 SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h +.ifc \satd, satd_ + transpose v0.8h, v1.8h, v16.8h, v17.8h + transpose v2.8h, v3.8h, v18.8h, v19.8h + transpose v4.8h, v5.8h, v20.8h, v21.8h + transpose v6.8h, v7.8h, v22.8h, v23.8h + + SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h + SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h + SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h + SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h + + transpose v4.4s, v6.4s, v24.4s, v26.4s + transpose v5.4s, v7.4s, v25.4s, v27.4s + transpose v24.4s, v26.4s, v0.4s, v2.4s + transpose v25.4s, v27.4s, v1.4s, v3.4s + + abs v0.8h, v4.8h + abs v1.8h, v5.8h + abs v2.8h, v6.8h + abs v3.8h, v7.8h + abs v4.8h, v24.8h + abs v5.8h, v25.8h + abs v6.8h, v26.8h + abs v7.8h, v27.8h + + umax v0.8h, v0.8h, v2.8h + umax v1.8h, v1.8h, v3.8h + umax v2.8h, v4.8h, v6.8h + umax v3.8h, v5.8h, v7.8h + + add v26.8h, v0.8h, v1.8h + add v27.8h, v2.8h, v3.8h +.endif + SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h @@ -855,20 +1068,20 @@ transpose v22.8h, v23.8h, v18.8h, v19.8h transpose v6.8h, v7.8h, v2.8h, v3.8h - SUMSUB_AB v28.8h, v29.8h, v20.8h, v21.8h + SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h - SUMSUB_AB v26.8h, v27.8h, v6.8h, v7.8h + SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h - transpose v20.4s, v22.4s, v28.4s, v0.4s - transpose v21.4s, v23.4s, v29.4s, v1.4s - transpose v16.4s, v18.4s, v24.4s, v26.4s - transpose v17.4s, v19.4s, v25.4s, v27.4s + transpose v20.4s, v22.4s, v2.4s, v0.4s + transpose v21.4s, v23.4s, v3.4s, v1.4s + transpose v16.4s, v18.4s, v24.4s, v4.4s + transpose v17.4s, v19.4s, v25.4s, v5.4s SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h - SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h - SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h + SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h + SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h transpose v16.2d, v20.2d, v0.2d, v4.2d transpose v17.2d, v21.2d, v1.2d, v5.2d @@ -894,7 +1107,47 @@ ret endfunc +.endm +sa8d_satd_8x8 +sa8d_satd_8x8 satd_ + +function x264_pixel_sa8d_satd_16x16_neon, export=1 + mov x4, x30 + bl pixel_sa8d_satd_8x8_neon + uaddlp v30.4s, v0.8h + uaddlp v31.4s, v1.8h + uaddlp v28.4s, v26.8h + uaddlp v29.4s, v27.8h + bl pixel_sa8d_satd_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + uadalp v28.4s, v26.8h + uadalp v29.4s, v27.8h + sub x0, x0, x1, lsl #4 + sub x2, x2, x3, lsl #4 + add x0, x0, #8 + add x2, x2, #8 + bl pixel_sa8d_satd_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + uadalp v28.4s, v26.8h + uadalp v29.4s, v27.8h + bl pixel_sa8d_satd_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + uadalp v28.4s, v26.8h + uadalp v29.4s, v27.8h + add v0.4s, v30.4s, v31.4s // sa8d + add v1.4s, v28.4s, v29.4s // satd + addv s0, v0.4s + addv s1, v1.4s + urshr v0.4s, v0.4s, #1 + fmov w0, s0 + fmov w1, s1 + add x0, x0, x1, lsl #32 + ret x4 +endfunc .macro HADAMARD_AC w h function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1
View file
x264-snapshot-20141218-2245.tar.bz2/common/aarch64/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/pixel.h
Changed
@@ -1,9 +1,10 @@ /***************************************************************************** * pixel.h: aarch64 pixel metrics ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> + * Janne Grunau <janne-x264@jannau.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -32,6 +33,7 @@ ret x264_pixel_##name##_8x16_##suffix args;\ ret x264_pixel_##name##_8x8_##suffix args;\ ret x264_pixel_##name##_8x4_##suffix args;\ + ret x264_pixel_##name##_4x16_##suffix args;\ ret x264_pixel_##name##_4x8_##suffix args;\ ret x264_pixel_##name##_4x4_##suffix args;\ @@ -47,8 +49,14 @@ DECL_X1( satd, neon ) DECL_X1( ssd, neon ) + +void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * ); + +int x264_pixel_vsad_neon( uint8_t *, intptr_t, int ); + int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t ); int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); +uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t ); @@ -66,4 +74,6 @@ int sums[2][4] ); float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width ); +int x264_pixel_asd8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + #endif
View file
x264-snapshot-20141218-2245.tar.bz2/common/aarch64/predict-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/predict-a.S
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * predict.S: aarch64 intra prediction ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> * Mans Rullgard <mans@mansr.com> @@ -436,14 +436,25 @@ endfunc function x264_predict_8x8c_dc_left_neon, export=1 - sub x2, x0, #1 + ldrb w2, [x0, #0 * FDEC_STRIDE - 1] + ldrb w3, [x0, #1 * FDEC_STRIDE - 1] + ldrb w4, [x0, #2 * FDEC_STRIDE - 1] + ldrb w5, [x0, #3 * FDEC_STRIDE - 1] mov x1, #FDEC_STRIDE - ldcol.8 v0, x2, x1 - uaddlp v0.4h, v0.8b - addp v0.4h, v0.4h, v0.4h + add w2, w2, w3 + add w3, w4, w5 + ldrb w6, [x0, #4 * FDEC_STRIDE - 1] + ldrb w7, [x0, #5 * FDEC_STRIDE - 1] + ldrb w8, [x0, #6 * FDEC_STRIDE - 1] + ldrb w9, [x0, #7 * FDEC_STRIDE - 1] + add w6, w6, w7 + add w7, w8, w9 + add w2, w2, w3 + add w6, w6, w7 + dup v0.8h, w2 + dup v1.8h, w6 rshrn v0.8b, v0.8h, #2 - dup v1.8b, v0.b[1] - dup v0.8b, v0.b[0] + rshrn v1.8b, v1.8h, #2 b pred8x8c_dc_end endfunc @@ -546,6 +557,223 @@ endfunc +.macro loadsum4 wd, t1, t2, t3, x, idx + ldrb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1] + ldrb \t1, [\x, #(\idx + 1) * FDEC_STRIDE - 1] + ldrb \t2, [\x, #(\idx + 2) * FDEC_STRIDE - 1] + ldrb \t3, [\x, #(\idx + 3) * FDEC_STRIDE - 1] + add \wd, \wd, \t1 + add \t1, \t2, \t3 + add \wd, \wd, \t1 +.endm + +function x264_predict_8x16c_h_neon, export=1 + sub x2, x0, #1 + add x3, x0, #FDEC_STRIDE - 1 + mov x7, #2 * FDEC_STRIDE + add x1, x0, #FDEC_STRIDE +.rept 4 + ld1r {v0.8b}, [x2], x7 + ld1r {v1.8b}, [x3], x7 + ld1r {v2.8b}, [x2], x7 + ld1r {v3.8b}, [x3], x7 + st1 {v0.8b}, [x0], x7 + st1 {v1.8b}, [x1], x7 + st1 {v2.8b}, [x0], x7 + st1 {v3.8b}, [x1], x7 +.endr + ret +endfunc + +function x264_predict_8x16c_v_neon, export=1 + sub x1, x0, #FDEC_STRIDE + mov x2, #2 * FDEC_STRIDE + ld1 {v0.8b}, [x1], x2 +.rept 8 + st1 {v0.8b}, [x0], x2 + st1 {v0.8b}, [x1], x2 +.endr + ret +endfunc + +function x264_predict_8x16c_p_neon, export=1 + movrel x4, p16weight + ld1 {v17.8h}, [x4] + sub x3, x0, #FDEC_STRIDE + mov x1, #FDEC_STRIDE + add x2, x3, #4 + sub x3, x3, #1 + + ld1 {v0.8b}, [x3] + ld1 {v2.8b}, [x2], x1 + ldcol.8 v1, x3, x1 + add x3, x3, x1 + ldcol.8 v3, x3, x1 + ext v4.8b, v2.8b, v2.8b, #3 + ext v5.8b, v3.8b, v3.8b, #7 + rev32 v0.8b, v0.8b + rev64 v1.8b, v1.8b + + uaddl v4.8h, v5.8b, v4.8b // a * 1/16 + + usubl v2.8h, v2.8b, v0.8b + mul v2.8h, v2.8h, v17.8h + saddlp v2.4s, v2.8h + addp v2.4s, v2.4s, v2.4s // H + + usubl v3.8h, v3.8b, v1.8b + mul v3.8h, v3.8h, v17.8h + saddlp v3.4s, v3.8h + addp v3.4s, v3.4s, v3.4s + addp v3.4s, v3.4s, v3.4s // V + + ext v17.16b, v17.16b, v17.16b, #14 + + shl v4.4h, v4.4h, #4 // a + shl v6.2s, v2.2s, #4 // 16 * H + shl v7.2s, v3.2s, #2 // 4 * V + add v2.2s, v2.2s, v6.2s // 17 * H + add v3.2s, v3.2s, v7.2s // 5 * V + rshrn v2.4h, v2.4s, #5 // b + rshrn v3.4h, v3.4s, #6 // c + + mov v17.h[0], wzr + + sub v4.4h, v4.4h, v2.4h // a - b + shl v6.4h, v2.4h, #1 // 2 * b + add v4.4h, v4.4h, v3.4h // a - b + c + shl v7.4h, v3.4h, #3 // 8 * c + sub v4.4h, v4.4h, v6.4h // a - 3b + c + sub v4.4h, v4.4h, v7.4h // a - 3b - 7c + + mul v0.8h, v17.8h, v2.h[0] // 0,1,2,3,4,5,6,7 * b + dup v1.8h, v4.h[0] // i00 + dup v2.8h, v3.h[0] // c + add v1.8h, v1.8h, v0.8h // pix + {0..7}*b + mov x3, #16 +1: + subs x3, x3, #2 + sqrshrun v4.8b, v1.8h, #5 + add v1.8h, v1.8h, v2.8h + sqrshrun v5.8b, v1.8h, #5 + st1 {v4.8b}, [x0], x1 + add v1.8h, v1.8h, v2.8h + st1 {v5.8b}, [x0], x1 + b.ne 1b + ret +endfunc + +function x264_predict_8x16c_dc_neon, export=1 + sub x3, x0, #FDEC_STRIDE + mov x1, #FDEC_STRIDE + ld1 {v6.8b}, [x3] + loadsum4 w2, w3, w4, w5, x0, 0 + uaddlp v6.4h, v6.8b + dup v22.8h, w2 // s2 + loadsum4 w6, w7, w8, w9, x0, 4 + addp v6.4h, v6.4h, v6.4h // s0, s1 + dup v23.8h, w6 // s3 + loadsum4 w2, w3, w4, w5, x0, 8 + dup v20.8h, v6.h[0] // s0 + dup v24.8h, w2 // s4 + loadsum4 w6, w7, w8, w9, x0, 12 + dup v21.8h, v6.h[1] // s1 + dup v25.8h, w6 // s5 + + ext v16.16b, v20.16b, v21.16b, #8 + ext v17.16b, v22.16b, v21.16b, #8 + ext v1.16b, v23.16b, v21.16b, #8 + ext v2.16b, v24.16b, v21.16b, #8 + ext v3.16b, v25.16b, v21.16b, #8 + + add v0.8h, v16.8h, v17.8h + add v1.8h, v1.8h, v23.8h + add v2.8h, v2.8h, v24.8h + add v3.8h, v3.8h, v25.8h + + rshrn v0.8b, v0.8h, #3 + rshrn v1.8b, v1.8h, #3 + rshrn v2.8b, v2.8h, #3 + rshrn v3.8b, v3.8h, #3 +.irp idx, 0, 1, 2, 3 +.rept 4 + st1 {v\idx\().8b}, [x0], x1 +.endr +.endr + ret +endfunc + +function x264_predict_8x16c_dc_left_neon, export=1 + mov x1, #FDEC_STRIDE + ldrb w2, [x0, # 0 * FDEC_STRIDE - 1] + ldrb w3, [x0, # 1 * FDEC_STRIDE - 1] + ldrb w4, [x0, # 2 * FDEC_STRIDE - 1] + ldrb w5, [x0, # 3 * FDEC_STRIDE - 1] + add w2, w2, w3 + + ldrb w6, [x0, # 4 * FDEC_STRIDE - 1] + add w4, w4, w5 + ldrb w7, [x0, # 5 * FDEC_STRIDE - 1] + add w2, w2, w4 + ldrb w8, [x0, # 6 * FDEC_STRIDE - 1] + ldrb w9, [x0, # 7 * FDEC_STRIDE - 1] + dup v0.8h, w2 + add w6, w6, w7 + rshrn v0.8b, v0.8h, #2 + add w8, w8, w9 + + ldrb w10, [x0, # 8 * FDEC_STRIDE - 1] + ldrb w11, [x0, # 9 * FDEC_STRIDE - 1] + add w6, w6, w8 + ldrb w12, [x0, #10 * FDEC_STRIDE - 1] + ldrb w13, [x0, #11 * FDEC_STRIDE - 1] + dup v1.8h, w6 + add w10, w10, w11 + rshrn v1.8b, v1.8h, #2 + add w12, w12, w13 + + ldrb w2, [x0, #12 * FDEC_STRIDE - 1] + ldrb w3, [x0, #13 * FDEC_STRIDE - 1] + add w10, w10, w12 + ldrb w4, [x0, #14 * FDEC_STRIDE - 1] + ldrb w5, [x0, #15 * FDEC_STRIDE - 1] + dup v2.8h, w10 + add w2, w2, w3 + rshrn v2.8b, v2.8h, #2 + add w4, w4, w5 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x0], x1 + add w2, w2, w4 + st1 {v0.8b}, [x0], x1 + dup v3.8h, w2 + st1 {v0.8b}, [x0], x1 + rshrn v3.8b, v3.8h, #2 + +.irp idx, 1, 2, 3 +.rept 4 + st1 {v\idx\().8b}, [x0], x1 +.endr +.endr + ret +endfunc + +function x264_predict_8x16c_dc_top_neon, export=1 + sub x2, x0, #FDEC_STRIDE + mov x1, #FDEC_STRIDE + ld1 {v0.8b}, [x2] + uaddlp v0.4h, v0.8b + addp v0.4h, v0.4h, v0.4h + rshrn v4.8b, v0.8h, #2 + dup v0.8b, v4.b[0] + dup v1.8b, v4.b[1] + ext v0.8b, v0.8b, v1.8b, #4 +.rept 16 + st1 {v0.8b}, [x0], x1 +.endr + ret +endfunc + + function x264_predict_16x16_dc_top_neon, export=1 sub x2, x0, #FDEC_STRIDE mov x1, #FDEC_STRIDE @@ -603,7 +831,7 @@ .rept 16 st1 {v0.16b}, [x0], x7 .endr - ret + ret endfunc function x264_predict_16x16_p_neon, export=1
View file
x264-snapshot-20141218-2245.tar.bz2/common/aarch64/predict-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/predict-c.c
Changed
@@ -1,9 +1,10 @@ /***************************************************************************** * predict.c: aarch64 intra prediction ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> + * Janne Grunau <janne-x264@jannau.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -35,6 +36,10 @@ void x264_predict_8x8c_dc_left_neon( uint8_t *src ); void x264_predict_8x8c_p_neon( uint8_t *src ); +void x264_predict_8x16c_dc_left_neon( uint8_t *src ); +void x264_predict_8x16c_dc_top_neon( uint8_t *src ); +void x264_predict_8x16c_p_neon( uint8_t *src ); + void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); @@ -80,6 +85,22 @@ #endif // !HIGH_BIT_DEPTH } + +void x264_predict_8x16c_init_aarch64( int cpu, x264_predict_t pf[7] ) +{ + if (!(cpu&X264_CPU_NEON)) + return; + +#if !HIGH_BIT_DEPTH + pf[I_PRED_CHROMA_V ] = x264_predict_8x16c_v_neon; + pf[I_PRED_CHROMA_H ] = x264_predict_8x16c_h_neon; + pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_neon; + pf[I_PRED_CHROMA_P ] = x264_predict_8x16c_p_neon; + pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_neon; + pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_neon; +#endif // !HIGH_BIT_DEPTH +} + void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) { if (!(cpu&X264_CPU_NEON))
View file
x264-snapshot-20141218-2245.tar.bz2/common/aarch64/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/predict.h
Changed
@@ -1,9 +1,10 @@ /***************************************************************************** * predict.h: aarch64 intra prediction ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> + * Janne Grunau <janne-x264@jannau.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -40,6 +41,9 @@ void x264_predict_8x8c_dc_neon( uint8_t *src ); void x264_predict_8x8c_h_neon( uint8_t *src ); void x264_predict_8x8c_v_neon( uint8_t *src ); +void x264_predict_8x16c_v_neon( uint8_t *src ); +void x264_predict_8x16c_h_neon( uint8_t *src ); +void x264_predict_8x16c_dc_neon( uint8_t *src ); void x264_predict_16x16_v_neon( uint8_t *src ); void x264_predict_16x16_h_neon( uint8_t *src ); void x264_predict_16x16_dc_neon( uint8_t *src ); @@ -47,6 +51,7 @@ void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] ); void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ); void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] ); +void x264_predict_8x16c_init_aarch64( int cpu, x264_predict_t pf[7] ); void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] ); #endif /* X264_AARCH64_PREDICT_H */
View file
x264-snapshot-20141218-2245.tar.bz2/common/aarch64/quant-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/quant-a.S
Changed
@@ -1,9 +1,10 @@ /**************************************************************************** * quant.S: arm quantization and level-run ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> + * Janne Grunau <janne-x264@jannau.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -300,6 +301,118 @@ ret endfunc +.macro decimate_score_1x size +function x264_decimate_score\size\()_neon, export=1 + ld1 {v0.8h,v1.8h}, [x0] + movrel x5, X(x264_decimate_table4) + movi v3.16b, #0x01 + sqxtn v0.8b, v0.8h + sqxtn2 v0.16b, v1.8h + abs v2.16b, v0.16b + cmeq v1.16b, v0.16b, #0 + cmhi v2.16b, v2.16b, v3.16b + shrn v1.8b, v1.8h, #4 + shrn v2.8b, v2.8h, #4 + fmov x2, d2 + fmov x1, d1 + cbnz x2, 9f + mvn x1, x1 + mov w0, #0 + cbz x1, 0f +.ifc \size, 15 + lsr x1, x1, #1 +.endif + rbit x1, x1 +1: + clz x3, x1 + lsr x6, x3, #2 + lsl x1, x1, x3 + ldrb w7, [x5, x6] + cbz x1, 2f + lsl x1, x1, #4 + add w0, w0, w7 + cbnz x1, 1b + ret +2: + add w0, w0, w7 +0: + ret +9: + mov w0, #9 + ret +endfunc +.endm + +decimate_score_1x 15 +decimate_score_1x 16 + +const mask64, align=6 + .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 + .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 +endconst + +function x264_decimate_score64_neon, export=1 + ld1 {v0.8h,v1.8h}, [x0], #32 + ld1 {v2.8h,v3.8h}, [x0], #32 + ld1 {v4.8h,v5.8h}, [x0], #32 + ld1 {v6.8h,v7.8h}, [x0] + movrel x6, mask64 + movi v31.16b, #0x01 + sqxtn v16.8b, v1.8h + sqxtn2 v16.16b, v0.8h + sqxtn v17.8b, v3.8h + sqxtn2 v17.16b, v2.8h + sqxtn v18.8b, v5.8h + sqxtn2 v18.16b, v4.8h + sqxtn v19.8b, v7.8h + sqxtn2 v19.16b, v6.8h + abs v4.16b, v16.16b + abs v5.16b, v17.16b + abs v6.16b, v18.16b + abs v7.16b, v19.16b + ld1 {v30.16b}, [x6] + cmeq v0.16b, v16.16b, #0 + cmeq v1.16b, v17.16b, #0 + cmeq v2.16b, v18.16b, #0 + cmeq v3.16b, v19.16b, #0 + umax v4.16b, v4.16b, v5.16b + umax v6.16b, v6.16b, v7.16b + and v0.16b, v0.16b, v30.16b + and v1.16b, v1.16b, v30.16b + and v2.16b, v2.16b, v30.16b + and v3.16b, v3.16b, v30.16b + umax v4.16b, v4.16b, v6.16b + addp v0.16b, v1.16b, v0.16b + addp v2.16b, v3.16b, v2.16b + cmhi v4.16b, v4.16b, v31.16b + addp v0.16b, v2.16b, v0.16b + shrn v4.8b, v4.8h, #4 + addp v0.16b, v0.16b, v0.16b + fmov x2, d4 + fmov x1, d0 + cbnz x2, 9f + mvn x1, x1 + mov w0, #0 + cbz x1, 0f + movrel x5, X(x264_decimate_table8) +1: + clz x3, x1 + lsl x1, x1, x3 + ldrb w7, [x5, x3] + cbz x1, 2f + lsl x1, x1, #1 + add w0, w0, w7 + cbnz x1, 1b + ret +2: + add w0, w0, w7 +0: + ret +9: + mov w0, #9 + ret +endfunc + // int coeff_last( int16_t *l ) function x264_coeff_last4_aarch64, export=1 ldr x2, [x0] @@ -384,3 +497,105 @@ sub w0, w3, w2 ret endfunc + +.macro coeff_level_run_start size + add x6, x1, #23 // runlevel->mask + mov w7, #0 + mov w8, #0 + mov w9, #1 + and x6, x6, #~15 + mov w4, #\size - 1 +.endm + +.macro coeff_level_run shift + clz x3, x2 + subs w4, w4, w3, lsr #\shift + str w4, [x1], #4 +1: + ldrh w5, [x0, x4, lsl #1] + strh w5, [x6], #2 + add w7, w7, #1 + lsl w10, w9, w4 + orr w8, w8, w10 + b.le 2f + add w3, w3, #1 << \shift + sub w4, w4, #1 + and x3, x3, #~((1 << \shift) - 1) + lsl x2, x2, x3 + clz x3, x2 + subs w4, w4, w3, lsr #\shift + b.ge 1b +2: + str w8, [x1] + mov w0, w7 +.endm + +function x264_coeff_level_run4_aarch64, export=1 + ldr x2, [x0] + + coeff_level_run_start 4 + + coeff_level_run 4 + + ret +endfunc + +.macro X264_COEFF_LEVEL_RUN size +function x264_coeff_level_run\size\()_neon, export=1 +.if \size == 15 + sub x0, x0, #2 +.endif +.if \size < 15 + .equ shiftw, 3 + ld1 {v0.8h}, [x0] + uqxtn v0.8b, v0.8h + cmtst v0.8b, v0.8b, v0.8b +.else + .equ shiftw, 2 + ld1 {v0.8h,v1.8h}, [x0] + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h + cmtst v0.16b, v0.16b, v0.16b + shrn v0.8b, v0.8h, #4 +.endif + fmov x2, d0 +.if \size == 15 + add x0, x0, #2 +.endif + + coeff_level_run_start \size + + coeff_level_run shiftw + + ret +endfunc +.endm + +X264_COEFF_LEVEL_RUN 8 +X264_COEFF_LEVEL_RUN 15 +X264_COEFF_LEVEL_RUN 16 + +function x264_denoise_dct_neon, export=1 +1: subs w3, w3, #16 + ld1 {v0.8h,v1.8h}, [x0] + ld1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x1] + abs v16.8h, v0.8h + abs v17.8h, v1.8h + ld1 {v2.8h,v3.8h}, [x2], #32 + cmlt v18.8h, v0.8h, #0 + cmlt v19.8h, v1.8h, #0 + uaddw v4.4s, v4.4s, v16.4h + uaddw2 v5.4s, v5.4s, v16.8h + uqsub v20.8h, v16.8h, v2.8h + uqsub v21.8h, v17.8h, v3.8h + uaddw v6.4s, v6.4s, v17.4h + uaddw2 v7.4s, v7.4s, v17.8h + neg v22.8h, v20.8h + neg v23.8h, v21.8h + bsl v18.16b, v22.16b, v20.16b + bsl v19.16b, v23.16b, v21.16b + st1 {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64 + st1 {v18.8h,v19.8h}, [x0], #32 + b.gt 1b + ret +endfunc
View file
x264-snapshot-20141218-2245.tar.bz2/common/aarch64/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/quant.h
Changed
@@ -1,9 +1,10 @@ /***************************************************************************** * quant.h: arm quantization and level-run ***************************************************************************** - * Copyright (C) 2005-2014 x264 project + * Copyright (C) 2005-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> + * Janne Grunau <janne-x264@jannau.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -38,10 +39,21 @@ void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp ); +int x264_decimate_score15_neon( int16_t * ); +int x264_decimate_score16_neon( int16_t * ); +int x264_decimate_score64_neon( int16_t * ); + int x264_coeff_last4_aarch64( int16_t * ); int x264_coeff_last8_aarch64( int16_t * ); int x264_coeff_last15_neon( int16_t * ); int x264_coeff_last16_neon( int16_t * ); int x264_coeff_last64_neon( int16_t * ); +int x264_coeff_level_run4_aarch64( int16_t *, x264_run_level_t * ); +int x264_coeff_level_run8_neon( int16_t *, x264_run_level_t * ); +int x264_coeff_level_run15_neon( int16_t *, x264_run_level_t * ); +int x264_coeff_level_run16_neon( int16_t *, x264_run_level_t * ); + +void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int ); + #endif
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/asm.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/asm.S
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * asm.S: arm utility macros ***************************************************************************** - * Copyright (C) 2008-2014 x264 project + * Copyright (C) 2008-2015 x264 project * * Authors: Mans Rullgard <mans@mansr.com> * David Conrad <lessen42@gmail.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/cpu-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/cpu-a.S
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * cpu-a.S: arm cpu detection ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/dct-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/dct-a.S
Changed
@@ -1,7 +1,7 @@ /**************************************************************************** * dct-a.S: arm transform and zigzag ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/dct.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * dct.h: arm transform and zigzag ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/deblock-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/deblock-a.S
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * deblock.S: arm deblocking ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: Mans Rullgard <mans@mansr.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/mc-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/mc-a.S
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * mc.S: arm motion compensation ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> * Mans Rullgard <mans@mansr.com> @@ -1566,6 +1566,30 @@ pop {r4-r7, pc} endfunc +function x264_plane_copy_swap_neon + push {r4-r5, lr} + ldrd r4, r5, [sp, #12] + add lr, r4, #15 + bic lr, lr, #15 + sub r1, r1, lr, lsl #1 + sub r3, r3, lr, lsl #1 +1: + vld1.8 {q0, q1}, [r2]! + subs lr, lr, #16 + vrev16.8 q0, q0 + vrev16.8 q1, q1 + vst1.8 {q0, q1}, [r0]! + bgt 1b + + subs r5, r5, #1 + add r0, r0, r1 + add r2, r2, r3 + mov lr, r4 + bgt 1b + + pop {r4-r5, pc} +endfunc + function x264_store_interleave_chroma_neon push {lr} ldr lr, [sp, #4]
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/mc-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/arm/mc-c.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * mc-c.c: arm motion compensation ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> * @@ -57,6 +57,8 @@ void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); +void x264_plane_copy_swap_neon( pixel *dst, intptr_t i_dst, + pixel *src, intptr_t i_src, int w, int h ); void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); @@ -136,9 +138,6 @@ x264_mc_copy_w16_neon, }; -static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; -static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; - static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride, uint8_t *src[4], intptr_t i_src_stride, int mvx, int mvy, @@ -146,13 +145,13 @@ { int qpel_idx = ((mvy&3)<<2) + (mvx&3); intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); - uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset; + uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset; if ( (mvy&3) == 3 ) // explict if() to force conditional add src1 += i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { - uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); x264_pixel_avg_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, src2, i_height ); @@ -172,13 +171,13 @@ { int qpel_idx = ((mvy&3)<<2) + (mvx&3); intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); - uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset; + uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset; if ( (mvy&3) == 3 ) // explict if() to force conditional add src1 += i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { - uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); x264_pixel_avg_wtab_neon[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, src2, i_height ); @@ -243,6 +242,7 @@ pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; pf->plane_copy_interleave = x264_plane_copy_interleave_neon; + pf->plane_copy_swap = x264_plane_copy_swap_neon; pf->store_interleave_chroma = x264_store_interleave_chroma_neon; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/mc.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * mc.h: arm motion compensation ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/pixel-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/pixel-a.S
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * pixel.S: arm pixel metrics ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/pixel.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * pixel.h: arm pixel metrics ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/predict-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/predict-a.S
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * predict.S: arm intra prediction ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> * Mans Rullgard <mans@mansr.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/predict-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/arm/predict-c.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * predict.c: arm intra prediction ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/predict.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * predict.h: arm intra prediction ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/quant-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/quant-a.S
Changed
@@ -1,7 +1,7 @@ /**************************************************************************** * quant.S: arm quantization and level-run ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/arm/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/quant.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * quant.h: arm quantization and level-run ***************************************************************************** - * Copyright (C) 2005-2014 x264 project + * Copyright (C) 2005-2015 x264 project * * Authors: David Conrad <lessen42@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/bitstream.c -> x264-snapshot-20150804-2245.tar.bz2/common/bitstream.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * bitstream.c: bitstream writing ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Fiona Glaser <fiona@x264.com> @@ -54,6 +54,8 @@ void x264_cabac_block_residual_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end ); + /**************************************************************************** * x264_nal_encode: ****************************************************************************/ @@ -142,4 +144,8 @@ } #endif #endif +#if ARCH_AARCH64 + if( cpu&X264_CPU_NEON ) + pf->nal_escape = x264_nal_escape_neon; +#endif }
View file
x264-snapshot-20141218-2245.tar.bz2/common/bitstream.h -> x264-snapshot-20150804-2245.tar.bz2/common/bitstream.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * bitstream.h: bitstream writing ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Fiona Glaser <fiona@x264.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/cabac.c -> x264-snapshot-20150804-2245.tar.bz2/common/cabac.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * cabac.c: arithmetic coder ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/common/cabac.h -> x264-snapshot-20150804-2245.tar.bz2/common/cabac.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * cabac.h: arithmetic coder ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> @@ -72,6 +72,10 @@ #define x264_cabac_encode_decision x264_cabac_encode_decision_asm #define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm #define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm +#elif defined(ARCH_AARCH64) +#define x264_cabac_encode_decision x264_cabac_encode_decision_asm +#define x264_cabac_encode_bypass x264_cabac_encode_bypass_asm +#define x264_cabac_encode_terminal x264_cabac_encode_terminal_asm #else #define x264_cabac_encode_decision x264_cabac_encode_decision_c #define x264_cabac_encode_bypass x264_cabac_encode_bypass_c
View file
x264-snapshot-20141218-2245.tar.bz2/common/common.c -> x264-snapshot-20150804-2245.tar.bz2/common/common.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * common.c: misc common functions ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> @@ -579,6 +579,7 @@ { char *name_buf = NULL; int b_error = 0; + int errortype = X264_PARAM_BAD_VALUE; int name_was_bool; int value_was_null = !value; int i; @@ -595,6 +596,8 @@ { char *c; name_buf = strdup(name); + if( !name_buf ) + return X264_PARAM_BAD_NAME; while( (c = strchr( name_buf, '_' )) ) *c = '-'; name = name_buf; @@ -617,20 +620,23 @@ !strcasecmp(value, "auto") || atobool(value) ? x264_cpu_detect() : 0; if( b_error ) { - char *buf = strdup(value); - char *tok, UNUSED *saveptr=NULL, *init; - b_error = 0; - p->cpu = 0; - for( init=buf; (tok=strtok_r(init, ",", &saveptr)); init=NULL ) + char *buf = strdup( value ); + if( buf ) { - for( i=0; x264_cpu_names[i].flags && strcasecmp(tok, x264_cpu_names[i].name); i++ ); - p->cpu |= x264_cpu_names[i].flags; - if( !x264_cpu_names[i].flags ) - b_error = 1; + char *tok, UNUSED *saveptr=NULL, *init; + b_error = 0; + p->cpu = 0; + for( init=buf; (tok=strtok_r(init, ",", &saveptr)); init=NULL ) + { + for( i=0; x264_cpu_names[i].flags && strcasecmp(tok, x264_cpu_names[i].name); i++ ); + p->cpu |= x264_cpu_names[i].flags; + if( !x264_cpu_names[i].flags ) + b_error = 1; + } + free( buf ); + if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) ) + p->cpu |= X264_CPU_SSE2_IS_FAST; } - free( buf ); - if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) ) - p->cpu |= X264_CPU_SSE2_IS_FAST; } } OPT("threads") @@ -1049,7 +1055,10 @@ OPT("opencl-device") p->i_opencl_device = atoi( value ); else - return X264_PARAM_BAD_NAME; + { + b_error = 1; + errortype = X264_PARAM_BAD_NAME; + } #undef OPT #undef OPT2 #undef atobool @@ -1060,7 +1069,7 @@ free( name_buf ); b_error |= value_was_null && !name_was_bool; - return b_error ? X264_PARAM_BAD_VALUE : 0; + return b_error ? errortype : 0; } /**************************************************************************** @@ -1133,6 +1142,7 @@ [X264_CSP_I420] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } }, [X264_CSP_YV12] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } }, [X264_CSP_NV12] = { 2, { 256*1, 256*1 }, { 256*1, 256/2 }, }, + [X264_CSP_NV21] = { 2, { 256*1, 256*1 }, { 256*1, 256/2 }, }, [X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_NV16] = { 2, { 256*1, 256*1 }, { 256*1, 256*1 }, }, @@ -1265,29 +1275,36 @@ char *x264_slurp_file( const char *filename ) { int b_error = 0; - size_t i_size; + int64_t i_size; char *buf; FILE *fh = x264_fopen( filename, "rb" ); if( !fh ) return NULL; + b_error |= fseek( fh, 0, SEEK_END ) < 0; b_error |= ( i_size = ftell( fh ) ) <= 0; + if( WORD_SIZE == 4 ) + b_error |= i_size > INT32_MAX; b_error |= fseek( fh, 0, SEEK_SET ) < 0; if( b_error ) goto error; + buf = x264_malloc( i_size+2 ); if( !buf ) goto error; + b_error |= fread( buf, 1, i_size, fh ) != i_size; - if( buf[i_size-1] != '\n' ) - buf[i_size++] = '\n'; - buf[i_size] = 0; fclose( fh ); if( b_error ) { x264_free( buf ); return NULL; } + + if( buf[i_size-1] != '\n' ) + buf[i_size++] = '\n'; + buf[i_size] = '\0'; + return buf; error: fclose( fh );
View file
x264-snapshot-20141218-2245.tar.bz2/common/common.h -> x264-snapshot-20150804-2245.tar.bz2/common/common.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * common.h: misc common functions ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/common/cpu.c -> x264-snapshot-20150804-2245.tar.bz2/common/cpu.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * cpu.c: cpu detection ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> @@ -67,8 +67,8 @@ {"AVX", AVX}, {"XOP", AVX|X264_CPU_XOP}, {"FMA4", AVX|X264_CPU_FMA4}, - {"AVX2", AVX|X264_CPU_AVX2}, {"FMA3", AVX|X264_CPU_FMA3}, + {"AVX2", AVX|X264_CPU_FMA3|X264_CPU_AVX2}, #undef AVX #undef SSE2 #undef MMX2 @@ -92,6 +92,8 @@ #elif ARCH_AARCH64 {"ARMv8", X264_CPU_ARMV8}, {"NEON", X264_CPU_NEON}, +#elif ARCH_MIPS + {"MSA", X264_CPU_MSA}, #endif {"", 0}, }; @@ -419,6 +421,17 @@ return X264_CPU_ARMV8 | X264_CPU_NEON; } +#elif ARCH_MIPS + +uint32_t x264_cpu_detect( void ) +{ + uint32_t flags = 0; +#if HAVE_MSA + flags |= X264_CPU_MSA; +#endif + return flags; +} + #else uint32_t x264_cpu_detect( void )
View file
x264-snapshot-20141218-2245.tar.bz2/common/cpu.h -> x264-snapshot-20150804-2245.tar.bz2/common/cpu.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * cpu.h: cpu detection ***************************************************************************** - * Copyright (C) 2004-2014 x264 project + * Copyright (C) 2004-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * @@ -45,7 +45,6 @@ #define x264_emms() #endif #define x264_sfence x264_cpu_sfence -void x264_safe_intel_cpu_indicator_init( void ); /* kludge: * gcc can't give variables any greater alignment than the stack frame has.
View file
x264-snapshot-20141218-2245.tar.bz2/common/dct.c -> x264-snapshot-20150804-2245.tar.bz2/common/dct.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * dct.c: transform and zigzag ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> @@ -38,6 +38,9 @@ #if ARCH_AARCH64 # include "aarch64/dct.h" #endif +#if ARCH_MIPS +# include "mips/dct.h" +#endif /* the inverse of the scaling factors introduced by 8x8 fdct */ /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */ @@ -747,8 +750,32 @@ dctf->add8x8_idct8 = x264_add8x8_idct8_neon; dctf->add16x16_idct8= x264_add16x16_idct8_neon; +#if ARCH_AARCH64 + dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon; +#endif + } +#endif + +#if HAVE_MSA + if( cpu&X264_CPU_MSA ) + { + dctf->sub4x4_dct = x264_sub4x4_dct_msa; + dctf->sub8x8_dct = x264_sub8x8_dct_msa; + dctf->sub16x16_dct = x264_sub16x16_dct_msa; + dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_msa; + dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_msa; + dctf->dct4x4dc = x264_dct4x4dc_msa; + dctf->idct4x4dc = x264_idct4x4dc_msa; + dctf->add4x4_idct = x264_add4x4_idct_msa; + dctf->add8x8_idct = x264_add8x8_idct_msa; + dctf->add8x8_idct_dc = x264_add8x8_idct_dc_msa; + dctf->add16x16_idct = x264_add16x16_idct_msa; + dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa; + dctf->add8x8_idct8 = x264_add8x8_idct8_msa; + dctf->add16x16_idct8 = x264_add16x16_idct8_msa; } #endif + #endif // HIGH_BIT_DEPTH } @@ -1004,7 +1031,20 @@ #endif #if HAVE_ARMV6 || ARCH_AARCH64 if( cpu&X264_CPU_NEON ) - pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; + { + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon; +#if ARCH_AARCH64 + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_neon; + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_neon; + pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_neon; + pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_neon; + pf_interlaced->sub_8x8 = x264_zigzag_sub_8x8_field_neon; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_neon; + pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_neon; + pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon; + pf_progressive->sub_8x8 = x264_zigzag_sub_8x8_frame_neon; +#endif // ARCH_AARCH64 + } #endif // HAVE_ARMV6 || ARCH_AARCH64 #endif // HIGH_BIT_DEPTH @@ -1047,4 +1087,21 @@ } #endif // HIGH_BIT_DEPTH #endif +#if !HIGH_BIT_DEPTH +#if ARCH_AARCH64 + if( cpu&X264_CPU_NEON ) + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon; + } +#endif // ARCH_AARCH64 +#endif // !HIGH_BIT_DEPTH +#if !HIGH_BIT_DEPTH +#if HAVE_MSA + if( cpu&X264_CPU_MSA ) + { + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_msa; + } +#endif +#endif }
View file
x264-snapshot-20141218-2245.tar.bz2/common/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/dct.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * dct.h: transform and zigzag ***************************************************************************** - * Copyright (C) 2004-2014 x264 project + * Copyright (C) 2004-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/deblock.c -> x264-snapshot-20150804-2245.tar.bz2/common/deblock.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * deblock.c: deblocking ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> @@ -612,8 +612,10 @@ uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength; if( intra_cur ) { - memset( &bs[0][1], 3, 3*4*sizeof(uint8_t) ); - memset( &bs[1][1], 3, 3*4*sizeof(uint8_t) ); + M32( bs[0][1] ) = 0x03030303; + M64( bs[0][2] ) = 0x0303030303030303ULL; + M32( bs[1][1] ) = 0x03030303; + M64( bs[1][2] ) = 0x0303030303030303ULL; } else h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv, @@ -737,6 +739,32 @@ void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, int bframe ); +#if ARCH_AARCH64 +void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +#endif +#endif + +#if !HIGH_BIT_DEPTH +#if HAVE_MSA +void x264_deblock_v_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_h_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_v_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_v_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_h_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta ); +void x264_deblock_strength_msa( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit, + int bframe ); +#endif #endif void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) @@ -835,18 +863,43 @@ { pf->deblock_luma[1] = x264_deblock_v_luma_altivec; pf->deblock_luma[0] = x264_deblock_h_luma_altivec; - } + } #endif // HAVE_ALTIVEC #if HAVE_ARMV6 || ARCH_AARCH64 - if( cpu&X264_CPU_NEON ) - { + if( cpu&X264_CPU_NEON ) + { pf->deblock_luma[1] = x264_deblock_v_luma_neon; pf->deblock_luma[0] = x264_deblock_h_luma_neon; pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon; +#if ARCH_AARCH64 + pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon; + pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon; + pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon; + pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon; + pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon; + pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon; + pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon; + pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon; +#endif pf->deblock_strength = x264_deblock_strength_neon; - } + } +#endif + +#if HAVE_MSA + if( cpu&X264_CPU_MSA ) + { + pf->deblock_luma[1] = x264_deblock_v_luma_msa; + pf->deblock_luma[0] = x264_deblock_h_luma_msa; + pf->deblock_chroma[1] = x264_deblock_v_chroma_msa; + pf->deblock_h_chroma_420 = x264_deblock_h_chroma_msa; + pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_msa; + pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_msa; + pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_msa; + pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_msa; + pf->deblock_strength = x264_deblock_strength_msa; + } #endif #endif // !HIGH_BIT_DEPTH
View file
x264-snapshot-20141218-2245.tar.bz2/common/frame.c -> x264-snapshot-20150804-2245.tar.bz2/common/frame.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * frame.c: frame handling ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> @@ -47,6 +47,7 @@ switch( external_csp & X264_CSP_MASK ) { case X264_CSP_NV12: + case X264_CSP_NV21: case X264_CSP_I420: case X264_CSP_YV12: return X264_CSP_NV12; @@ -77,7 +78,7 @@ #if ARCH_X86 || ARCH_X86_64 if( h->param.cpu&X264_CPU_CACHELINE_64 ) align = 64; - else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX2 ) + else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX ) align = 32; #endif #if ARCH_PPC @@ -387,7 +388,15 @@ return -1; } - dst->i_type = src->i_type; + if( src->i_type < X264_TYPE_AUTO || src->i_type > X264_TYPE_KEYFRAME ) + { + x264_log( h, X264_LOG_WARNING, "forced frame type (%d) at %d is unknown\n", src->i_type, h->frames.i_input ); + dst->i_forced_type = X264_TYPE_AUTO; + } + else + dst->i_forced_type = src->i_type; + + dst->i_type = dst->i_forced_type; dst->i_qpplus1 = src->i_qpplus1; dst->i_pts = dst->i_reordered_pts = src->i_pts; dst->param = src->param; @@ -435,6 +444,12 @@ h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1], stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>v_shift ); } + else if( i_csp == X264_CSP_NV21 ) + { + get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift ); + h->mc.plane_copy_swap( dst->plane[1], dst->i_stride[1], (pixel*)pix[1], + stride[1]/sizeof(pixel), h->param.i_width>>1, h->param.i_height>>v_shift ); + } else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 ) { int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16;
View file
x264-snapshot-20141218-2245.tar.bz2/common/frame.h -> x264-snapshot-20150804-2245.tar.bz2/common/frame.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * frame.h: frame handling ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> @@ -39,6 +39,7 @@ int i_poc; int i_delta_poc[2]; int i_type; + int i_forced_type; int i_qpplus1; int64_t i_pts; int64_t i_dts;
View file
x264-snapshot-20141218-2245.tar.bz2/common/macroblock.c -> x264-snapshot-20150804-2245.tar.bz2/common/macroblock.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * macroblock.c: macroblock common functions ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Fiona Glaser <fiona@x264.com> * Laurent Aimar <fenrir@via.ecp.fr> @@ -1158,7 +1158,7 @@ { // Looking at the bottom field so always take the bottom macroblock of the pair. h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]]; - h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]]; + h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[1]]; h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[2]]; CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[0]] ); CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[1]] ); @@ -1436,8 +1436,10 @@ uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength; if( IS_INTRA( h->mb.i_type ) ) { - memset( bs[0][1], 3, 3*4*sizeof(uint8_t) ); - memset( bs[1][1], 3, 3*4*sizeof(uint8_t) ); + M32( bs[0][1] ) = 0x03030303; + M64( bs[0][2] ) = 0x0303030303030303ULL; + M32( bs[1][1] ) = 0x03030303; + M64( bs[1][2] ) = 0x0303030303030303ULL; return; } @@ -1450,7 +1452,9 @@ M32( bs[0][0] ) = 0x02020202; M32( bs[0][2] ) = 0x02020202; M32( bs[0][4] ) = 0x02020202; - memset( bs[1][0], 2, 5*4*sizeof(uint8_t) ); /* [1][1] and [1][3] has to be set for 4:2:2 */ + M64( bs[1][0] ) = 0x0202020202020202ULL; /* [1][1] and [1][3] has to be set for 4:2:2 */ + M64( bs[1][2] ) = 0x0202020202020202ULL; + M32( bs[1][4] ) = 0x02020202; return; } }
View file
x264-snapshot-20141218-2245.tar.bz2/common/macroblock.h -> x264-snapshot-20150804-2245.tar.bz2/common/macroblock.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * macroblock.h: macroblock common functions ***************************************************************************** - * Copyright (C) 2005-2014 x264 project + * Copyright (C) 2005-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr>
View file
x264-snapshot-20141218-2245.tar.bz2/common/mc.c -> x264-snapshot-20150804-2245.tar.bz2/common/mc.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * mc.c: motion compensation ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> @@ -38,6 +38,9 @@ #if ARCH_AARCH64 #include "aarch64/mc.h" #endif +#if ARCH_MIPS +#include "mips/mc.h" +#endif static inline void pixel_avg( pixel *dst, intptr_t i_dst_stride, @@ -189,8 +192,8 @@ } } -static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; -static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; +const uint8_t x264_hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; +const uint8_t x264_hpel_ref1[16] = {0,0,1,0,2,2,3,2,2,2,3,2,2,2,3,2}; static void mc_luma( pixel *dst, intptr_t i_dst_stride, pixel *src[4], intptr_t i_src_stride, @@ -199,11 +202,11 @@ { int qpel_idx = ((mvy&3)<<2) + (mvx&3); int offset = (mvy>>2)*i_src_stride + (mvx>>2); - pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; + pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { - pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); pixel_avg( dst, i_dst_stride, src1, i_src_stride, src2, i_src_stride, i_width, i_height ); if( weight->weightfn ) @@ -222,11 +225,11 @@ { int qpel_idx = ((mvy&3)<<2) + (mvx&3); int offset = (mvy>>2)*i_src_stride + (mvx>>2); - pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; + pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { - pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); pixel_avg( dst, *i_dst_stride, src1, i_src_stride, src2, i_src_stride, i_width, i_height ); if( weight->weightfn ) @@ -299,6 +302,17 @@ } } +void x264_plane_copy_swap_c( pixel *dst, intptr_t i_dst, + pixel *src, intptr_t i_src, int w, int h ) +{ + for( int y=0; y<h; y++, dst+=i_dst, src+=i_src ) + for( int x=0; x<2*w; x+=2 ) + { + dst[x] = src[x+1]; + dst[x+1] = src[x]; + } +} + void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ) @@ -612,6 +626,7 @@ pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec; pf->plane_copy = x264_plane_copy_c; + pf->plane_copy_swap = x264_plane_copy_swap_c; pf->plane_copy_interleave = x264_plane_copy_interleave_c; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c; @@ -647,6 +662,10 @@ #if ARCH_AARCH64 x264_mc_init_aarch64( cpu, pf ); #endif +#if HAVE_MSA + if( cpu&X264_CPU_MSA ) + x264_mc_init_mips( cpu, pf ); +#endif if( cpu_independent ) {
View file
x264-snapshot-20141218-2245.tar.bz2/common/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/mc.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * mc.h: motion compensation ***************************************************************************** - * Copyright (C) 2004-2014 x264 project + * Copyright (C) 2004-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * @@ -41,6 +41,8 @@ } ALIGNED_16( x264_weight_t ); extern const x264_weight_t x264_weight_none[3]; +extern const uint8_t x264_hpel_ref0[16]; +extern const uint8_t x264_hpel_ref1[16]; #define SET_WEIGHT( w, b, s, d, o )\ {\ @@ -86,6 +88,7 @@ void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, intptr_t i_src, int height ); void (*plane_copy)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ); + void (*plane_copy_swap)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ); void (*plane_copy_interleave)( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); /* may write up to 15 pixels off the end of each plane */
View file
x264-snapshot-20150804-2245.tar.bz2/common/mips
Added
+(directory)
View file
x264-snapshot-20150804-2245.tar.bz2/common/mips/dct-c.c
Added
@@ -0,0 +1,525 @@ +/***************************************************************************** + * dct-c.c: msa transform and zigzag + ***************************************************************************** + * Copyright (C) 2015 x264 project + * + * Authors: Rishikesh More <rishikesh.more@imgtec.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "macros.h" + +#if !HIGH_BIT_DEPTH +#define AVC_ITRANS_H( in0, in1, in2, in3, out0, out1, out2, out3 ) \ +{ \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + tmp0_m = in0 + in2; \ + tmp1_m = in0 - in2; \ + tmp2_m = in1 >> 1; \ + tmp2_m = tmp2_m - in3; \ + tmp3_m = in3 >> 1; \ + tmp3_m = in1 + tmp3_m; \ + \ + BUTTERFLY_4( tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3 ); \ +} + +static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst, + int32_t i_src_stride ) +{ + v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3; + v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3; + v4i32 hor_res0, hor_res1, hor_res2, hor_res3; + v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r; + + LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 ); + UNPCK_R_SH_SW( src0, src0_r ); + UNPCK_R_SH_SW( src1, src1_r ); + UNPCK_R_SH_SW( src2, src2_r ); + UNPCK_R_SH_SW( src3, src3_r ); + BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, + tmp0, tmp3, tmp2, tmp1 ); + BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3, + hor_res0, hor_res3, hor_res2, hor_res1 ); + TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3, + hor_res0, hor_res1, hor_res2, hor_res3 ); + BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1, + tmp0, tmp3, tmp2, tmp1 ); + BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3, + ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r ); + SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 ); + PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r, + ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r, + ver_res0, ver_res1, ver_res2, ver_res3 ); + PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 ); + ST_SH2( ver_res0, ver_res2, p_dst, 8 ); +} + +static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_ref, int32_t i_dst_stride, + int16_t *p_dst ) +{ + uint32_t i_src0, i_src1, i_src2, i_src3; + uint32_t i_ref0, i_ref1, i_ref2, i_ref3; + v16i8 src = { 0 }; + v16i8 ref = { 0 }; + v16u8 inp0, inp1; + v8i16 diff0, diff1, diff2, diff3; + v8i16 temp0, temp1, temp2, temp3; + + LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 ); + LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 ); + + INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src ); + INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref ); + + ILVRL_B2_UB( src, ref, inp0, inp1 ); + + HSUB_UB2_SH( inp0, inp1, diff0, diff2 ); + + diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 ); + diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 ); + + BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 ); + + diff0 = temp0 + temp1; + diff1 = ( temp3 << 1 ) + temp2; + diff2 = temp0 - temp1; + diff3 = temp3 - ( temp2 << 1 ); + + TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3, + temp0, temp1, temp2, temp3 ); + BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 ); + + temp0 = diff0 + diff1; + temp1 = ( diff3 << 1 ) + diff2; + temp2 = diff0 - diff1; + temp3 = diff3 - ( diff2 << 1 ); + + ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 ); + ST_UB2( inp0, inp1, p_dst, 8 ); +} + +static void avc_zigzag_scan_4x4_frame_msa( int16_t pi_dct[16], + int16_t pi_level[16] ) +{ + v8i16 src0, src1; + v8i16 mask0 = { 0, 4, 1, 2, 5, 8, 12, 9 }; + v8i16 mask1 = { 6, 3, 7, 10, 13, 14, 11, 15 }; + + LD_SH2( pi_dct, 8, src0, src1 ); + VSHF_H2_SH( src0, src1, src0, src1, mask0, mask1, mask0, mask1 ); + ST_SH2( mask0, mask1, pi_level, 8 ); +} + +static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src, + int32_t i_dst_stride ) +{ + v8i16 src0, src1, src2, src3; + v8i16 hres0, hres1, hres2, hres3; + v8i16 vres0, vres1, vres2, vres3; + v8i16 zeros = { 0 }; + + LD4x4_SH( p_src, src0, src1, src2, src3 ); + AVC_ITRANS_H( src0, src1, src2, src3, hres0, hres1, hres2, hres3 ); + TRANSPOSE4x4_SH_SH( hres0, hres1, hres2, hres3, + hres0, hres1, hres2, hres3 ); + AVC_ITRANS_H( hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3 ); + SRARI_H4_SH( vres0, vres1, vres2, vres3, 6 ); + ADDBLK_ST4x4_UB( vres0, vres1, vres2, vres3, p_dst, i_dst_stride ); + ST_SH2( zeros, zeros, p_src, 8 ); +} + +static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src, + int32_t i_dst_stride ) +{ + int16_t i_dc; + uint32_t i_src0, i_src1, i_src2, i_src3; + v16u8 pred = { 0 }; + v16i8 out; + v8i16 input_dc, pred_r, pred_l; + + i_dc = ( p_src[0] + 32 ) >> 6; + input_dc = __msa_fill_h( i_dc ); + p_src[ 0 ] = 0; + + LW4( p_dst, i_dst_stride, i_src0, i_src1, i_src2, i_src3 ); + INSERT_W4_UB( i_src0, i_src1, i_src2, i_src3, pred ); + UNPCK_UB_SH( pred, pred_r, pred_l ); + + pred_r += input_dc; + pred_l += input_dc; + + CLIP_SH2_0_255( pred_r, pred_l ); + out = __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r ); + ST4x4_UB( out, out, 0, 1, 2, 3, p_dst, i_dst_stride ); +} + +static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src, + int32_t i_dst_stride ) +{ + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 vec0, vec1, vec2, vec3; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r; + v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l; + v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l; + v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r; + v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l; + v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v16i8 zeros = { 0 }; + + p_src[ 0 ] += 32; + + LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 ); + + vec0 = src0 + src4; + vec1 = src0 - src4; + vec2 = src2 >> 1; + vec2 = vec2 - src6; + vec3 = src6 >> 1; + vec3 = src2 + vec3; + + BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 ); + + vec0 = src7 >> 1; + vec0 = src5 - vec0 - src3 - src7; + vec1 = src3 >> 1; + vec1 = src1 - vec1 + src7 - src3; + vec2 = src5 >> 1; + vec2 = vec2 - src1 + src7 + src5; + vec3 = src1 >> 1; + vec3 = vec3 + src3 + src5 + src1; + tmp4 = vec3 >> 2; + tmp4 += vec0; + tmp5 = vec2 >> 2; + tmp5 += vec1; + tmp6 = vec1 >> 2; + tmp6 -= vec2; + tmp7 = vec0 >> 2; + tmp7 = vec3 - tmp7; + + BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + res0, res1, res2, res3, res4, res5, res6, res7 ); + TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7, + res0, res1, res2, res3, res4, res5, res6, res7 ); + UNPCK_SH_SW( res0, tmp0_r, tmp0_l ); + UNPCK_SH_SW( res1, tmp1_r, tmp1_l ); + UNPCK_SH_SW( res2, tmp2_r, tmp2_l ); + UNPCK_SH_SW( res3, tmp3_r, tmp3_l ); + UNPCK_SH_SW( res4, tmp4_r, tmp4_l ); + UNPCK_SH_SW( res5, tmp5_r, tmp5_l ); + UNPCK_SH_SW( res6, tmp6_r, tmp6_l ); + UNPCK_SH_SW( res7, tmp7_r, tmp7_l ); + BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r, + vec0_r, vec0_l, vec1_l, vec1_r ); + + vec2_r = tmp2_r >> 1; + vec2_l = tmp2_l >> 1; + vec2_r -= tmp6_r; + vec2_l -= tmp6_l; + vec3_r = tmp6_r >> 1; + vec3_l = tmp6_l >> 1; + vec3_r += tmp2_r; + vec3_l += tmp2_l; + + BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r, + tmp0_r, tmp2_r, tmp4_r, tmp6_r ); + BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l, + tmp0_l, tmp2_l, tmp4_l, tmp6_l ); + + vec0_r = tmp7_r >> 1; + vec0_l = tmp7_l >> 1; + vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r; + vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l; + vec1_r = tmp3_r >> 1; + vec1_l = tmp3_l >> 1; + vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r; + vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l; + vec2_r = tmp5_r >> 1; + vec2_l = tmp5_l >> 1; + vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r; + vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l; + vec3_r = tmp1_r >> 1; + vec3_l = tmp1_l >> 1; + vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r; + vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l; + tmp1_r = vec3_r >> 2; + tmp1_l = vec3_l >> 2; + tmp1_r += vec0_r; + tmp1_l += vec0_l; + tmp3_r = vec2_r >> 2; + tmp3_l = vec2_l >> 2; + tmp3_r += vec1_r; + tmp3_l += vec1_l; + tmp5_r = vec1_r >> 2; + tmp5_l = vec1_l >> 2; + tmp5_r -= vec2_r; + tmp5_l -= vec2_l; + tmp7_r = vec0_r >> 2; + tmp7_l = vec0_l >> 2; + tmp7_r = vec3_r - tmp7_r; + tmp7_l = vec3_l - tmp7_l; + + BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r, + res0_r, res0_l, res7_l, res7_r ); + BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r, + res1_r, res1_l, res6_l, res6_r ); + BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r, + res2_r, res2_l, res5_l, res5_r ); + BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r, + res3_r, res3_l, res4_l, res4_r ); + SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 ); + SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 ); + SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 ); + SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 ); + PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r, + res0, res1, res2, res3 ); + PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r, + res4, res5, res6, res7 ); + LD_SB8( p_dst, i_dst_stride, + dst0, dst1, dst2, dst3, + dst4, dst5, dst6, dst7 ); + ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3, + tmp0, tmp1, tmp2, tmp3 ); + ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7, + tmp4, tmp5, tmp6, tmp7 ); + ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3, + res0, res1, res2, res3 ); + ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7, + res4, res5, res6, res7 ); + CLIP_SH4_0_255( res0, res1, res2, res3 ); + CLIP_SH4_0_255( res4, res5, res6, res7 ); + PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6, + dst0, dst1, dst2, dst3 ); + ST8x4_UB( dst0, dst1, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + ST8x4_UB( dst2, dst3, p_dst, i_dst_stride ); +} + +static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride, + int16_t *p_dst, int32_t i_dst_stride ) +{ + v8i16 src0, src1, src2, src3; + v4i32 src0_r, src1_r, src2_r, src3_r; + v4i32 hres0, hres1, hres2, hres3; + v8i16 vres0, vres1, vres2, vres3; + v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v2i64 res0, res1; + + LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 ); + UNPCK_R_SH_SW( src0, src0_r ); + UNPCK_R_SH_SW( src1, src1_r ); + UNPCK_R_SH_SW( src2, src2_r ); + UNPCK_R_SH_SW( src3, src3_r ); + BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1 ); + BUTTERFLY_4( vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1 ); + TRANSPOSE4x4_SW_SW( hres0, hres1, hres2, hres3, + hres0, hres1, hres2, hres3 ); + BUTTERFLY_4( hres0, hres2, hres3, hres1, vec0, vec3, vec2, vec1 ); + BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 ); + PCKEV_H4_SH( vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, + vres0, vres1, vres2, vres3 ); + PCKOD_D2_SD( vres1, vres0, vres3, vres2, res0, res1 ); + ST8x4_UB( res0, res1, p_dst, i_dst_stride * 2 ); +} + +static int32_t subtract_sum4x4_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *pred_ptr, int32_t i_pred_stride ) +{ + int16_t i_sum; + uint32_t i_src0, i_src1, i_src2, i_src3; + uint32_t i_pred0, i_pred1, i_pred2, i_pred3; + v16i8 src = { 0 }; + v16i8 pred = { 0 }; + v16u8 src_l0, src_l1; + v8i16 diff0, diff1; + + LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 ); + LW4( pred_ptr, i_pred_stride, i_pred0, i_pred1, i_pred2, i_pred3 ); + INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src ); + INSERT_W4_SB( i_pred0, i_pred1, i_pred2, i_pred3, pred ); + ILVRL_B2_UB( src, pred, src_l0, src_l1 ); + HSUB_UB2_SH( src_l0, src_l1, diff0, diff1 ); + i_sum = HADD_UH_U32( diff0 + diff1 ); + + return i_sum; +} + +void x264_dct4x4dc_msa( int16_t d[16] ) +{ + avc_dct4x4dc_msa( d, d, 4 ); +} + +void x264_idct4x4dc_msa( int16_t d[16] ) +{ + avc_idct4x4dc_msa( d, 4, d, 4 ); +} + +void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] ) +{ + avc_idct4x4_addblk_msa( p_dst, pi_dct, FDEC_STRIDE ); +} + +void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] ) +{ + avc_idct4x4_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE ); + avc_idct4x4_addblk_msa( &p_dst[4], &pi_dct[1][0], FDEC_STRIDE ); + avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 0], + &pi_dct[2][0], FDEC_STRIDE ); + avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 4], + &pi_dct[3][0], FDEC_STRIDE ); +} + +void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] ) +{ + x264_add8x8_idct_msa( &p_dst[0], &pi_dct[0] ); + x264_add8x8_idct_msa( &p_dst[8], &pi_dct[4] ); + x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 0], &pi_dct[8] ); + x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 8], &pi_dct[12] ); +} + +void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] ) +{ + avc_idct8_addblk_msa( p_dst, pi_dct, FDEC_STRIDE ); +} + +void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] ) +{ + avc_idct8_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE ); + avc_idct8_addblk_msa( &p_dst[8], &pi_dct[1][0], FDEC_STRIDE ); + avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 0], + &pi_dct[2][0], FDEC_STRIDE ); + avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 8], + &pi_dct[3][0], FDEC_STRIDE ); +} + +void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] ) +{ + avc_idct4x4_addblk_dc_msa( &p_dst[0], &pi_dct[0], FDEC_STRIDE ); + avc_idct4x4_addblk_dc_msa( &p_dst[4], &pi_dct[1], FDEC_STRIDE ); + avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 0], + &pi_dct[2], FDEC_STRIDE ); + avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 4], + &pi_dct[3], FDEC_STRIDE ); +} + +void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] ) +{ + for( int32_t i = 0; i < 4; i++, pi_dct += 4, p_dst += 4 * FDEC_STRIDE ) + { + avc_idct4x4_addblk_dc_msa( &p_dst[ 0], &pi_dct[0], FDEC_STRIDE ); + avc_idct4x4_addblk_dc_msa( &p_dst[ 4], &pi_dct[1], FDEC_STRIDE ); + avc_idct4x4_addblk_dc_msa( &p_dst[ 8], &pi_dct[2], FDEC_STRIDE ); + avc_idct4x4_addblk_dc_msa( &p_dst[12], &pi_dct[3], FDEC_STRIDE ); + } +} + +void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src, + uint8_t *p_ref ) +{ + avc_sub4x4_dct_msa( p_src, FENC_STRIDE, p_ref, FDEC_STRIDE, p_dst ); +} + +void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src, + uint8_t *p_ref ) +{ + avc_sub4x4_dct_msa( &p_src[0], FENC_STRIDE, + &p_ref[0], FDEC_STRIDE, p_dst[0] ); + avc_sub4x4_dct_msa( &p_src[4], FENC_STRIDE, &p_ref[4], + FDEC_STRIDE, p_dst[1] ); + avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 0], + FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 0], + FDEC_STRIDE, p_dst[2] ); + avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 4], + FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 4], + FDEC_STRIDE, p_dst[3] ); +} + +void x264_sub16x16_dct_msa( int16_t p_dst[16][16], + uint8_t *p_src, + uint8_t *p_ref ) +{ + x264_sub8x8_dct_msa( &p_dst[ 0], &p_src[0], &p_ref[0] ); + x264_sub8x8_dct_msa( &p_dst[ 4], &p_src[8], &p_ref[8] ); + x264_sub8x8_dct_msa( &p_dst[ 8], &p_src[8 * FENC_STRIDE + 0], + &p_ref[8*FDEC_STRIDE+0] ); + x264_sub8x8_dct_msa( &p_dst[12], &p_src[8 * FENC_STRIDE + 8], + &p_ref[8*FDEC_STRIDE+8] ); +} + +void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4], + uint8_t *p_pix1, uint8_t *p_pix2 ) +{ + int32_t d0, d1, d2, d3; + + pi_dct[0] = subtract_sum4x4_msa( &p_pix1[0], FENC_STRIDE, + &p_pix2[0], FDEC_STRIDE ); + pi_dct[1] = subtract_sum4x4_msa( &p_pix1[4], FENC_STRIDE, + &p_pix2[4], FDEC_STRIDE ); + pi_dct[2] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 0], FENC_STRIDE, + &p_pix2[4 * FDEC_STRIDE + 0], + FDEC_STRIDE ); + pi_dct[3] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 4], FENC_STRIDE, + &p_pix2[4 * FDEC_STRIDE + 4], + FDEC_STRIDE ); + + BUTTERFLY_4( pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1], d0, d1, d3, d2 ); + BUTTERFLY_4( d0, d2, d3, d1, pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1] ); +} + +void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8], + uint8_t *p_pix1, uint8_t *p_pix2 ) +{ + int32_t a0, a1, a2, a3, a4, a5, a6, a7; + int32_t b0, b1, b2, b3, b4, b5, b6, b7; + + a0 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 0], FENC_STRIDE, + &p_pix2[ 0 * FDEC_STRIDE + 0], FDEC_STRIDE ); + a1 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 4], FENC_STRIDE, + &p_pix2[ 0 * FDEC_STRIDE + 4], FDEC_STRIDE ); + a2 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 0], FENC_STRIDE, + &p_pix2[ 4 * FDEC_STRIDE + 0], FDEC_STRIDE ); + a3 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 4], FENC_STRIDE, + &p_pix2[ 4 * FDEC_STRIDE + 4], FDEC_STRIDE ); + a4 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 0], FENC_STRIDE, + &p_pix2[ 8 * FDEC_STRIDE + 0], FDEC_STRIDE ); + a5 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 4], FENC_STRIDE, + &p_pix2[ 8 * FDEC_STRIDE + 4], FDEC_STRIDE ); + a6 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 0], FENC_STRIDE, + &p_pix2[12 * FDEC_STRIDE + 0], FDEC_STRIDE ); + a7 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 4], FENC_STRIDE, + &p_pix2[12 * FDEC_STRIDE + 4], FDEC_STRIDE ); + + BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1, + b0, b1, b2, b3, b7, b6, b5, b4 ); + BUTTERFLY_8( b0, b2, b4, b6, b7, b5, b3, b1, + a0, a1, a2, a3, a7, a6, a5, a4 ); + BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1, + pi_dct[0], pi_dct[1], pi_dct[6], pi_dct[7], + pi_dct[5], pi_dct[4], pi_dct[3], pi_dct[2] ); +} + +void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] ) +{ + avc_zigzag_scan_4x4_frame_msa( pi_dct, pi_level ); +} +#endif
View file
x264-snapshot-20150804-2245.tar.bz2/common/mips/dct.h
Added
@@ -0,0 +1,49 @@ +/***************************************************************************** + * dct.h: msa transform and zigzag + ***************************************************************************** + * Copyright (C) 2015 x264 project + * + * Authors: Rishikesh More <rishikesh.more@imgtec.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_MIPS_DCT_H +#define X264_MIPS_DCT_H + +void x264_dct4x4dc_msa( int16_t d[16] ); +void x264_idct4x4dc_msa( int16_t d[16] ); +void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] ); +void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] ); +void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] ); +void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] ); +void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] ); +void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] ); +void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] ); +void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref ); +void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src, + uint8_t *p_ref ); +void x264_sub16x16_dct_msa( int16_t p_dst[16][16], uint8_t *p_src, + uint8_t *p_ref ); +void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4], uint8_t *p_pix1, + uint8_t *p_pix2 ); +void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8], uint8_t *p_pix1, + uint8_t *p_pix2 ); +void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] ); + +#endif
View file
x264-snapshot-20150804-2245.tar.bz2/common/mips/deblock-c.c
Added
@@ -0,0 +1,2010 @@ +/***************************************************************************** + * deblock-c.c: msa deblocking + ***************************************************************************** + * Copyright (C) 2015 x264 project + * + * Authors: Neha Rana <neha.rana@imgtec.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "macros.h" + +#if !HIGH_BIT_DEPTH +#define AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_or_q3_org_in, p0_or_q0_org_in, \ + q3_or_p3_org_in, p1_or_q1_org_in, \ + p2_or_q2_org_in, q1_or_p1_org_in, \ + p0_or_q0_out, p1_or_q1_out, p2_or_q2_out ) \ +{ \ + v8i16 threshold; \ + v8i16 const3 = __msa_ldi_h( 3 ); \ + \ + threshold = p0_or_q0_org_in + q3_or_p3_org_in; \ + threshold += p1_or_q1_org_in; \ + \ + p0_or_q0_out = threshold << 1; \ + p0_or_q0_out += p2_or_q2_org_in; \ + p0_or_q0_out += q1_or_p1_org_in; \ + p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 3 ); \ + \ + p1_or_q1_out = p2_or_q2_org_in + threshold; \ + p1_or_q1_out = __msa_srari_h( p1_or_q1_out, 2 ); \ + \ + p2_or_q2_out = p2_or_q2_org_in * const3; \ + p2_or_q2_out += p3_or_q3_org_in; \ + p2_or_q2_out += p3_or_q3_org_in; \ + p2_or_q2_out += threshold; \ + p2_or_q2_out = __msa_srari_h( p2_or_q2_out, 3 ); \ +} + +/* data[-u32_u_img_width] = ( uint8_t )( ( 2 * p1 + p0 + q1 + 2 ) >> 2 ); */ +#define AVC_LPF_P0_OR_Q0( p0_or_q0_org_in, q1_or_p1_org_in, \ + p1_or_q1_org_in, p0_or_q0_out ) \ +{ \ + p0_or_q0_out = p0_or_q0_org_in + q1_or_p1_org_in; \ + p0_or_q0_out += p1_or_q1_org_in; \ + p0_or_q0_out += p1_or_q1_org_in; \ + p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 2 ); \ +} + +#define AVC_LPF_P1_OR_Q1( p0_or_q0_org_in, q0_or_p0_org_in, \ + p1_or_q1_org_in, p2_or_q2_org_in, \ + negate_tc_in, tc_in, p1_or_q1_out ) \ +{ \ + v8i16 clip3, temp; \ + \ + clip3 = ( v8i16 ) __msa_aver_u_h( ( v8u16 ) p0_or_q0_org_in, \ + ( v8u16 ) q0_or_p0_org_in ); \ + temp = p1_or_q1_org_in << 1; \ + clip3 -= temp; \ + clip3 = __msa_ave_s_h( p2_or_q2_org_in, clip3 ); \ + clip3 = CLIP_SH( clip3, negate_tc_in, tc_in ); \ + p1_or_q1_out = p1_or_q1_org_in + clip3; \ +} + +#define AVC_LPF_P0Q0( q0_or_p0_org_in, p0_or_q0_org_in, \ + p1_or_q1_org_in, q1_or_p1_org_in, \ + negate_threshold_in, threshold_in, \ + p0_or_q0_out, q0_or_p0_out ) \ +{ \ + v8i16 q0_sub_p0, p1_sub_q1, delta; \ + \ + q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \ + p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \ + q0_sub_p0 <<= 2; \ + p1_sub_q1 += 4; \ + delta = q0_sub_p0 + p1_sub_q1; \ + delta >>= 3; \ + \ + delta = CLIP_SH( delta, negate_threshold_in, threshold_in ); \ + \ + p0_or_q0_out = p0_or_q0_org_in + delta; \ + q0_or_p0_out = q0_or_p0_org_in - delta; \ + \ + CLIP_SH2_0_255( p0_or_q0_out, q0_or_p0_out ); \ +} + +static void avc_loopfilter_luma_intra_edge_hor_msa( uint8_t *p_data, + uint8_t u_alpha_in, + uint8_t u_beta_in, + uint32_t u_img_width ) +{ + v16u8 p2_asub_p0, q2_asub_q0, p0_asub_q0; + v16u8 alpha, beta; + v16u8 is_less_than, is_less_than_beta, negate_is_less_than_beta; + v16u8 p2, p1, p0, q0, q1, q2; + v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; + v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; + v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; + v8i16 p2_r = { 0 }; + v8i16 p1_r = { 0 }; + v8i16 p0_r = { 0 }; + v8i16 q0_r = { 0 }; + v8i16 q1_r = { 0 }; + v8i16 q2_r = { 0 }; + v8i16 p2_l = { 0 }; + v8i16 p1_l = { 0 }; + v8i16 p0_l = { 0 }; + v8i16 q0_l = { 0 }; + v8i16 q1_l = { 0 }; + v8i16 q2_l = { 0 }; + v16u8 tmp_flag; + v16i8 zero = { 0 }; + + alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); + beta = ( v16u8 ) __msa_fill_b( u_beta_in ); + + LD_UB4( p_data - ( u_img_width << 1 ), u_img_width, + p1_org, p0_org, q0_org, q1_org ); + + { + v16u8 p1_asub_p0, q1_asub_q0, is_less_than_alpha; + + p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); + p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); + q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); + + is_less_than_alpha = ( p0_asub_q0 < alpha ); + is_less_than_beta = ( p1_asub_p0 < beta ); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = ( q1_asub_q0 < beta ); + is_less_than = is_less_than_beta & is_less_than; + } + + if( !__msa_test_bz_v( is_less_than ) ) + { + q2_org = LD_UB( p_data + ( 2 * u_img_width ) ); + p3_org = LD_UB( p_data - ( u_img_width << 2 ) ); + p2_org = LD_UB( p_data - ( 3 * u_img_width ) ); + + UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l ); + UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l ); + UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l ); + + tmp_flag = alpha >> 2; + tmp_flag = tmp_flag + 2; + tmp_flag = ( p0_asub_q0 < tmp_flag ); + + p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org ); + is_less_than_beta = ( p2_asub_p0 < beta ); + is_less_than_beta = is_less_than_beta & tmp_flag; + negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff ); + is_less_than_beta = is_less_than_beta & is_less_than; + negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; + { + v8u16 is_less_than_beta_l, is_less_than_beta_r; + + q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org ); + + is_less_than_beta_r = + ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 ); + if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) ) + { + v8i16 p3_org_r; + + ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r ); + AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r, + q0_org_r, p1_org_r, + p2_r, q1_org_r, p0_r, p1_r, p2_r ); + } + + q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org ); + + is_less_than_beta_l = + ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 ); + + if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) ) + { + v8i16 p3_org_l; + + ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l ); + AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l, + q0_org_l, p1_org_l, + p2_l, q1_org_l, p0_l, p1_l, p2_l ); + } + } + /* combine and store */ + if( !__msa_test_bz_v( is_less_than_beta ) ) + { + PCKEV_B3_UB( p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2 ); + + p0_org = __msa_bmnz_v( p0_org, p0, is_less_than_beta ); + p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta ); + p2_org = __msa_bmnz_v( p2_org, p2, is_less_than_beta ); + + ST_UB( p1_org, p_data - ( 2 * u_img_width ) ); + ST_UB( p2_org, p_data - ( 3 * u_img_width ) ); + } + { + v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l; + + negate_is_less_than_beta_r = + ( v8u16 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta, + zero, 8 ); + if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_r ) ) + { + AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r ); + } + + negate_is_less_than_beta_l = + ( v8u16 ) __msa_sldi_b( zero, + ( v16i8 ) negate_is_less_than_beta, 8 ); + if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_l ) ) + { + AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l ); + } + } + if( !__msa_test_bz_v( negate_is_less_than_beta ) ) + { + p0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p0_l, ( v16i8 ) p0_r ); + p0_org = __msa_bmnz_v( p0_org, p0, negate_is_less_than_beta ); + } + + ST_UB( p0_org, p_data - u_img_width ); + + q3_org = LD_UB( p_data + ( 3 * u_img_width ) ); + q2_asub_q0 = __msa_asub_u_b( q2_org, q0_org ); + is_less_than_beta = ( q2_asub_q0 < beta ); + is_less_than_beta = is_less_than_beta & tmp_flag; + negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff ); + is_less_than_beta = is_less_than_beta & is_less_than; + negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; + + { + v8u16 is_less_than_beta_l, is_less_than_beta_r; + is_less_than_beta_r = + ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 ); + if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) ) + { + v8i16 q3_org_r; + + ILVR_B2_SH( zero, q3_org, zero, q2_org, q3_org_r, q2_r ); + AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_r, q0_org_r, + p0_org_r, q1_org_r, + q2_r, p1_org_r, q0_r, q1_r, q2_r ); + } + is_less_than_beta_l = + ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 ); + if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) ) + { + v8i16 q3_org_l; + + ILVL_B2_SH( zero, q3_org, zero, q2_org, q3_org_l, q2_l ); + AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_l, q0_org_l, + p0_org_l, q1_org_l, + q2_l, p1_org_l, q0_l, q1_l, q2_l ); + } + } + + if( !__msa_test_bz_v( is_less_than_beta ) ) + { + PCKEV_B3_UB( q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2 ); + q0_org = __msa_bmnz_v( q0_org, q0, is_less_than_beta ); + q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta ); + q2_org = __msa_bmnz_v( q2_org, q2, is_less_than_beta ); + + ST_UB( q1_org, p_data + u_img_width ); + ST_UB( q2_org, p_data + 2 * u_img_width ); + } + { + v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l; + negate_is_less_than_beta_r = + ( v8u16 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta, + zero, 8 ); + if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_r ) ) + { + AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r ); + } + + negate_is_less_than_beta_l = + ( v8u16 ) __msa_sldi_b( zero, + ( v16i8 ) negate_is_less_than_beta, 8 ); + if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_l ) ) + { + AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l ); + } + } + if( !__msa_test_bz_v( negate_is_less_than_beta ) ) + { + q0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q0_l, ( v16i8 ) q0_r ); + q0_org = __msa_bmnz_v( q0_org, q0, negate_is_less_than_beta ); + } + + ST_UB( q0_org, p_data ); + } +} + +static void avc_loopfilter_luma_intra_edge_ver_msa( uint8_t *p_data, + uint8_t u_alpha_in, + uint8_t u_beta_in, + uint32_t u_img_width ) +{ + uint8_t *p_src; + v16u8 alpha, beta, p0_asub_q0; + v16u8 is_less_than_alpha, is_less_than; + v16u8 is_less_than_beta, negate_is_less_than_beta; + v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; + v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; + v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; + v8i16 p2_r = { 0 }; + v8i16 p1_r = { 0 }; + v8i16 p0_r = { 0 }; + v8i16 q0_r = { 0 }; + v8i16 q1_r = { 0 }; + v8i16 q2_r = { 0 }; + v8i16 p2_l = { 0 }; + v8i16 p1_l = { 0 }; + v8i16 p0_l = { 0 }; + v8i16 q0_l = { 0 }; + v8i16 q1_l = { 0 }; + v8i16 q2_l = { 0 }; + v16i8 zero = { 0 }; + v16u8 tmp_flag; + + p_src = p_data - 4; + + { + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + + LD_UB8( p_src, u_img_width, + row0, row1, row2, row3, row4, row5, row6, row7 ); + LD_UB8( p_src + ( 8 * u_img_width ), u_img_width, + row8, row9, row10, row11, row12, row13, row14, row15 ); + + TRANSPOSE16x8_UB_UB( row0, row1, row2, row3, + row4, row5, row6, row7, + row8, row9, row10, row11, + row12, row13, row14, row15, + p3_org, p2_org, p1_org, p0_org, + q0_org, q1_org, q2_org, q3_org ); + } + + UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l ); + UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l ); + UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l ); + UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l ); + + { + v16u8 p1_asub_p0, q1_asub_q0; + + p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); + p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); + q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); + + alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); + beta = ( v16u8 ) __msa_fill_b( u_beta_in ); + + is_less_than_alpha = ( p0_asub_q0 < alpha ); + is_less_than_beta = ( p1_asub_p0 < beta ); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = ( q1_asub_q0 < beta ); + is_less_than = is_less_than_beta & is_less_than; + } + + if( !__msa_test_bz_v( is_less_than ) ) + { + tmp_flag = alpha >> 2; + tmp_flag = tmp_flag + 2; + tmp_flag = ( p0_asub_q0 < tmp_flag ); + + { + v16u8 p2_asub_p0; + + p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org ); + is_less_than_beta = ( p2_asub_p0 < beta ); + } + is_less_than_beta = tmp_flag & is_less_than_beta; + negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff ); + is_less_than_beta = is_less_than_beta & is_less_than; + negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; + + { + v16u8 is_less_than_beta_r; + + is_less_than_beta_r = + ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 ); + if( !__msa_test_bz_v( is_less_than_beta_r ) ) + { + v8i16 p3_org_r; + + ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r ); + AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r, + q0_org_r, p1_org_r, + p2_r, q1_org_r, p0_r, p1_r, p2_r ); + } + } + + { + v16u8 is_less_than_beta_l; + + is_less_than_beta_l = + ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 ); + if( !__msa_test_bz_v( is_less_than_beta_l ) ) + { + v8i16 p3_org_l; + + ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l ); + AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l, + q0_org_l, p1_org_l, + p2_l, q1_org_l, p0_l, p1_l, p2_l ); + } + } + if( !__msa_test_bz_v( is_less_than_beta ) ) + { + v16u8 p0, p2, p1; + + PCKEV_B3_UB( p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2 ); + p0_org = __msa_bmnz_v( p0_org, p0, is_less_than_beta ); + p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta ); + p2_org = __msa_bmnz_v( p2_org, p2, is_less_than_beta ); + } + { + v16u8 negate_is_less_than_beta_r; + + negate_is_less_than_beta_r = + ( v16u8 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta, + zero, 8 ); + + if( !__msa_test_bz_v( negate_is_less_than_beta_r ) ) + { + AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r ); + } + } + { + v16u8 negate_is_less_than_beta_l; + + negate_is_less_than_beta_l = + ( v16u8 ) __msa_sldi_b( zero, + ( v16i8 ) negate_is_less_than_beta, 8 ); + if( !__msa_test_bz_v( negate_is_less_than_beta_l ) ) + { + AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l ); + } + } + + if( !__msa_test_bz_v( negate_is_less_than_beta ) ) + { + v16u8 p0; + + p0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p0_l, ( v16i8 ) p0_r ); + p0_org = __msa_bmnz_v( p0_org, p0, negate_is_less_than_beta ); + } + + { + v16u8 q2_asub_q0; + + q2_asub_q0 = __msa_asub_u_b( q2_org, q0_org ); + is_less_than_beta = ( q2_asub_q0 < beta ); + } + + is_less_than_beta = is_less_than_beta & tmp_flag; + negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff ); + + is_less_than_beta = is_less_than_beta & is_less_than; + negate_is_less_than_beta = negate_is_less_than_beta & is_less_than; + + { + v16u8 is_less_than_beta_r; + + is_less_than_beta_r = + ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 ); + if( !__msa_test_bz_v( is_less_than_beta_r ) ) + { + v8i16 q3_org_r; + + ILVR_B2_SH( zero, q3_org, zero, q2_org, q3_org_r, q2_r ); + AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_r, q0_org_r, + p0_org_r, q1_org_r, + q2_r, p1_org_r, q0_r, q1_r, q2_r ); + } + } + { + v16u8 is_less_than_beta_l; + + is_less_than_beta_l = + ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 ); + if( !__msa_test_bz_v( is_less_than_beta_l ) ) + { + v8i16 q3_org_l; + + ILVL_B2_SH( zero, q3_org, zero, q2_org, q3_org_l, q2_l ); + AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_l, q0_org_l, + p0_org_l, q1_org_l, + q2_l, p1_org_l, q0_l, q1_l, q2_l ); + } + } + if( !__msa_test_bz_v( is_less_than_beta ) ) + { + v16u8 q0, q1, q2; + + PCKEV_B3_UB( q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2 ); + q0_org = __msa_bmnz_v( q0_org, q0, is_less_than_beta ); + q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta ); + q2_org = __msa_bmnz_v( q2_org, q2, is_less_than_beta ); + } + + { + v16u8 negate_is_less_than_beta_r; + + negate_is_less_than_beta_r = + ( v16u8 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta, + zero, 8 ); + if( !__msa_test_bz_v( negate_is_less_than_beta_r ) ) + { + AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r ); + } + } + { + v16u8 negate_is_less_than_beta_l; + + negate_is_less_than_beta_l = + ( v16u8 ) __msa_sldi_b( zero, + ( v16i8 ) negate_is_less_than_beta, 8 ); + if( !__msa_test_bz_v( negate_is_less_than_beta_l ) ) + { + AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l ); + } + } + if( !__msa_test_bz_v( negate_is_less_than_beta ) ) + { + v16u8 q0; + + q0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q0_l, ( v16i8 ) q0_r ); + q0_org = __msa_bmnz_v( q0_org, q0, negate_is_less_than_beta ); + } + } + { + v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + ILVRL_B2_SH( p1_org, p2_org, tp0, tp2 ); + ILVRL_B2_SH( q0_org, p0_org, tp1, tp3 ); + ILVRL_B2_SH( q2_org, q1_org, tmp2, tmp5 ); + + ILVRL_H2_SH( tp1, tp0, tmp3, tmp4 ); + ILVRL_H2_SH( tp3, tp2, tmp6, tmp7 ); + + p_src = p_data - 3; + ST4x4_UB( tmp3, tmp3, 0, 1, 2, 3, p_src, u_img_width ); + ST2x4_UB( tmp2, 0, p_src + 4, u_img_width ); + p_src += 4 * u_img_width; + ST4x4_UB( tmp4, tmp4, 0, 1, 2, 3, p_src, u_img_width ); + ST2x4_UB( tmp2, 4, p_src + 4, u_img_width ); + p_src += 4 * u_img_width; + + ST4x4_UB( tmp6, tmp6, 0, 1, 2, 3, p_src, u_img_width ); + ST2x4_UB( tmp5, 0, p_src + 4, u_img_width ); + p_src += 4 * u_img_width; + ST4x4_UB( tmp7, tmp7, 0, 1, 2, 3, p_src, u_img_width ); + ST2x4_UB( tmp5, 4, p_src + 4, u_img_width ); + } +} + +static void avc_lpf_cbcr_interleaved_intra_edge_hor_msa( uint8_t *p_chroma, + uint8_t u_alpha_in, + uint8_t u_beta_in, + uint32_t u_img_width ) +{ + v16u8 alpha, beta, is_less_than; + v16u8 p0, q0, p1_org, p0_org, q0_org, q1_org; + v8i16 p0_r = { 0 }; + v8i16 q0_r = { 0 }; + v8i16 p0_l = { 0 }; + v8i16 q0_l = { 0 }; + + alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); + beta = ( v16u8 ) __msa_fill_b( u_beta_in ); + + LD_UB4( p_chroma - ( u_img_width << 1 ), u_img_width, + p1_org, p0_org, q0_org, q1_org ); + + { + v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; + v16u8 is_less_than_alpha, is_less_than_beta; + + p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); + p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); + q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); + + is_less_than_alpha = ( p0_asub_q0 < alpha ); + is_less_than_beta = ( p1_asub_p0 < beta ); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = ( q1_asub_q0 < beta ); + is_less_than = is_less_than_beta & is_less_than; + } + + if( !__msa_test_bz_v( is_less_than ) ) + { + v16i8 zero = { 0 }; + v16u8 is_less_than_r, is_less_than_l; + + is_less_than_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than, + zero, 8 ); + if( !__msa_test_bz_v( is_less_than_r ) ) + { + v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; + + ILVR_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org, + zero, q1_org, p1_org_r, p0_org_r, q0_org_r, + q1_org_r ); + AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r ); + AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r ); + } + + is_less_than_l = ( v16u8 ) __msa_sldi_b( zero, + ( v16i8 ) is_less_than, 8 ); + if( !__msa_test_bz_v( is_less_than_l ) ) + { + v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; + + ILVL_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org, + zero, q1_org, p1_org_l, p0_org_l, q0_org_l, + q1_org_l ); + AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l ); + AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l ); + } + + PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 ); + + p0_org = __msa_bmnz_v( p0_org, p0, is_less_than ); + q0_org = __msa_bmnz_v( q0_org, q0, is_less_than ); + + ST_UB( p0_org, ( p_chroma - u_img_width ) ); + ST_UB( q0_org, p_chroma ); + } +} + +static void avc_lpf_cbcr_interleaved_intra_edge_ver_msa( uint8_t *p_chroma, + uint8_t u_alpha_in, + uint8_t u_beta_in, + uint32_t u_img_width ) +{ + v16u8 is_less_than; + v16u8 p0, q0, p1_org, p0_org, q0_org, q1_org; + v8i16 p0_r = { 0 }; + v8i16 q0_r = { 0 }; + v8i16 p0_l = { 0 }; + v8i16 q0_l = { 0 }; + v16u8 p1_u_org, p0_u_org, q0_u_org, q1_u_org; + v16u8 p1_v_org, p0_v_org, q0_v_org, q1_v_org; + v16i8 tmp0, tmp1, tmp2, tmp3; + v4i32 vec0, vec1; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + + LD_UB8( ( p_chroma - 4 ), u_img_width, + row0, row1, row2, row3, row4, row5, row6, row7 ); + + TRANSPOSE8x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7, + p1_u_org, p1_v_org, p0_u_org, p0_v_org, + q0_u_org, q0_v_org, q1_u_org, q1_v_org ); + + ILVR_D4_UB( p1_v_org, p1_u_org, p0_v_org, p0_u_org, q0_v_org, q0_u_org, + q1_v_org, q1_u_org, p1_org, p0_org, q0_org, q1_org ); + + { + v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; + v16u8 is_less_than_beta, is_less_than_alpha, alpha, beta; + + p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); + p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); + q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); + + alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); + beta = ( v16u8 ) __msa_fill_b( u_beta_in ); + + is_less_than_alpha = ( p0_asub_q0 < alpha ); + is_less_than_beta = ( p1_asub_p0 < beta ); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = ( q1_asub_q0 < beta ); + is_less_than = is_less_than_beta & is_less_than; + } + + if( !__msa_test_bz_v( is_less_than ) ) + { + v16u8 is_less_than_r, is_less_than_l; + v16i8 zero = { 0 }; + + is_less_than_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than, + zero, 8 ); + if( !__msa_test_bz_v( is_less_than_r ) ) + { + v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; + + ILVR_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org, + zero, q1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r ); + AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r ); + AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r ); + } + + is_less_than_l = ( v16u8 ) __msa_sldi_b( zero, + ( v16i8 ) is_less_than, 8 ); + if( !__msa_test_bz_v( is_less_than_l ) ) + { + v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; + + ILVL_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org, + zero, q1_org, p1_org_l, p0_org_l, q0_org_l, q1_org_l ); + AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l ); + AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l ); + } + + PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 ); + + p0_org = __msa_bmnz_v( p0_org, p0, is_less_than ); + q0_org = __msa_bmnz_v( q0_org, q0, is_less_than ); + + SLDI_B2_0_UB( p0_org, q0_org, p0_v_org, q0_v_org, 8 ); + ILVR_D2_SB( p0_v_org, p0_org, q0_v_org, q0_org, tmp0, tmp1 ); + ILVRL_B2_SB( tmp1, tmp0, tmp2, tmp3 ); + ILVRL_B2_SW( tmp3, tmp2, vec0, vec1 ); + + ST4x8_UB( vec0, vec1, ( p_chroma - 2 ), u_img_width ); + } +} + +static void avc_loopfilter_luma_inter_edge_ver_msa( uint8_t *p_data, + uint8_t u_bs0, + uint8_t u_bs1, + uint8_t u_bs2, + uint8_t u_bs3, + uint8_t u_tc0, + uint8_t u_tc1, + uint8_t u_tc2, + uint8_t u_tc3, + uint8_t u_alpha_in, + uint8_t u_beta_in, + uint32_t u_img_width ) +{ + uint8_t *p_src; + v16u8 beta, tmp_vec, bs = { 0 }; + v16u8 tc = { 0 }; + v16u8 is_less_than, is_less_than_beta; + v16u8 p1, p0, q0, q1; + v8i16 p0_r, q0_r, p1_r = { 0 }; + v8i16 q1_r = { 0 }; + v8i16 p0_l, q0_l, p1_l = { 0 }; + v8i16 q1_l = { 0 }; + v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org; + v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r; + v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l; + v8i16 tc_r, tc_l; + v16i8 zero = { 0 }; + v16u8 is_bs_greater_than0; + + tmp_vec = ( v16u8 ) __msa_fill_b( u_bs0 ); + bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 0, ( v4i32 ) tmp_vec ); + tmp_vec = ( v16u8 ) __msa_fill_b( u_bs1 ); + bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 1, ( v4i32 ) tmp_vec ); + tmp_vec = ( v16u8 ) __msa_fill_b( u_bs2 ); + bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 2, ( v4i32 ) tmp_vec ); + tmp_vec = ( v16u8 ) __msa_fill_b( u_bs3 ); + bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 3, ( v4i32 ) tmp_vec ); + + if( !__msa_test_bz_v( bs ) ) + { + tmp_vec = ( v16u8 ) __msa_fill_b( u_tc0 ); + tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 0, ( v4i32 ) tmp_vec ); + tmp_vec = ( v16u8 ) __msa_fill_b( u_tc1 ); + tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 1, ( v4i32 ) tmp_vec ); + tmp_vec = ( v16u8 ) __msa_fill_b( u_tc2 ); + tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 2, ( v4i32 ) tmp_vec ); + tmp_vec = ( v16u8 ) __msa_fill_b( u_tc3 ); + tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 3, ( v4i32 ) tmp_vec ); + + is_bs_greater_than0 = ( zero < bs ); + + { + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16u8 row8, row9, row10, row11, row12, row13, row14, row15; + + p_src = p_data; + p_src -= 4; + + LD_UB8( p_src, u_img_width, + row0, row1, row2, row3, row4, row5, row6, row7 ); + p_src += ( 8 * u_img_width ); + LD_UB8( p_src, u_img_width, + row8, row9, row10, row11, row12, row13, row14, row15 ); + + TRANSPOSE16x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7, + row8, row9, row10, row11, + row12, row13, row14, row15, + p3_org, p2_org, p1_org, p0_org, + q0_org, q1_org, q2_org, q3_org ); + } + { + v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha; + v16u8 is_less_than_alpha; + + p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); + p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); + q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); + + alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); + beta = ( v16u8 ) __msa_fill_b( u_beta_in ); + + is_less_than_alpha = ( p0_asub_q0 < alpha ); + is_less_than_beta = ( p1_asub_p0 < beta ); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = ( q1_asub_q0 < beta ); + is_less_than = is_less_than_beta & is_less_than; + is_less_than = is_less_than & is_bs_greater_than0; + } + if( !__msa_test_bz_v( is_less_than ) ) + { + v16i8 negate_tc, sign_negate_tc; + v8i16 negate_tc_r, i16_negatetc_l; + + negate_tc = zero - ( v16i8 ) tc; + sign_negate_tc = __msa_clti_s_b( negate_tc, 0 ); + + ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r, + i16_negatetc_l ); + + UNPCK_UB_SH( tc, tc_r, tc_l ); + UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l ); + UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l ); + UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l ); + + { + v16u8 p2_asub_p0; + v16u8 is_less_than_beta_r, is_less_than_beta_l; + + p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org ); + is_less_than_beta = ( p2_asub_p0 < beta ); + is_less_than_beta = is_less_than_beta & is_less_than; + + is_less_than_beta_r = + ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, + zero, 8 ); + if( !__msa_test_bz_v( is_less_than_beta_r ) ) + { + p2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) p2_org ); + + AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, p1_org_r, p2_org_r, + negate_tc_r, tc_r, p1_r ); + } + + is_less_than_beta_l = + ( v16u8 ) __msa_sldi_b( zero, + ( v16i8 ) is_less_than_beta, 8 ); + if( !__msa_test_bz_v( is_less_than_beta_l ) ) + { + p2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) p2_org ); + + AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, p1_org_l, p2_org_l, + i16_negatetc_l, tc_l, p1_l ); + } + } + + if( !__msa_test_bz_v( is_less_than_beta ) ) + { + p1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p1_l, ( v16i8 ) p1_r ); + p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta ); + + is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 ); + tc = tc + is_less_than_beta; + } + + { + v16u8 u8_q2asub_q0; + v16u8 is_less_than_beta_l, is_less_than_beta_r; + + u8_q2asub_q0 = __msa_asub_u_b( q2_org, q0_org ); + is_less_than_beta = ( u8_q2asub_q0 < beta ); + is_less_than_beta = is_less_than_beta & is_less_than; + + q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org ); + + is_less_than_beta_r = + ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, + zero, 8 ); + if( !__msa_test_bz_v( is_less_than_beta_r ) ) + { + q2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q2_org ); + AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, q1_org_r, q2_org_r, + negate_tc_r, tc_r, q1_r ); + } + + q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org ); + + is_less_than_beta_l = + ( v16u8 ) __msa_sldi_b( zero, + ( v16i8 ) is_less_than_beta, 8 ); + if( !__msa_test_bz_v( is_less_than_beta_l ) ) + { + q2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q2_org ); + AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, q1_org_l, q2_org_l, + i16_negatetc_l, tc_l, q1_l ); + } + } + + if( !__msa_test_bz_v( is_less_than_beta ) ) + { + q1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q1_l, ( v16i8 ) q1_r ); + q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta ); + + is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 ); + tc = tc + is_less_than_beta; + } + + { + v8i16 threshold_r, negate_thresh_r; + v8i16 threshold_l, negate_thresh_l; + v16i8 negate_thresh, sign_negate_thresh; + + negate_thresh = zero - ( v16i8 ) tc; + sign_negate_thresh = __msa_clti_s_b( negate_thresh, 0 ); + + ILVR_B2_SH( zero, tc, sign_negate_thresh, negate_thresh, + threshold_r, negate_thresh_r ); + + AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r, + negate_thresh_r, threshold_r, p0_r, q0_r ); + + threshold_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) tc ); + negate_thresh_l = ( v8i16 ) __msa_ilvl_b( sign_negate_thresh, + negate_thresh ); + + AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l, + negate_thresh_l, threshold_l, p0_l, q0_l ); + } + + PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 ); + + p0_org = __msa_bmnz_v( p0_org, p0, is_less_than ); + q0_org = __msa_bmnz_v( q0_org, q0, is_less_than ); + } + { + v16i8 tp0, tp1, tp2, tp3; + v8i16 tmp2, tmp5; + v4i32 tmp3, tmp4, tmp6, tmp7; + uint32_t u_out0, u_out2; + uint16_t u_out1, u_out3; + + p_src = p_data - 3; + + ILVRL_B2_SB( p1_org, p2_org, tp0, tp2 ); + ILVRL_B2_SB( q0_org, p0_org, tp1, tp3 ); + ILVRL_B2_SH( q2_org, q1_org, tmp2, tmp5 ); + + ILVRL_H2_SW( tp1, tp0, tmp3, tmp4 ); + ILVRL_H2_SW( tp3, tp2, tmp6, tmp7 ); + + u_out0 = __msa_copy_u_w( tmp3, 0 ); + u_out1 = __msa_copy_u_h( tmp2, 0 ); + u_out2 = __msa_copy_u_w( tmp3, 1 ); + u_out3 = __msa_copy_u_h( tmp2, 1 ); + + SW( u_out0, p_src ); + SH( u_out1, ( p_src + 4 ) ); + p_src += u_img_width; + SW( u_out2, p_src ); + SH( u_out3, ( p_src + 4 ) ); + + u_out0 = __msa_copy_u_w( tmp3, 2 ); + u_out1 = __msa_copy_u_h( tmp2, 2 ); + u_out2 = __msa_copy_u_w( tmp3, 3 ); + u_out3 = __msa_copy_u_h( tmp2, 3 ); + + p_src += u_img_width; + SW( u_out0, p_src ); + SH( u_out1, ( p_src + 4 ) ); + p_src += u_img_width; + SW( u_out2, p_src ); + SH( u_out3, ( p_src + 4 ) ); + + u_out0 = __msa_copy_u_w( tmp4, 0 ); + u_out1 = __msa_copy_u_h( tmp2, 4 ); + u_out2 = __msa_copy_u_w( tmp4, 1 ); + u_out3 = __msa_copy_u_h( tmp2, 5 ); + + p_src += u_img_width; + SW( u_out0, p_src ); + SH( u_out1, ( p_src + 4 ) ); + p_src += u_img_width; + SW( u_out2, p_src ); + SH( u_out3, ( p_src + 4 ) ); + + u_out0 = __msa_copy_u_w( tmp4, 2 ); + u_out1 = __msa_copy_u_h( tmp2, 6 ); + u_out2 = __msa_copy_u_w( tmp4, 3 ); + u_out3 = __msa_copy_u_h( tmp2, 7 ); + + p_src += u_img_width; + SW( u_out0, p_src ); + SH( u_out1, ( p_src + 4 ) ); + p_src += u_img_width; + SW( u_out2, p_src ); + SH( u_out3, ( p_src + 4 ) ); + + u_out0 = __msa_copy_u_w( tmp6, 0 ); + u_out1 = __msa_copy_u_h( tmp5, 0 ); + u_out2 = __msa_copy_u_w( tmp6, 1 ); + u_out3 = __msa_copy_u_h( tmp5, 1 ); + + p_src += u_img_width; + SW( u_out0, p_src ); + SH( u_out1, ( p_src + 4 ) ); + p_src += u_img_width; + SW( u_out2, p_src ); + SH( u_out3, ( p_src + 4 ) ); + + u_out0 = __msa_copy_u_w( tmp6, 2 ); + u_out1 = __msa_copy_u_h( tmp5, 2 ); + u_out2 = __msa_copy_u_w( tmp6, 3 ); + u_out3 = __msa_copy_u_h( tmp5, 3 ); + + p_src += u_img_width; + SW( u_out0, p_src ); + SH( u_out1, ( p_src + 4 ) ); + p_src += u_img_width; + SW( u_out2, p_src ); + SH( u_out3, ( p_src + 4 ) ); + + u_out0 = __msa_copy_u_w( tmp7, 0 ); + u_out1 = __msa_copy_u_h( tmp5, 4 ); + u_out2 = __msa_copy_u_w( tmp7, 1 ); + u_out3 = __msa_copy_u_h( tmp5, 5 ); + + p_src += u_img_width; + SW( u_out0, p_src ); + SH( u_out1, ( p_src + 4 ) ); + p_src += u_img_width; + SW( u_out2, p_src ); + SH( u_out3, ( p_src + 4 ) ); + + u_out0 = __msa_copy_u_w( tmp7, 2 ); + u_out1 = __msa_copy_u_h( tmp5, 6 ); + u_out2 = __msa_copy_u_w( tmp7, 3 ); + u_out3 = __msa_copy_u_h( tmp5, 7 ); + + p_src += u_img_width; + SW( u_out0, p_src ); + SH( u_out1, ( p_src + 4 ) ); + p_src += u_img_width; + SW( u_out2, p_src ); + SH( u_out3, ( p_src + 4 ) ); + } + } +} + +static void avc_loopfilter_luma_inter_edge_hor_msa( uint8_t *p_data, + uint8_t u_bs0, + uint8_t u_bs1, + uint8_t u_bs2, + uint8_t u_bs3, + uint8_t u_tc0, + uint8_t u_tc1, + uint8_t u_tc2, + uint8_t u_tc3, + uint8_t u_alpha_in, + uint8_t u_beta_in, + uint32_t u_image_width ) +{ + v16u8 p2_asub_p0, u8_q2asub_q0; + v16u8 alpha, beta, is_less_than, is_less_than_beta; + v16u8 p1, p0, q0, q1; + v8i16 p1_r = { 0 }; + v8i16 p0_r, q0_r, q1_r = { 0 }; + v8i16 p1_l = { 0 }; + v8i16 p0_l, q0_l, q1_l = { 0 }; + v16u8 p2_org, p1_org, p0_org, q0_org, q1_org, q2_org; + v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r; + v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l; + v16i8 zero = { 0 }; + v16u8 tmp_vec; + v16u8 bs = { 0 }; + v16i8 tc = { 0 }; + + tmp_vec = ( v16u8 ) __msa_fill_b( u_bs0 ); + bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 0, ( v4i32 ) tmp_vec ); + tmp_vec = ( v16u8 ) __msa_fill_b( u_bs1 ); + bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 1, ( v4i32 ) tmp_vec ); + tmp_vec = ( v16u8 ) __msa_fill_b( u_bs2 ); + bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 2, ( v4i32 ) tmp_vec ); + tmp_vec = ( v16u8 ) __msa_fill_b( u_bs3 ); + bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 3, ( v4i32 ) tmp_vec ); + + if( !__msa_test_bz_v( bs ) ) + { + tmp_vec = ( v16u8 ) __msa_fill_b( u_tc0 ); + tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 0, ( v4i32 ) tmp_vec ); + tmp_vec = ( v16u8 ) __msa_fill_b( u_tc1 ); + tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 1, ( v4i32 ) tmp_vec ); + tmp_vec = ( v16u8 ) __msa_fill_b( u_tc2 ); + tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 2, ( v4i32 ) tmp_vec ); + tmp_vec = ( v16u8 ) __msa_fill_b( u_tc3 ); + tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 3, ( v4i32 ) tmp_vec ); + + alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); + beta = ( v16u8 ) __msa_fill_b( u_beta_in ); + + LD_UB5( p_data - ( 3 * u_image_width ), u_image_width, + p2_org, p1_org, p0_org, q0_org, q1_org ); + + { + v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; + v16u8 is_less_than_alpha, is_bs_greater_than0; + + is_bs_greater_than0 = ( ( v16u8 ) zero < bs ); + p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); + p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); + q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); + + is_less_than_alpha = ( p0_asub_q0 < alpha ); + is_less_than_beta = ( p1_asub_p0 < beta ); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = ( q1_asub_q0 < beta ); + is_less_than = is_less_than_beta & is_less_than; + is_less_than = is_less_than & is_bs_greater_than0; + } + + if( !__msa_test_bz_v( is_less_than ) ) + { + v16i8 sign_negate_tc, negate_tc; + v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r; + + q2_org = LD_UB( p_data + ( 2 * u_image_width ) ); + negate_tc = zero - tc; + sign_negate_tc = __msa_clti_s_b( negate_tc, 0 ); + + ILVRL_B2_SH( sign_negate_tc, negate_tc, + negate_tc_r, i16_negatetc_l ); + + UNPCK_UB_SH( tc, tc_r, tc_l ); + UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l ); + UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l ); + UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l ); + + p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org ); + is_less_than_beta = ( p2_asub_p0 < beta ); + is_less_than_beta = is_less_than_beta & is_less_than; + { + v8u16 is_less_than_beta_r, is_less_than_beta_l; + + is_less_than_beta_r = + ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, + zero, 8 ); + if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) ) + { + p2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) p2_org ); + + AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, p1_org_r, p2_org_r, + negate_tc_r, tc_r, p1_r ); + } + + is_less_than_beta_l = + ( v8u16 ) __msa_sldi_b( zero, + ( v16i8 ) is_less_than_beta, 8 ); + if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) ) + { + p2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) p2_org ); + + AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, p1_org_l, p2_org_l, + i16_negatetc_l, tc_l, p1_l ); + } + } + if( !__msa_test_bz_v( is_less_than_beta ) ) + { + p1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p1_l, ( v16i8 ) p1_r ); + p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta ); + ST_UB( p1_org, p_data - ( 2 * u_image_width ) ); + + is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 ); + tc = tc + ( v16i8 ) is_less_than_beta; + } + + u8_q2asub_q0 = __msa_asub_u_b( q2_org, q0_org ); + is_less_than_beta = ( u8_q2asub_q0 < beta ); + is_less_than_beta = is_less_than_beta & is_less_than; + + { + v8u16 is_less_than_beta_r, is_less_than_beta_l; + is_less_than_beta_r = + ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, + zero, 8 ); + + q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org ); + if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) ) + { + q2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q2_org ); + + AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, q1_org_r, q2_org_r, + negate_tc_r, tc_r, q1_r ); + } + is_less_than_beta_l = + ( v8u16 ) __msa_sldi_b( zero, + ( v16i8 ) is_less_than_beta, 8 ); + + q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org ); + if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) ) + { + q2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q2_org ); + + AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, q1_org_l, q2_org_l, + i16_negatetc_l, tc_l, q1_l ); + } + } + if( !__msa_test_bz_v( is_less_than_beta ) ) + { + q1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q1_l, ( v16i8 ) q1_r ); + q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta ); + ST_UB( q1_org, p_data + u_image_width ); + + is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 ); + tc = tc + ( v16i8 ) is_less_than_beta; + } + { + v16i8 negate_thresh, sign_negate_thresh; + v8i16 threshold_r, threshold_l; + v8i16 negate_thresh_l, negate_thresh_r; + + negate_thresh = zero - tc; + sign_negate_thresh = __msa_clti_s_b( negate_thresh, 0 ); + + ILVR_B2_SH( zero, tc, sign_negate_thresh, negate_thresh, + threshold_r, negate_thresh_r ); + AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r, + negate_thresh_r, threshold_r, p0_r, q0_r ); + + threshold_l = ( v8i16 ) __msa_ilvl_b( zero, tc ); + negate_thresh_l = ( v8i16 ) __msa_ilvl_b( sign_negate_thresh, + negate_thresh ); + AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l, + negate_thresh_l, threshold_l, p0_l, q0_l ); + } + + PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 ); + + p0_org = __msa_bmnz_v( p0_org, p0, is_less_than ); + q0_org = __msa_bmnz_v( q0_org, q0, is_less_than ); + + ST_UB( p0_org, ( p_data - u_image_width ) ); + ST_UB( q0_org, p_data ); + } + } +} + +static void avc_lpf_cbcr_interleaved_inter_edge_hor_msa( uint8_t *p_chroma, + uint8_t u_bs0, + uint8_t u_bs1, + uint8_t u_bs2, + uint8_t u_bs3, + uint8_t u_tc0, + uint8_t u_tc1, + uint8_t u_tc2, + uint8_t u_tc3, + uint8_t u_alpha_in, + uint8_t u_beta_in, + uint32_t u_img_width ) +{ + v16u8 alpha, beta; + v4i32 tmp_vec, bs = { 0 }; + v4i32 tc = { 0 }; + v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0; + v16u8 is_less_than; + v8i16 is_less_than_r, is_less_than_l; + v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0; + v16u8 p0, q0; + v8i16 p0_r = { 0 }; + v8i16 q0_r = { 0 }; + v8i16 p0_l = { 0 }; + v8i16 q0_l = { 0 }; + v16u8 p1_org, p0_org, q0_org, q1_org; + v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; + v16i8 negate_tc, sign_negate_tc; + v8i16 negate_tc_r, i16_negatetc_l; + v8i16 tc_r, tc_l; + v16i8 zero = { 0 }; + v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; + + tmp_vec = ( v4i32 ) __msa_fill_b( u_bs0 ); + bs = __msa_insve_w( bs, 0, tmp_vec ); + tmp_vec = ( v4i32 ) __msa_fill_b( u_bs1 ); + bs = __msa_insve_w( bs, 1, tmp_vec ); + tmp_vec = ( v4i32 ) __msa_fill_b( u_bs2 ); + bs = __msa_insve_w( bs, 2, tmp_vec ); + tmp_vec = ( v4i32 ) __msa_fill_b( u_bs3 ); + bs = __msa_insve_w( bs, 3, tmp_vec ); + + if( !__msa_test_bz_v( ( v16u8 ) bs ) ) + { + tmp_vec = ( v4i32 ) __msa_fill_b( u_tc0 ); + tc = __msa_insve_w( tc, 0, tmp_vec ); + tmp_vec = ( v4i32 ) __msa_fill_b( u_tc1 ); + tc = __msa_insve_w( tc, 1, tmp_vec ); + tmp_vec = ( v4i32 ) __msa_fill_b( u_tc2 ); + tc = __msa_insve_w( tc, 2, tmp_vec ); + tmp_vec = ( v4i32 ) __msa_fill_b( u_tc3 ); + tc = __msa_insve_w( tc, 3, tmp_vec ); + + is_bs_greater_than0 = ( v16u8 ) ( zero < ( v16i8 ) bs ); + + alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); + beta = ( v16u8 ) __msa_fill_b( u_beta_in ); + + LD_UB4( p_chroma - ( u_img_width << 1 ), u_img_width, + p1_org, p0_org, q0_org, q1_org ); + + p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); + p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); + q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); + + is_less_than_alpha = ( p0_asub_q0 < alpha ); + is_less_than_beta = ( p1_asub_p0 < beta ); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = ( q1_asub_q0 < beta ); + is_less_than = is_less_than_beta & is_less_than; + + is_less_than = is_less_than & is_bs_greater_than0; + + if( !__msa_test_bz_v( is_less_than ) ) + { + negate_tc = zero - ( v16i8 ) tc; + sign_negate_tc = __msa_clti_s_b( negate_tc, 0 ); + + ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r, + i16_negatetc_l ); + + UNPCK_UB_SH( tc, tc_r, tc_l ); + UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l ); + UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l ); + UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l ); + UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l ); + + is_less_than_r = + ( v8i16 ) __msa_sldi_b( ( v16i8 ) is_less_than, zero, 8 ); + if( !__msa_test_bz_v( ( v16u8 ) is_less_than_r ) ) + { + AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r, + negate_tc_r, tc_r, p0_r, q0_r ); + } + + is_less_than_l = + ( v8i16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than, 8 ); + if( !__msa_test_bz_v( ( v16u8 ) is_less_than_l ) ) + { + AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l, + i16_negatetc_l, tc_l, p0_l, q0_l ); + } + + PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 ); + + p0_org = __msa_bmnz_v( p0_org, p0, is_less_than ); + q0_org = __msa_bmnz_v( q0_org, q0, is_less_than ); + + ST_UB( p0_org, p_chroma - u_img_width ); + ST_UB( q0_org, p_chroma ); + } + } +} + +static void avc_lpf_cbcr_interleaved_inter_edge_ver_msa( uint8_t *p_chroma, + uint8_t u_bs0, + uint8_t u_bs1, + uint8_t u_bs2, + uint8_t u_bs3, + uint8_t u_tc0, + uint8_t u_tc1, + uint8_t u_tc2, + uint8_t u_tc3, + uint8_t u_alpha_in, + uint8_t u_beta_in, + uint32_t u_img_width ) +{ + v16u8 alpha, beta; + v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0; + v16u8 is_less_than, is_less_than1; + v8i16 is_less_than_r, is_less_than_l; + v16u8 is_less_than_beta, is_less_than_alpha; + v8i16 p0_r = { 0 }; + v8i16 q0_r = { 0 }; + v8i16 p0_l = { 0 }; + v8i16 q0_l = { 0 }; + v16u8 p1_org, p0_org, q0_org, q1_org; + v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r; + v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l; + v16u8 is_bs_less_than4, is_bs_greater_than0; + v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l; + v16u8 const4; + v16i8 zero = { 0 }; + v8i16 tmp_vec, bs = { 0 }; + v8i16 tc = { 0 }; + v16u8 p1_u_org, p0_u_org, q0_u_org, q1_u_org; + v16u8 p1_v_org, p0_v_org, q0_v_org, q1_v_org; + v16i8 tmp0, tmp1, tmp2, tmp3; + v4i32 vec0, vec1; + v16u8 row0, row1, row2, row3, row4, row5, row6, row7; + v16i8 negate_tc, sign_negate_tc; + + const4 = ( v16u8 ) __msa_ldi_b( 4 ); + + tmp_vec = ( v8i16 ) __msa_fill_b( u_bs0 ); + bs = __msa_insve_h( bs, 0, tmp_vec ); + bs = __msa_insve_h( bs, 4, tmp_vec ); + + tmp_vec = ( v8i16 ) __msa_fill_b( u_bs1 ); + bs = __msa_insve_h( bs, 1, tmp_vec ); + bs = __msa_insve_h( bs, 5, tmp_vec ); + + tmp_vec = ( v8i16 ) __msa_fill_b( u_bs2 ); + bs = __msa_insve_h( bs, 2, tmp_vec ); + bs = __msa_insve_h( bs, 6, tmp_vec ); + + tmp_vec = ( v8i16 ) __msa_fill_b( u_bs3 ); + bs = __msa_insve_h( bs, 3, tmp_vec ); + bs = __msa_insve_h( bs, 7, tmp_vec ); + + if( !__msa_test_bz_v( ( v16u8 ) bs ) ) + { + tmp_vec = ( v8i16 ) __msa_fill_b( u_tc0 ); + tc = __msa_insve_h( tc, 0, tmp_vec ); + tc = __msa_insve_h( tc, 4, tmp_vec ); + + tmp_vec = ( v8i16 ) __msa_fill_b( u_tc1 ); + tc = __msa_insve_h( tc, 1, tmp_vec ); + tc = __msa_insve_h( tc, 5, tmp_vec ); + + tmp_vec = ( v8i16 ) __msa_fill_b( u_tc2 ); + tc = __msa_insve_h( tc, 2, tmp_vec ); + tc = __msa_insve_h( tc, 6, tmp_vec ); + + tmp_vec = ( v8i16 ) __msa_fill_b( u_tc3 ); + tc = __msa_insve_h( tc, 3, tmp_vec ); + tc = __msa_insve_h( tc, 7, tmp_vec ); + + is_bs_greater_than0 = ( v16u8 ) ( zero < ( v16i8 ) bs ); + + LD_UB8( ( p_chroma - 4 ), u_img_width, + row0, row1, row2, row3, row4, row5, row6, row7 ); + + TRANSPOSE8x8_UB_UB( row0, row1, row2, row3, + row4, row5, row6, row7, + p1_u_org, p1_v_org, p0_u_org, p0_v_org, + q0_u_org, q0_v_org, q1_u_org, q1_v_org ); + + ILVR_D4_UB( p1_v_org, p1_u_org, p0_v_org, p0_u_org, q0_v_org, q0_u_org, + q1_v_org, q1_u_org, p1_org, p0_org, q0_org, q1_org ); + + p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org ); + p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org ); + q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org ); + + alpha = ( v16u8 ) __msa_fill_b( u_alpha_in ); + beta = ( v16u8 ) __msa_fill_b( u_beta_in ); + + is_less_than_alpha = ( p0_asub_q0 < alpha ); + is_less_than_beta = ( p1_asub_p0 < beta ); + is_less_than = is_less_than_beta & is_less_than_alpha; + is_less_than_beta = ( q1_asub_q0 < beta ); + is_less_than = is_less_than_beta & is_less_than; + is_less_than = is_bs_greater_than0 & is_less_than; + + if( !__msa_test_bz_v( is_less_than ) ) + { + UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l ); + UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l ); + UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l ); + UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l ); + + is_bs_less_than4 = ( ( v16u8 ) bs < const4 ); + + is_less_than1 = is_less_than & is_bs_less_than4; + if( !__msa_test_bz_v( ( v16u8 ) is_less_than1 ) ) + { + negate_tc = zero - ( v16i8 ) tc; + sign_negate_tc = __msa_clti_s_b( negate_tc, 0 ); + + ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r, + i16_negatetc_l ); + + UNPCK_UB_SH( tc, tc_r, tc_l ); + + is_less_than_r = + ( v8i16 ) __msa_sldi_b( ( v16i8 ) is_less_than1, zero, 8 ); + if( !__msa_test_bz_v( ( v16u8 ) is_less_than_r ) ) + { + AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r, + negate_tc_r, tc_r, p0_r, q0_r ); + } + + is_less_than_l = + ( v8i16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than1, 8 ); + if( !__msa_test_bz_v( ( v16u8 ) is_less_than_l ) ) + { + AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l, + i16_negatetc_l, tc_l, p0_l, q0_l ); + } + + PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 ); + + p0_org = __msa_bmnz_v( p0_org, p0, is_less_than1 ); + q0_org = __msa_bmnz_v( q0_org, q0, is_less_than1 ); + } + + SLDI_B2_0_UB( p0_org, q0_org, p0_v_org, q0_v_org, 8 ); + ILVR_D2_SB( p0_v_org, p0_org, q0_v_org, q0_org, tmp0, tmp1 ); + ILVRL_B2_SB( tmp1, tmp0, tmp2, tmp3 ); + ILVRL_B2_SW( tmp3, tmp2, vec0, vec1 ); + ST4x8_UB( vec0, vec1, ( p_chroma - 2 ), u_img_width ); + } + } +} + +static void avc_deblock_strength_msa( uint8_t *nnz, + int8_t pi_ref[2][X264_SCAN8_LUMA_SIZE], + int16_t pi_mv[2][X264_SCAN8_LUMA_SIZE][2], + uint8_t pu_bs[2][8][4], + int32_t i_mvy_limit ) +{ + uint32_t u_tmp; + v16u8 nnz0, nnz1, nnz2, nnz3, nnz4; + v16u8 nnz_mask, ref_mask, mask, one, two, dst = { 0 }; + v16i8 ref0, ref1, ref2, ref3, ref4; + v16i8 temp_vec0, temp_vec1, temp_vec4, temp_vec5; + v8i16 mv0, mv1, mv2, mv3, mv4, mv5, mv6, mv7, mv8, mv9, mv_a, mv_b; + v8u16 four, mvy_limit_vec, sub0, sub1; + + nnz0 = LD_UB( nnz + 4 ); + nnz2 = LD_UB( nnz + 20 ); + nnz4 = LD_UB( nnz + 36 ); + + ref0 = LD_SB( pi_ref[0] + 4 ); + ref2 = LD_SB( pi_ref[0] + 20 ); + ref4 = LD_SB( pi_ref[0] + 36 ); + + mv0 = LD_SH( ( pi_mv[0] + 4 )[0] ); + mv1 = LD_SH( ( pi_mv[0] + 12 )[0] ); + mv2 = LD_SH( ( pi_mv[0] + 20 )[0] ); + mv3 = LD_SH( ( pi_mv[0] + 28 )[0] ); + mv4 = LD_SH( ( pi_mv[0] + 36 )[0] ); + + mvy_limit_vec = ( v8u16 ) __msa_fill_h( i_mvy_limit ); + four = ( v8u16 ) __msa_fill_h( 4 ); + mask = ( v16u8 ) __msa_ldi_b( 0 ); + one = ( v16u8 ) __msa_ldi_b( 1 ); + two = ( v16u8 ) __msa_ldi_b( 2 ); + + mv5 = __msa_pckod_h( mv0, mv0 ); + mv6 = __msa_pckod_h( mv1, mv1 ); + mv_a = __msa_pckev_h( mv0, mv0 ); + mv_b = __msa_pckev_h( mv1, mv1 ); + nnz1 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz0, 2 ); + ref1 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref0, 2 ); + nnz_mask = nnz0 | nnz1; + nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); + two = __msa_bmnz_v( two, mask, nnz_mask ); + + ref_mask = ( v16u8 ) __msa_ceq_b( ref0, ref1 ); + ref_mask = ref_mask ^ 255; + + sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); + sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); + + sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); + sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); + + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); + + dst = __msa_bmnz_v( dst, one, ref_mask ); + dst = __msa_bmnz_v( two, dst, nnz_mask ); + + u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); + SW( u_tmp, pu_bs[1][0] ); + + dst = ( v16u8 ) __msa_ldi_b( 0 ); + two = ( v16u8 ) __msa_ldi_b( 2 ); + + mv5 = __msa_pckod_h( mv1, mv1 ); + mv6 = __msa_pckod_h( mv2, mv2 ); + mv_a = __msa_pckev_h( mv1, mv1 ); + mv_b = __msa_pckev_h( mv2, mv2 ); + + nnz_mask = nnz2 | nnz1; + nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); + two = __msa_bmnz_v( two, mask, nnz_mask ); + + ref_mask = ( v16u8 ) __msa_ceq_b( ref1, ref2 ); + ref_mask = ref_mask ^ 255; + + sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); + sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); + sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); + sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); + + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); + + dst = __msa_bmnz_v( dst, one, ref_mask ); + dst = __msa_bmnz_v( two, dst, nnz_mask ); + + u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); + SW( u_tmp, pu_bs[1][1] ); + + dst = ( v16u8 ) __msa_ldi_b( 0 ); + two = ( v16u8 ) __msa_ldi_b( 2 ); + + mv5 = __msa_pckod_h( mv2, mv2 ); + mv6 = __msa_pckod_h( mv3, mv3 ); + mv_a = __msa_pckev_h( mv2, mv2 ); + mv_b = __msa_pckev_h( mv3, mv3 ); + + nnz3 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz2, 2 ); + ref3 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref2, 2 ); + + nnz_mask = nnz3 | nnz2; + nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); + two = __msa_bmnz_v( two, mask, nnz_mask ); + + ref_mask = ( v16u8 ) __msa_ceq_b( ref2, ref3 ); + ref_mask = ref_mask ^ 255; + + sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); + sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); + + sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); + sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); + + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); + + dst = __msa_bmnz_v( dst, one, ref_mask ); + dst = __msa_bmnz_v( two, dst, nnz_mask ); + + u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); + SW( u_tmp, pu_bs[1][2] ); + + dst = ( v16u8 ) __msa_ldi_b( 0 ); + two = ( v16u8 ) __msa_ldi_b( 2 ); + + mv5 = __msa_pckod_h( mv3, mv3 ); + mv6 = __msa_pckod_h( mv4, mv4 ); + mv_a = __msa_pckev_h( mv3, mv3 ); + mv_b = __msa_pckev_h( mv4, mv4 ); + + nnz_mask = nnz4 | nnz3; + nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); + two = __msa_bmnz_v( two, mask, nnz_mask ); + + ref_mask = ( v16u8 ) __msa_ceq_b( ref3, ref4 ); + ref_mask = ref_mask ^ 255; + + sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); + sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); + + sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); + sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); + + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); + + dst = __msa_bmnz_v( dst, one, ref_mask ); + dst = __msa_bmnz_v( two, dst, nnz_mask ); + + u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); + SW( u_tmp, pu_bs[1][3] ); + + nnz0 = LD_UB( nnz + 8 ); + nnz2 = LD_UB( nnz + 24 ); + + ref0 = LD_SB( pi_ref[0] + 8 ); + ref2 = LD_SB( pi_ref[0] + 24 ); + + mv0 = LD_SH( ( pi_mv[0] + 8 )[0] ); + mv1 = LD_SH( ( pi_mv[0] + 12 )[0] ); + mv2 = LD_SH( ( pi_mv[0] + 16 )[0] ); + mv3 = LD_SH( ( pi_mv[0] + 20 )[0] ); + mv4 = LD_SH( ( pi_mv[0] + 24 )[0] ); + mv7 = LD_SH( ( pi_mv[0] + 28 )[0] ); + mv8 = LD_SH( ( pi_mv[0] + 32 )[0] ); + mv9 = LD_SH( ( pi_mv[0] + 36 )[0] ); + + nnz1 = ( v16u8 ) __msa_splati_d( ( v2i64 ) nnz0, 1 ); + nnz3 = ( v16u8 ) __msa_splati_d( ( v2i64 ) nnz2, 1 ); + + ILVR_B2_SB( nnz2, nnz0, nnz3, nnz1, temp_vec0, temp_vec1 ); + + ILVRL_B2_SB( temp_vec1, temp_vec0, temp_vec5, temp_vec4 ); + + nnz0 = ( v16u8 ) __msa_splati_w( ( v4i32 ) temp_vec5, 3 ); + nnz1 = ( v16u8 ) temp_vec4; + nnz2 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 1 ); + nnz3 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 2 ); + nnz4 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 3 ); + + ref1 = ( v16i8 ) __msa_splati_d( ( v2i64 ) ref0, 1 ); + ref3 = ( v16i8 ) __msa_splati_d( ( v2i64 ) ref2, 1 ); + + ILVR_B2_SB( ref2, ref0, ref3, ref1, temp_vec0, temp_vec1 ); + + ILVRL_B2_SB( temp_vec1, temp_vec0, temp_vec5, ref1 ); + + ref0 = ( v16i8 ) __msa_splati_w( ( v4i32 ) temp_vec5, 3 ); + + ref2 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 1 ); + ref3 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 2 ); + ref4 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 3 ); + + TRANSPOSE8X4_SH_SH( mv0, mv2, mv4, mv8, mv5, mv5, mv5, mv0 ); + TRANSPOSE8X4_SH_SH( mv1, mv3, mv7, mv9, mv1, mv2, mv3, mv4 ); + + mvy_limit_vec = ( v8u16 ) __msa_fill_h( i_mvy_limit ); + four = ( v8u16 ) __msa_fill_h( 4 ); + mask = ( v16u8 ) __msa_ldi_b( 0 ); + one = ( v16u8 ) __msa_ldi_b( 1 ); + two = ( v16u8 ) __msa_ldi_b( 2 ); + dst = ( v16u8 ) __msa_ldi_b( 0 ); + + mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv0, 1 ); + mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv1, 1 ); + mv_a = mv0; + mv_b = mv1; + + nnz_mask = nnz0 | nnz1; + nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); + two = __msa_bmnz_v( two, mask, nnz_mask ); + + ref_mask = ( v16u8 ) __msa_ceq_b( ref0, ref1 ); + ref_mask = ref_mask ^ 255; + + sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); + sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); + + sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); + sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); + + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); + + dst = __msa_bmnz_v( dst, one, ref_mask ); + dst = __msa_bmnz_v( two, dst, nnz_mask ); + + u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); + SW( u_tmp, pu_bs[0][0] ); + + two = ( v16u8 ) __msa_ldi_b( 2 ); + dst = ( v16u8 ) __msa_ldi_b( 0 ); + + mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv1, 1 ); + mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv2, 1 ); + mv_a = mv1; + mv_b = mv2; + + nnz_mask = nnz1 | nnz2; + nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); + two = __msa_bmnz_v( two, mask, nnz_mask ); + + ref_mask = ( v16u8 ) __msa_ceq_b( ref1, ref2 ); + ref_mask = ref_mask ^ 255; + + sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); + sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); + sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); + sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); + + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); + + dst = __msa_bmnz_v( dst, one, ref_mask ); + dst = __msa_bmnz_v( two, dst, nnz_mask ); + + u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); + SW( u_tmp, pu_bs[0][1] ); + + two = ( v16u8 ) __msa_ldi_b( 2 ); + dst = ( v16u8 ) __msa_ldi_b( 0 ); + + mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv2, 1 ); + mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv3, 1 ); + mv_a = mv2; + mv_b = mv3; + + nnz_mask = nnz2 | nnz3; + nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); + two = __msa_bmnz_v( two, mask, nnz_mask ); + + ref_mask = ( v16u8 ) __msa_ceq_b( ref2, ref3 ); + ref_mask = ref_mask ^ 255; + + sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); + sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); + sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); + sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); + + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); + + dst = __msa_bmnz_v( dst, one, ref_mask ); + dst = __msa_bmnz_v( two, dst, nnz_mask ); + + u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); + SW( u_tmp, pu_bs[0][2] ); + + two = ( v16u8 ) __msa_ldi_b( 2 ); + dst = ( v16u8 ) __msa_ldi_b( 0 ); + + mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv3, 1 ); + mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv4, 1 ); + mv_a = mv3; + mv_b = mv4; + + nnz_mask = nnz3 | nnz4; + nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask ); + two = __msa_bmnz_v( two, mask, nnz_mask ); + + ref_mask = ( v16u8 ) __msa_ceq_b( ref3, ref4 ); + ref_mask = ref_mask ^ 255; + + sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a ); + sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 ); + sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 ); + sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 ); + + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 ); + ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 ); + + dst = __msa_bmnz_v( dst, one, ref_mask ); + dst = __msa_bmnz_v( two, dst, nnz_mask ); + + u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 ); + SW( u_tmp, pu_bs[0][3] ); +} + +void x264_deblock_v_luma_intra_msa( uint8_t *p_pix, intptr_t i_stride, + int32_t i_alpha, int32_t i_beta ) +{ + avc_loopfilter_luma_intra_edge_hor_msa( p_pix, ( uint8_t ) i_alpha, + ( uint8_t ) i_beta, i_stride ); +} + +void x264_deblock_h_luma_intra_msa( uint8_t *p_pix, intptr_t i_stride, + int32_t i_alpha, int32_t i_beta ) +{ + avc_loopfilter_luma_intra_edge_ver_msa( p_pix, ( uint8_t ) i_alpha, + ( uint8_t ) i_beta, i_stride ); +} + +void x264_deblock_v_chroma_intra_msa( uint8_t *p_pix, intptr_t i_stride, + int32_t i_alpha, int32_t i_beta ) +{ + avc_lpf_cbcr_interleaved_intra_edge_hor_msa( p_pix, ( uint8_t ) i_alpha, + ( uint8_t ) i_beta, i_stride ); +} + +void x264_deblock_h_chroma_intra_msa( uint8_t *p_pix, intptr_t i_stride, + int32_t i_alpha, int32_t i_beta ) +{ + avc_lpf_cbcr_interleaved_intra_edge_ver_msa( p_pix, ( uint8_t ) i_alpha, + ( uint8_t ) i_beta, i_stride ); +} + +void x264_deblock_h_luma_msa( uint8_t *p_pix, intptr_t i_stride, + int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 ) +{ + uint8_t u_bs0 = 1; + uint8_t u_bs1 = 1; + uint8_t u_bs2 = 1; + uint8_t u_bs3 = 1; + + if( p_tc0[0] < 0 ) u_bs0 = 0; + if( p_tc0[1] < 0 ) u_bs1 = 0; + if( p_tc0[2] < 0 ) u_bs2 = 0; + if( p_tc0[3] < 0 ) u_bs3 = 0; + + avc_loopfilter_luma_inter_edge_ver_msa( p_pix, + u_bs0, u_bs1, u_bs2, u_bs3, + p_tc0[0], p_tc0[1], p_tc0[2], + p_tc0[3], i_alpha, i_beta, + i_stride ); +} + +void x264_deblock_v_luma_msa( uint8_t *p_pix, intptr_t i_stride, + int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 ) +{ + uint8_t u_bs0 = 1; + uint8_t u_bs1 = 1; + uint8_t u_bs2 = 1; + uint8_t u_bs3 = 1; + + if( p_tc0[0] < 0 ) u_bs0 = 0; + if( p_tc0[1] < 0 ) u_bs1 = 0; + if( p_tc0[2] < 0 ) u_bs2 = 0; + if( p_tc0[3] < 0 ) u_bs3 = 0; + + avc_loopfilter_luma_inter_edge_hor_msa( p_pix, + u_bs0, u_bs1, u_bs2, u_bs3, + p_tc0[0], p_tc0[1], p_tc0[2], + p_tc0[3], i_alpha, i_beta, + i_stride ); +} + +void x264_deblock_v_chroma_msa( uint8_t *p_pix, intptr_t i_stride, + int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 ) +{ + uint8_t u_bs0 = 1; + uint8_t u_bs1 = 1; + uint8_t u_bs2 = 1; + uint8_t u_bs3 = 1; + + if( p_tc0[0] < 0 ) u_bs0 = 0; + if( p_tc0[1] < 0 ) u_bs1 = 0; + if( p_tc0[2] < 0 ) u_bs2 = 0; + if( p_tc0[3] < 0 ) u_bs3 = 0; + + avc_lpf_cbcr_interleaved_inter_edge_hor_msa( p_pix, + u_bs0, u_bs1, u_bs2, u_bs3, + p_tc0[0], p_tc0[1], p_tc0[2], + p_tc0[3], i_alpha, i_beta, + i_stride ); +} + +void x264_deblock_h_chroma_msa( uint8_t *p_pix, intptr_t i_stride, + int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 ) +{ + uint8_t u_bs0 = 1; + uint8_t u_bs1 = 1; + uint8_t u_bs2 = 1; + uint8_t u_bs3 = 1; + + if( p_tc0[0] < 0 ) u_bs0 = 0; + if( p_tc0[1] < 0 ) u_bs1 = 0; + if( p_tc0[2] < 0 ) u_bs2 = 0; + if( p_tc0[3] < 0 ) u_bs3 = 0; + + avc_lpf_cbcr_interleaved_inter_edge_ver_msa( p_pix, + u_bs0, u_bs1, u_bs2, u_bs3, + p_tc0[0], p_tc0[1], p_tc0[2], + p_tc0[3], i_alpha, i_beta, + i_stride ); +} + +void x264_deblock_strength_msa( uint8_t u_nnz[X264_SCAN8_SIZE], + int8_t pi_ref[2][X264_SCAN8_LUMA_SIZE], + int16_t pi_mv[2][X264_SCAN8_LUMA_SIZE][2], + uint8_t pu_bs[2][8][4], int32_t i_mvy_limit, + int32_t i_bframe ) +{ + if( i_bframe ) + { + for( int32_t i_dir = 0; i_dir < 2; i_dir++ ) + { + int32_t s1 = i_dir ? 1 : 8; + int32_t s2 = i_dir ? 8 : 1; + + for( int32_t i_edge = 0; i_edge < 4; i_edge++ ) + { + for( int32_t i = 0, loc = X264_SCAN8_0 + i_edge * s2; i < 4; + i++, loc += s1 ) + { + int32_t locn = loc - s2; + if( u_nnz[loc] || u_nnz[locn] ) + { + pu_bs[i_dir][i_edge][i] = 2; + } + else if( pi_ref[0][loc] != pi_ref[0][locn] || + abs( pi_mv[0][loc][0] - + pi_mv[0][locn][0] ) >= 4 || + abs( pi_mv[0][loc][1] - + pi_mv[0][locn][1] ) >= i_mvy_limit || + ( i_bframe && + ( pi_ref[1][loc] != pi_ref[1][locn] || + abs( pi_mv[1][loc][0] - + pi_mv[1][locn][0] ) >= 4 || + abs( pi_mv[1][loc][1] - + pi_mv[1][locn][1] ) >= i_mvy_limit ) ) + ) + { + pu_bs[i_dir][i_edge][i] = 1; + } + else + { + pu_bs[i_dir][i_edge][i] = 0; + } + } + } + } + } + else + { + avc_deblock_strength_msa( u_nnz, pi_ref, pi_mv, pu_bs, i_mvy_limit ); + } +} +#endif
View file
x264-snapshot-20150804-2245.tar.bz2/common/mips/macros.h
Added
@@ -0,0 +1,1952 @@ +/***************************************************************************** + * macros.h: msa macros + ***************************************************************************** + * Copyright (C) 2015 x264 project + * + * Authors: Rishikesh More <rishikesh.more@imgtec.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_MIPS_MACROS_H +#define X264_MIPS_MACROS_H + +#include <stdint.h> +#include <msa.h> + +#define LD_B( RTYPE, p_src ) *( ( RTYPE * )( p_src ) ) +#define LD_UB( ... ) LD_B( v16u8, __VA_ARGS__ ) +#define LD_SB( ... ) LD_B( v16i8, __VA_ARGS__ ) + +#define LD_H( RTYPE, p_src ) *( ( RTYPE * )( p_src ) ) +#define LD_SH( ... ) LD_H( v8i16, __VA_ARGS__ ) + +#define LD_W( RTYPE, p_src ) *( ( RTYPE * )( p_src ) ) +#define LD_SW( ... ) LD_W( v4i32, __VA_ARGS__ ) + +#define ST_B( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in ) +#define ST_UB( ... ) ST_B( v16u8, __VA_ARGS__ ) +#define ST_SB( ... ) ST_B( v16i8, __VA_ARGS__ ) + +#define ST_H( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in ) +#define ST_UH( ... ) ST_H( v8u16, __VA_ARGS__ ) +#define ST_SH( ... ) ST_H( v8i16, __VA_ARGS__ ) + +#if ( __mips_isa_rev >= 6 ) + #define LH( p_src ) \ + ( { \ + uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ + uint16_t u_val_h_m; \ + \ + asm volatile ( \ + "lh %[u_val_h_m], %[p_src_m] \n\t" \ + \ + : [u_val_h_m] "=r" ( u_val_h_m ) \ + : [p_src_m] "m" ( *p_src_m ) \ + ); \ + \ + u_val_h_m; \ + } ) + + #define LW( p_src ) \ + ( { \ + uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ + uint32_t u_val_w_m; \ + \ + asm volatile ( \ + "lw %[u_val_w_m], %[p_src_m] \n\t" \ + \ + : [u_val_w_m] "=r" ( u_val_w_m ) \ + : [p_src_m] "m" ( *p_src_m ) \ + ); \ + \ + u_val_w_m; \ + } ) + + #if ( __mips == 64 ) + #define LD( p_src ) \ + ( { \ + uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ + uint64_t u_val_d_m = 0; \ + \ + asm volatile ( \ + "ld %[u_val_d_m], %[p_src_m] \n\t" \ + \ + : [u_val_d_m] "=r" ( u_val_d_m ) \ + : [p_src_m] "m" ( *p_src_m ) \ + ); \ + \ + u_val_d_m; \ + } ) + #else // !( __mips == 64 ) + #define LD( p_src ) \ + ( { \ + uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ + uint32_t u_val0_m, u_val1_m; \ + uint64_t u_val_d_m = 0; \ + \ + u_val0_m = LW( p_src_m ); \ + u_val1_m = LW( p_src_m + 4 ); \ + \ + u_val_d_m = ( uint64_t ) ( u_val1_m ); \ + u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) & \ + 0xFFFFFFFF00000000 ); \ + u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m ); \ + \ + u_val_d_m; \ + } ) + #endif // ( __mips == 64 ) + + #define SH( u_val, p_dst ) \ + { \ + uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ + uint16_t u_val_h_m = ( u_val ); \ + \ + asm volatile ( \ + "sh %[u_val_h_m], %[p_dst_m] \n\t" \ + \ + : [p_dst_m] "=m" ( *p_dst_m ) \ + : [u_val_h_m] "r" ( u_val_h_m ) \ + ); \ + } + + #define SW( u_val, p_dst ) \ + { \ + uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ + uint32_t u_val_w_m = ( u_val ); \ + \ + asm volatile ( \ + "sw %[u_val_w_m], %[p_dst_m] \n\t" \ + \ + : [p_dst_m] "=m" ( *p_dst_m ) \ + : [u_val_w_m] "r" ( u_val_w_m ) \ + ); \ + } + + #define SD( u_val, p_dst ) \ + { \ + uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ + uint64_t u_val_d_m = ( u_val ); \ + \ + asm volatile ( \ + "sd %[u_val_d_m], %[p_dst_m] \n\t" \ + \ + : [p_dst_m] "=m" ( *p_dst_m ) \ + : [u_val_d_m] "r" ( u_val_d_m ) \ + ); \ + } + +#else // !( __mips_isa_rev >= 6 ) + #define LH( p_src ) \ + ( { \ + uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ + uint16_t u_val_h_m; \ + \ + asm volatile ( \ + "ulh %[u_val_h_m], %[p_src_m] \n\t" \ + \ + : [u_val_h_m] "=r" ( u_val_h_m ) \ + : [p_src_m] "m" ( *p_src_m ) \ + ); \ + \ + u_val_h_m; \ + } ) + + #define LW( p_src ) \ + ( { \ + uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ + uint32_t u_val_w_m; \ + \ + asm volatile ( \ + "ulw %[u_val_w_m], %[p_src_m] \n\t" \ + \ + : [u_val_w_m] "=r" ( u_val_w_m ) \ + : [p_src_m] "m" ( *p_src_m ) \ + ); \ + \ + u_val_w_m; \ + } ) + + #if ( __mips == 64 ) + #define LD( p_src ) \ + ( { \ + uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \ + uint64_t u_val_d_m = 0; \ + \ + asm volatile ( \ + "uld %[u_val_d_m], %[p_src_m] \n\t" \ + \ + : [u_val_d_m] "=r" ( u_val_d_m ) \ + : [p_src_m] "m" ( *p_src_m ) \ + ); \ + \ + u_val_d_m; \ + } ) + #else // !( __mips == 64 ) + #define LD( p_src ) \ + ( { \ + uint8_t *psrc_m1 = ( uint8_t * ) ( p_src ); \ + uint32_t u_val0_m, u_val1_m; \ + uint64_t u_val_d_m = 0; \ + \ + u_val0_m = LW( psrc_m1 ); \ + u_val1_m = LW( psrc_m1 + 4 ); \ + \ + u_val_d_m = ( uint64_t ) ( u_val1_m ); \ + u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) & \ + 0xFFFFFFFF00000000 ); \ + u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m ); \ + \ + u_val_d_m; \ + } ) + #endif // ( __mips == 64 ) + + #define SH( u_val, p_dst ) \ + { \ + uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ + uint16_t u_val_h_m = ( u_val ); \ + \ + asm volatile ( \ + "ush %[u_val_h_m], %[p_dst_m] \n\t" \ + \ + : [p_dst_m] "=m" ( *p_dst_m ) \ + : [u_val_h_m] "r" ( u_val_h_m ) \ + ); \ + } + + #define SW( u_val, p_dst ) \ + { \ + uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \ + uint32_t u_val_w_m = ( u_val ); \ + \ + asm volatile ( \ + "usw %[u_val_w_m], %[p_dst_m] \n\t" \ + \ + : [p_dst_m] "=m" ( *p_dst_m ) \ + : [u_val_w_m] "r" ( u_val_w_m ) \ + ); \ + } + + #define SD( u_val, p_dst ) \ + { \ + uint8_t *p_dst_m1 = ( uint8_t * ) ( p_dst ); \ + uint32_t u_val0_m, u_val1_m; \ + \ + u_val0_m = ( uint32_t ) ( ( u_val ) & 0x00000000FFFFFFFF ); \ + u_val1_m = ( uint32_t ) ( ( ( u_val ) >> 32 ) & 0x00000000FFFFFFFF ); \ + \ + SW( u_val0_m, p_dst_m1 ); \ + SW( u_val1_m, p_dst_m1 + 4 ); \ + } + +#endif // ( __mips_isa_rev >= 6 ) + +/* Description : Load 4 words with stride + Arguments : Inputs - psrc (source pointer to load from) + - stride + Outputs - out0, out1, out2, out3 + Details : Load word in 'out0' from (psrc) + Load word in 'out1' from (psrc + stride) + Load word in 'out2' from (psrc + 2 * stride) + Load word in 'out3' from (psrc + 3 * stride) +*/ +#define LW4( p_src, stride, out0, out1, out2, out3 ) \ +{ \ + out0 = LW( ( p_src ) ); \ + out1 = LW( ( p_src ) + stride ); \ + out2 = LW( ( p_src ) + 2 * stride ); \ + out3 = LW( ( p_src ) + 3 * stride ); \ +} + +/* Description : Store 4 words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store word from 'in0' to (pdst) + Store word from 'in1' to (pdst + stride) + Store word from 'in2' to (pdst + 2 * stride) + Store word from 'in3' to (pdst + 3 * stride) +*/ +#define SW4( in0, in1, in2, in3, p_dst, stride ) \ +{ \ + SW( in0, ( p_dst ) ) \ + SW( in1, ( p_dst ) + stride ); \ + SW( in2, ( p_dst ) + 2 * stride ); \ + SW( in3, ( p_dst ) + 3 * stride ); \ +} + +/* Description : Store 4 double words with stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store double word from 'in0' to (pdst) + Store double word from 'in1' to (pdst + stride) + Store double word from 'in2' to (pdst + 2 * stride) + Store double word from 'in3' to (pdst + 3 * stride) +*/ +#define SD4( in0, in1, in2, in3, p_dst, stride ) \ +{ \ + SD( in0, ( p_dst ) ) \ + SD( in1, ( p_dst ) + stride ); \ + SD( in2, ( p_dst ) + 2 * stride ); \ + SD( in3, ( p_dst ) + 3 * stride ); \ +} + +/* Description : Load vectors with 16 byte elements with stride + Arguments : Inputs - psrc (source pointer to load from) + - stride + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Load 16 byte elements in 'out0' from (psrc) + Load 16 byte elements in 'out1' from (psrc + stride) +*/ +#define LD_B2( RTYPE, p_src, stride, out0, out1 ) \ +{ \ + out0 = LD_B( RTYPE, ( p_src ) ); \ + out1 = LD_B( RTYPE, ( p_src ) + stride ); \ +} +#define LD_UB2( ... ) LD_B2( v16u8, __VA_ARGS__ ) +#define LD_SB2( ... ) LD_B2( v16i8, __VA_ARGS__ ) + +#define LD_B3( RTYPE, p_src, stride, out0, out1, out2 ) \ +{ \ + LD_B2( RTYPE, ( p_src ), stride, out0, out1 ); \ + out2 = LD_B( RTYPE, ( p_src ) + 2 * stride ); \ +} +#define LD_UB3( ... ) LD_B3( v16u8, __VA_ARGS__ ) +#define LD_SB3( ... ) LD_B3( v16i8, __VA_ARGS__ ) + +#define LD_B4( RTYPE, p_src, stride, out0, out1, out2, out3 ) \ +{ \ + LD_B2( RTYPE, ( p_src ), stride, out0, out1 ); \ + LD_B2( RTYPE, ( p_src ) + 2 * stride , stride, out2, out3 ); \ +} +#define LD_UB4( ... ) LD_B4( v16u8, __VA_ARGS__ ) +#define LD_SB4( ... ) LD_B4( v16i8, __VA_ARGS__ ) + +#define LD_B5( RTYPE, p_src, stride, out0, out1, out2, out3, out4 ) \ +{ \ + LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \ + out4 = LD_B( RTYPE, ( p_src ) + 4 * stride ); \ +} +#define LD_UB5( ... ) LD_B5( v16u8, __VA_ARGS__ ) +#define LD_SB5( ... ) LD_B5( v16i8, __VA_ARGS__ ) + +#define LD_B8( RTYPE, p_src, stride, \ + out0, out1, out2, out3, out4, out5, out6, out7 ) \ +{ \ + LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \ + LD_B4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 ); \ +} +#define LD_UB8( ... ) LD_B8( v16u8, __VA_ARGS__ ) +#define LD_SB8( ... ) LD_B8( v16i8, __VA_ARGS__ ) + +/* Description : Load vectors with 8 halfword elements with stride + Arguments : Inputs - psrc (source pointer to load from) + - stride + Outputs - out0, out1 + Details : Load 8 halfword elements in 'out0' from (psrc) + Load 8 halfword elements in 'out1' from (psrc + stride) +*/ +#define LD_H2( RTYPE, p_src, stride, out0, out1 ) \ +{ \ + out0 = LD_H( RTYPE, ( p_src ) ); \ + out1 = LD_H( RTYPE, ( p_src ) + ( stride ) ); \ +} +#define LD_SH2( ... ) LD_H2( v8i16, __VA_ARGS__ ) + +#define LD_H4( RTYPE, p_src, stride, out0, out1, out2, out3 ) \ +{ \ + LD_H2( RTYPE, ( p_src ), stride, out0, out1 ); \ + LD_H2( RTYPE, ( p_src ) + 2 * stride, stride, out2, out3 ); \ +} +#define LD_SH4( ... ) LD_H4( v8i16, __VA_ARGS__ ) + +#define LD_H8( RTYPE, p_src, stride, \ + out0, out1, out2, out3, out4, out5, out6, out7 ) \ +{ \ + LD_H4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \ + LD_H4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 ); \ +} +#define LD_SH8( ... ) LD_H8( v8i16, __VA_ARGS__ ) + +/* Description : Load 4x4 block of signed halfword elements from 1D source + data into 4 vectors (Each vector with 4 signed halfwords) + Arguments : Inputs - psrc + Outputs - out0, out1, out2, out3 +*/ +#define LD4x4_SH( p_src, out0, out1, out2, out3 ) \ +{ \ + out0 = LD_SH( p_src ); \ + out2 = LD_SH( p_src + 8 ); \ + out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 ); \ + out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out2, ( v2i64 ) out2 ); \ +} + +/* Description : Load 2 vectors of signed word elements with stride + Arguments : Inputs - psrc (source pointer to load from) + - stride + Outputs - out0, out1 + Return Type - signed word +*/ +#define LD_SW2( p_src, stride, out0, out1 ) \ +{ \ + out0 = LD_SW( ( p_src ) ); \ + out1 = LD_SW( ( p_src ) + stride ); \ +} + +/* Description : Store vectors of 16 byte elements with stride + Arguments : Inputs - in0, in1, stride + - pdst (destination pointer to store to) + Details : Store 16 byte elements from 'in0' to (pdst) + Store 16 byte elements from 'in1' to (pdst + stride) +*/ +#define ST_B2( RTYPE, in0, in1, p_dst, stride ) \ +{ \ + ST_B( RTYPE, in0, ( p_dst ) ); \ + ST_B( RTYPE, in1, ( p_dst ) + stride ); \ +} +#define ST_UB2( ... ) ST_B2( v16u8, __VA_ARGS__ ) + +#define ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride ) \ +{ \ + ST_B2( RTYPE, in0, in1, ( p_dst ), stride ); \ + ST_B2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride ); \ +} +#define ST_UB4( ... ) ST_B4( v16u8, __VA_ARGS__ ) +#define ST_SB4( ... ) ST_B4( v16i8, __VA_ARGS__ ) + +#define ST_B8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + p_dst, stride ) \ +{ \ + ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride ); \ + ST_B4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride ); \ +} +#define ST_UB8( ... ) ST_B8( v16u8, __VA_ARGS__ ) + +/* Description : Store vectors of 8 halfword elements with stride + Arguments : Inputs - in0, in1, stride + - pdst (destination pointer to store to) + Details : Store 8 halfword elements from 'in0' to (pdst) + Store 8 halfword elements from 'in1' to (pdst + stride) +*/ +#define ST_H2( RTYPE, in0, in1, p_dst, stride ) \ +{ \ + ST_H( RTYPE, in0, ( p_dst ) ); \ + ST_H( RTYPE, in1, ( p_dst ) + stride ); \ +} +#define ST_SH2( ... ) ST_H2( v8i16, __VA_ARGS__ ) + +#define ST_H4( RTYPE, in0, in1, in2, in3, p_dst, stride ) \ +{ \ + ST_H2( RTYPE, in0, in1, ( p_dst ), stride ); \ + ST_H2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride ); \ +} +#define ST_SH4( ... ) ST_H4( v8i16, __VA_ARGS__ ) + +#define ST_H8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, p_dst, stride ) \ +{ \ + ST_H4( RTYPE, in0, in1, in2, in3, ( p_dst ), stride ); \ + ST_H4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride ); \ +} +#define ST_SH8( ... ) ST_H8( v8i16, __VA_ARGS__ ) + +/* Description : Store 2x4 byte block to destination memory from input vector + Arguments : Inputs - in, stidx, pdst, stride + Details : Index 'stidx' halfword element from 'in' vector is copied to + GP register and stored to (pdst) + Index 'stidx+1' halfword element from 'in' vector is copied to + GP register and stored to (pdst + stride) + Index 'stidx+2' halfword element from 'in' vector is copied to + GP register and stored to (pdst + 2 * stride) + Index 'stidx+3' halfword element from 'in' vector is copied to + GP register and stored to (pdst + 3 * stride) +*/ +#define ST2x4_UB( in, stidx, p_dst, stride ) \ +{ \ + uint16_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \ + uint8_t *pblk_2x4_m = ( uint8_t * ) ( p_dst ); \ + \ + u_out0_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx ) ); \ + u_out1_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 1 ) ); \ + u_out2_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 2 ) ); \ + u_out3_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 3 ) ); \ + \ + SH( u_out0_m, pblk_2x4_m ); \ + SH( u_out1_m, pblk_2x4_m + stride ); \ + SH( u_out2_m, pblk_2x4_m + 2 * stride ); \ + SH( u_out3_m, pblk_2x4_m + 3 * stride ); \ +} + +/* Description : Store 4x4 byte block to destination memory from input vector + Arguments : Inputs - in0, in1, pdst, stride + Details : 'Idx0' word element from input vector 'in0' is copied to + GP register and stored to (pdst) + 'Idx1' word element from input vector 'in0' is copied to + GP register and stored to (pdst + stride) + 'Idx2' word element from input vector 'in0' is copied to + GP register and stored to (pdst + 2 * stride) + 'Idx3' word element from input vector 'in0' is copied to + GP register and stored to (pdst + 3 * stride) +*/ +#define ST4x4_UB( in0, in1, idx0, idx1, idx2, idx3, p_dst, stride ) \ +{ \ + uint32_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \ + uint8_t *pblk_4x4_m = ( uint8_t * ) ( p_dst ); \ + \ + u_out0_m = __msa_copy_u_w( ( v4i32 ) in0, idx0 ); \ + u_out1_m = __msa_copy_u_w( ( v4i32 ) in0, idx1 ); \ + u_out2_m = __msa_copy_u_w( ( v4i32 ) in1, idx2 ); \ + u_out3_m = __msa_copy_u_w( ( v4i32 ) in1, idx3 ); \ + \ + SW4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_4x4_m, stride ); \ +} + +#define ST4x8_UB( in0, in1, p_dst, stride ) \ +{ \ + uint8_t *pblk_4x8 = ( uint8_t * ) ( p_dst ); \ + \ + ST4x4_UB( in0, in0, 0, 1, 2, 3, pblk_4x8, stride ); \ + ST4x4_UB( in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride ); \ +} + +/* Description : Store 8x1 byte block to destination memory from input vector + Arguments : Inputs - in, pdst + Details : Index 0 double word element from 'in' vector is copied to + GP register and stored to (pdst) +*/ +#define ST8x1_UB( in, p_dst ) \ +{ \ + uint64_t u_out0_m; \ + u_out0_m = __msa_copy_u_d( ( v2i64 ) in, 0 ); \ + SD( u_out0_m, p_dst ); \ +} + +/* Description : Store 8x4 byte block to destination memory from input + vectors + Arguments : Inputs - in0, in1, pdst, stride + Details : Index 0 double word element from 'in0' vector is copied to + GP register and stored to (pdst) + Index 1 double word element from 'in0' vector is copied to + GP register and stored to (pdst + stride) + Index 0 double word element from 'in1' vector is copied to + GP register and stored to (pdst + 2 * stride) + Index 1 double word element from 'in1' vector is copied to + GP register and stored to (pdst + 3 * stride) +*/ +#define ST8x4_UB( in0, in1, p_dst, stride ) \ +{ \ + uint64_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \ + uint8_t *pblk_8x4_m = ( uint8_t * ) ( p_dst ); \ + \ + u_out0_m = __msa_copy_u_d( ( v2i64 ) in0, 0 ); \ + u_out1_m = __msa_copy_u_d( ( v2i64 ) in0, 1 ); \ + u_out2_m = __msa_copy_u_d( ( v2i64 ) in1, 0 ); \ + u_out3_m = __msa_copy_u_d( ( v2i64 ) in1, 1 ); \ + \ + SD4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_8x4_m, stride ); \ +} + +/* Description : average with rounding (in0 + in1 + 1) / 2. + Arguments : Inputs - in0, in1, in2, in3, + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned byte element from 'in0' vector is added with + each unsigned byte element from 'in1' vector. + Average with rounding is calculated and written to 'out0' +*/ +#define AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in0, ( v16u8 ) in1 ); \ + out1 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in2, ( v16u8 ) in3 ); \ +} +#define AVER_UB2_UB( ... ) AVER_UB2( v16u8, __VA_ARGS__ ) + +#define AVER_UB4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 ) \ +{ \ + AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ + AVER_UB2( RTYPE, in4, in5, in6, in7, out2, out3 ) \ +} +#define AVER_UB4_UB( ... ) AVER_UB4( v16u8, __VA_ARGS__ ) + +/* Description : Immediate number of elements to slide with zero + Arguments : Inputs - in0, in1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'zero_m' vector are slide into 'in0' by + value specified in 'slide_val' +*/ +#define SLDI_B2_0( RTYPE, in0, in1, out0, out1, slide_val ) \ +{ \ + v16i8 zero_m = { 0 }; \ + out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m, \ + ( v16i8 ) in0, slide_val ); \ + out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m, \ + ( v16i8 ) in1, slide_val ); \ +} +#define SLDI_B2_0_UB( ... ) SLDI_B2_0( v16u8, __VA_ARGS__ ) + +/* Description : Immediate number of elements to slide + Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by + value specified in 'slide_val' +*/ +#define SLDI_B2( RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val ) \ +{ \ + out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_0, ( v16i8 ) in1_0, \ + slide_val ); \ + out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_1, ( v16i8 ) in1_1, \ + slide_val ); \ +} +#define SLDI_B2_UB( ... ) SLDI_B2( v16u8, __VA_ARGS__ ) + +/* Description : Shuffle byte vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Selective byte elements from 'in0' & 'in1' are copied to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_B2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask0, \ + ( v16i8 ) in1, ( v16i8 ) in0 ); \ + out1 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask1, \ + ( v16i8 ) in3, ( v16i8 ) in2 ); \ +} +#define VSHF_B2_UB( ... ) VSHF_B2( v16u8, __VA_ARGS__ ) +#define VSHF_B2_SB( ... ) VSHF_B2( v16i8, __VA_ARGS__ ) + +/* Description : Shuffle halfword vector elements as per mask vector + Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Selective byte elements from 'in0' & 'in1' are copied to + 'out0' as per control vector 'mask0' +*/ +#define VSHF_H2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask0, \ + ( v8i16 ) in1, ( v8i16 ) in0 ); \ + out1 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask1, \ + ( v8i16 ) in3, ( v8i16 ) in2 ); \ +} +#define VSHF_H2_SH( ... ) VSHF_H2( v8i16, __VA_ARGS__ ) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1 + cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Unsigned byte elements from 'mult0' are multiplied with + unsigned byte elements from 'cnst0' producing a result + twice the size of input i.e. unsigned halfword. + Multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult0, ( v16u8 ) cnst0 ); \ + out1 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult1, ( v16u8 ) cnst1 ); \ +} +#define DOTP_UB2_UH( ... ) DOTP_UB2( v8u16, __VA_ARGS__ ) + +#define DOTP_UB4( RTYPE, mult0, mult1, mult2, mult3, \ + cnst0, cnst1, cnst2, cnst3, \ + out0, out1, out2, out3 ) \ +{ \ + DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ); \ + DOTP_UB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 ); \ +} +#define DOTP_UB4_UH( ... ) DOTP_UB4( v8u16, __VA_ARGS__ ) + +/* Description : Dot product of byte vector elements + Arguments : Inputs - mult0, mult1 + cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result + twice the size of input i.e. signed halfword. + Multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out0, \ + ( v16i8 ) mult0, ( v16i8 ) cnst0 ); \ + out1 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out1, \ + ( v16i8 ) mult1, ( v16i8 ) cnst1 ); \ +} +#define DPADD_SB2_SH( ... ) DPADD_SB2( v8i16, __VA_ARGS__ ) + +#define DPADD_SB4( RTYPE, mult0, mult1, mult2, mult3, \ + cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3 ) \ +{ \ + DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ); \ + DPADD_SB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 ); \ +} +#define DPADD_SB4_SH( ... ) DPADD_SB4( v8i16, __VA_ARGS__ ) + +/* Description : Dot product of halfword vector elements + Arguments : Inputs - mult0, mult1 + cnst0, cnst1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result + twice the size of input i.e. signed word. + Multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector +*/ +#define DPADD_SH2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out0, \ + ( v8i16 ) mult0, ( v8i16 ) cnst0 ); \ + out1 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out1, \ + ( v8i16 ) mult1, ( v8i16 ) cnst1 ); \ +} +#define DPADD_SH2_SW( ... ) DPADD_SH2( v4i32, __VA_ARGS__ ) + +/* Description : Clips all halfword elements of input vector between min & max + out = (in < min) ? min : ((in > max) ? max : in) + Arguments : Inputs - in, min, max + Output - out_m + Return Type - signed halfword +*/ +#define CLIP_SH( in, min, max ) \ +( { \ + v8i16 out_m; \ + \ + out_m = __msa_max_s_h( ( v8i16 ) min, ( v8i16 ) in ); \ + out_m = __msa_min_s_h( ( v8i16 ) max, ( v8i16 ) out_m ); \ + out_m; \ +} ) + +/* Description : Clips all signed halfword elements of input vector + between 0 & 255 + Arguments : Input - in + Output - out_m + Return Type - signed halfword +*/ +#define CLIP_SH_0_255( in ) \ +( { \ + v8i16 max_m = __msa_ldi_h( 255 ); \ + v8i16 out_m; \ + \ + out_m = __msa_maxi_s_h( ( v8i16 ) in, 0 ); \ + out_m = __msa_min_s_h( ( v8i16 ) max_m, ( v8i16 ) out_m ); \ + out_m; \ +} ) +#define CLIP_SH2_0_255( in0, in1 ) \ +{ \ + in0 = CLIP_SH_0_255( in0 ); \ + in1 = CLIP_SH_0_255( in1 ); \ +} +#define CLIP_SH4_0_255( in0, in1, in2, in3 ) \ +{ \ + CLIP_SH2_0_255( in0, in1 ); \ + CLIP_SH2_0_255( in2, in3 ); \ +} + +/* Description : Horizontal addition of 4 signed word elements of input vector + Arguments : Input - in (signed word vector) + Output - sum_m (i32 sum) + Return Type - signed word (GP) + Details : 4 signed word elements of 'in' vector are added together and + the resulting integer sum is returned +*/ +#define HADD_SW_S32( in ) \ +( { \ + v2i64 res0_m, res1_m; \ + int32_t i_sum_m; \ + \ + res0_m = __msa_hadd_s_d( ( v4i32 ) in, ( v4i32 ) in ); \ + res1_m = __msa_splati_d( res0_m, 1 ); \ + res0_m = res0_m + res1_m; \ + i_sum_m = __msa_copy_s_w( ( v4i32 ) res0_m, 0 ); \ + i_sum_m; \ +} ) + +/* Description : Horizontal addition of 4 signed word elements of input vector + Arguments : Input - in (signed word vector) + Output - sum_m (i32 sum) + Return Type - signed word (GP) + Details : 4 signed word elements of 'in' vector are added together and + the resulting integer sum is returned +*/ +#define HADD_UH_U32( in ) \ +( { \ + v4u32 res_m; \ + v2u64 res0_m, res1_m; \ + uint32_t u_sum_m; \ + \ + res_m = __msa_hadd_u_w( ( v8u16 ) in, ( v8u16 ) in ); \ + res0_m = __msa_hadd_u_d( res_m, res_m ); \ + res1_m = ( v2u64 ) __msa_splati_d( ( v2i64 ) res0_m, 1 ); \ + res0_m = res0_m + res1_m; \ + u_sum_m = __msa_copy_u_w( ( v4i32 ) res0_m, 0 ); \ + u_sum_m; \ +} ) + +/* Description : Horizontal addition of signed byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each signed odd byte element from 'in0' is added to + even signed byte element from 'in0' (pairwise) and the + halfword result is written in 'out0' +*/ +#define HADD_SB2( RTYPE, in0, in1, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in0, ( v16i8 ) in0 ); \ + out1 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in1, ( v16i8 ) in1 ); \ +} +#define HADD_SB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \ +{ \ + HADD_SB2( RTYPE, in0, in1, out0, out1 ); \ + HADD_SB2( RTYPE, in2, in3, out2, out3 ); \ +} +#define HADD_SB4_SH( ... ) HADD_SB4( v8i16, __VA_ARGS__ ) + +/* Description : Horizontal addition of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is added to + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HADD_UB2( RTYPE, in0, in1, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in0, ( v16u8 ) in0 ); \ + out1 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in1, ( v16u8 ) in1 ); \ +} +#define HADD_UB2_UH( ... ) HADD_UB2( v8u16, __VA_ARGS__ ) + +#define HADD_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \ +{ \ + HADD_UB2( RTYPE, in0, in1, out0, out1 ); \ + HADD_UB2( RTYPE, in2, in3, out2, out3 ); \ +} +#define HADD_UB4_UH( ... ) HADD_UB4( v8u16, __VA_ARGS__ ) + +/* Description : Horizontal subtraction of unsigned byte vector elements + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Each unsigned odd byte element from 'in0' is subtracted from + even unsigned byte element from 'in0' (pairwise) and the + halfword result is written to 'out0' +*/ +#define HSUB_UB2( RTYPE, in0, in1, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in0, ( v16u8 ) in0 ); \ + out1 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in1, ( v16u8 ) in1 ); \ +} +#define HSUB_UB2_SH( ... ) HSUB_UB2( v8i16, __VA_ARGS__ ) + +#define HSUB_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \ +{ \ + HSUB_UB2( RTYPE, in0, in1, out0, out1 ); \ + HSUB_UB2( RTYPE, in2, in3, out2, out3 ); \ +} +#define HSUB_UB4_SH( ... ) HSUB_UB4( v8i16, __VA_ARGS__ ) + +/* Description : SAD (Sum of Absolute Difference) + Arguments : Inputs - in0, in1, ref0, ref1 + Outputs - sad_m (halfword vector) + Return Type - unsigned halfword + Details : Absolute difference of all the byte elements from 'in0' with + 'ref0' is calculated and preserved in 'diff0'. Then even-odd + pairs are added together to generate 8 halfword results. +*/ +#define SAD_UB2_UH( in0, in1, ref0, ref1 ) \ +( { \ + v16u8 diff0_m, diff1_m; \ + v8u16 sad_m = { 0 }; \ + \ + diff0_m = __msa_asub_u_b( ( v16u8 ) in0, ( v16u8 ) ref0 ); \ + diff1_m = __msa_asub_u_b( ( v16u8 ) in1, ( v16u8 ) ref1 ); \ + \ + sad_m += __msa_hadd_u_h( ( v16u8 ) diff0_m, ( v16u8 ) diff0_m ); \ + sad_m += __msa_hadd_u_h( ( v16u8 ) diff1_m, ( v16u8 ) diff1_m ); \ + \ + sad_m; \ +} ) + +/* Description : Set element n input vector to GPR value + Arguments : Inputs - in0, in1, in2, in3 (4 input vectors) + Output - out (output vector) + Return Type - as per RTYPE + Details : Set element 0 in vector 'out' to value specified in 'in0' +*/ +#define INSERT_W2( RTYPE, in0, in1, out ) \ +{ \ + out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 ); \ + out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 ); \ +} +#define INSERT_W2_SB( ... ) INSERT_W2( v16i8, __VA_ARGS__ ) + +#define INSERT_W4( RTYPE, in0, in1, in2, in3, out ) \ +{ \ + out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 ); \ + out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 ); \ + out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 2, in2 ); \ + out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 3, in3 ); \ +} +#define INSERT_W4_UB( ... ) INSERT_W4( v16u8, __VA_ARGS__ ) +#define INSERT_W4_SB( ... ) INSERT_W4( v16i8, __VA_ARGS__ ) + +#define INSERT_D2( RTYPE, in0, in1, out ) \ +{ \ + out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 0, in0 ); \ + out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 1, in1 ); \ +} +#define INSERT_D2_UB( ... ) INSERT_D2( v16u8, __VA_ARGS__ ) + +/* Description : Interleave even halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in1, ( v8i16 ) in0 ); \ + out1 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in3, ( v8i16 ) in2 ); \ +} +#define ILVEV_H2_UB( ... ) ILVEV_H2( v16u8, __VA_ARGS__ ) + +/* Description : Interleave even double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double word elements of 'in0' and 'in1' are interleaved + and written to 'out0' +*/ +#define ILVEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in1, ( v2i64 ) in0 ); \ + out1 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in3, ( v2i64 ) in2 ); \ +} +#define ILVEV_D2_UB( ... ) ILVEV_D2( v16u8, __VA_ARGS__ ) + +/* Description : Interleave left half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of byte elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ + out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \ +} +#define ILVL_B2_UH( ... ) ILVL_B2( v8u16, __VA_ARGS__ ) +#define ILVL_B2_SH( ... ) ILVL_B2( v8i16, __VA_ARGS__ ) + +#define ILVL_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 ) \ +{ \ + ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ + ILVL_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ +} +#define ILVL_B4_UB( ... ) ILVL_B4( v16u8, __VA_ARGS__ ) +#define ILVL_B4_SB( ... ) ILVL_B4( v16i8, __VA_ARGS__ ) +#define ILVL_B4_UH( ... ) ILVL_B4( v8u16, __VA_ARGS__ ) +#define ILVL_B4_SH( ... ) ILVL_B4( v8i16, __VA_ARGS__ ) + +/* Description : Interleave left half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ + out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \ +} +#define ILVL_H2_SH( ... ) ILVL_H2( v8i16, __VA_ARGS__ ) +#define ILVL_H2_SW( ... ) ILVL_H2( v4i32, __VA_ARGS__ ) + +#define ILVL_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 ) \ +{ \ + ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ + ILVL_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ +} +#define ILVL_H4_SW( ... ) ILVL_H4( v4i32, __VA_ARGS__ ) + +/* Description : Interleave left half of word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Left half of word elements of 'in0' and 'in1' are interleaved + and written to 'out0'. +*/ +#define ILVL_W2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \ + out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in2, ( v4i32 ) in3 ); \ +} +#define ILVL_W2_SH( ... ) ILVL_W2( v8i16, __VA_ARGS__ ) + +/* Description : Interleave right half of byte elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements of 'in0' and 'in1' are interleaved + and written to out0. +*/ +#define ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ + out1 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \ +} +#define ILVR_B2_SB( ... ) ILVR_B2( v16i8, __VA_ARGS__ ) +#define ILVR_B2_UH( ... ) ILVR_B2( v8u16, __VA_ARGS__ ) +#define ILVR_B2_SH( ... ) ILVR_B2( v8i16, __VA_ARGS__ ) + +#define ILVR_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 ) \ +{ \ + ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ + ILVR_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ +} +#define ILVR_B4_UB( ... ) ILVR_B4( v16u8, __VA_ARGS__ ) +#define ILVR_B4_SB( ... ) ILVR_B4( v16i8, __VA_ARGS__ ) +#define ILVR_B4_UH( ... ) ILVR_B4( v8u16, __VA_ARGS__ ) +#define ILVR_B4_SH( ... ) ILVR_B4( v8i16, __VA_ARGS__ ) + +/* Description : Interleave right half of halfword elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ + out1 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \ +} +#define ILVR_H2_SH( ... ) ILVR_H2( v8i16, __VA_ARGS__ ) +#define ILVR_H2_SW( ... ) ILVR_H2( v4i32, __VA_ARGS__ ) + +#define ILVR_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 ) \ +{ \ + ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ + ILVR_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ +} +#define ILVR_H4_SH( ... ) ILVR_H4( v8i16, __VA_ARGS__ ) +#define ILVR_H4_SW( ... ) ILVR_H4( v4i32, __VA_ARGS__ ) + +#define ILVR_W2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \ + out1 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in2, ( v4i32 ) in3 ); \ +} +#define ILVR_W2_SH( ... ) ILVR_W2( v8i16, __VA_ARGS__ ) + +/* Description : Interleave right half of double word elements from vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of double word elements of 'in0' and 'in1' are + interleaved and written to 'out0'. +*/ +#define ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in0 ), ( v2i64 ) ( in1 ) ); \ + out1 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in2 ), ( v2i64 ) ( in3 ) ); \ +} +#define ILVR_D2_UB( ... ) ILVR_D2( v16u8, __VA_ARGS__ ) +#define ILVR_D2_SB( ... ) ILVR_D2( v16i8, __VA_ARGS__ ) +#define ILVR_D2_SH( ... ) ILVR_D2( v8i16, __VA_ARGS__ ) + +#define ILVR_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 ) \ +{ \ + ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ + ILVR_D2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ +} +#define ILVR_D4_UB( ... ) ILVR_D4( v16u8, __VA_ARGS__ ) + +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_B2( RTYPE, in0, in1, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ + out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ +} +#define ILVRL_B2_UB( ... ) ILVRL_B2( v16u8, __VA_ARGS__ ) +#define ILVRL_B2_SB( ... ) ILVRL_B2( v16i8, __VA_ARGS__ ) +#define ILVRL_B2_UH( ... ) ILVRL_B2( v8u16, __VA_ARGS__ ) +#define ILVRL_B2_SH( ... ) ILVRL_B2( v8i16, __VA_ARGS__ ) +#define ILVRL_B2_SW( ... ) ILVRL_B2( v4i32, __VA_ARGS__ ) + +#define ILVRL_H2( RTYPE, in0, in1, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ + out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ +} +#define ILVRL_H2_SH( ... ) ILVRL_H2( v8i16, __VA_ARGS__ ) +#define ILVRL_H2_SW( ... ) ILVRL_H2( v4i32, __VA_ARGS__ ) + +#define ILVRL_W2( RTYPE, in0, in1, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \ + out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \ +} +#define ILVRL_W2_SH( ... ) ILVRL_W2( v8i16, __VA_ARGS__ ) +#define ILVRL_W2_SW( ... ) ILVRL_W2( v4i32, __VA_ARGS__ ) + +/* Description : Maximum values between signed elements of vector and + 5-bit signed immediate value are copied to the output vector + Arguments : Inputs - in0, in1, in2, in3, max_val + Outputs - in place operation + Return Type - unsigned halfword + Details : Maximum of signed halfword element values from 'in0' and + 'max_val' are written in place +*/ +#define MAXI_SH2( RTYPE, in0, in1, max_val ) \ +{ \ + in0 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in0, ( max_val ) ); \ + in1 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in1, ( max_val ) ); \ +} +#define MAXI_SH2_UH( ... ) MAXI_SH2( v8u16, __VA_ARGS__ ) +#define MAXI_SH2_SH( ... ) MAXI_SH2( v8i16, __VA_ARGS__ ) + +#define MAXI_SH4( RTYPE, in0, in1, in2, in3, max_val ) \ +{ \ + MAXI_SH2( RTYPE, in0, in1, max_val ); \ + MAXI_SH2( RTYPE, in2, in3, max_val ); \ +} +#define MAXI_SH4_UH( ... ) MAXI_SH4( v8u16, __VA_ARGS__ ) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val + 1 bits) + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val+1) bit range. + The results are written in place +*/ +#define SAT_UH2( RTYPE, in0, in1, sat_val ) \ +{ \ + in0 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in0, sat_val ); \ + in1 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in1, sat_val ); \ +} +#define SAT_UH2_UH( ... ) SAT_UH2( v8u16, __VA_ARGS__ ) + +#define SAT_UH4( RTYPE, in0, in1, in2, in3, sat_val ) \ +{ \ + SAT_UH2( RTYPE, in0, in1, sat_val ); \ + SAT_UH2( RTYPE, in2, in3, sat_val ) \ +} +#define SAT_UH4_UH( ... ) SAT_UH4( v8u16, __VA_ARGS__ ) + +/* Description : Saturate the halfword element values to the max + unsigned value of (sat_val+1 bits) + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned halfword element from 'in0' is saturated to the + value generated with (sat_val+1) bit range + The results are written in place +*/ +#define SAT_SH2( RTYPE, in0, in1, sat_val ) \ +{ \ + in0 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in0, sat_val ); \ + in1 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in1, sat_val ); \ +} +#define SAT_SH2_SH( ... ) SAT_SH2( v8i16, __VA_ARGS__ ) + +#define SAT_SH4( RTYPE, in0, in1, in2, in3, sat_val ) \ +{ \ + SAT_SH2( RTYPE, in0, in1, sat_val ); \ + SAT_SH2( RTYPE, in2, in3, sat_val ); \ +} +#define SAT_SH4_SH( ... ) SAT_SH4( v8i16, __VA_ARGS__ ) + +/* Description : Saturate the word element values to the max + unsigned value of (sat_val+1 bits) + The element data width remains unchanged + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned word element from 'in0' is saturated to the + value generated with (sat_val+1) bit range + The results are written in place +*/ +#define SAT_SW2( RTYPE, in0, in1, sat_val ) \ +{ \ + in0 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in0, sat_val ); \ + in1 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in1, sat_val ); \ +} +#define SAT_SW2_SW( ... ) SAT_SW2( v4i32, __VA_ARGS__ ) + +/* Description : Pack even byte elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even byte elements of 'in0' are copied to the left half of + 'out0' & even byte elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ + out1 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \ +} +#define PCKEV_B2_SB( ... ) PCKEV_B2( v16i8, __VA_ARGS__ ) +#define PCKEV_B2_UB( ... ) PCKEV_B2( v16u8, __VA_ARGS__ ) +#define PCKEV_B2_SH( ... ) PCKEV_B2( v8i16, __VA_ARGS__ ) +#define PCKEV_B2_SW( ... ) PCKEV_B2( v4i32, __VA_ARGS__ ) + +#define PCKEV_B3( RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2 ) \ +{ \ + PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ + out2 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in4, ( v16i8 ) in5 ); \ +} +#define PCKEV_B3_UB( ... ) PCKEV_B3( v16u8, __VA_ARGS__ ) + +#define PCKEV_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 ) \ +{ \ + PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ + PCKEV_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ +} +#define PCKEV_B4_SB( ... ) PCKEV_B4( v16i8, __VA_ARGS__ ) +#define PCKEV_B4_UB( ... ) PCKEV_B4( v16u8, __VA_ARGS__ ) + +/* Description : Pack even halfword elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even halfword elements of 'in0' are copied to the left half of + 'out0' & even halfword elements of 'in1' are copied to the + right half of 'out0'. +*/ +#define PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ + out1 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \ +} +#define PCKEV_H2_SH( ... ) PCKEV_H2( v8i16, __VA_ARGS__ ) + +#define PCKEV_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 ) \ +{ \ + PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ + PCKEV_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ +} +#define PCKEV_H4_SH( ... ) PCKEV_H4( v8i16, __VA_ARGS__ ) + +/* Description : Pack even double word elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Even double elements of 'in0' are copied to the left half of + 'out0' & even double elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in0, ( v2i64 ) in1 ); \ + out1 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in2, ( v2i64 ) in3 ); \ +} +#define PCKEV_D2_UB( ... ) PCKEV_D2( v16u8, __VA_ARGS__ ) + +#define PCKEV_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 ) \ +{ \ + PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ + PCKEV_D2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ +} +#define PCKEV_D4_UB( ... ) PCKEV_D4( v16u8, __VA_ARGS__ ) + +/* Description : Pack odd byte elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Odd byte elements of 'in0' are copied to the left half of + 'out0' & odd byte elements of 'in1' are copied to the right + half of 'out0'. +*/ +#define PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \ + out1 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \ +} +#define PCKOD_B2_UB( ... ) PCKOD_B2( v16u8, __VA_ARGS__ ) + +#define PCKOD_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 ) \ +{ \ + PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ + PCKOD_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ +} +#define PCKOD_B4_UB( ... ) PCKOD_B4( v16u8, __VA_ARGS__ ) + +/* Description : Pack odd double word elements of vector pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Odd double word elements of 'in0' are copied to the left half + of 'out0' & odd double word elements of 'in1' are copied to + the right half of 'out0'. +*/ +#define PCKOD_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in0, ( v2i64 ) in1 ); \ + out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in2, ( v2i64 ) in3 ); \ +} +#define PCKOD_D2_SH( ... ) PCKOD_D2( v8i16, __VA_ARGS__ ) +#define PCKOD_D2_SD( ... ) PCKOD_D2( v2i64, __VA_ARGS__ ) + +/* Description : Each byte element is logically xor'ed with immediate 128 + Arguments : Inputs - in0, in1 + Outputs - in place operation + Return Type - as per RTYPE + Details : Each unsigned byte element from input vector 'in0' is + logically xor'ed with 128 and the result is stored in-place. +*/ +#define XORI_B2_128( RTYPE, in0, in1 ) \ +{ \ + in0 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in0, 128 ); \ + in1 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in1, 128 ); \ +} +#define XORI_B2_128_UB( ... ) XORI_B2_128( v16u8, __VA_ARGS__ ) +#define XORI_B2_128_SB( ... ) XORI_B2_128( v16i8, __VA_ARGS__ ) + +#define XORI_B3_128( RTYPE, in0, in1, in2 ) \ +{ \ + XORI_B2_128( RTYPE, in0, in1 ); \ + in2 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in2, 128 ); \ +} +#define XORI_B3_128_SB( ... ) XORI_B3_128( v16i8, __VA_ARGS__ ) + +#define XORI_B4_128( RTYPE, in0, in1, in2, in3 ) \ +{ \ + XORI_B2_128( RTYPE, in0, in1 ); \ + XORI_B2_128( RTYPE, in2, in3 ); \ +} +#define XORI_B4_128_UB( ... ) XORI_B4_128( v16u8, __VA_ARGS__ ) +#define XORI_B4_128_SB( ... ) XORI_B4_128( v16i8, __VA_ARGS__ ) + +#define XORI_B5_128( RTYPE, in0, in1, in2, in3, in4 ) \ +{ \ + XORI_B3_128( RTYPE, in0, in1, in2 ); \ + XORI_B2_128( RTYPE, in3, in4 ); \ +} +#define XORI_B5_128_SB( ... ) XORI_B5_128( v16i8, __VA_ARGS__ ) + +/* Description : Addition of signed halfword elements and signed saturation + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Signed halfword elements from 'in0' are added to signed + halfword elements of 'in1'. The result is then signed saturated + between halfword data type range +*/ +#define ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \ + out1 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \ +} +#define ADDS_SH2_SH( ... ) ADDS_SH2( v8i16, __VA_ARGS__ ) + +#define ADDS_SH4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 ) \ +{ \ + ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 ); \ + ADDS_SH2( RTYPE, in4, in5, in6, in7, out2, out3 ); \ +} +#define ADDS_SH4_UH( ... ) ADDS_SH4( v8u16, __VA_ARGS__ ) + +/* Description : Shift left all elements of vector (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is left shifted by 'shift' and + the result is written in-place. +*/ +#define SLLI_4V( in0, in1, in2, in3, shift ) \ +{ \ + in0 = in0 << shift; \ + in1 = in1 << shift; \ + in2 = in2 << shift; \ + in3 = in3 << shift; \ +} + +/* Description : Arithmetic shift right all elements of vector + (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is right shifted by 'shift' and + the result is written in-place. 'shift' is a GP variable. +*/ +#define SRA_4V( in0, in1, in2, in3, shift ) \ +{ \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + in2 = in2 >> shift; \ + in3 = in3 >> shift; \ +} + +/* Description : Shift right arithmetic rounded halfwords + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetic by + number of bits respective element holds in vector 'shift'. + The last discarded bit is added to shifted value for rounding + and the result is written in-place. + 'shift' is a vector. +*/ +#define SRAR_H2( RTYPE, in0, in1, shift ) \ +{ \ + in0 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in0, ( v8i16 ) shift ); \ + in1 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in1, ( v8i16 ) shift ); \ +} +#define SRAR_H2_SH( ... ) SRAR_H2( v8i16, __VA_ARGS__ ) + +#define SRAR_H4( RTYPE, in0, in1, in2, in3, shift ) \ +{ \ + SRAR_H2( RTYPE, in0, in1, shift ) \ + SRAR_H2( RTYPE, in2, in3, shift ) \ +} +#define SRAR_H4_SH( ... ) SRAR_H4( v8i16, __VA_ARGS__ ) + +/* Description : Shift right logical all halfword elements of vector + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right logical by + number of bits respective element holds in vector 'shift' and + the result is stored in-place.'shift' is a vector. +*/ +#define SRL_H4( RTYPE, in0, in1, in2, in3, shift ) \ +{ \ + in0 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in0, ( v8i16 ) shift ); \ + in1 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in1, ( v8i16 ) shift ); \ + in2 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in2, ( v8i16 ) shift ); \ + in3 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in3, ( v8i16 ) shift ); \ +} +#define SRL_H4_UH( ... ) SRL_H4( v8u16, __VA_ARGS__ ) + +/* Description : Shift right arithmetic rounded (immediate) + Arguments : Inputs - in0, in1, shift + Outputs - in place operation + Return Type - as per RTYPE + Details : Each element of vector 'in0' is shifted right arithmetic by + value in 'shift'. The last discarded bit is added to shifted + value for rounding and the result is written in-place. + 'shift' is an immediate value. +*/ +#define SRARI_H2( RTYPE, in0, in1, shift ) \ +{ \ + in0 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in0, shift ); \ + in1 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in1, shift ); \ +} +#define SRARI_H2_UH( ... ) SRARI_H2( v8u16, __VA_ARGS__ ) +#define SRARI_H2_SH( ... ) SRARI_H2( v8i16, __VA_ARGS__ ) + +#define SRARI_H4( RTYPE, in0, in1, in2, in3, shift ) \ +{ \ + SRARI_H2( RTYPE, in0, in1, shift ); \ + SRARI_H2( RTYPE, in2, in3, shift ); \ +} +#define SRARI_H4_UH( ... ) SRARI_H4( v8u16, __VA_ARGS__ ) +#define SRARI_H4_SH( ... ) SRARI_H4( v8i16, __VA_ARGS__ ) + +#define SRARI_W2( RTYPE, in0, in1, shift ) \ +{ \ + in0 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in0, shift ); \ + in1 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in1, shift ); \ +} +#define SRARI_W2_SW( ... ) SRARI_W2( v4i32, __VA_ARGS__ ) + +#define SRARI_W4( RTYPE, in0, in1, in2, in3, shift ) \ +{ \ + SRARI_W2( RTYPE, in0, in1, shift ); \ + SRARI_W2( RTYPE, in2, in3, shift ); \ +} +#define SRARI_W4_SW( ... ) SRARI_W4( v4i32, __VA_ARGS__ ) + +/* Description : Multiplication of pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element from 'in0' is multiplied with elements from 'in1' + and the result is written to 'out0' +*/ +#define MUL2( in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = in0 * in1; \ + out1 = in2 * in3; \ +} +#define MUL4( in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 ) \ +{ \ + MUL2( in0, in1, in2, in3, out0, out1 ); \ + MUL2( in4, in5, in6, in7, out2, out3 ); \ +} + +/* Description : Addition of 2 pairs of vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Details : Each element in 'in0' is added to 'in1' and result is written + to 'out0'. +*/ +#define ADD2( in0, in1, in2, in3, out0, out1 ) \ +{ \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ +} +#define ADD4( in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 ) \ +{ \ + ADD2( in0, in1, in2, in3, out0, out1 ); \ + ADD2( in4, in5, in6, in7, out2, out3 ); \ +} + +#define SUB4( in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3 ) \ +{ \ + out0 = in0 - in1; \ + out1 = in2 - in3; \ + out2 = in4 - in5; \ + out3 = in6 - in7; \ +} + +/* Description : Sign extend halfword elements from right half of the vector + Arguments : Input - in (halfword vector) + Output - out (sign extended word vector) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved with same vector 'in0' to generate + 4 word elements keeping sign intact +*/ +#define UNPCK_R_SH_SW( in, out ) \ +{ \ + v8i16 sign_m; \ + \ + sign_m = __msa_clti_s_h( ( v8i16 ) in, 0 ); \ + out = ( v4i32 ) __msa_ilvr_h( sign_m, ( v8i16 ) in ); \ +} + +/* Description : Zero extend unsigned byte elements to halfword elements + Arguments : Input - in (unsigned byte vector) + Outputs - out0, out1 (unsigned halfword vectors) + Return Type - signed halfword + Details : Zero extended right half of vector is returned in 'out0' + Zero extended left half of vector is returned in 'out1' +*/ +#define UNPCK_UB_SH( in, out0, out1 ) \ +{ \ + v16i8 zero_m = { 0 }; \ + \ + ILVRL_B2_SH( zero_m, in, out0, out1 ); \ +} + +/* Description : Sign extend halfword elements from input vector and return + the result in pair of vectors + Arguments : Input - in (halfword vector) + Outputs - out0, out1 (sign extended word vectors) + Return Type - signed word + Details : Sign bit of halfword elements from input vector 'in' is + extracted and interleaved right with same vector 'in0' to + generate 4 signed word elements in 'out0' + Then interleaved left with same vector 'in0' to + generate 4 signed word elements in 'out1' +*/ +#define UNPCK_SH_SW( in, out0, out1 ) \ +{ \ + v8i16 tmp_m; \ + \ + tmp_m = __msa_clti_s_h( ( v8i16 ) in, 0 ); \ + ILVRL_H2_SW( tmp_m, in, out0, out1 ); \ +} + +/* Description : Butterfly of 4 input vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Details : Butterfly operation +*/ +#define BUTTERFLY_4( in0, in1, in2, in3, out0, out1, out2, out3 ) \ +{ \ + out0 = in0 + in3; \ + out1 = in1 + in2; \ + \ + out2 = in1 - in2; \ + out3 = in0 - in3; \ +} + +/* Description : Butterfly of 8 input vectors + Arguments : Inputs - in0 ... in7 + Outputs - out0 .. out7 + Details : Butterfly operation +*/ +#define BUTTERFLY_8( in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7 ) \ +{ \ + out0 = in0 + in7; \ + out1 = in1 + in6; \ + out2 = in2 + in5; \ + out3 = in3 + in4; \ + \ + out4 = in3 - in4; \ + out5 = in2 - in5; \ + out6 = in1 - in6; \ + out7 = in0 - in7; \ +} + +/* Description : Transpose input 8x8 byte block + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_UB( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7 ) \ +{ \ + v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_B4_SB( in2, in0, in3, in1, in6, in4, in7, in5, \ + tmp0_m, tmp1_m, tmp2_m, tmp3_m ); \ + ILVRL_B2_SB( tmp1_m, tmp0_m, tmp4_m, tmp5_m ); \ + ILVRL_B2_SB( tmp3_m, tmp2_m, tmp6_m, tmp7_m ); \ + ILVRL_W2( RTYPE, tmp6_m, tmp4_m, out0, out2 ); \ + ILVRL_W2( RTYPE, tmp7_m, tmp5_m, out4, out6 ); \ + SLDI_B2_0( RTYPE, out0, out2, out1, out3, 8 ); \ + SLDI_B2_0( RTYPE, out4, out6, out5, out7, 8 ); \ +} +#define TRANSPOSE8x8_UB_UB( ... ) TRANSPOSE8x8_UB( v16u8, __VA_ARGS__ ) + +/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, + in8, in9, in10, in11, in12, in13, in14, in15 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - unsigned byte +*/ +#define TRANSPOSE16x8_UB_UB( in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3, out4, out5, out6, out7 ) \ +{ \ + v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVEV_D2_UB( in0, in8, in1, in9, out7, out6 ); \ + ILVEV_D2_UB( in2, in10, in3, in11, out5, out4 ); \ + ILVEV_D2_UB( in4, in12, in5, in13, out3, out2 ); \ + ILVEV_D2_UB( in6, in14, in7, in15, out1, out0 ); \ + \ + tmp0_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out6, ( v16i8 ) out7 ); \ + tmp4_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out6, ( v16i8 ) out7 ); \ + tmp1_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out4, ( v16i8 ) out5 ); \ + tmp5_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out4, ( v16i8 ) out5 ); \ + out5 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out2, ( v16i8 ) out3 ); \ + tmp6_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out2, ( v16i8 ) out3 ); \ + out7 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out0, ( v16i8 ) out1 ); \ + tmp7_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out0, ( v16i8 ) out1 ); \ + \ + ILVEV_H2_UB( tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m ); \ + out0 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ + out4 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ + \ + tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m ); \ + tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) out7, ( v8i16 ) out5 ); \ + out2 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ + out6 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ + \ + ILVEV_H2_UB( tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m ); \ + out1 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ + out5 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ + \ + tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m ); \ + tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m ); \ + tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m ); \ + tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m ); \ + out3 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ + out7 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \ +} + +/* Description : Transpose 4x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed halfword +*/ +#define TRANSPOSE4x4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 ) \ +{ \ + v8i16 s0_m, s1_m; \ + \ + ILVR_H2_SH( in1, in0, in3, in2, s0_m, s1_m ); \ + ILVRL_W2_SH( s1_m, s0_m, out0, out2 ); \ + out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 ); \ + out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out2 ); \ +} + +/* Description : Transpose 4x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE4X8_SH_SH( in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7 ) \ +{ \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \ + v8i16 zero_m = { 0 }; \ + \ + ILVR_H4_SH( in1, in0, in3, in2, in5, in4, in7, in6, \ + tmp0_n, tmp1_n, tmp2_n, tmp3_n ); \ + ILVRL_W2_SH( tmp1_n, tmp0_n, tmp0_m, tmp2_m ); \ + ILVRL_W2_SH( tmp3_n, tmp2_n, tmp1_m, tmp3_m ); \ + \ + out0 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m ); \ + out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m ); \ + out2 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m ); \ + out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m ); \ + \ + out4 = zero_m; \ + out5 = zero_m; \ + out6 = zero_m; \ + out7 = zero_m; \ +} + +/* Description : Transpose 8x4 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - signed halfword +*/ +#define TRANSPOSE8X4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 ) \ +{ \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + \ + ILVR_H2_SH( in1, in0, in3, in2, tmp0_m, tmp1_m ); \ + ILVL_H2_SH( in1, in0, in3, in2, tmp2_m, tmp3_m ); \ + ILVR_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2 ); \ + ILVL_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3 ); \ +} + +/* Description : Transpose 8x8 block with half word elements in vectors + Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + Return Type - as per RTYPE +*/ +#define TRANSPOSE8x8_H( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7 ) \ +{ \ + v8i16 s0_m, s1_m; \ + v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \ + \ + ILVR_H2_SH( in6, in4, in7, in5, s0_m, s1_m ); \ + ILVRL_H2_SH( s1_m, s0_m, tmp0_m, tmp1_m ); \ + ILVL_H2_SH( in6, in4, in7, in5, s0_m, s1_m ); \ + ILVRL_H2_SH( s1_m, s0_m, tmp2_m, tmp3_m ); \ + ILVR_H2_SH( in2, in0, in3, in1, s0_m, s1_m ); \ + ILVRL_H2_SH( s1_m, s0_m, tmp4_m, tmp5_m ); \ + ILVL_H2_SH( in2, in0, in3, in1, s0_m, s1_m ); \ + ILVRL_H2_SH( s1_m, s0_m, tmp6_m, tmp7_m ); \ + PCKEV_D4( RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \ + tmp3_m, tmp7_m, out0, out2, out4, out6 ); \ + out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp0_m, ( v2i64 ) tmp4_m ); \ + out3 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp5_m ); \ + out5 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp2_m, ( v2i64 ) tmp6_m ); \ + out7 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp7_m ); \ +} +#define TRANSPOSE8x8_SH_SH( ... ) TRANSPOSE8x8_H( v8i16, __VA_ARGS__ ) + +/* Description : Transpose 4x4 block with word elements in vectors + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1, out2, out3 + Return Type - signed word +*/ +#define TRANSPOSE4x4_SW_SW( in0, in1, in2, in3, out0, out1, out2, out3 ) \ +{ \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + \ + ILVRL_W2_SW( in1, in0, s0_m, s1_m ); \ + ILVRL_W2_SW( in3, in2, s2_m, s3_m ); \ + \ + out0 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m ); \ + out1 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m ); \ + out2 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m ); \ + out3 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m ); \ +} + +/* Description : Add block 4x4 + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Least significant 4 bytes from each input vector are added to + the destination bytes, clipped between 0-255 and stored. +*/ +#define ADDBLK_ST4x4_UB( in0, in1, in2, in3, p_dst, stride ) \ +{ \ + uint32_t src0_m, src1_m, src2_m, src3_m; \ + uint32_t out0_m, out1_m, out2_m, out3_m; \ + v8i16 inp0_m, inp1_m, res0_m, res1_m; \ + v16i8 dst0_m = { 0 }; \ + v16i8 dst1_m = { 0 }; \ + v16i8 zero_m = { 0 }; \ + \ + ILVR_D2_SH( in1, in0, in3, in2, inp0_m, inp1_m ) \ + LW4( p_dst, stride, src0_m, src1_m, src2_m, src3_m ); \ + INSERT_W2_SB( src0_m, src1_m, dst0_m ); \ + INSERT_W2_SB( src2_m, src3_m, dst1_m ); \ + ILVR_B2_SH( zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m ); \ + ADD2( res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m ); \ + CLIP_SH2_0_255( res0_m, res1_m ); \ + PCKEV_B2_SB( res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m ); \ + \ + out0_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 0 ); \ + out1_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 1 ); \ + out2_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 0 ); \ + out3_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 1 ); \ + SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride ); \ +} + +/* Description : Dot product and addition of 3 signed halfword input vectors + Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2 + Output - out0_m + Return Type - signed halfword + Details : Dot product of 'in0' with 'coeff0' + Dot product of 'in1' with 'coeff1' + Dot product of 'in2' with 'coeff2' + Addition of all the 3 vector results + out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2) +*/ +#define DPADD_SH3_SH( in0, in1, in2, coeff0, coeff1, coeff2 ) \ +( { \ + v8i16 tmp1_m; \ + v8i16 out0_m; \ + \ + out0_m = __msa_dotp_s_h( ( v16i8 ) in0, ( v16i8 ) coeff0 ); \ + out0_m = __msa_dpadd_s_h( out0_m, ( v16i8 ) in1, ( v16i8 ) coeff1 ); \ + tmp1_m = __msa_dotp_s_h( ( v16i8 ) in2, ( v16i8 ) coeff2 ); \ + out0_m = __msa_adds_s_h( out0_m, tmp1_m ); \ + \ + out0_m; \ +} ) + +/* Description : Pack even elements of input vectors & xor with 128 + Arguments : Inputs - in0, in1 + Output - out_m + Return Type - unsigned byte + Details : Signed byte even elements from 'in0' and 'in1' are packed + together in one vector and the resulting vector is xor'ed with + 128 to shift the range from signed to unsigned byte +*/ +#define PCKEV_XORI128_UB( in0, in1 ) \ +( { \ + v16u8 out_m; \ + out_m = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 ); \ + out_m = ( v16u8 ) __msa_xori_b( ( v16u8 ) out_m, 128 ); \ + out_m; \ +} ) + +/* Description : Pack even byte elements, extract 0 & 2 index words from pair + of results and store 4 words in destination memory as per + stride + Arguments : Inputs - in0, in1, in2, in3, pdst, stride +*/ +#define PCKEV_ST4x4_UB( in0, in1, in2, in3, p_dst, stride ) \ +{ \ + uint32_t out0_m, out1_m, out2_m, out3_m; \ + v16i8 tmp0_m, tmp1_m; \ + \ + PCKEV_B2_SB( in1, in0, in3, in2, tmp0_m, tmp1_m ); \ + \ + out0_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 0 ); \ + out1_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 2 ); \ + out2_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 0 ); \ + out3_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 2 ); \ + \ + SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride ); \ +} + +/* Description : Pack even byte elements and store byte vector in destination + memory + Arguments : Inputs - in0, in1, pdst +*/ +#define PCKEV_ST_SB( in0, in1, p_dst ) \ +{ \ + v16i8 tmp_m; \ + tmp_m = __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 ); \ + ST_SB( tmp_m, ( p_dst ) ); \ +} + +#define AVC_CALC_DPADD_H_6PIX_2COEFF_SH( in0, in1, in2, in3, in4, in5 ) \ +( { \ + v4i32 tmp0_m, tmp1_m; \ + v8i16 out0_m, out1_m, out2_m, out3_m; \ + v8i16 minus5h_m = __msa_ldi_h( -5 ); \ + v8i16 plus20h_m = __msa_ldi_h( 20 ); \ + \ + ILVRL_H2_SW( in5, in0, tmp0_m, tmp1_m ); \ + \ + tmp0_m = __msa_hadd_s_w( ( v8i16 ) tmp0_m, ( v8i16 ) tmp0_m ); \ + tmp1_m = __msa_hadd_s_w( ( v8i16 ) tmp1_m, ( v8i16 ) tmp1_m ); \ + \ + ILVRL_H2_SH( in1, in4, out0_m, out1_m ); \ + DPADD_SH2_SW( out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m ); \ + ILVRL_H2_SH( in2, in3, out2_m, out3_m ); \ + DPADD_SH2_SW( out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m ); \ + \ + SRARI_W2_SW( tmp0_m, tmp1_m, 10 ); \ + SAT_SW2_SW( tmp0_m, tmp1_m, 7 ); \ + out0_m = __msa_pckev_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m ); \ + \ + out0_m; \ +} ) + +#define AVC_HORZ_FILTER_SH( in, mask0, mask1, mask2 ) \ +( { \ + v8i16 out0_m, out1_m; \ + v16i8 tmp0_m, tmp1_m; \ + v16i8 minus5b = __msa_ldi_b( -5 ); \ + v16i8 plus20b = __msa_ldi_b( 20 ); \ + \ + tmp0_m = __msa_vshf_b( ( v16i8 ) mask0, in, in ); \ + out0_m = __msa_hadd_s_h( tmp0_m, tmp0_m ); \ + \ + tmp0_m = __msa_vshf_b( ( v16i8 ) mask1, in, in ); \ + out0_m = __msa_dpadd_s_h( out0_m, minus5b, tmp0_m ); \ + \ + tmp1_m = __msa_vshf_b( ( v16i8 ) ( mask2 ), in, in ); \ + out1_m = __msa_dpadd_s_h( out0_m, plus20b, tmp1_m ); \ + \ + out1_m; \ +} ) + +#endif /* X264_MIPS_MACROS_H */
View file
x264-snapshot-20150804-2245.tar.bz2/common/mips/mc-c.c
Added
@@ -0,0 +1,3807 @@ +/***************************************************************************** + * mc-c.c: msa motion compensation + ***************************************************************************** + * Copyright (C) 2015 x264 project + * + * Authors: Neha Rana <neha.rana@imgtec.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "macros.h" +#include "mc.h" + +#if !HIGH_BIT_DEPTH +static const uint8_t pu_luma_mask_arr[16 * 8] = +{ + /* 8 width cases */ + 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12, + 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11, + 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + /* 4 width cases */ + 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24, + 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23, + 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26 +}; + +static const uint8_t pu_chroma_mask_arr[16 * 5] = +{ + 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20, + 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24, + 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8, + 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20 +}; + +void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + int32_t i_height ); +void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + int32_t i_height ); +void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src, + intptr_t i_src_stride, int32_t i_height ); +void x264_memzero_aligned_msa( void *p_dst, size_t n ); + +void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride, + uint8_t *p_pix2, intptr_t i_pix2_stride, + uint8_t *p_pix3, intptr_t i_pix3_stride, + int32_t i_weight ); +void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride, + uint8_t *p_pix2, intptr_t i_pix2_stride, + uint8_t *p_pix3, intptr_t i_pix3_stride, + int32_t i_weight ); +void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride, + uint8_t *p_pix2, intptr_t i_pix2_stride, + uint8_t *p_pix3, intptr_t i_pix3_stride, + int32_t i_weight ); +void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride, + uint8_t *p_pix2, intptr_t i_pix2_stride, + uint8_t *p_pix3, intptr_t i_pix3_stride, + int32_t i_weight ); +void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride, + uint8_t *p_pix2, intptr_t i_pix2_stride, + uint8_t *p_pix3, intptr_t i_pix3_stride, + int32_t i_weight ); +void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride, + uint8_t *p_pix2, intptr_t pix2_stride, + uint8_t *p_pix3, intptr_t pix3_stride, + int32_t i_weight ); +void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride, + uint8_t *p_pix2, intptr_t i_pix2_stride, + uint8_t *p_pix3, intptr_t i_pix3_stride, + int32_t i_weight ); +void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride, + uint8_t *p_pix2, intptr_t i_pix2_stride, + uint8_t *p_pix3, intptr_t i_pix3_stride, + int32_t i_weight ); +void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t i_pix1_stride, + uint8_t *p_pix2, intptr_t i_pix2_stride, + uint8_t *p_pix3, intptr_t i_pix3_stride, + int32_t i_weight ); + +void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + const x264_weight_t *pWeight, int32_t i_height ); +void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + const x264_weight_t *pWeight, int32_t i_height ); +void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + const x264_weight_t *pWeight, int32_t i_height ); +void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + const x264_weight_t *pWeight, int32_t i_height ); + +weight_fn_t x264_mc_weight_wtab_msa[6] = +{ + x264_mc_weight_w4_msa, + x264_mc_weight_w4_msa, + x264_mc_weight_w8_msa, + x264_mc_weight_w16_msa, + x264_mc_weight_w16_msa, + x264_mc_weight_w20_msa, +}; + +void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src[4], intptr_t i_src_stride, + int32_t m_vx, int32_t m_vy, + int32_t i_width, int32_t i_height, + const x264_weight_t *pWeight ); +uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride, + uint8_t *p_src[4], intptr_t i_src_stride, + int32_t m_vx, int32_t m_vy, + int32_t i_width, int32_t i_height, + const x264_weight_t *pWeight ); +void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v, + intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + int32_t m_vx, int32_t m_vy, + int32_t i_width, int32_t i_height ); +void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v, + uint8_t *p_dstc, uint8_t *p_src, + intptr_t i_stride, int32_t i_width, + int32_t i_height, int16_t *p_buf ); + +void x264_plane_copy_interleave_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src0, intptr_t i_src_stride0, + uint8_t *p_src1, intptr_t i_src_stride1, + int32_t i_width, int32_t i_height ); +void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0, + uint8_t *p_dst1, intptr_t i_dst_stride1, + uint8_t *p_src, intptr_t i_src_stride, + int32_t i_width, int32_t i_height ); +void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0, + intptr_t i_dst_stride0, + uint8_t *p_dst1, + intptr_t i_dst_stride1, + uint8_t *p_dst2, + intptr_t i_dst_stride2, + uint8_t *p_src, + intptr_t i_src_stride, + int32_t i_src_width, int32_t i_width, + int32_t i_height ); +void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src0, uint8_t *p_src1, + int32_t i_height ); +void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src, + intptr_t i_src_stride, + int32_t i_height ); +void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src, + intptr_t i_src_stride, + int32_t i_height ); +void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0, + uint8_t *p_dst1, uint8_t *p_dst2, + uint8_t *p_dst3, intptr_t i_src_stride, + intptr_t i_dst_stride, int32_t i_width, + int32_t i_height ); + +static void avc_luma_hz_16w_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height ) +{ + uint32_t u_loop_cnt, u_h4w; + v16u8 dst0; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + v16i8 mask0, mask1, mask2; + v16i8 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 vec6, vec7, vec8, vec9, vec10, vec11; + v16i8 minus5b = __msa_ldi_b( -5 ); + v16i8 plus20b = __msa_ldi_b( 20 ); + + u_h4w = i_height % 4; + LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 ); + + for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; ) + { + LD_SB2( p_src, 8, src0, src1 ); + p_src += i_src_stride; + LD_SB2( p_src, 8, src2, src3 ); + p_src += i_src_stride; + + XORI_B4_128_SB( src0, src1, src2, src3 ); + VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 ); + VSHF_B2_SB( src2, src2, src3, src3, mask0, mask0, vec6, vec9 ); + VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 ); + VSHF_B2_SB( src2, src2, src3, src3, mask1, mask1, vec7, vec10 ); + VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 ); + VSHF_B2_SB( src2, src2, src3, src3, mask2, mask2, vec8, vec11 ); + HADD_SB4_SH( vec0, vec3, vec6, vec9, res0, res1, res2, res3 ); + DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, + minus5b, res0, res1, res2, res3 ); + DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, + plus20b, res0, res1, res2, res3 ); + + LD_SB2( p_src, 8, src4, src5 ); + p_src += i_src_stride; + LD_SB2( p_src, 8, src6, src7 ); + p_src += i_src_stride; + + XORI_B4_128_SB( src4, src5, src6, src7 ); + VSHF_B2_SB( src4, src4, src5, src5, mask0, mask0, vec0, vec3 ); + VSHF_B2_SB( src6, src6, src7, src7, mask0, mask0, vec6, vec9 ); + VSHF_B2_SB( src4, src4, src5, src5, mask1, mask1, vec1, vec4 ); + VSHF_B2_SB( src6, src6, src7, src7, mask1, mask1, vec7, vec10 ); + VSHF_B2_SB( src4, src4, src5, src5, mask2, mask2, vec2, vec5 ); + VSHF_B2_SB( src6, src6, src7, src7, mask2, mask2, vec8, vec11 ); + HADD_SB4_SH( vec0, vec3, vec6, vec9, res4, res5, res6, res7 ); + DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b, + minus5b, res4, res5, res6, res7 ); + DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b, + plus20b, res4, res5, res6, res7 ); + SRARI_H4_SH( res0, res1, res2, res3, 5 ); + SRARI_H4_SH( res4, res5, res6, res7, 5 ); + SAT_SH4_SH( res0, res1, res2, res3, 7 ); + SAT_SH4_SH( res4, res5, res6, res7, 7 ); + PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6, + vec0, vec1, vec2, vec3 ); + XORI_B4_128_SB( vec0, vec1, vec2, vec3 ); + + ST_SB4( vec0, vec1, vec2, vec3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + } + + for( u_loop_cnt = u_h4w; u_loop_cnt--; ) + { + LD_SB2( p_src, 8, src0, src1 ); + p_src += i_src_stride; + + XORI_B2_128_SB( src0, src1 ); + VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 ); + VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 ); + VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 ); + res0 = __msa_hadd_s_h( vec0, vec0 ); + DPADD_SB2_SH( vec1, vec2, minus5b, plus20b, res0, res0 ); + res1 = __msa_hadd_s_h( vec3, vec3 ); + DPADD_SB2_SH( vec4, vec5, minus5b, plus20b, res1, res1 ); + SRARI_H2_SH( res0, res1, 5 ); + SAT_SH2_SH( res0, res1, 7 ); + dst0 = PCKEV_XORI128_UB( res0, res1 ); + ST_UB( dst0, p_dst ); + p_dst += i_dst_stride; + } +} + +static void avc_luma_vt_16w_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height ) +{ + uint32_t u_loop_cnt, u_h4w; + const int16_t i_filt_const0 = 0xfb01; + const int16_t i_filt_const1 = 0x1414; + const int16_t i_filt_const2 = 0x1fb; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r; + v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l; + v16i8 src65_l, src87_l; + v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l; + v16u8 res0, res1, res2, res3; + v16i8 filt0, filt1, filt2; + + u_h4w = i_height % 4; + filt0 = ( v16i8 ) __msa_fill_h( i_filt_const0 ); + filt1 = ( v16i8 ) __msa_fill_h( i_filt_const1 ); + filt2 = ( v16i8 ) __msa_fill_h( i_filt_const2 ); + + LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 ); + p_src += ( 5 * i_src_stride ); + + XORI_B5_128_SB( src0, src1, src2, src3, src4 ); + ILVR_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3, + src10_r, src21_r, src32_r, src43_r ); + ILVL_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3, + src10_l, src21_l, src32_l, src43_l ); + + for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; ) + { + LD_SB4( p_src, i_src_stride, src5, src6, src7, src8 ); + p_src += ( 4 * i_src_stride ); + + XORI_B4_128_SB( src5, src6, src7, src8 ); + ILVR_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7, + src54_r, src65_r, src76_r, src87_r ); + ILVL_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7, + src54_l, src65_l, src76_l, src87_l ); + out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r, + filt0, filt1, filt2 ); + out1_r = DPADD_SH3_SH( src21_r, src43_r, src65_r, + filt0, filt1, filt2 ); + out2_r = DPADD_SH3_SH( src32_r, src54_r, src76_r, + filt0, filt1, filt2 ); + out3_r = DPADD_SH3_SH( src43_r, src65_r, src87_r, + filt0, filt1, filt2 ); + out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l, + filt0, filt1, filt2 ); + out1_l = DPADD_SH3_SH( src21_l, src43_l, src65_l, + filt0, filt1, filt2 ); + out2_l = DPADD_SH3_SH( src32_l, src54_l, src76_l, + filt0, filt1, filt2 ); + out3_l = DPADD_SH3_SH( src43_l, src65_l, src87_l, + filt0, filt1, filt2 ); + SRARI_H4_SH( out0_r, out1_r, out2_r, out3_r, 5 ); + SAT_SH4_SH( out0_r, out1_r, out2_r, out3_r, 7 ); + SRARI_H4_SH( out0_l, out1_l, out2_l, out3_l, 5 ); + SAT_SH4_SH( out0_l, out1_l, out2_l, out3_l, 7 ); + PCKEV_B4_UB( out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, + out3_r, res0, res1, res2, res3 ); + XORI_B4_128_UB( res0, res1, res2, res3 ); + + ST_UB4( res0, res1, res2, res3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + + src10_r = src54_r; + src32_r = src76_r; + src21_r = src65_r; + src43_r = src87_r; + src10_l = src54_l; + src32_l = src76_l; + src21_l = src65_l; + src43_l = src87_l; + src4 = src8; + } + + for( u_loop_cnt = u_h4w; u_loop_cnt--; ) + { + src5 = LD_SB( p_src ); + p_src += ( i_src_stride ); + src5 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src5, 128 ); + ILVRL_B2_SB( src5, src4, src54_r, src54_l ); + out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r, + filt0, filt1, filt2 ); + out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l, + filt0, filt1, filt2 ); + SRARI_H2_SH( out0_r, out0_l, 5 ); + SAT_SH2_SH( out0_r, out0_l, 7 ); + out0_r = ( v8i16 ) __msa_pckev_b( ( v16i8 ) out0_l, ( v16i8 ) out0_r ); + res0 = __msa_xori_b( ( v16u8 ) out0_r, 128 ); + ST_UB( res0, p_dst ); + p_dst += i_dst_stride; + + src10_r = src21_r; + src21_r = src32_r; + src32_r = src43_r; + src43_r = src54_r; + + src10_l = src21_l; + src21_l = src32_l; + src32_l = src43_l; + src43_l = src54_l; + + src4 = src5; + } +} + +static void avc_luma_mid_8w_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height ) +{ + uint32_t u_loop_cnt, u_h4w; + uint64_t u_out0; + v16i8 tmp0; + v16i8 src0, src1, src2, src3, src4; + v16i8 mask0, mask1, mask2; + v8i16 hz_out0, hz_out1, hz_out2, hz_out3; + v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8; + v8i16 dst0, dst1, dst2, dst3; + v16u8 out0, out1; + + u_h4w = i_height % 4; + LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 ); + + LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 ); + XORI_B5_128_SB( src0, src1, src2, src3, src4 ); + p_src += ( 5 * i_src_stride ); + + hz_out0 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 ); + hz_out1 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 ); + hz_out2 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 ); + hz_out3 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 ); + hz_out4 = AVC_HORZ_FILTER_SH( src4, mask0, mask1, mask2 ); + + for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; ) + { + LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 ); + XORI_B4_128_SB( src0, src1, src2, src3 ); + p_src += ( 4 * i_src_stride ); + + hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 ); + hz_out6 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 ); + hz_out7 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 ); + hz_out8 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 ); + dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1, hz_out2, + hz_out3, hz_out4, hz_out5 ); + dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out1, hz_out2, hz_out3, + hz_out4, hz_out5, hz_out6 ); + dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out2, hz_out3, hz_out4, + hz_out5, hz_out6, hz_out7 ); + dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out3, hz_out4, hz_out5, + hz_out6, hz_out7, hz_out8 ); + out0 = PCKEV_XORI128_UB( dst0, dst1 ); + out1 = PCKEV_XORI128_UB( dst2, dst3 ); + ST8x4_UB( out0, out1, p_dst, i_dst_stride ); + + p_dst += ( 4 * i_dst_stride ); + hz_out3 = hz_out7; + hz_out1 = hz_out5; + hz_out5 = hz_out4; + hz_out4 = hz_out8; + hz_out2 = hz_out6; + hz_out0 = hz_out5; + } + + for( u_loop_cnt = u_h4w; u_loop_cnt--; ) + { + src0 = LD_SB( p_src ); + p_src += i_src_stride; + + src0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src0, 128 ); + hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 ); + + dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1, + hz_out2, hz_out3, + hz_out4, hz_out5 ); + + tmp0 = __msa_pckev_b( ( v16i8 ) ( dst0 ), ( v16i8 ) ( dst0 ) ); + tmp0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp0, 128 ); + u_out0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 ); + SD( u_out0, p_dst ); + p_dst += i_dst_stride; + + hz_out0 = hz_out1; + hz_out1 = hz_out2; + hz_out2 = hz_out3; + hz_out3 = hz_out4; + hz_out4 = hz_out5; + } +} + +static void avc_luma_mid_16w_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height ) +{ + uint32_t u_multiple8_cnt; + + for( u_multiple8_cnt = 2; u_multiple8_cnt--; ) + { + avc_luma_mid_8w_msa( p_src, i_src_stride, p_dst, i_dst_stride, + i_height ); + p_src += 8; + p_dst += 8; + } +} + +static void avc_interleaved_chroma_hv_2x2_msa( uint8_t *p_src, + int32_t i_src_stride, + uint8_t *p_dst_u, + uint8_t *p_dst_v, + int32_t i_dst_stride, + uint32_t u_coef_hor0, + uint32_t u_coef_hor1, + uint32_t u_coef_ver0, + uint32_t u_coef_ver1 ) +{ + uint16_t u_out0, u_out1, u_out2, u_out3; + v16u8 src0, src1, src2, src3, src4; + v8u16 res_hz0, res_hz1, res_hz2, res_hz3; + v8u16 res_vt0, res_vt1, res_vt2, res_vt3; + v16i8 mask; + v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 ); + v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 ); + v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 ); + v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 ); + v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 ); + v8i16 res0, res1; + + mask = LD_SB( &pu_chroma_mask_arr[16] ); + + LD_UB3( p_src, i_src_stride, src0, src1, src2 ); + VSHF_B2_UB( src0, src1, src1, src2, + ( mask + 1 ), ( mask + 1 ), src3, src4 ); + VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 ); + DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec, + coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, + res_hz3 ); + MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, + coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, + res_vt3 ); + ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 ); + SRARI_H2_UH( res_vt0, res_vt2, 6 ); + SAT_UH2_UH( res_vt0, res_vt2, 7 ); + PCKEV_B2_SH( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 ); + + u_out0 = __msa_copy_u_h( res0, 0 ); + u_out1 = __msa_copy_u_h( res0, 2 ); + u_out2 = __msa_copy_u_h( res1, 0 ); + u_out3 = __msa_copy_u_h( res1, 2 ); + + SH( u_out0, p_dst_u ); + p_dst_u += i_dst_stride; + SH( u_out1, p_dst_u ); + + SH( u_out2, p_dst_v ); + p_dst_v += i_dst_stride; + SH( u_out3, p_dst_v ); +} + +static void avc_interleaved_chroma_hv_2x4_msa( uint8_t *p_src, + int32_t i_src_stride, + uint8_t *p_dst_u, + uint8_t *p_dst_v, + int32_t i_dst_stride, + uint32_t u_coef_hor0, + uint32_t u_coef_hor1, + uint32_t u_coef_ver0, + uint32_t u_coef_ver1 ) +{ + uint16_t u_out0, u_out1, u_out2, u_out3; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8u16 res_hz0, res_hz1, res_hz2, res_hz3; + v8u16 res_vt0, res_vt1, res_vt2, res_vt3; + v16i8 mask; + v8i16 res0, res1; + v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 ); + v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 ); + v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 ); + v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 ); + v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 ); + + mask = LD_SB( &pu_chroma_mask_arr[16] ); + + LD_UB5( p_src, i_src_stride, src0, src1, src2, src3, src4 ); + + VSHF_B2_UB( src0, src1, src1, src2, + ( mask + 1 ), ( mask + 1 ), src5, src6 ); + VSHF_B2_UB( src2, src3, src3, src4, + ( mask + 1 ), ( mask + 1 ), src7, src8 ); + VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 ); + VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 ); + DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec, + coeff_hz_vec, coeff_hz_vec, res_hz0, + res_hz1, res_hz2, res_hz3 ); + MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, + coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, + res_vt3 ); + ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 ); + SRARI_H2_UH( res_vt0, res_vt1, 6 ); + SAT_UH2_UH( res_vt0, res_vt1, 7 ); + PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 ); + + u_out0 = __msa_copy_u_h( res0, 0 ); + u_out1 = __msa_copy_u_h( res0, 2 ); + u_out2 = __msa_copy_u_h( res1, 0 ); + u_out3 = __msa_copy_u_h( res1, 2 ); + + SH( u_out0, p_dst_u ); + p_dst_u += i_dst_stride; + SH( u_out1, p_dst_u ); + p_dst_u += i_dst_stride; + SH( u_out2, p_dst_u ); + p_dst_u += i_dst_stride; + SH( u_out3, p_dst_u ); + + DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec, + coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, + res_hz3 ); + MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, + coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, + res_vt3 ); + ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 ); + SRARI_H2_UH( res_vt0, res_vt1, 6 ); + SAT_UH2_UH( res_vt0, res_vt1, 7 ); + PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 ); + + u_out0 = __msa_copy_u_h( res0, 0 ); + u_out1 = __msa_copy_u_h( res0, 2 ); + u_out2 = __msa_copy_u_h( res1, 0 ); + u_out3 = __msa_copy_u_h( res1, 2 ); + + SH( u_out0, p_dst_v ); + p_dst_v += i_dst_stride; + SH( u_out1, p_dst_v ); + p_dst_v += i_dst_stride; + SH( u_out2, p_dst_v ); + p_dst_v += i_dst_stride; + SH( u_out3, p_dst_v ); +} + +static void avc_interleaved_chroma_hv_2w_msa( uint8_t *p_src, + int32_t i_src_stride, + uint8_t *p_dst_u, + uint8_t *p_dst_v, + int32_t i_dst_stride, + uint32_t u_coef_hor0, + uint32_t u_coef_hor1, + uint32_t u_coef_ver0, + uint32_t u_coef_ver1, + int32_t i_height ) +{ + if( 2 == i_height ) + { + avc_interleaved_chroma_hv_2x2_msa( p_src, i_src_stride, + p_dst_u, p_dst_v, i_dst_stride, + u_coef_hor0, u_coef_hor1, + u_coef_ver0, u_coef_ver1 ); + } + else if( 4 == i_height ) + { + avc_interleaved_chroma_hv_2x4_msa( p_src, i_src_stride, + p_dst_u, p_dst_v, i_dst_stride, + u_coef_hor0, u_coef_hor1, + u_coef_ver0, u_coef_ver1 ); + } +} + +static void avc_interleaved_chroma_hv_4x2_msa( uint8_t *p_src, + int32_t i_src_stride, + uint8_t *p_dst_u, + uint8_t *p_dst_v, + int32_t i_dst_stride, + uint32_t u_coef_hor0, + uint32_t u_coef_hor1, + uint32_t u_coef_ver0, + uint32_t u_coef_ver1 ) +{ + uint32_t u_out0, u_out1, u_out2, u_out3; + v16u8 src0, src1, src2, src3, src4; + v8u16 res_hz0, res_hz1, res_hz2, res_hz3; + v8u16 res_vt0, res_vt1, res_vt2, res_vt3; + v16i8 mask; + v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 ); + v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 ); + v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 ); + v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 ); + v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 ); + v4i32 res0, res1; + + mask = LD_SB( &pu_chroma_mask_arr[16] ); + + LD_UB3( p_src, i_src_stride, src0, src1, src2 ); + VSHF_B2_UB( src0, src1, src1, src2, + ( mask + 1 ), ( mask + 1 ), src3, src4 ); + VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 ); + DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec, + coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, + res_hz3 ); + MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, + coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, + res_vt3 ); + ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 ); + SRARI_H2_UH( res_vt0, res_vt2, 6 ); + SAT_UH2_UH( res_vt0, res_vt2, 7 ); + PCKEV_B2_SW( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 ); + + u_out0 = __msa_copy_u_w( res0, 0 ); + u_out1 = __msa_copy_u_w( res0, 1 ); + u_out2 = __msa_copy_u_w( res1, 0 ); + u_out3 = __msa_copy_u_w( res1, 1 ); + SW( u_out0, p_dst_u ); + p_dst_u += i_dst_stride; + SW( u_out1, p_dst_u ); + SW( u_out2, p_dst_v ); + p_dst_v += i_dst_stride; + SW( u_out3, p_dst_v ); +} + +static void avc_interleaved_chroma_hv_4x4mul_msa( uint8_t *p_src, + int32_t i_src_stride, + uint8_t *p_dst_u, + uint8_t *p_dst_v, + int32_t i_dst_stride, + uint32_t u_coef_hor0, + uint32_t u_coef_hor1, + uint32_t u_coef_ver0, + uint32_t u_coef_ver1, + int32_t i_height ) +{ + uint32_t u_row; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v8u16 res_hz0, res_hz1, res_hz2, res_hz3; + v8u16 res_vt0, res_vt1, res_vt2, res_vt3; + v16i8 mask; + v4i32 res0, res1; + v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 ); + v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 ); + v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 ); + v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 ); + v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 ); + + mask = LD_SB( &pu_chroma_mask_arr[16] ); + + src0 = LD_UB( p_src ); + p_src += i_src_stride; + + for( u_row = ( i_height >> 2 ); u_row--; ) + { + LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 ); + p_src += ( 4 * i_src_stride ); + + VSHF_B2_UB( src0, src1, src1, src2, + ( mask + 1 ), ( mask + 1 ), src5, src6 ); + VSHF_B2_UB( src2, src3, src3, src4, + ( mask + 1 ), ( mask + 1 ), src7, src8 ); + VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 ); + VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 ); + DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec, + coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, + res_hz3 ); + MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, + coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, + res_vt3 ); + ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 ); + SRARI_H2_UH( res_vt0, res_vt1, 6 ); + SAT_UH2_UH( res_vt0, res_vt1, 7 ); + PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 ); + + ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_u, i_dst_stride ); + p_dst_u += ( 4 * i_dst_stride ); + + DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec, + coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, + res_hz3 ); + MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, + coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, + res_vt3 ); + ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 ); + SRARI_H2_UH( res_vt0, res_vt1, 6 ); + SAT_UH2_UH( res_vt0, res_vt1, 7 ); + PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 ); + + ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_v, i_dst_stride ); + p_dst_v += ( 4 * i_dst_stride ); + src0 = src4; + } +} + +static void avc_interleaved_chroma_hv_4w_msa( uint8_t *p_src, + int32_t i_src_stride, + uint8_t *p_dst_u, + uint8_t *p_dst_v, + int32_t i_dst_stride, + uint32_t u_coef_hor0, + uint32_t u_coef_hor1, + uint32_t u_coef_ver0, + uint32_t u_coef_ver1, + int32_t i_height ) +{ + if( 2 == i_height ) + { + avc_interleaved_chroma_hv_4x2_msa( p_src, i_src_stride, + p_dst_u, p_dst_v, i_dst_stride, + u_coef_hor0, u_coef_hor1, + u_coef_ver0, u_coef_ver1 ); + } + else + { + avc_interleaved_chroma_hv_4x4mul_msa( p_src, i_src_stride, + p_dst_u, p_dst_v, i_dst_stride, + u_coef_hor0, u_coef_hor1, + u_coef_ver0, u_coef_ver1, + i_height ); + } +} + +static void avc_interleaved_chroma_hv_8w_msa( uint8_t *p_src, + int32_t i_src_stride, + uint8_t *p_dst_u, + uint8_t *p_dst_v, + int32_t i_dst_stride, + uint32_t u_coef_hor0, + uint32_t u_coef_hor1, + uint32_t u_coef_ver0, + uint32_t u_coef_ver1, + int32_t i_height ) +{ + uint32_t u_row; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9; + v16u8 src10, src11, src12, src13, src14; + v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5; + v8u16 res_vt0, res_vt1, res_vt2, res_vt3; + v16i8 mask = { 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16 }; + v16i8 coeff_hz_vec0, coeff_hz_vec1; + v16i8 tmp0, tmp1; + v16u8 coeff_hz_vec; + v8u16 coeff_vt_vec0, coeff_vt_vec1; + + coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 ); + coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 ); + coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 ); + coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 ); + coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 ); + + LD_UB2( p_src, 16, src0, src13 ); + p_src += i_src_stride; + + VSHF_B2_UB( src0, src13, src0, src13, ( mask + 1 ), mask, src14, src0 ); + DOTP_UB2_UH( src0, src14, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz5 ); + + for( u_row = ( i_height >> 2 ); u_row--; ) + { + LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 ); + LD_UB4( p_src + 16, i_src_stride, src5, src6, src7, src8 ); + p_src += ( 4 * i_src_stride ); + + VSHF_B2_UB( src1, src5, src2, src6, mask, mask, src9, src10 ); + VSHF_B2_UB( src3, src7, src4, src8, mask, mask, src11, src12 ); + DOTP_UB4_UH( src9, src10, src11, src12, coeff_hz_vec, coeff_hz_vec, + coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, + res_hz4 ); + MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, + coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, + res_vt3 ); + + res_vt0 += ( res_hz0 * coeff_vt_vec1 ); + res_vt1 += ( res_hz1 * coeff_vt_vec1 ); + res_vt2 += ( res_hz2 * coeff_vt_vec1 ); + res_vt3 += ( res_hz3 * coeff_vt_vec1 ); + + SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 ); + SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 ); + PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 ); + ST8x4_UB( tmp0, tmp1, p_dst_u, i_dst_stride ); + p_dst_u += ( 4 * i_dst_stride ); + res_hz0 = res_hz4; + + VSHF_B2_UB( src1, src5, src2, src6, + ( mask + 1 ), ( mask + 1 ), src5, src6 ); + VSHF_B2_UB( src3, src7, src4, src8, + ( mask + 1 ), ( mask + 1 ), src7, src8 ); + DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec, + coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, + res_hz4 ); + MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, + coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, + res_vt3 ); + + res_vt0 += ( res_hz5 * coeff_vt_vec1 ); + res_vt1 += ( res_hz1 * coeff_vt_vec1 ); + res_vt2 += ( res_hz2 * coeff_vt_vec1 ); + res_vt3 += ( res_hz3 * coeff_vt_vec1 ); + + SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 ); + SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 ); + PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 ); + ST8x4_UB( tmp0, tmp1, p_dst_v, i_dst_stride ); + p_dst_v += ( 4 * i_dst_stride ); + res_hz5 = res_hz4; + } +} + +static void avc_wgt_opscale_4x2_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_log2_denom, int32_t i_weight, + int32_t i_offset_in ) +{ + uint32_t u_load0, u_load1, u_out0, u_out1; + v16u8 zero = { 0 }; + v16u8 src0, src1; + v4i32 dst0, dst1; + v8u16 temp0, temp1, wgt, denom, offset, tp0, tp1; + v8i16 vec0, vec1; + + i_offset_in <<= ( i_log2_denom ); + + if( i_log2_denom ) + { + i_offset_in += ( 1 << ( i_log2_denom - 1 ) ); + } + + wgt = ( v8u16 ) __msa_fill_h( i_weight ); + offset = ( v8u16 ) __msa_fill_h( i_offset_in ); + denom = ( v8u16 ) __msa_fill_h( i_log2_denom ); + + u_load0 = LW( p_src ); + p_src += i_src_stride; + u_load1 = LW( p_src ); + + src0 = ( v16u8 ) __msa_fill_w( u_load0 ); + src1 = ( v16u8 ) __msa_fill_w( u_load1 ); + + ILVR_B2_UH( zero, src0, zero, src1, temp0, temp1 ); + MUL2( wgt, temp0, wgt, temp1, temp0, temp1 ); + ADDS_SH2_SH( temp0, offset, temp1, offset, vec0, vec1 ); + MAXI_SH2_SH( vec0, vec1, 0 ); + + tp0 = ( v8u16 ) __msa_srl_h( vec0, ( v8i16 ) denom ); + tp1 = ( v8u16 ) __msa_srl_h( vec1, ( v8i16 ) denom ); + + SAT_UH2_UH( tp0, tp1, 7 ); + PCKEV_B2_SW( tp0, tp0, tp1, tp1, dst0, dst1 ); + + u_out0 = __msa_copy_u_w( dst0, 0 ); + u_out1 = __msa_copy_u_w( dst1, 0 ); + SW( u_out0, p_dst ); + p_dst += i_dst_stride; + SW( u_out1, p_dst ); +} + +static void avc_wgt_opscale_4x4multiple_msa( uint8_t *p_src, + int32_t i_src_stride, + uint8_t *p_dst, + int32_t i_dst_stride, + int32_t i_height, + int32_t i_log2_denom, + int32_t i_weight, + int32_t i_offset_in ) +{ + uint8_t u_cnt; + uint32_t u_load0, u_load1, u_load2, u_load3; + v16u8 zero = { 0 }; + v16u8 src0, src1, src2, src3; + v8u16 temp0, temp1, temp2, temp3; + v8u16 wgt, denom, offset; + + i_offset_in <<= ( i_log2_denom ); + + if( i_log2_denom ) + { + i_offset_in += ( 1 << ( i_log2_denom - 1 ) ); + } + + wgt = ( v8u16 ) __msa_fill_h( i_weight ); + offset = ( v8u16 ) __msa_fill_h( i_offset_in ); + denom = ( v8u16 ) __msa_fill_h( i_log2_denom ); + + for( u_cnt = i_height / 4; u_cnt--; ) + { + LW4( p_src, i_src_stride, u_load0, u_load1, u_load2, u_load3 ); + p_src += 4 * i_src_stride; + + src0 = ( v16u8 ) __msa_fill_w( u_load0 ); + src1 = ( v16u8 ) __msa_fill_w( u_load1 ); + src2 = ( v16u8 ) __msa_fill_w( u_load2 ); + src3 = ( v16u8 ) __msa_fill_w( u_load3 ); + + ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3, + temp0, temp1, temp2, temp3 ); + MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3, + temp0, temp1, temp2, temp3 ); + ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset, + temp0, temp1, temp2, temp3 ); + MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 ); + SRL_H4_UH( temp0, temp1, temp2, temp3, denom ); + SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 ); + PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + } +} + +static void avc_wgt_opscale_4width_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height, int32_t i_log2_denom, + int32_t i_weight, int32_t i_offset_in ) +{ + if( 2 == i_height ) + { + avc_wgt_opscale_4x2_msa( p_src, i_src_stride, p_dst, i_dst_stride, + i_log2_denom, i_weight, i_offset_in ); + } + else + { + avc_wgt_opscale_4x4multiple_msa( p_src, i_src_stride, + p_dst, i_dst_stride, + i_height, i_log2_denom, + i_weight, i_offset_in ); + } +} + +static void avc_wgt_opscale_8width_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height, int32_t i_log2_denom, + int32_t i_weight, int32_t i_offset_in ) +{ + uint8_t u_cnt; + v16u8 zero = { 0 }; + v16u8 src0, src1, src2, src3; + v8u16 temp0, temp1, temp2, temp3; + v8u16 wgt, denom, offset; + v16i8 out0, out1; + + i_offset_in <<= ( i_log2_denom ); + + if( i_log2_denom ) + { + i_offset_in += ( 1 << ( i_log2_denom - 1 ) ); + } + + wgt = ( v8u16 ) __msa_fill_h( i_weight ); + offset = ( v8u16 ) __msa_fill_h( i_offset_in ); + denom = ( v8u16 ) __msa_fill_h( i_log2_denom ); + + for( u_cnt = i_height / 4; u_cnt--; ) + { + LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); + p_src += 4 * i_src_stride; + + ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3, + temp0, temp1, temp2, temp3 ); + MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3, + temp0, temp1, temp2, temp3 ); + ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset, + temp0, temp1, temp2, temp3 ); + MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 ); + SRL_H4_UH( temp0, temp1, temp2, temp3, denom ); + SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 ); + PCKEV_B2_SB( temp1, temp0, temp3, temp2, out0, out1 ); + ST8x4_UB( out0, out1, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + } +} + +static void avc_wgt_opscale_16width_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height, int32_t i_log2_denom, + int32_t i_weight, int32_t i_offset_in ) +{ + uint8_t u_cnt; + v16i8 zero = { 0 }; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + v8u16 wgt, denom, offset; + + i_offset_in <<= ( i_log2_denom ); + + if( i_log2_denom ) + { + i_offset_in += ( 1 << ( i_log2_denom - 1 ) ); + } + + wgt = ( v8u16 ) __msa_fill_h( i_weight ); + offset = ( v8u16 ) __msa_fill_h( i_offset_in ); + denom = ( v8u16 ) __msa_fill_h( i_log2_denom ); + + for( u_cnt = i_height / 4; u_cnt--; ) + { + LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); + p_src += 4 * i_src_stride; + + ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3, + temp0, temp2, temp4, temp6 ); + ILVL_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3, + temp1, temp3, temp5, temp7 ); + MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3, + temp0, temp1, temp2, temp3 ); + MUL4( wgt, temp4, wgt, temp5, wgt, temp6, wgt, temp7, + temp4, temp5, temp6, temp7 ); + ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset, + temp0, temp1, temp2, temp3 ); + ADDS_SH4_UH( temp4, offset, temp5, offset, temp6, offset, temp7, offset, + temp4, temp5, temp6, temp7 ); + MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 ); + MAXI_SH4_UH( temp4, temp5, temp6, temp7, 0 ); + SRL_H4_UH( temp0, temp1, temp2, temp3, denom ); + SRL_H4_UH( temp4, temp5, temp6, temp7, denom ); + SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 ); + SAT_UH4_UH( temp4, temp5, temp6, temp7, 7 ); + PCKEV_B4_UB( temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6, + dst0, dst1, dst2, dst3 ); + + ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride ); + p_dst += 4 * i_dst_stride; + } +} + +static void avc_biwgt_opscale_4x2_nw_msa( uint8_t *p_src1_in, + int32_t i_src1_stride, + uint8_t *p_src2_in, + int32_t i_src2_stride, + uint8_t *p_dst, + int32_t i_dst_stride, + int32_t i_log2_denom, + int32_t i_src1_weight, + int32_t i_src2_weight, + int32_t i_offset_in ) +{ + uint32_t u_load0, u_load1, u_out0, u_out1; + v8i16 src1_wgt, src2_wgt; + v16u8 in0, in1, in2, in3; + v8i16 temp0, temp1, temp2, temp3; + v16i8 zero = { 0 }; + v8i16 denom = __msa_ldi_h( i_log2_denom + 1 ); + + src1_wgt = __msa_fill_h( i_src1_weight ); + src2_wgt = __msa_fill_h( i_src2_weight ); + u_load0 = LW( p_src1_in ); + u_load1 = LW( p_src1_in + i_src1_stride ); + in0 = ( v16u8 ) __msa_fill_w( u_load0 ); + in1 = ( v16u8 ) __msa_fill_w( u_load1 ); + u_load0 = LW( p_src2_in ); + u_load1 = LW( p_src2_in + i_src2_stride ); + in2 = ( v16u8 ) __msa_fill_w( u_load0 ); + in3 = ( v16u8 ) __msa_fill_w( u_load1 ); + ILVR_B4_SH( zero, in0, zero, in1, zero, in2, zero, in3, + temp0, temp1, temp2, temp3 ); + temp0 = ( temp0 * src1_wgt ) + ( temp2 * src2_wgt ); + temp1 = ( temp1 * src1_wgt ) + ( temp3 * src2_wgt ); + SRAR_H2_SH( temp0, temp1, denom ); + CLIP_SH2_0_255( temp0, temp1 ); + PCKEV_B2_UB( temp0, temp0, temp1, temp1, in0, in1 ); + u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 ); + u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 ); + SW( u_out0, p_dst ); + p_dst += i_dst_stride; + SW( u_out1, p_dst ); +} + +static void avc_biwgt_opscale_4x4multiple_nw_msa( uint8_t *p_src1_in, + int32_t i_src1_stride, + uint8_t *p_src2_in, + int32_t i_src2_stride, + uint8_t *p_dst, + int32_t i_dst_stride, + int32_t i_height, + int32_t i_log2_denom, + int32_t i_src1_weight, + int32_t i_src2_weight, + int32_t i_offset_in ) +{ + uint8_t u_cnt; + uint32_t u_load0, u_load1, u_load2, u_load3; + v8i16 src1_wgt, src2_wgt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + v16i8 zero = { 0 }; + v8i16 denom = __msa_ldi_h( i_log2_denom + 1 ); + + src1_wgt = __msa_fill_h( i_src1_weight ); + src2_wgt = __msa_fill_h( i_src2_weight ); + for( u_cnt = i_height / 4; u_cnt--; ) + { + LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 ); + p_src1_in += ( 4 * i_src1_stride ); + src0 = ( v16u8 ) __msa_fill_w( u_load0 ); + src1 = ( v16u8 ) __msa_fill_w( u_load1 ); + src2 = ( v16u8 ) __msa_fill_w( u_load2 ); + src3 = ( v16u8 ) __msa_fill_w( u_load3 ); + LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 ); + p_src2_in += ( 4 * i_src2_stride ); + src4 = ( v16u8 ) __msa_fill_w( u_load0 ); + src5 = ( v16u8 ) __msa_fill_w( u_load1 ); + src6 = ( v16u8 ) __msa_fill_w( u_load2 ); + src7 = ( v16u8 ) __msa_fill_w( u_load3 ); + ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3, + temp0, temp1, temp2, temp3 ); + ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7, + temp4, temp5, temp6, temp7 ); + temp0 = ( temp0 * src1_wgt ) + ( temp4 * src2_wgt ); + temp1 = ( temp1 * src1_wgt ) + ( temp5 * src2_wgt ); + temp2 = ( temp2 * src1_wgt ) + ( temp6 * src2_wgt ); + temp3 = ( temp3 * src1_wgt ) + ( temp7 * src2_wgt ); + SRAR_H4_SH( temp0, temp1, temp2, temp3, denom ); + CLIP_SH4_0_255( temp0, temp1, temp2, temp3 ); + PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + } +} + +static void avc_biwgt_opscale_4width_nw_msa( uint8_t *p_src1_in, + int32_t i_src1_stride, + uint8_t *p_src2_in, + int32_t i_src2_stride, + uint8_t *p_dst, + int32_t i_dst_stride, + int32_t i_height, + int32_t i_log2_denom, + int32_t i_src1_weight, + int32_t i_src2_weight, + int32_t i_offset_in ) +{ + if( 2 == i_height ) + { + avc_biwgt_opscale_4x2_nw_msa( p_src1_in, i_src1_stride, + p_src2_in, i_src2_stride, + p_dst, i_dst_stride, + i_log2_denom, i_src1_weight, + i_src2_weight, i_offset_in ); + } + else + { + avc_biwgt_opscale_4x4multiple_nw_msa( p_src1_in, i_src1_stride, + p_src2_in, i_src2_stride, + p_dst, i_dst_stride, + i_height, i_log2_denom, + i_src1_weight, i_src2_weight, + i_offset_in ); + } +} + +static void avc_biwgt_opscale_8width_nw_msa( uint8_t *p_src1_in, + int32_t i_src1_stride, + uint8_t *p_src2_in, + int32_t i_src2_stride, + uint8_t *p_dst, + int32_t i_dst_stride, + int32_t i_height, + int32_t i_log2_denom, + int32_t i_src1_weight, + int32_t i_src2_weight, + int32_t i_offset_in ) +{ + uint8_t u_cnt; + v8i16 src1_wgt, src2_wgt; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v8i16 temp0, temp1, temp2, temp3; + v8i16 res0, res1, res2, res3; + v16i8 zero = { 0 }; + v8i16 denom = __msa_ldi_h( i_log2_denom + 1 ); + + src1_wgt = __msa_fill_h( i_src1_weight ); + src2_wgt = __msa_fill_h( i_src2_weight ); + + for( u_cnt = i_height / 4; u_cnt--; ) + { + LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 ); + p_src1_in += ( 4 * i_src1_stride ); + LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 ); + p_src2_in += ( 4 * i_src2_stride ); + ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3, + temp0, temp1, temp2, temp3 ); + ILVR_B4_SH( zero, dst0, zero, dst1, zero, dst2, zero, dst3, + res0, res1, res2, res3 ); + res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt ); + res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt ); + res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt ); + res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt ); + SRAR_H4_SH( res0, res1, res2, res3, denom ); + CLIP_SH4_0_255( res0, res1, res2, res3 ); + PCKEV_B4_UB( res0, res0, res1, res1, res2, res2, res3, res3, + dst0, dst1, dst2, dst3 ); + ST8x1_UB( dst0, p_dst ); + p_dst += i_dst_stride; + ST8x1_UB( dst1, p_dst ); + p_dst += i_dst_stride; + ST8x1_UB( dst2, p_dst ); + p_dst += i_dst_stride; + ST8x1_UB( dst3, p_dst ); + p_dst += i_dst_stride; + } +} + +static void avc_biwgt_opscale_16width_nw_msa( uint8_t *p_src1_in, + int32_t i_src1_stride, + uint8_t *p_src2_in, + int32_t i_src2_stride, + uint8_t *p_dst, + int32_t i_dst_stride, + int32_t i_height, + int32_t i_log2_denom, + int32_t i_src1_weight, + int32_t i_src2_weight, + int32_t i_offset_in ) +{ + uint8_t u_cnt; + v8i16 src1_wgt, src2_wgt; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + v16i8 zero = { 0 }; + v8i16 denom = __msa_ldi_h( i_log2_denom + 1 ); + + src1_wgt = __msa_fill_h( i_src1_weight ); + src2_wgt = __msa_fill_h( i_src2_weight ); + + for( u_cnt = i_height / 4; u_cnt--; ) + { + LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 ); + p_src1_in += ( 4 * i_src1_stride ); + LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 ); + p_src2_in += ( 4 * i_src2_stride ); + ILVRL_B2_SH( zero, src0, temp1, temp0 ); + ILVRL_B2_SH( zero, src1, temp3, temp2 ); + ILVRL_B2_SH( zero, src2, temp5, temp4 ); + ILVRL_B2_SH( zero, src3, temp7, temp6 ); + ILVRL_B2_SH( zero, dst0, res1, res0 ); + ILVRL_B2_SH( zero, dst1, res3, res2 ); + ILVRL_B2_SH( zero, dst2, res5, res4 ); + ILVRL_B2_SH( zero, dst3, res7, res6 ); + res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt ); + res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt ); + res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt ); + res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt ); + res4 = ( temp4 * src1_wgt ) + ( res4 * src2_wgt ); + res5 = ( temp5 * src1_wgt ) + ( res5 * src2_wgt ); + res6 = ( temp6 * src1_wgt ) + ( res6 * src2_wgt ); + res7 = ( temp7 * src1_wgt ) + ( res7 * src2_wgt ); + SRAR_H4_SH( res0, res1, res2, res3, denom ); + SRAR_H4_SH( res4, res5, res6, res7, denom ); + CLIP_SH4_0_255( res0, res1, res2, res3 ); + CLIP_SH4_0_255( res4, res5, res6, res7 ); + PCKEV_B4_UB( res0, res1, res2, res3, res4, res5, res6, res7, + dst0, dst1, dst2, dst3 ); + ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride ); + p_dst += 4 * i_dst_stride; + } +} + +static void avc_biwgt_opscale_4x2_msa( uint8_t *p_src1_in, + int32_t i_src1_stride, + uint8_t *p_src2_in, + int32_t i_src2_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_log2_denom, + int32_t i_src1_weight, + int32_t i_src2_weight, + int32_t i_offset_in ) +{ + uint32_t u_load0, u_load1, u_out0, u_out1; + v16u8 src1_wgt, src2_wgt, wgt; + v16i8 in0, in1, in2, in3; + v8u16 temp0, temp1, denom, offset; + + i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom; + + src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight ); + src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight ); + offset = ( v8u16 ) __msa_fill_h( i_offset_in ); + denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 ); + + wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt ); + + u_load0 = LW( p_src1_in ); + u_load1 = LW( p_src1_in + i_src1_stride ); + in0 = ( v16i8 ) __msa_fill_w( u_load0 ); + in1 = ( v16i8 ) __msa_fill_w( u_load1 ); + + u_load0 = LW( p_src2_in ); + u_load1 = LW( p_src2_in + i_src2_stride ); + in2 = ( v16i8 ) __msa_fill_w( u_load0 ); + in3 = ( v16i8 ) __msa_fill_w( u_load1 ); + + ILVR_B2_SB( in2, in0, in3, in1, in0, in1 ); + + temp0 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in0 ); + temp1 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in1 ); + temp0 >>= denom; + temp1 >>= denom; + MAXI_SH2_UH( temp0, temp1, 0 ); + SAT_UH2_UH( temp0, temp1, 7 ); + PCKEV_B2_SB( temp0, temp0, temp1, temp1, in0, in1 ); + + u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 ); + u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 ); + SW( u_out0, p_dst ); + p_dst += i_dst_stride; + SW( u_out1, p_dst ); +} + +static void avc_biwgt_opscale_4x4multiple_msa( uint8_t *p_src1_in, + int32_t i_src1_stride, + uint8_t *p_src2_in, + int32_t i_src2_stride, + uint8_t *p_dst, + int32_t i_dst_stride, + int32_t i_height, + int32_t i_log2_denom, + int32_t i_src1_weight, + int32_t i_src2_weight, + int32_t i_offset_in ) +{ + uint8_t u_cnt; + uint32_t u_load0, u_load1, u_load2, u_load3; + v16u8 src1_wgt, src2_wgt, wgt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 temp0, temp1, temp2, temp3; + v8u16 res0, res1, res2, res3; + v8u16 denom, offset; + + i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom; + + src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight ); + src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight ); + offset = ( v8u16 ) __msa_fill_h( i_offset_in ); + denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 ); + + wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt ); + + for( u_cnt = i_height / 4; u_cnt--; ) + { + LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 ); + p_src1_in += ( 4 * i_src1_stride ); + + src0 = ( v16u8 ) __msa_fill_w( u_load0 ); + src1 = ( v16u8 ) __msa_fill_w( u_load1 ); + src2 = ( v16u8 ) __msa_fill_w( u_load2 ); + src3 = ( v16u8 ) __msa_fill_w( u_load3 ); + + LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 ); + p_src2_in += ( 4 * i_src2_stride ); + + src4 = ( v16u8 ) __msa_fill_w( u_load0 ); + src5 = ( v16u8 ) __msa_fill_w( u_load1 ); + src6 = ( v16u8 ) __msa_fill_w( u_load2 ); + src7 = ( v16u8 ) __msa_fill_w( u_load3 ); + + ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3, + temp0, temp1, temp2, temp3 ); + DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt, + res0, res1, res2, res3 ); + ADD4( res0, offset, res1, offset, res2, offset, res3, offset, + res0, res1, res2, res3 ); + SRA_4V( res0, res1, res2, res3, denom ); + MAXI_SH4_UH( res0, res1, res2, res3, 0 ); + SAT_UH4_UH( res0, res1, res2, res3, 7 ); + PCKEV_ST4x4_UB( res0, res1, res2, res3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + } +} + +static void avc_biwgt_opscale_4width_msa( uint8_t *p_src1_in, + int32_t i_src1_stride, + uint8_t *p_src2_in, + int32_t i_src2_stride, + uint8_t *p_dst, + int32_t i_dst_stride, + int32_t i_height, + int32_t i_log2_denom, + int32_t i_src1_weight, + int32_t i_src2_weight, + int32_t i_offset_in ) +{ + if( 2 == i_height ) + { + avc_biwgt_opscale_4x2_msa( p_src1_in, i_src1_stride, + p_src2_in, i_src2_stride, + p_dst, i_dst_stride, + i_log2_denom, i_src1_weight, + i_src2_weight, i_offset_in ); + } + else + { + avc_biwgt_opscale_4x4multiple_msa( p_src1_in, i_src1_stride, + p_src2_in, i_src2_stride, + p_dst, i_dst_stride, + i_height, i_log2_denom, + i_src1_weight, + i_src2_weight, i_offset_in ); + } +} + + +static void avc_biwgt_opscale_8width_msa( uint8_t *p_src1_in, + int32_t i_src1_stride, + uint8_t *p_src2_in, + int32_t i_src2_stride, + uint8_t *p_dst, + int32_t i_dst_stride, + int32_t i_height, + int32_t i_log2_denom, + int32_t i_src1_weight, + int32_t i_src2_weight, + int32_t i_offset_in ) +{ + uint8_t u_cnt; + v16u8 src1_wgt, src2_wgt, wgt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 temp0, temp1, temp2, temp3; + v8u16 res0, res1, res2, res3; + v8u16 denom, offset; + v16i8 out0, out1; + + i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom; + + src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight ); + src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight ); + offset = ( v8u16 ) __msa_fill_h( i_offset_in ); + denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 ); + + wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt ); + + for( u_cnt = i_height / 4; u_cnt--; ) + { + LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 ); + p_src1_in += ( 4 * i_src1_stride ); + + LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 ); + p_src2_in += ( 4 * i_src2_stride ); + + ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3, + temp0, temp1, temp2, temp3 ); + DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt, + res0, res1, res2, res3 ); + ADD4( res0, offset, res1, offset, res2, offset, res3, offset, + res0, res1, res2, res3 ); + SRA_4V( res0, res1, res2, res3, denom ); + MAXI_SH4_UH( res0, res1, res2, res3, 0 ); + SAT_UH4_UH( res0, res1, res2, res3, 7 ); + PCKEV_B2_SB( res1, res0, res3, res2, out0, out1 ); + ST8x4_UB( out0, out1, p_dst, i_dst_stride ); + p_dst += 4 * i_dst_stride; + } +} + +static void avc_biwgt_opscale_16width_msa( uint8_t *p_src1_in, + int32_t i_src1_stride, + uint8_t *p_src2_in, + int32_t i_src2_stride, + uint8_t *p_dst, + int32_t i_dst_stride, + int32_t i_height, + int32_t i_log2_denom, + int32_t i_src1_weight, + int32_t i_src2_weight, + int32_t i_offset_in ) +{ + uint8_t u_cnt; + v16u8 src1_wgt, src2_wgt, wgt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + v8u16 res0, res1, res2, res3, res4, res5, res6, res7; + v8u16 denom, offset; + + i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom; + + src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight ); + src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight ); + offset = ( v8u16 ) __msa_fill_h( i_offset_in ); + denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 ); + + wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt ); + + for( u_cnt = i_height / 4; u_cnt--; ) + { + LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 ); + p_src1_in += ( 4 * i_src1_stride ); + + LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 ); + p_src2_in += ( 4 * i_src2_stride ); + + ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3, + temp0, temp2, temp4, temp6 ); + ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3, + temp1, temp3, temp5, temp7 ); + DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt, + res0, res1, res2, res3 ); + ADD4( res0, offset, res1, offset, res2, offset, res3, offset, + res0, res1, res2, res3 ); + DOTP_UB4_UH( temp4, temp5, temp6, temp7, wgt, wgt, wgt, wgt, + res4, res5, res6, res7 ); + ADD4( res4, offset, res5, offset, res6, offset, res7, offset, + res4, res5, res6, res7 ); + SRA_4V( res0, res1, res2, res3, denom ); + SRA_4V( res4, res5, res6, res7, denom ); + MAXI_SH4_UH( res0, res1, res2, res3, 0 ); + MAXI_SH4_UH( res4, res5, res6, res7, 0 ); + SAT_UH4_UH( res0, res1, res2, res3, 7 ); + SAT_UH4_UH( res4, res5, res6, res7, 7 ); + PCKEV_B4_UB( res1, res0, res3, res2, res5, res4, res7, res6, + temp0, temp1, temp2, temp3 ); + ST_UB4( temp0, temp1, temp2, temp3, p_dst, i_dst_stride ); + p_dst += 4 * i_dst_stride; + } +} + +static void copy_width4_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height ) +{ + int32_t i_cnt; + uint32_t u_src0, u_src1; + + for( i_cnt = ( i_height / 2 ); i_cnt--; ) + { + u_src0 = LW( p_src ); + p_src += i_src_stride; + u_src1 = LW( p_src ); + p_src += i_src_stride; + + SW( u_src0, p_dst ); + p_dst += i_dst_stride; + SW( u_src1, p_dst ); + p_dst += i_dst_stride; + } +} + +static void copy_width8_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height ) +{ + int32_t i_cnt; + uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if( 0 == i_height % 12 ) + { + for( i_cnt = ( i_height / 12 ); i_cnt--; ) + { + LD_UB8( p_src, i_src_stride, + src0, src1, src2, src3, src4, src5, src6, src7 ); + p_src += ( 8 * i_src_stride ); + + u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 ); + u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 ); + u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 ); + u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 ); + u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 ); + u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 ); + u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 ); + u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 ); + + SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + + LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); + p_src += ( 4 * i_src_stride ); + + u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 ); + u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 ); + u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 ); + u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 ); + + SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + } + } + else if( 0 == i_height % 8 ) + { + for( i_cnt = i_height >> 3; i_cnt--; ) + { + LD_UB8( p_src, i_src_stride, + src0, src1, src2, src3, src4, src5, src6, src7 ); + p_src += ( 8 * i_src_stride ); + + u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 ); + u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 ); + u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 ); + u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 ); + u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 ); + u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 ); + u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 ); + u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 ); + + SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + } + } + else if( 0 == i_height % 4 ) + { + for( i_cnt = ( i_height / 4 ); i_cnt--; ) + { + LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); + p_src += ( 4 * i_src_stride ); + u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 ); + u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 ); + u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 ); + u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 ); + + SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + } + } + else if( 0 == i_height % 2 ) + { + for( i_cnt = ( i_height / 2 ); i_cnt--; ) + { + LD_UB2( p_src, i_src_stride, src0, src1 ); + p_src += ( 2 * i_src_stride ); + u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 ); + u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 ); + + SD( u_out0, p_dst ); + p_dst += i_dst_stride; + SD( u_out1, p_dst ); + p_dst += i_dst_stride; + } + } +} + + +static void copy_16multx8mult_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height, int32_t i_width ) +{ + int32_t i_cnt, i_loop_cnt; + uint8_t *p_src_tmp, *p_dst_tmp; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + for( i_cnt = ( i_width >> 4 ); i_cnt--; ) + { + p_src_tmp = p_src; + p_dst_tmp = p_dst; + + for( i_loop_cnt = ( i_height >> 3 ); i_loop_cnt--; ) + { + LD_UB8( p_src_tmp, i_src_stride, + src0, src1, src2, src3, src4, src5, src6, src7 ); + p_src_tmp += ( 8 * i_src_stride ); + + ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7, + p_dst_tmp, i_dst_stride ); + p_dst_tmp += ( 8 * i_dst_stride ); + } + + p_src += 16; + p_dst += 16; + } +} + +static void copy_width16_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height ) +{ + int32_t i_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + + if( 0 == i_height % 12 ) + { + for( i_cnt = ( i_height / 12 ); i_cnt--; ) + { + LD_UB8( p_src, i_src_stride, + src0, src1, src2, src3, src4, src5, src6, src7 ); + p_src += ( 8 * i_src_stride ); + ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7, + p_dst, i_dst_stride ); + p_dst += ( 8 * i_dst_stride ); + + LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); + p_src += ( 4 * i_src_stride ); + ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + } + } + else if( 0 == i_height % 8 ) + { + copy_16multx8mult_msa( p_src, i_src_stride, + p_dst, i_dst_stride, i_height, 16 ); + } + else if( 0 == i_height % 4 ) + { + for( i_cnt = ( i_height >> 2 ); i_cnt--; ) + { + LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); + p_src += ( 4 * i_src_stride ); + + ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + } + } +} + +static void avg_src_width4_msa( uint8_t *p_src1, int32_t i_src1_stride, + uint8_t *p_src2, int32_t i_src2_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height ) +{ + int32_t i_cnt; + uint32_t u_out0, u_out1; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1; + + for( i_cnt = ( i_height / 2 ); i_cnt--; ) + { + LD_UB2( p_src1, i_src1_stride, src0, src1 ); + p_src1 += ( 2 * i_src1_stride ); + LD_UB2( p_src2, i_src2_stride, src2, src3 ); + p_src2 += ( 2 * i_src2_stride ); + + AVER_UB2_UB( src0, src2, src1, src3, dst0, dst1 ); + + u_out0 = __msa_copy_u_w( ( v4i32 ) dst0, 0 ); + u_out1 = __msa_copy_u_w( ( v4i32 ) dst1, 0 ); + SW( u_out0, p_dst ); + p_dst += i_dst_stride; + SW( u_out1, p_dst ); + p_dst += i_dst_stride; + } +} + +static void avg_src_width8_msa( uint8_t *p_src1, int32_t i_src1_stride, + uint8_t *p_src2, int32_t i_src2_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height ) +{ + int32_t i_cnt; + uint64_t u_out0, u_out1, u_out2, u_out3; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 dst0, dst1, dst2, dst3; + + for( i_cnt = ( i_height / 4 ); i_cnt--; ) + { + LD_UB4( p_src1, i_src1_stride, src0, src1, src2, src3 ); + p_src1 += ( 4 * i_src1_stride ); + LD_UB4( p_src2, i_src2_stride, src4, src5, src6, src7 ); + p_src2 += ( 4 * i_src2_stride ); + + AVER_UB4_UB( src0, src4, src1, src5, src2, src6, src3, src7, + dst0, dst1, dst2, dst3 ); + + u_out0 = __msa_copy_u_d( ( v2i64 ) dst0, 0 ); + u_out1 = __msa_copy_u_d( ( v2i64 ) dst1, 0 ); + u_out2 = __msa_copy_u_d( ( v2i64 ) dst2, 0 ); + u_out3 = __msa_copy_u_d( ( v2i64 ) dst3, 0 ); + SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + } +} + +static void avg_src_width16_msa( uint8_t *p_src1, int32_t i_src1_stride, + uint8_t *p_src2, int32_t i_src2_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height ) +{ + int32_t i_cnt; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + for( i_cnt = ( i_height / 8 ); i_cnt--; ) + { + LD_UB8( p_src1, i_src1_stride, + src0, src1, src2, src3, src4, src5, src6, src7 ); + p_src1 += ( 8 * i_src1_stride ); + LD_UB8( p_src2, i_src2_stride, + dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 ); + p_src2 += ( 8 * i_src2_stride ); + + AVER_UB4_UB( src0, dst0, src1, dst1, src2, dst2, src3, dst3, + dst0, dst1, dst2, dst3 ); + AVER_UB4_UB( src4, dst4, src5, dst5, src6, dst6, src7, dst7, + dst4, dst5, dst6, dst7 ); + + ST_UB8( dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, + p_dst, i_dst_stride ); + p_dst += ( 8 * i_dst_stride ); + } +} + +static void memset_zero_16width_msa( uint8_t *p_src, int32_t i_stride, + int32_t i_height ) +{ + int8_t i_cnt; + v16u8 zero = { 0 }; + + for( i_cnt = ( i_height / 2 ); i_cnt--; ) + { + ST_UB( zero, p_src ); + p_src += i_stride; + ST_UB( zero, p_src ); + p_src += i_stride; + } +} + +static void plane_copy_interleave_msa( uint8_t *p_src0, int32_t i_src0_stride, + uint8_t *p_src1, int32_t i_src1_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_width, int32_t i_height ) +{ + int32_t i_loop_width, i_loop_height, i_w_mul8, i_h4w; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3; + v16u8 vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3; + + i_w_mul8 = i_width - i_width % 8; + i_h4w = i_height - i_height % 4; + + for( i_loop_height = ( i_h4w >> 2 ); i_loop_height--; ) + { + for( i_loop_width = ( i_width >> 4 ); i_loop_width--; ) + { + LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 ); + LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 ); + ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3, + vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 ); + ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3, + vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3 ); + ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3, + p_dst, i_dst_stride ); + ST_UB4( vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3, + ( p_dst + 16 ), i_dst_stride ); + p_src0 += 16; + p_src1 += 16; + p_dst += 32; + } + + for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; ) + { + LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 ); + LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 ); + ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3, + vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 ); + ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3, + p_dst, i_dst_stride ); + p_src0 += 8; + p_src1 += 8; + p_dst += 16; + } + + for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ ) + { + p_dst[0] = p_src0[0]; + p_dst[1] = p_src1[0]; + p_dst[i_dst_stride] = p_src0[i_src0_stride]; + p_dst[i_dst_stride + 1] = p_src1[i_src1_stride]; + p_dst[2 * i_dst_stride] = p_src0[2 * i_src0_stride]; + p_dst[2 * i_dst_stride + 1] = p_src1[2 * i_src1_stride]; + p_dst[3 * i_dst_stride] = p_src0[3 * i_src0_stride]; + p_dst[3 * i_dst_stride + 1] = p_src1[3 * i_src1_stride]; + p_src0 += 1; + p_src1 += 1; + p_dst += 2; + } + + p_src0 += ( ( 4 * i_src0_stride ) - i_width ); + p_src1 += ( ( 4 * i_src1_stride ) - i_width ); + p_dst += ( ( 4 * i_dst_stride ) - ( i_width * 2 ) ); + } + + for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ ) + { + for( i_loop_width = ( i_width >> 4 ); i_loop_width--; ) + { + src0 = LD_UB( p_src0 ); + src4 = LD_UB( p_src1 ); + ILVRL_B2_UB( src4, src0, vec_ilv_r0, vec_ilv_l0 ); + ST_UB2( vec_ilv_r0, vec_ilv_l0, p_dst, 16 ); + p_src0 += 16; + p_src1 += 16; + p_dst += 32; + } + + for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; ) + { + src0 = LD_UB( p_src0 ); + src4 = LD_UB( p_src1 ); + vec_ilv_r0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) src4, + ( v16i8 ) src0 ); + ST_UB( vec_ilv_r0, p_dst ); + p_src0 += 8; + p_src1 += 8; + p_dst += 16; + } + + for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ ) + { + p_dst[0] = p_src0[0]; + p_dst[1] = p_src1[0]; + p_src0 += 1; + p_src1 += 1; + p_dst += 2; + } + + p_src0 += ( i_src0_stride - i_width ); + p_src1 += ( i_src1_stride - i_width ); + p_dst += ( i_dst_stride - ( i_width * 2 ) ); + } +} + +static void plane_copy_deinterleave_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst0, int32_t dst0_stride, + uint8_t *p_dst1, int32_t dst1_stride, + int32_t i_width, int32_t i_height ) +{ + int32_t i_loop_width, i_loop_height, i_w_mul4, i_w_mul8, i_h4w; + uint32_t u_res_w0, u_res_w1; + v16u8 in0, in1, in2, in3, in4, in5, in6, in7; + v16u8 vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3; + v16u8 vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3; + uint8_t *p_dst; + + i_w_mul8 = i_width - i_width % 8; + i_w_mul4 = i_width - i_width % 4; + i_h4w = i_height - i_height % 8; + + for( i_loop_height = ( i_h4w >> 3 ); i_loop_height--; ) + { + for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; ) + { + LD_UB8( p_src, i_src_stride, + in0, in1, in2, in3, in4, in5, in6, in7 ); + p_src += 16; + PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6, + vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 ); + PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6, + vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 ); + ST8x4_UB( vec_pckev0, vec_pckev1, p_dst0, dst0_stride ); + p_dst = p_dst0 + 4 * dst0_stride; + ST8x4_UB( vec_pckev2, vec_pckev3, p_dst, dst0_stride ); + ST8x4_UB( vec_pckod0, vec_pckod1, p_dst1, dst1_stride ); + p_dst = p_dst1 + 4 * dst1_stride; + ST8x4_UB( vec_pckod2, vec_pckod3, p_dst, dst1_stride ); + p_dst0 += 8; + p_dst1 += 8; + } + + for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; ) + { + LD_UB8( p_src, i_src_stride, + in0, in1, in2, in3, in4, in5, in6, in7 ); + p_src += 8; + PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6, + vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 ); + PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6, + vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 ); + ST4x4_UB( vec_pckev0, vec_pckev1, 0, 2, 0, 2, p_dst0, dst0_stride ); + p_dst = p_dst0 + 4 * dst0_stride; + ST4x4_UB( vec_pckev2, vec_pckev3, 0, 2, 0, 2, p_dst, dst0_stride ); + ST4x4_UB( vec_pckod0, vec_pckod1, 0, 2, 0, 2, p_dst1, dst1_stride ); + p_dst = p_dst1 + 4 * dst1_stride; + ST4x4_UB( vec_pckod2, vec_pckod3, 0, 2, 0, 2, p_dst, dst1_stride ); + p_dst0 += 4; + p_dst1 += 4; + } + + for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ ) + { + p_dst0[0] = p_src[0]; + p_dst1[0] = p_src[1]; + p_dst0[dst0_stride] = p_src[i_src_stride]; + p_dst1[dst1_stride] = p_src[i_src_stride + 1]; + p_dst0[2 * dst0_stride] = p_src[2 * i_src_stride]; + p_dst1[2 * dst1_stride] = p_src[2 * i_src_stride + 1]; + p_dst0[3 * dst0_stride] = p_src[3 * i_src_stride]; + p_dst1[3 * dst1_stride] = p_src[3 * i_src_stride + 1]; + p_dst0[4 * dst0_stride] = p_src[4 * i_src_stride]; + p_dst1[4 * dst1_stride] = p_src[4 * i_src_stride + 1]; + p_dst0[5 * dst0_stride] = p_src[5 * i_src_stride]; + p_dst1[5 * dst1_stride] = p_src[5 * i_src_stride + 1]; + p_dst0[6 * dst0_stride] = p_src[6 * i_src_stride]; + p_dst1[6 * dst1_stride] = p_src[6 * i_src_stride + 1]; + p_dst0[7 * dst0_stride] = p_src[7 * i_src_stride]; + p_dst1[7 * dst1_stride] = p_src[7 * i_src_stride + 1]; + p_dst0 += 1; + p_dst1 += 1; + p_src += 2; + } + + p_src += ( ( 8 * i_src_stride ) - ( i_width << 1 ) ); + p_dst0 += ( ( 8 * dst0_stride ) - i_width ); + p_dst1 += ( ( 8 * dst1_stride ) - i_width ); + } + + for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ ) + { + for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; ) + { + in0 = LD_UB( p_src ); + p_src += 16; + vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0, + ( v16i8 ) in0 ); + vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0, + ( v16i8 ) in0 ); + ST8x1_UB( vec_pckev0, p_dst0 ); + ST8x1_UB( vec_pckod0, p_dst1 ); + p_dst0 += 8; + p_dst1 += 8; + } + + for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; ) + { + in0 = LD_UB( p_src ); + p_src += 8; + vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0, + ( v16i8 ) in0 ); + vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0, + ( v16i8 ) in0 ); + u_res_w0 = __msa_copy_u_w( ( v4i32 ) vec_pckev0, 0 ); + SW( u_res_w0, p_dst0 ); + u_res_w1 = __msa_copy_u_w( ( v4i32 ) vec_pckod0, 0 ); + SW( u_res_w1, p_dst1 ); + p_dst0 += 4; + p_dst1 += 4; + } + + for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ ) + { + p_dst0[0] = p_src[0]; + p_dst1[0] = p_src[1]; + p_dst0 += 1; + p_dst1 += 1; + p_src += 2; + } + + p_src += ( ( i_src_stride ) - ( i_width << 1 ) ); + p_dst0 += ( ( dst0_stride ) - i_width ); + p_dst1 += ( ( dst1_stride ) - i_width ); + } +} + + +static void plane_copy_deinterleave_rgb_msa( uint8_t *p_src, + int32_t i_src_stride, + uint8_t *p_dst0, + int32_t i_dst0_stride, + uint8_t *p_dst1, + int32_t i_dst1_stride, + uint8_t *p_dst2, + int32_t i_dst2_stride, + int32_t i_width, + int32_t i_height ) +{ + uint8_t *p_src_orig = p_src; + uint8_t *p_dst0_orig = p_dst0; + uint8_t *p_dst1_orig = p_dst1; + uint8_t *p_dst2_orig = p_dst2; + int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4; + v16i8 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 temp0, temp1, temp2, temp3; + v16i8 mask0 = { 0, 3, 6, 9, 12, 15, 18, 21, 0, 0, 0, 0, 0, 0, 0, 0 }; + v16i8 mask1 = { 1, 4, 7, 10, 13, 16, 19, 22, 0, 0, 0, 0, 0, 0, 0, 0 }; + v16i8 mask2 = { 2, 5, 8, 11, 14, 17, 20, 23, 0, 0, 0, 0, 0, 0, 0, 0 }; + + i_w_mul8 = i_width - i_width % 8; + i_h_mul4 = i_height - i_height % 4; + + for( i_loop_height = ( i_height >> 2 ); i_loop_height--; ) + { + p_src = p_src_orig; + p_dst0 = p_dst0_orig; + p_dst1 = p_dst1_orig; + p_dst2 = p_dst2_orig; + + for( i_loop_width = ( i_width >> 3 ); i_loop_width--; ) + { + LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 ); + LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 ); + + VSHF_B2_SB( in0, in4, in1, in5, mask0, mask0, temp0, temp1 ); + VSHF_B2_SB( in2, in6, in3, in7, mask0, mask0, temp2, temp3 ); + ST8x1_UB( temp0, p_dst0 ); + ST8x1_UB( temp1, p_dst0 + i_dst0_stride ); + ST8x1_UB( temp2, p_dst0 + 2 * i_dst0_stride ); + ST8x1_UB( temp3, p_dst0 + 3 * i_dst0_stride ); + + VSHF_B2_SB( in0, in4, in1, in5, mask1, mask1, temp0, temp1 ); + VSHF_B2_SB( in2, in6, in3, in7, mask1, mask1, temp2, temp3 ); + ST8x1_UB( temp0, p_dst1 ); + ST8x1_UB( temp1, p_dst1 + i_dst1_stride ); + ST8x1_UB( temp2, p_dst1 + 2 * i_dst1_stride ); + ST8x1_UB( temp3, p_dst1 + 3 * i_dst1_stride ); + + VSHF_B2_SB( in0, in4, in1, in5, mask2, mask2, temp0, temp1 ); + VSHF_B2_SB( in2, in6, in3, in7, mask2, mask2, temp2, temp3 ); + ST8x1_UB( temp0, p_dst2 ); + ST8x1_UB( temp1, p_dst2 + i_dst2_stride ); + ST8x1_UB( temp2, p_dst2 + 2 * i_dst2_stride ); + ST8x1_UB( temp3, p_dst2 + 3 * i_dst2_stride ); + + p_src += 8 * 3; + p_dst0 += 8; + p_dst1 += 8; + p_dst2 += 8; + } + + for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ ) + { + p_dst0_orig[i_loop_width] = p_src_orig[0 + 3 * i_loop_width]; + p_dst1_orig[i_loop_width] = p_src_orig[1 + 3 * i_loop_width]; + p_dst2_orig[i_loop_width] = p_src_orig[2 + 3 * i_loop_width]; + + p_dst0_orig[i_loop_width + i_dst0_stride] = + p_src_orig[0 + i_src_stride + 3 * i_loop_width]; + p_dst1_orig[i_loop_width + i_dst1_stride] = + p_src_orig[1 + i_src_stride + 3 * i_loop_width]; + p_dst2_orig[i_loop_width + i_dst2_stride] = + p_src_orig[2 + i_src_stride + 3 * i_loop_width]; + + p_dst0_orig[i_loop_width + 2 * i_dst0_stride] = + p_src_orig[0 + 2 * i_src_stride + 3 * i_loop_width]; + p_dst1_orig[i_loop_width + 2 * i_dst1_stride] = + p_src_orig[1 + 2 * i_src_stride + 3 * i_loop_width]; + p_dst2_orig[i_loop_width + 2 * i_dst2_stride] = + p_src_orig[2 + 2 * i_src_stride + 3 * i_loop_width]; + + p_dst0_orig[i_loop_width + 3 * i_dst0_stride] = + p_src_orig[0 + 3 * i_src_stride + 3 * i_loop_width]; + p_dst1_orig[i_loop_width + 3 * i_dst1_stride] = + p_src_orig[1 + 3 * i_src_stride + 3 * i_loop_width]; + p_dst2_orig[i_loop_width + 3 * i_dst2_stride] = + p_src_orig[2 + 3 * i_src_stride + 3 * i_loop_width]; + } + + p_src_orig += ( 4 * i_src_stride ); + p_dst0_orig += ( 4 * i_dst0_stride ); + p_dst1_orig += ( 4 * i_dst1_stride ); + p_dst2_orig += ( 4 * i_dst2_stride ); + } + + for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ ) + { + p_src = p_src_orig; + p_dst0 = p_dst0_orig; + p_dst1 = p_dst1_orig; + p_dst2 = p_dst2_orig; + + for( i_loop_width = ( i_width >> 3 ); i_loop_width--; ) + { + in0 = LD_SB( p_src ); + in4 = LD_SB( p_src + 16 ); + temp0 = __msa_vshf_b( mask0, in4, in0 ); + ST8x1_UB( temp0, p_dst0 ); + temp0 = __msa_vshf_b( mask1, in4, in0 ); + ST8x1_UB( temp0, p_dst1 ); + temp0 = __msa_vshf_b( mask2, in4, in0 ); + ST8x1_UB( temp0, p_dst2 ); + + p_src += 8 * 3; + p_dst0 += 8; + p_dst1 += 8; + p_dst2 += 8; + } + + for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ ) + { + p_dst0_orig[i_loop_width] = p_src_orig[3 * i_loop_width]; + p_dst1_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 1]; + p_dst2_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 2]; + } + + p_src_orig += ( i_src_stride ); + p_dst0_orig += ( i_dst0_stride ); + p_dst1_orig += ( i_dst1_stride ); + p_dst2_orig += ( i_dst2_stride ); + } +} + +static void plane_copy_deinterleave_rgba_msa( uint8_t *p_src, + int32_t i_src_stride, + uint8_t *p_dst0, + int32_t i_dst0_stride, + uint8_t *p_dst1, + int32_t i_dst1_stride, + uint8_t *p_dst2, + int32_t i_dst2_stride, + int32_t i_width, + int32_t i_height ) +{ + uint8_t *p_src_orig = p_src; + uint8_t *p_dst0_orig = p_dst0; + uint8_t *p_dst1_orig = p_dst1; + uint8_t *p_dst2_orig = p_dst2; + int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4; + v16i8 in0, in1, in2, in3, in4, in5, in6, in7; + v16i8 in8, in9, in10, in11, in12, in13, in14, in15; + v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + v8i16 temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15; + + i_w_mul8 = i_width - i_width % 8; + i_h_mul4 = i_height - i_height % 4; + + for( i_loop_height = ( i_height >> 2 ); i_loop_height--; ) + { + p_src = p_src_orig; + p_dst0 = p_dst0_orig; + p_dst1 = p_dst1_orig; + p_dst2 = p_dst2_orig; + + for( i_loop_width = ( i_width >> 4 ); i_loop_width--; ) + { + LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 ); + LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 ); + LD_SB4( ( p_src + 32 ), i_src_stride, in8, in9, in10, in11 ); + LD_SB4( ( p_src + 48 ), i_src_stride, in12, in13, in14, in15 ); + + PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 ); + temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 ); + temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 ); + PCKEV_H2_SH( in5, in1, in13, in9, temp4, temp5 ); + temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 ); + temp7 = __msa_pckod_h( ( v8i16 ) in13, ( v8i16 ) in9 ); + PCKEV_H2_SH( in6, in2, in14, in10, temp8, temp9 ); + temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 ); + temp11 = __msa_pckod_h( ( v8i16 ) in14, ( v8i16 ) in10 ); + PCKEV_H2_SH( in7, in3, in15, in11, temp12, temp13 ); + temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 ); + temp15 = __msa_pckod_h( ( v8i16 ) in15, ( v8i16 ) in11 ); + PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 ); + in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 ); + PCKEV_B2_SB( temp5, temp4, temp7, temp6, in4, in5 ); + in6 = __msa_pckod_b( ( v16i8 ) temp5, ( v16i8 ) temp4 ); + PCKEV_B2_SB( temp9, temp8, temp11, temp10, in8, in9 ); + in10 = __msa_pckod_b( ( v16i8 ) temp9, ( v16i8 ) temp8 ); + PCKEV_B2_SB( temp13, temp12, temp15, temp14, in12, in13 ); + in14 = __msa_pckod_b( ( v16i8 ) temp13, ( v16i8 ) temp12 ); + ST_SB4( in0, in4, in8, in12, p_dst0, i_dst0_stride ); + ST_SB4( in1, in5, in9, in13, p_dst2, i_dst2_stride ); + ST_SB4( in2, in6, in10, in14, p_dst1, i_dst1_stride ); + + p_src += 16 * 4; + p_dst0 += 16; + p_dst1 += 16; + p_dst2 += 16; + } + + for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; ) + { + LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 ); + LD_SB4( p_src + 16, i_src_stride, in4, in5, in6, in7 ); + + PCKEV_H2_SH( in4, in0, in5, in1, temp0, temp4 ); + temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 ); + temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 ); + + PCKEV_H2_SH( in6, in2, in7, in3, temp8, temp12 ); + temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 ); + temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 ); + + PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 ); + in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 ); + PCKEV_B2_SB( temp4, temp4, temp6, temp6, in4, in5 ); + in6 = __msa_pckod_b( ( v16i8 ) temp4, ( v16i8 ) temp4 ); + PCKEV_B2_SB( temp8, temp8, temp10, temp10, in8, in9 ); + in10 = __msa_pckod_b( ( v16i8 ) temp8, ( v16i8 ) temp8 ); + PCKEV_B2_SB( temp12, temp12, temp14, temp14, in12, in13 ); + in14 = __msa_pckod_b( ( v16i8 ) temp12, ( v16i8 ) temp12 ); + + ST8x1_UB( in0, p_dst0 ); + ST8x1_UB( in4, p_dst0 + i_dst0_stride ); + ST8x1_UB( in8, p_dst0 + 2 * i_dst0_stride ); + ST8x1_UB( in12, p_dst0 + 3 * i_dst0_stride ); + + ST8x1_UB( in1, p_dst2 ); + ST8x1_UB( in5, p_dst2 + i_dst2_stride ); + ST8x1_UB( in9, p_dst2 + 2 * i_dst2_stride ); + ST8x1_UB( in13, p_dst2 + 3 * i_dst2_stride ); + + ST8x1_UB( in2, p_dst1 ); + ST8x1_UB( in6, p_dst1 + i_dst1_stride ); + ST8x1_UB( in10, p_dst1 + 2 * i_dst1_stride ); + ST8x1_UB( in14, p_dst1 + 3 * i_dst1_stride ); + + p_src += 8 * 4; + p_dst0 += 8; + p_dst1 += 8; + p_dst2 += 8; + } + + for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ ) + { + p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width]; + p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1]; + p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2]; + + p_dst0_orig[i_dst0_stride + i_loop_width] = + p_src_orig[i_src_stride + 4 * i_loop_width]; + p_dst1_orig[i_dst1_stride + i_loop_width] = + p_src_orig[i_src_stride + 4 * i_loop_width + 1]; + p_dst2_orig[i_dst2_stride + i_loop_width] = + p_src_orig[i_src_stride + 4 * i_loop_width + 2]; + + p_dst0_orig[2 * i_dst0_stride + i_loop_width] = + p_src_orig[2 * i_src_stride + 4 * i_loop_width]; + p_dst1_orig[2 * i_dst1_stride + i_loop_width] = + p_src_orig[2 * i_src_stride + 4 * i_loop_width + 1]; + p_dst2_orig[2 * i_dst2_stride + i_loop_width] = + p_src_orig[2 * i_src_stride + 4 * i_loop_width + 2]; + + p_dst0_orig[3 * i_dst0_stride + i_loop_width] = + p_src_orig[3 * i_src_stride + 4 * i_loop_width]; + p_dst1_orig[3 * i_dst1_stride + i_loop_width] = + p_src_orig[3 * i_src_stride + 4 * i_loop_width + 1]; + p_dst2_orig[3 * i_dst2_stride + i_loop_width] = + p_src_orig[3 * i_src_stride + 4 * i_loop_width + 2]; + } + + p_src_orig += ( 4 * i_src_stride ); + p_dst0_orig += ( 4 * i_dst0_stride ); + p_dst1_orig += ( 4 * i_dst1_stride ); + p_dst2_orig += ( 4 * i_dst2_stride ); + } + + for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ ) + { + p_src = p_src_orig; + p_dst0 = p_dst0_orig; + p_dst1 = p_dst1_orig; + p_dst2 = p_dst2_orig; + + for( i_loop_width = ( i_width >> 4 ); i_loop_width--; ) + { + LD_SB4( p_src, 16, in0, in4, in8, in12 ); + + PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 ); + temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 ); + temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 ); + PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 ); + in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 ); + ST_SB( in0, p_dst0 ); + ST_SB( in0, p_dst0 ); + ST_SB( in1, p_dst2 ); + ST_SB( in1, p_dst2 ); + ST_SB( in2, p_dst1 ); + ST_SB( in2, p_dst1 ); + + p_src += 16 * 4; + p_dst0 += 16; + p_dst1 += 16; + p_dst2 += 16; + } + + for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; ) + { + in0 = LD_SB( p_src ); + in4 = LD_SB( p_src + 16 ); + + temp0 = __msa_pckev_h( ( v8i16 ) in4, ( v8i16 ) in0 ); + temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 ); + PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 ); + in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 ); + ST8x1_UB( in0, p_dst0 ); + ST8x1_UB( in1, p_dst2 ); + ST8x1_UB( in2, p_dst1 ); + + p_src += 8 * 4; + p_dst0 += 8; + p_dst1 += 8; + p_dst2 += 8; + } + + for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ ) + { + p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width]; + p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1]; + p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2]; + } + + p_src_orig += ( i_src_stride ); + p_dst0_orig += ( i_dst0_stride ); + p_dst1_orig += ( i_dst1_stride ); + p_dst2_orig += ( i_dst2_stride ); + } +} + +static void store_interleave_chroma_msa( uint8_t *p_src0, int32_t i_src0_stride, + uint8_t *p_src1, int32_t i_src1_stride, + uint8_t *p_dst, int32_t i_dst_stride, + int32_t i_height ) +{ + int32_t i_loop_height, i_h4w; + v16u8 in0, in1, in2, in3, in4, in5, in6, in7; + v16u8 ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3; + + i_h4w = i_height % 4; + for( i_loop_height = ( i_height >> 2 ); i_loop_height--; ) + { + LD_UB4( p_src0, i_src0_stride, in0, in1, in2, in3 ); + p_src0 += ( 4 * i_src0_stride ); + LD_UB4( p_src1, i_src1_stride, in4, in5, in6, in7 ); + p_src1 += ( 4 * i_src1_stride ); + ILVR_B4_UB( in4, in0, in5, in1, in6, in2, in7, in3, + ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3 ); + ST_UB4( ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3, + p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + } + + for( i_loop_height = i_h4w; i_loop_height--; ) + { + in0 = LD_UB( p_src0 ); + p_src0 += ( i_src0_stride ); + in1 = LD_UB( p_src1 ); + p_src1 += ( i_src1_stride ); + ilvr_vec0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) in1, ( v16i8 ) in0 ); + ST_UB( ilvr_vec0, p_dst ); + p_dst += ( i_dst_stride ); + } +} + +static void frame_init_lowres_core_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst0, int32_t dst0_stride, + uint8_t *p_dst1, int32_t dst1_stride, + uint8_t *p_dst2, int32_t dst2_stride, + uint8_t *p_dst3, int32_t dst3_stride, + int32_t i_width, int32_t i_height ) +{ + int32_t i_loop_width, i_loop_height, i_w16_mul; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8; + v16u8 sld1_vec0, sld1_vec1, sld1_vec2, sld1_vec3, sld1_vec4, sld1_vec5; + v16u8 pckev_vec0, pckev_vec1, pckev_vec2; + v16u8 pckod_vec0, pckod_vec1, pckod_vec2; + v16u8 tmp0, tmp1, tmp2, tmp3; + v16u8 res0, res1; + + i_w16_mul = i_width - i_width % 16; + for( i_loop_height = i_height; i_loop_height--; ) + { + LD_UB3( p_src, i_src_stride, src0, src1, src2 ); + p_src += 16; + for( i_loop_width = 0; i_loop_width < ( i_w16_mul >> 4 ); i_loop_width++ ) + { + LD_UB3( p_src, i_src_stride, src3, src4, src5 ); + p_src += 16; + LD_UB3( p_src, i_src_stride, src6, src7, src8 ); + p_src += 16; + PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 ); + PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 ); + pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5, + ( v16i8 ) src2 ); + pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5, + ( v16i8 ) src2 ); + AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0, + pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1, + tmp0, tmp1, tmp2, tmp3 ); + AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 ); + ST_UB( res0, p_dst0 ); + ST_UB( res1, p_dst2 ); + + SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 ); + SLDI_B2_UB( src5, src6, src2, src3, sld1_vec2, sld1_vec3, 1 ); + SLDI_B2_UB( src7, src8, src4, src5, sld1_vec4, sld1_vec5, 1 ); + PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1, + pckev_vec0, pckev_vec1 ) + pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5, + ( v16i8 ) sld1_vec2 ); + AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0, + pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1, + tmp0, tmp1, tmp2, tmp3 ); + AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 ); + ST_UB( res0, p_dst1 ); + ST_UB( res1, p_dst3 ); + + src0 = src6; + src1 = src7; + src2 = src8; + p_dst0 += 16; + p_dst1 += 16; + p_dst2 += 16; + p_dst3 += 16; + } + + for( i_loop_width = i_w16_mul; i_loop_width < i_width; + i_loop_width += 8 ) + { + LD_UB3( p_src, i_src_stride, src3, src4, src5 ); + p_src += 16; + PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 ); + PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 ); + pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5, + ( v16i8 ) src2 ); + pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5, + ( v16i8 ) src2 ); + AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0, + pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1, + tmp0, tmp1, tmp2, tmp3 ); + AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 ); + ST8x1_UB( res0, p_dst0 ); + ST8x1_UB( res1, p_dst2 ); + + SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 ); + SLDI_B2_UB( src5, src3, src2, src3, sld1_vec2, sld1_vec3, 1 ); + SLDI_B2_UB( src4, src5, src4, src5, sld1_vec4, sld1_vec5, 1 ); + PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1, + pckev_vec0, pckev_vec1 ) + pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5, + ( v16i8 ) sld1_vec2 ); + AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0, + pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1, + tmp0, tmp1, tmp2, tmp3 ); + AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 ); + ST8x1_UB( res0, p_dst1 ); + ST8x1_UB( res1, p_dst3 ); + p_dst0 += 8; + p_dst1 += 8; + p_dst2 += 8; + p_dst3 += 8; + } + + p_src += ( i_src_stride * 2 - ( ( i_width * 2 ) + 16 ) ); + p_dst0 += ( dst0_stride - i_width ); + p_dst1 += ( dst1_stride - i_width ); + p_dst2 += ( dst2_stride - i_width ); + p_dst3 += ( dst3_stride - i_width ); + } +} + +void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + int32_t i_height ) +{ + copy_width16_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height ); +} + +void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src, + intptr_t i_src_stride, int32_t i_height ) +{ + copy_width8_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height ); +} + +void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src, + intptr_t i_src_stride, int32_t i_height ) +{ + copy_width4_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height ); +} + +void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t pix1_stride, + uint8_t *p_pix2, intptr_t pix2_stride, + uint8_t *p_pix3, intptr_t pix3_stride, + int32_t i_weight ) +{ + if( 32 == i_weight ) + { + avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, + p_pix1, pix1_stride, 16 ); + } + else if( i_weight < 0 || i_weight > 63 ) + { + avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, + 16, 5, i_weight, + ( 64 - i_weight ), 0 ); + } + else + { + avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, + 16, 5, i_weight, + ( 64 - i_weight ), 0 ); + } +} + +void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t pix1_stride, + uint8_t *p_pix2, intptr_t pix2_stride, + uint8_t *p_pix3, intptr_t pix3_stride, + int32_t i_weight ) +{ + if( 32 == i_weight ) + { + avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, + p_pix1, pix1_stride, 8 ); + } + else if( i_weight < 0 || i_weight > 63 ) + { + avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, + 8, 5, i_weight, + ( 64 - i_weight ), 0 ); + } + else + { + avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, + 8, 5, i_weight, + ( 64 - i_weight ), 0 ); + } +} + +void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t pix1_stride, + uint8_t *p_pix2, intptr_t pix2_stride, + uint8_t *p_pix3, intptr_t pix3_stride, + int32_t i_weight ) +{ + if( 32 == i_weight ) + { + avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, + p_pix1, pix1_stride, 16 ); + } + else if( i_weight < 0 || i_weight > 63 ) + { + avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, 16, 5, i_weight, + ( 64 - i_weight ), 0 ); + } + else + { + avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, 16, 5, i_weight, + ( 64 - i_weight ), 0 ); + } +} + +void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t pix1_stride, + uint8_t *p_pix2, intptr_t pix2_stride, + uint8_t *p_pix3, intptr_t pix3_stride, + int32_t i_weight ) +{ + if( 32 == i_weight ) + { + avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, + p_pix1, pix1_stride, 8 ); + } + else if( i_weight < 0 || i_weight > 63 ) + { + avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, 8, 5, i_weight, + ( 64 - i_weight ), 0 ); + } + else + { + avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, 8, 5, i_weight, + ( 64 - i_weight ), 0 ); + } +} + +void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t pix1_stride, + uint8_t *p_pix2, intptr_t pix2_stride, + uint8_t *p_pix3, intptr_t pix3_stride, + int32_t i_weight ) +{ + if( 32 == i_weight ) + { + avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, + p_pix1, pix1_stride, 4 ); + } + else if( i_weight < 0 || i_weight > 63 ) + { + avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, 4, 5, i_weight, + ( 64 - i_weight ), 0 ); + } + else + { + avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, 4, 5, i_weight, + ( 64 - i_weight ), 0 ); + } +} + +void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride, + uint8_t *p_pix2, intptr_t pix2_stride, + uint8_t *p_pix3, intptr_t pix3_stride, + int32_t i_weight ) +{ + if( 32 == i_weight ) + { + avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, + p_pix1, pix1_stride, 16 ); + } + else if( i_weight < 0 || i_weight > 63 ) + { + avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, 16, 5, i_weight, + ( 64 - i_weight ), 0 ); + } + else + { + avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, 16, 5, i_weight, + ( 64 - i_weight ), 0 ); + } +} + +void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t pix1_stride, + uint8_t *p_pix2, intptr_t pix2_stride, + uint8_t *p_pix3, intptr_t pix3_stride, + int32_t i_weight ) +{ + if( 32 == i_weight ) + { + avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, + p_pix1, pix1_stride, 8 ); + } + else if( i_weight < 0 || i_weight > 63 ) + { + avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, 8, 5, i_weight, + ( 64 - i_weight ), 0 ); + } + else + { + avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, 8, 5, i_weight, + ( 64 - i_weight ), 0 ); + } +} + +void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t pix1_stride, + uint8_t *p_pix2, intptr_t pix2_stride, + uint8_t *p_pix3, intptr_t pix3_stride, + int32_t i_weight ) +{ + if( 32 == i_weight ) + { + avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, + p_pix1, pix1_stride, 4 ); + } + else if( i_weight < 0 || i_weight > 63 ) + { + avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, 4, 5, i_weight, + ( 64 - i_weight ), 0 ); + } + else + { + avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, 4, 5, i_weight, + ( 64 - i_weight ), 0 ); + } +} + +void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t pix1_stride, + uint8_t *p_pix2, intptr_t pix2_stride, + uint8_t *p_pix3, intptr_t pix3_stride, + int32_t i_weight ) +{ + if( 32 == i_weight ) + { + avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride, + p_pix1, pix1_stride, 2 ); + } + else if( i_weight < 0 || i_weight > 63 ) + { + avc_biwgt_opscale_4x2_nw_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, 5, i_weight, + ( 64 - i_weight ), 0 ); + } + else + { + avc_biwgt_opscale_4x2_msa( p_pix2, pix2_stride, + p_pix3, pix3_stride, + p_pix1, pix1_stride, 5, i_weight, + ( 64 - i_weight ), 0 ); + } +} + + +void x264_memzero_aligned_msa( void *p_dst, size_t n ) +{ + uint32_t u_tot32_mul_lines = n >> 5; + uint32_t u_remaining = n - ( u_tot32_mul_lines << 5 ); + + memset_zero_16width_msa( p_dst, 16, ( n / 16 ) ); + + if( u_remaining ) + { + memset( p_dst + ( u_tot32_mul_lines << 5 ), 0, u_remaining ); + } +} + +void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + const x264_weight_t *pWeight, int32_t i_height ) +{ + int32_t i_log2_denom = pWeight->i_denom; + int32_t i_offset = pWeight->i_offset; + int32_t i_weight = pWeight->i_scale; + + avc_wgt_opscale_4width_msa( p_src, i_src_stride, p_dst, i_dst_stride, + i_height, i_log2_denom, i_weight, i_offset ); +} + +void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + const x264_weight_t *pWeight, int32_t i_height ) +{ + int32_t i_log2_denom = pWeight->i_denom; + int32_t i_offset = pWeight->i_offset; + int32_t i_weight = pWeight->i_scale; + + avc_wgt_opscale_8width_msa( p_src, i_src_stride, p_dst, i_dst_stride, + i_height, i_log2_denom, i_weight, i_offset ); +} + +void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + const x264_weight_t *pWeight, int32_t i_height ) +{ + int32_t i_log2_denom = pWeight->i_denom; + int32_t i_offset = pWeight->i_offset; + int32_t i_weight = pWeight->i_scale; + + avc_wgt_opscale_16width_msa( p_src, i_src_stride, p_dst, i_dst_stride, + i_height, i_log2_denom, i_weight, i_offset ); +} + +void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + const x264_weight_t *pWeight, int32_t i_height ) +{ + x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src, i_src_stride, + pWeight, i_height ); + x264_mc_weight_w4_msa( p_dst + 16, i_dst_stride, p_src + 16, i_src_stride, + pWeight, i_height ); +} + +void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src[4], intptr_t i_src_stride, + int32_t m_vx, int32_t m_vy, + int32_t i_width, int32_t i_height, + const x264_weight_t *pWeight ) +{ + int32_t i_qpel_idx; + int32_t i_offset; + uint8_t *p_src1; + + i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 ); + i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 ); + p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset + + ( 3 == ( m_vy & 3 ) ) * i_src_stride; + + if( i_qpel_idx & 5 ) + { + uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] + + i_offset + ( 3 == ( m_vx&3 ) ); + + if( 16 == i_width ) + { + avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride, + p_dst, i_dst_stride, i_height ); + } + else if( 8 == i_width ) + { + avg_src_width8_msa( p_src1, i_src_stride, p_src2, i_src_stride, + p_dst, i_dst_stride, i_height ); + } + else if( 4 == i_width ) + { + avg_src_width4_msa( p_src1, i_src_stride, p_src2, i_src_stride, + p_dst, i_dst_stride, i_height ); + } + + if( pWeight->weightfn ) + { + if( 16 == i_width ) + { + x264_mc_weight_w16_msa( p_dst, i_dst_stride, + p_dst, i_dst_stride, + pWeight, i_height ); + } + else if( 8 == i_width ) + { + x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_dst, i_dst_stride, + pWeight, i_height ); + } + else if( 4 == i_width ) + { + x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_dst, i_dst_stride, + pWeight, i_height ); + } + } + } + else if( pWeight->weightfn ) + { + if( 16 == i_width ) + { + x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src1, i_src_stride, + pWeight, i_height ); + } + else if( 8 == i_width ) + { + x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_src1, i_src_stride, + pWeight, i_height ); + } + else if( 4 == i_width ) + { + x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_src1, i_src_stride, + pWeight, i_height ); + } + } + else + { + if( 16 == i_width ) + { + copy_width16_msa( p_src1, i_src_stride, p_dst, i_dst_stride, + i_height ); + } + else if( 8 == i_width ) + { + copy_width8_msa( p_src1, i_src_stride, p_dst, i_dst_stride, + i_height ); + } + else if( 4 == i_width ) + { + copy_width4_msa( p_src1, i_src_stride, p_dst, i_dst_stride, + i_height ); + } + } +} + +void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v, + intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + int32_t m_vx, int32_t m_vy, + int32_t i_width, int32_t i_height ) +{ + int32_t i_d8x = m_vx & 0x07; + int32_t i_d8y = m_vy & 0x07; + int32_t i_coeff_horiz1 = ( 8 - i_d8x ); + int32_t i_coeff_vert1 = ( 8 - i_d8y ); + int32_t i_coeff_horiz0 = i_d8x; + int32_t i_coeff_vert0 = i_d8y; + + p_src += ( m_vy >> 3 ) * i_src_stride + ( m_vx >> 3 ) * 2; + + if( 2 == i_width ) + { + avc_interleaved_chroma_hv_2w_msa( p_src, i_src_stride, + p_dst_u, p_dst_v, i_dst_stride, + i_coeff_horiz0, i_coeff_horiz1, + i_coeff_vert0, i_coeff_vert1, + i_height ); + } + else if( 4 == i_width ) + { + avc_interleaved_chroma_hv_4w_msa( p_src, i_src_stride, + p_dst_u, p_dst_v, i_dst_stride, + i_coeff_horiz0, i_coeff_horiz1, + i_coeff_vert0, i_coeff_vert1, + i_height ); + } + else if( 8 == i_width ) + { + avc_interleaved_chroma_hv_8w_msa( p_src, i_src_stride, + p_dst_u, p_dst_v, i_dst_stride, + i_coeff_horiz0, i_coeff_horiz1, + i_coeff_vert0, i_coeff_vert1, + i_height ); + } +} + +void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v, + uint8_t *p_dstc, uint8_t *p_src, + intptr_t i_stride, int32_t i_width, + int32_t i_height, int16_t *p_buf ) +{ + for( int32_t i = 0; i < ( i_width / 16 ); i++ ) + { + avc_luma_vt_16w_msa( p_src - 2 - ( 2 * i_stride ), i_stride, + p_dst_v - 2, i_stride, i_height ); + avc_luma_mid_16w_msa( p_src - 2 - ( 2 * i_stride ) , i_stride, + p_dstc, i_stride, i_height ); + avc_luma_hz_16w_msa( p_src - 2, i_stride, p_dsth, i_stride, i_height ); + + p_src += 16; + p_dst_v += 16; + p_dsth += 16; + p_dstc += 16; + } +} + +void x264_plane_copy_interleave_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src0, intptr_t i_src_stride0, + uint8_t *p_src1, intptr_t i_src_stride1, + int32_t i_width, int32_t i_height ) +{ + plane_copy_interleave_msa( p_src0, i_src_stride0, p_src1, i_src_stride1, + p_dst, i_dst_stride, i_width, i_height ); +} + +void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0, + uint8_t *p_dst1, intptr_t i_dst_stride1, + uint8_t *p_src, intptr_t i_src_stride, + int32_t i_width, int32_t i_height ) +{ + plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst0, i_dst_stride0, + p_dst1, i_dst_stride1, i_width, i_height ); +} + +void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0, + intptr_t i_dst_stride0, + uint8_t *p_dst1, + intptr_t i_dst_stride1, + uint8_t *p_dst2, + intptr_t i_dst_stride2, + uint8_t *p_src, + intptr_t i_src_stride, + int32_t i_src_width, + int32_t i_width, + int32_t i_height ) +{ + if( 3 == i_src_width ) + { + plane_copy_deinterleave_rgb_msa( p_src, i_src_stride, + p_dst0, i_dst_stride0, + p_dst1, i_dst_stride1, + p_dst2, i_dst_stride2, + i_width, i_height ); + } + else if( 4 == i_src_width ) + { + plane_copy_deinterleave_rgba_msa( p_src, i_src_stride, + p_dst0, i_dst_stride0, + p_dst1, i_dst_stride1, + p_dst2, i_dst_stride2, + i_width, i_height ); + } +} + +void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src0, uint8_t *p_src1, + int32_t i_height ) +{ + store_interleave_chroma_msa( p_src0, FDEC_STRIDE, p_src1, FDEC_STRIDE, + p_dst, i_dst_stride, i_height ); +} + +void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src, + intptr_t i_src_stride, + int32_t i_height ) +{ + plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FENC_STRIDE, + ( p_dst + ( FENC_STRIDE / 2 ) ), FENC_STRIDE, + 8, i_height ); +} + +void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src, + intptr_t i_src_stride, + int32_t i_height ) +{ + plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FDEC_STRIDE, + ( p_dst + ( FDEC_STRIDE / 2 ) ), FDEC_STRIDE, + 8, i_height ); +} + +void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0, + uint8_t *p_dst1, uint8_t *p_dst2, + uint8_t *p_dst3, intptr_t i_src_stride, + intptr_t i_dst_stride, int32_t i_width, + int32_t i_height ) +{ + frame_init_lowres_core_msa( p_src, i_src_stride, p_dst0, i_dst_stride, + p_dst1, i_dst_stride, p_dst2, i_dst_stride, + p_dst3, i_dst_stride, i_width, i_height ); +} + +uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride, + uint8_t *p_src[4], intptr_t i_src_stride, + int32_t m_vx, int32_t m_vy, + int32_t i_width, int32_t i_height, + const x264_weight_t *pWeight ) +{ + int32_t i_qpel_idx, i_cnt, i_h4w; + int32_t i_offset; + uint8_t *p_src1, *src1_org; + + i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 ); + i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 ); + p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset + + ( 3 == ( m_vy & 3 ) ) * i_src_stride; + + i_h4w = i_height - i_height%4; + + if( i_qpel_idx & 5 ) + { + uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] + + i_offset + ( 3 == ( m_vx & 3 ) ); + + if( 16 == i_width ) + { + avg_src_width16_msa( p_src1, i_src_stride, + p_src2, i_src_stride, + p_dst, *p_dst_stride, i_h4w ); + for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) + { + v16u8 src_vec1, src_vec2; + v16u8 dst_vec0; + + src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride ); + src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride ); + + dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 ); + + ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) ); + } + } + else if( 20 == i_width ) + { + avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride, + p_dst, *p_dst_stride, i_h4w ); + avg_src_width4_msa( p_src1 + 16, i_src_stride, + p_src2 + 16, i_src_stride, + p_dst + 16, *p_dst_stride, i_h4w ); + + for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) + { + v16u8 src_vec1, src_vec2, src_vec3, src_vec4; + v16u8 dst_vec0, dst_vec1; + uint32_t temp0; + + src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride ); + src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride ); + src_vec3 = LD_UB( p_src1 + i_cnt * i_src_stride + 16 ); + src_vec4 = LD_UB( p_src2 + i_cnt * i_src_stride + 16 ); + + dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 ); + dst_vec1 = __msa_aver_u_b( src_vec3, src_vec4 ); + + temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec1, 0 ); + + ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) ); + SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 ); + } + } + else if( 12 == i_width ) + { + avg_src_width8_msa( p_src1, i_src_stride, + p_src2, i_src_stride, + p_dst, *p_dst_stride, i_h4w ); + avg_src_width4_msa( p_src1 + 8, i_src_stride, + p_src2 + 8, i_src_stride, + p_dst + 8, *p_dst_stride, i_h4w ); + for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) + { + uint32_t temp0; + uint64_t dst0; + v16u8 src_vec1, src_vec2; + v16u8 dst_vec0; + + src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride ); + src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride ); + + dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 ); + + dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 ); + temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 2 ); + + SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) ); + SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 8 ); + } + } + else if( 8 == i_width ) + { + avg_src_width8_msa( p_src1, i_src_stride, + p_src2, i_src_stride, + p_dst, *p_dst_stride, i_h4w ); + for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) + { + uint64_t dst0; + v16u8 src_vec1, src_vec2; + v16u8 dst_vec0; + + src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride ); + src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride ); + + dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 ); + + dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 ); + + SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) ); + } + } + else if( 4 == i_width ) + { + avg_src_width4_msa( p_src1, i_src_stride, + p_src2, i_src_stride, + p_dst, *p_dst_stride, i_h4w ); + for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) + { + uint32_t temp0; + v16u8 src_vec1, src_vec2; + v16u8 dst_vec0; + + src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride ); + src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride ); + + dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 ); + temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 0 ); + + SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) ); + } + } + + if( pWeight->weightfn ) + { + int32_t i_log2_denom; + int32_t i_offset_val; + int32_t i_weight; + + i_log2_denom = pWeight->i_denom; + i_offset_val = pWeight->i_offset; + i_weight = pWeight->i_scale; + + if( 16 == i_width || 12 == i_width ) + { + x264_mc_weight_w16_msa( p_dst, *p_dst_stride, + p_dst, *p_dst_stride, + pWeight, i_h4w ); + for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) + { + v16i8 zero = {0}; + v16u8 src_vec0; + v16i8 tmp0; + v8u16 temp_vec0, temp_vec1; + v8u16 wgt, offset_val0; + v8i16 denom; + + i_offset_val <<= ( i_log2_denom ); + + if( i_log2_denom ) + { + i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); + } + + wgt = ( v8u16 ) __msa_fill_h( i_weight ); + offset_val0 = ( v8u16 ) __msa_fill_h( i_offset_val ); + denom = __msa_fill_h( i_log2_denom ); + + src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) ); + + temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, + ( v16i8 ) src_vec0 ); + temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, + ( v16i8 ) src_vec0 ); + + temp_vec0 = wgt * temp_vec0; + temp_vec1 = wgt * temp_vec1; + + temp_vec0 = + ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, + ( v8i16 ) offset_val0 ); + temp_vec1 = + ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1, + ( v8i16 ) offset_val0 ); + + temp_vec0 = + ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); + temp_vec1 = + ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 ); + + temp_vec0 = + ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); + temp_vec1 = + ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom ); + + temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); + temp_vec1 = __msa_sat_u_h( temp_vec1, 7 ); + + tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1, + ( v16i8 ) temp_vec0 ); + ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) ); + } + } + else if( 20 == i_width ) + { + x264_mc_weight_w20_msa( p_dst, *p_dst_stride, + p_dst, *p_dst_stride, + pWeight, i_h4w ); + for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) + { + uint32_t temp0; + v16i8 zero = {0}; + v16u8 src_vec0; + v16i8 tmp0; + v8u16 temp_vec0, temp_vec1; + v8u16 wgt; + v8i16 denom, offset_val0; + + i_offset_val <<= ( i_log2_denom ); + + if( i_log2_denom ) + { + i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); + } + + wgt = ( v8u16 ) __msa_fill_h( i_weight ); + offset_val0 = __msa_fill_h( i_offset_val ); + denom = __msa_fill_h( i_log2_denom ); + + src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) ); + temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) + 16 ); + + temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, + ( v16i8 ) src_vec0 ); + temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, + ( v16i8 ) src_vec0 ); + + temp_vec0 = wgt * temp_vec0; + temp_vec1 = wgt * temp_vec1; + + temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, + offset_val0 ); + temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1, + offset_val0 ); + + temp_vec0 = + ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); + temp_vec1 = + ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 ); + + temp_vec0 = + ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); + temp_vec1 = + ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom ); + + temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); + temp_vec1 = __msa_sat_u_h( temp_vec1, 7 ); + + tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1, + ( v16i8 ) temp_vec0 ); + ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) ); + + src_vec0 = ( v16u8 ) __msa_fill_w( temp0 ); + temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, + ( v16i8 ) src_vec0 ); + temp_vec0 = wgt * temp_vec0; + + temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, + offset_val0 ); + temp_vec0 = + ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); + temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, + denom ); + temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); + + tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0, + ( v16i8 ) temp_vec0 ); + temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 ); + SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 ); + } + } + else if( 8 == i_width ) + { + x264_mc_weight_w8_msa( p_dst, *p_dst_stride, + p_dst, *p_dst_stride, + pWeight, i_h4w ); + for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ ) + { + uint64_t temp0; + v16i8 zero = {0}; + v16u8 src_vec0; + v16i8 tmp0; + v8u16 temp_vec0; + v8u16 wgt; + v8i16 denom, offset_val0; + + i_offset_val = i_offset_val << i_log2_denom; + + if( i_log2_denom ) + { + i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); + } + + wgt = ( v8u16 ) __msa_fill_h( i_weight ); + offset_val0 = __msa_fill_h( i_offset_val ); + denom = __msa_fill_h( i_log2_denom ); + + src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) ); + + temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, + ( v16i8 ) src_vec0 ); + temp_vec0 = wgt * temp_vec0; + + temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, + offset_val0 ); + temp_vec0 = + ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); + temp_vec0 = + ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); + temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); + + tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0, + ( v16i8 ) temp_vec0 ); + temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 ); + SD( temp0, p_dst + i_cnt * ( *p_dst_stride ) ); + } + } + else if( 4 == i_width ) + { + x264_mc_weight_w4_msa( p_dst, *p_dst_stride, + p_dst, *p_dst_stride, + pWeight, i_h4w ); + for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) + { + uint32_t temp0; + v16i8 zero = {0}; + v16u8 src_vec0; + v16i8 tmp0; + v8u16 temp_vec0; + v8u16 wgt; + v8i16 denom, offset_val0; + + i_offset_val <<= ( i_log2_denom ); + + if( i_log2_denom ) + { + i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); + } + + wgt = ( v8u16 ) __msa_fill_h( i_weight ); + offset_val0 = __msa_fill_h( i_offset_val ); + denom = __msa_fill_h( i_log2_denom ); + + temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) ); + + src_vec0 = ( v16u8 ) __msa_fill_w( temp0 ); + + temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, + ( v16i8 ) src_vec0 ); + temp_vec0 = wgt * temp_vec0; + + temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, + offset_val0 ); + temp_vec0 = + ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); + temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, + denom ); + temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); + + tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0, + ( v16i8 ) temp_vec0 ); + temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 ); + SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) ); + } + } + } + + return p_dst; + } + else if( pWeight->weightfn ) + { + int32_t i_offset_val, i_log2_denom, i_weight; + + i_log2_denom = pWeight->i_denom; + i_offset_val = pWeight->i_offset; + i_weight = pWeight->i_scale; + + i_h4w = i_height - i_height%4; + + src1_org = p_src1; + + if( 16 == i_width || 12 == i_width ) + { + x264_mc_weight_w16_msa( p_dst, *p_dst_stride, p_src1, i_src_stride, + pWeight, i_h4w ); + p_src1 = src1_org + i_h4w * i_src_stride; + + for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) + { + v16i8 zero = {0}; + v16u8 src_vec0; + v16i8 tmp0; + v8u16 temp_vec0, temp_vec1; + v8u16 wgt; + v8i16 denom, offset_val0; + + i_offset_val <<= ( i_log2_denom ); + + if( i_log2_denom ) + { + i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); + } + + wgt = ( v8u16 ) __msa_fill_h( i_weight ); + offset_val0 = __msa_fill_h( i_offset_val ); + denom = __msa_fill_h( i_log2_denom ); + + src_vec0 = LD_UB( p_src1 ); + p_src1 += i_src_stride; + + temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 ); + temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); + + temp_vec0 = wgt * temp_vec0; + temp_vec1 = wgt * temp_vec1; + + temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, + offset_val0 ); + temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1, + offset_val0 ); + + temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); + temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 ); + + temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); + temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom ); + + temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); + temp_vec1 = __msa_sat_u_h( temp_vec1, 7 ); + + tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1, + ( v16i8 ) temp_vec0 ); + ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) ); + } + } + else if( 20 == i_width ) + { + x264_mc_weight_w20_msa( p_dst, *p_dst_stride, p_src1, i_src_stride, + pWeight, i_h4w ); + p_src1 = src1_org + i_h4w * i_src_stride; + + for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) + { + uint32_t temp0; + v16i8 zero = {0}; + v16u8 src_vec0; + v16i8 tmp0; + v8u16 temp_vec0, temp_vec1; + v8u16 wgt; + v8i16 denom, offset_val0; + + i_offset_val <<= ( i_log2_denom ); + + if( i_log2_denom ) + { + i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); + } + + wgt = ( v8u16 ) __msa_fill_h( i_weight ); + offset_val0 = __msa_fill_h( i_offset_val ); + denom = __msa_fill_h( i_log2_denom ); + + src_vec0 = LD_UB( p_src1 ); + temp0 = LW( p_src1 + 16 ); + p_src1 += i_src_stride; + + temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 ); + temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); + + temp_vec0 = wgt * temp_vec0; + temp_vec1 = wgt * temp_vec1; + + temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, + offset_val0 ); + temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1, + offset_val0 ); + + temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); + temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 ); + + temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); + temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom ); + + temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); + temp_vec1 = __msa_sat_u_h( temp_vec1, 7 ); + + tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1, + ( v16i8 ) temp_vec0 ); + ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) ); + + src_vec0 = ( v16u8 ) __msa_fill_w( temp0 ); + temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); + temp_vec0 = wgt * temp_vec0; + + temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, + offset_val0 ); + temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); + temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); + temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); + + tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0, + ( v16i8 ) temp_vec0 ); + temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 ); + SW( temp0,p_dst + i_cnt * ( *p_dst_stride ) + 16 ); + } + } + else if( 8 == i_width ) + { + x264_mc_weight_w8_msa( p_dst, *p_dst_stride, p_src1, i_src_stride, + pWeight, i_h4w ); + p_src1 = src1_org + i_h4w * i_src_stride; + + for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ ) + { + uint64_t u_temp0; + v16i8 zero = {0}; + v16u8 src_vec0; + v16i8 tmp0; + v8u16 temp_vec0; + v8u16 wgt; + v8i16 denom, offset_val0; + + i_offset_val = i_offset_val << i_log2_denom; + + if( i_log2_denom ) + { + i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); + } + + wgt = ( v8u16 ) __msa_fill_h( i_weight ); + offset_val0 = __msa_fill_h( i_offset_val ); + denom = __msa_fill_h( i_log2_denom ); + + src_vec0 = LD_UB( p_src1 ); + p_src1 += i_src_stride; + + temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); + temp_vec0 = wgt * temp_vec0; + + temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, + offset_val0 ); + temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); + temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); + temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); + + tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0, + ( v16i8 ) temp_vec0 ); + u_temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 ); + SD( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) ); + } + } + else if( 4 == i_width ) + { + x264_mc_weight_w4_msa( p_dst, *p_dst_stride, p_src1, i_src_stride, + pWeight, i_h4w ); + p_src1 = src1_org + i_h4w * i_src_stride; + + for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ ) + { + uint32_t u_temp0; + v16i8 zero = {0}; + v16u8 src_vec0; + v16i8 tmp0; + v8u16 temp_vec0; + v8u16 wgt; + v8i16 denom, offset_val0; + + i_offset_val <<= ( i_log2_denom ); + + if( i_log2_denom ) + { + i_offset_val += ( 1 << ( i_log2_denom - 1 ) ); + } + + wgt = ( v8u16 ) __msa_fill_h( i_weight ); + offset_val0 = __msa_fill_h( i_offset_val ); + denom = __msa_fill_h( i_log2_denom ); + + u_temp0 = LW( p_src1 ); + p_src1 += i_src_stride; + + src_vec0 = ( v16u8 ) __msa_fill_w( u_temp0 ); + + temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 ); + temp_vec0 = wgt * temp_vec0; + + temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0, + offset_val0 ); + temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 ); + temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom ); + temp_vec0 = __msa_sat_u_h( temp_vec0, 7 ); + + tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0, + ( v16i8 ) temp_vec0 ); + u_temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 ); + SW( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) ); + } + } + + return p_dst; + } + else + { + *p_dst_stride = i_src_stride; + return p_src1; + } +} + +void x264_mc_init_mips( int32_t cpu, x264_mc_functions_t *pf ) +{ + if( cpu & X264_CPU_MSA ) + { + pf->mc_luma = x264_mc_luma_msa; + pf->mc_chroma = x264_mc_chroma_msa; + pf->get_ref = x264_get_ref_msa; + + pf->avg[PIXEL_16x16]= x264_pixel_avg_16x16_msa; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_msa; + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_msa; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_msa; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_msa; + pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_msa; + pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_msa; + pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_msa; + pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_msa; + + pf->weight = x264_mc_weight_wtab_msa; + pf->offsetadd = x264_mc_weight_wtab_msa; + pf->offsetsub = x264_mc_weight_wtab_msa; + + pf->copy_16x16_unaligned = x264_mc_copy_w16_msa; + pf->copy[PIXEL_16x16] = x264_mc_copy_w16_msa; + pf->copy[PIXEL_8x8] = x264_mc_copy_w8_msa; + pf->copy[PIXEL_4x4] = x264_mc_copy_w4_msa; + + pf->store_interleave_chroma = x264_store_interleave_chroma_msa; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_msa; + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_msa; + + pf->plane_copy_interleave = x264_plane_copy_interleave_msa; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_msa; + pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_msa; + + pf->hpel_filter = x264_hpel_filter_msa; + + pf->memcpy_aligned = memcpy; + pf->memzero_aligned = x264_memzero_aligned_msa; + pf->frame_init_lowres_core = x264_frame_init_lowres_core_msa; + } +} +#endif
View file
x264-snapshot-20150804-2245.tar.bz2/common/mips/mc.h
Added
@@ -0,0 +1,31 @@ +/***************************************************************************** + * mc.h: msa motion compensation + ***************************************************************************** + * Copyright (C) 2015 x264 project + * + * Authors: Neha Rana <neha.rana@imgtec.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_MIPS_MC_H +#define X264_MIPS_MC_H + +void x264_mc_init_mips( int cpu, x264_mc_functions_t *pf ); + +#endif
View file
x264-snapshot-20150804-2245.tar.bz2/common/mips/pixel-c.c
Added
@@ -0,0 +1,1491 @@ +/***************************************************************************** + * pixel-c.c: msa pixel metrics + ***************************************************************************** + * Copyright (C) 2015 x264 project + * + * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "macros.h" +#include "pixel.h" +#include "predict.h" + +#if !HIGH_BIT_DEPTH +#define CALC_MSE_B( src, ref, var ) \ +{ \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m ); \ + HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m ); \ + DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var ); \ +} + +#define CALC_MSE_AVG_B( src, ref, var, sub ) \ +{ \ + v16u8 src_l0_m, src_l1_m; \ + v8i16 res_l0_m, res_l1_m; \ + \ + ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m ); \ + HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m ); \ + DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var ); \ + \ + sub += res_l0_m + res_l1_m; \ +} + +#define VARIANCE_WxH( sse, diff, shift ) \ + ( ( sse ) - ( ( ( uint32_t )( diff ) * ( diff ) ) >> ( shift ) ) ) + +static uint32_t sad_4width_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_ref, int32_t i_ref_stride, + int32_t i_height ) +{ + int32_t i_ht_cnt; + uint32_t u_src0, u_src1, u_src2, u_src3, u_ref0, u_ref1, u_ref2, u_ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff; + v8u16 sad = { 0 }; + + for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) + { + LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 ); + p_src += ( 4 * i_src_stride ); + LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 ); + p_ref += ( 4 * i_ref_stride ); + + INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src ); + INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref ); + + diff = __msa_asub_u_b( src, ref ); + sad += __msa_hadd_u_h( diff, diff ); + } + + return ( HADD_UH_U32( sad ) ); +} + +static uint32_t sad_8width_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_ref, int32_t i_ref_stride, + int32_t i_height ) +{ + int32_t i_ht_cnt; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v8u16 sad = { 0 }; + + for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) + { + LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); + p_src += ( 4 * i_src_stride ); + LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 ); + p_ref += ( 4 * i_ref_stride ); + + PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2, + src0, src1, ref0, ref1 ); + sad += SAD_UB2_UH( src0, src1, ref0, ref1 ); + } + + return ( HADD_UH_U32( sad ) ); +} + +static uint32_t sad_16width_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_ref, int32_t i_ref_stride, + int32_t i_height ) +{ + int32_t i_ht_cnt; + v16u8 src0, src1, ref0, ref1; + v8u16 sad = { 0 }; + + for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) + { + LD_UB2( p_src, i_src_stride, src0, src1 ); + p_src += ( 2 * i_src_stride ); + LD_UB2( p_ref, i_ref_stride, ref0, ref1 ); + p_ref += ( 2 * i_ref_stride ); + sad += SAD_UB2_UH( src0, src1, ref0, ref1 ); + + LD_UB2( p_src, i_src_stride, src0, src1 ); + p_src += ( 2 * i_src_stride ); + LD_UB2( p_ref, i_ref_stride, ref0, ref1 ); + p_ref += ( 2 * i_ref_stride ); + sad += SAD_UB2_UH( src0, src1, ref0, ref1 ); + } + + return ( HADD_UH_U32( sad ) ); +} + +static void sad_4width_x3d_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_ref0, uint8_t *p_ref1, + uint8_t *p_ref2, int32_t i_ref_stride, + int32_t i_height, uint32_t *pu_sad_array ) +{ + int32_t i_ht_cnt; + v16u8 src = { 0 }; + uint32_t src0, src1, src2, src3, load0, load1, load2, load3; + v16u8 ref0 = { 0 }; + v16u8 ref1 = { 0 }; + v16u8 ref2 = { 0 }; + v16u8 diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) + { + LW4( p_src, i_src_stride, src0, src1, src2, src3 ); + INSERT_W4_UB( src0, src1, src2, src3, src ); + p_src += ( 4 * i_src_stride ); + + LW4( p_ref0, i_ref_stride, load0, load1, load2, load3 ); + INSERT_W4_UB( load0, load1, load2, load3, ref0 ); + p_ref0 += ( 4 * i_ref_stride ); + + LW4( p_ref1, i_ref_stride, load0, load1, load2, load3 ); + INSERT_W4_UB( load0, load1, load2, load3, ref1 ); + p_ref1 += ( 4 * i_ref_stride ); + + LW4( p_ref2, i_ref_stride, load0, load1, load2, load3 ); + INSERT_W4_UB( load0, load1, load2, load3, ref2 ); + p_ref2 += ( 4 * i_ref_stride ); + + diff = __msa_asub_u_b( src, ref0 ); + sad0 += __msa_hadd_u_h( diff, diff ); + + diff = __msa_asub_u_b( src, ref1 ); + sad1 += __msa_hadd_u_h( diff, diff ); + + diff = __msa_asub_u_b( src, ref2 ); + sad2 += __msa_hadd_u_h( diff, diff ); + } + + pu_sad_array[0] = HADD_UH_U32( sad0 ); + pu_sad_array[1] = HADD_UH_U32( sad1 ); + pu_sad_array[2] = HADD_UH_U32( sad2 ); +} + +static void sad_8width_x3d_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_ref0, uint8_t *p_ref1, + uint8_t *p_ref2, int32_t i_ref_stride, + int32_t i_height, uint32_t *pu_sad_array ) +{ + int32_t i_ht_cnt; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref00, ref11, ref22, ref33; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) + { + LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); + p_src += ( 4 * i_src_stride ); + LD_UB4( p_ref0, i_ref_stride, ref00, ref11, ref22, ref33 ); + p_ref0 += ( 4 * i_ref_stride ); + + PCKEV_D4_UB( src1, src0, src3, src2, ref11, ref00, ref33, ref22, + src0, src1, ref0, ref1 ); + sad0 += SAD_UB2_UH( src0, src1, ref0, ref1 ); + + LD_UB4( p_ref1, i_ref_stride, ref00, ref11, ref22, ref33 ); + p_ref1 += ( 4 * i_ref_stride ); + + PCKEV_D2_UB( ref11, ref00, ref33, ref22, ref0, ref1 ); + sad1 += SAD_UB2_UH( src0, src1, ref0, ref1 ); + + LD_UB4( p_ref2, i_ref_stride, ref00, ref11, ref22, ref33 ); + p_ref2 += ( 4 * i_ref_stride ); + + PCKEV_D2_UB( ref11, ref00, ref33, ref22, ref0, ref1 ); + sad2 += SAD_UB2_UH( src0, src1, ref0, ref1 ); + } + + pu_sad_array[0] = HADD_UH_U32( sad0 ); + pu_sad_array[1] = HADD_UH_U32( sad1 ); + pu_sad_array[2] = HADD_UH_U32( sad2 ); +} + +static void sad_16width_x3d_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_ref0, uint8_t *p_ref1, + uint8_t *p_ref2, int32_t i_ref_stride, + int32_t i_height, uint32_t *pu_sad_array ) +{ + int32_t i_ht_cnt; + v16u8 src, ref; + v16u8 diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + + for ( i_ht_cnt = ( i_height >> 1 ); i_ht_cnt--; ) + { + src = LD_UB( p_src ); + p_src += i_src_stride; + + ref = LD_UB( p_ref0 ); + p_ref0 += i_ref_stride; + diff = __msa_asub_u_b( src, ref ); + sad0 += __msa_hadd_u_h( diff, diff ); + + ref = LD_UB( p_ref1 ); + p_ref1 += i_ref_stride; + diff = __msa_asub_u_b( src, ref ); + sad1 += __msa_hadd_u_h( diff, diff ); + + ref = LD_UB( p_ref2 ); + p_ref2 += i_ref_stride; + diff = __msa_asub_u_b( src, ref ); + sad2 += __msa_hadd_u_h( diff, diff ); + + src = LD_UB( p_src ); + p_src += i_src_stride; + + ref = LD_UB( p_ref0 ); + p_ref0 += i_ref_stride; + diff = __msa_asub_u_b( src, ref ); + sad0 += __msa_hadd_u_h( diff, diff ); + + ref = LD_UB( p_ref1 ); + p_ref1 += i_ref_stride; + diff = __msa_asub_u_b( src, ref ); + sad1 += __msa_hadd_u_h( diff, diff ); + + ref = LD_UB( p_ref2 ); + p_ref2 += i_ref_stride; + diff = __msa_asub_u_b( src, ref ); + sad2 += __msa_hadd_u_h( diff, diff ); + } + + pu_sad_array[0] = HADD_UH_U32( sad0 ); + pu_sad_array[1] = HADD_UH_U32( sad1 ); + pu_sad_array[2] = HADD_UH_U32( sad2 ); +} + +static void sad_4width_x4d_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_aref[], int32_t i_ref_stride, + int32_t i_height, uint32_t *pu_sad_array ) +{ + uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3; + int32_t i_ht_cnt; + uint32_t src0, src1, src2, src3; + uint32_t ref0, ref1, ref2, ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v16u8 diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + p_ref0 = p_aref[0]; + p_ref1 = p_aref[1]; + p_ref2 = p_aref[2]; + p_ref3 = p_aref[3]; + + for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) + { + LW4( p_src, i_src_stride, src0, src1, src2, src3 ); + INSERT_W4_UB( src0, src1, src2, src3, src ); + p_src += ( 4 * i_src_stride ); + + LW4( p_ref0, i_ref_stride, ref0, ref1, ref2, ref3 ); + INSERT_W4_UB( ref0, ref1, ref2, ref3, ref ); + p_ref0 += ( 4 * i_ref_stride ); + + diff = __msa_asub_u_b( src, ref ); + sad0 += __msa_hadd_u_h( diff, diff ); + + LW4( p_ref1, i_ref_stride, ref0, ref1, ref2, ref3 ); + INSERT_W4_UB( ref0, ref1, ref2, ref3, ref ); + p_ref1 += ( 4 * i_ref_stride ); + + diff = __msa_asub_u_b( src, ref ); + sad1 += __msa_hadd_u_h( diff, diff ); + + LW4( p_ref2, i_ref_stride, ref0, ref1, ref2, ref3 ); + INSERT_W4_UB( ref0, ref1, ref2, ref3, ref ); + p_ref2 += ( 4 * i_ref_stride ); + + diff = __msa_asub_u_b( src, ref ); + sad2 += __msa_hadd_u_h( diff, diff ); + + LW4( p_ref3, i_ref_stride, ref0, ref1, ref2, ref3 ); + INSERT_W4_UB( ref0, ref1, ref2, ref3, ref ); + p_ref3 += ( 4 * i_ref_stride ); + + diff = __msa_asub_u_b( src, ref ); + sad3 += __msa_hadd_u_h( diff, diff ); + } + + pu_sad_array[0] = HADD_UH_U32( sad0 ); + pu_sad_array[1] = HADD_UH_U32( sad1 ); + pu_sad_array[2] = HADD_UH_U32( sad2 ); + pu_sad_array[3] = HADD_UH_U32( sad3 ); +} + +static void sad_8width_x4d_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_aref[], int32_t i_ref_stride, + int32_t i_height, uint32_t *pu_sad_array ) +{ + int32_t i_ht_cnt; + uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + p_ref0 = p_aref[0]; + p_ref1 = p_aref[1]; + p_ref2 = p_aref[2]; + p_ref3 = p_aref[3]; + + for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) + { + LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); + p_src += ( 4 * i_src_stride ); + LD_UB4( p_ref0, i_ref_stride, ref0, ref1, ref2, ref3 ); + p_ref0 += ( 4 * i_ref_stride ); + LD_UB4( p_ref1, i_ref_stride, ref4, ref5, ref6, ref7 ); + p_ref1 += ( 4 * i_ref_stride ); + LD_UB4( p_ref2, i_ref_stride, ref8, ref9, ref10, ref11 ); + p_ref2 += ( 4 * i_ref_stride ); + LD_UB4( p_ref3, i_ref_stride, ref12, ref13, ref14, ref15 ); + p_ref3 += ( 4 * i_ref_stride ); + + PCKEV_D2_UB( src1, src0, src3, src2, src0, src1 ); + PCKEV_D2_UB( ref1, ref0, ref3, ref2, ref0, ref1 ); + sad0 += SAD_UB2_UH( src0, src1, ref0, ref1 ); + + PCKEV_D2_UB( ref5, ref4, ref7, ref6, ref0, ref1 ); + sad1 += SAD_UB2_UH( src0, src1, ref0, ref1 ); + + PCKEV_D2_UB( ref9, ref8, ref11, ref10, ref0, ref1 ); + sad2 += SAD_UB2_UH( src0, src1, ref0, ref1 ); + + PCKEV_D2_UB( ref13, ref12, ref15, ref14, ref0, ref1 ); + sad3 += SAD_UB2_UH( src0, src1, ref0, ref1 ); + } + + pu_sad_array[0] = HADD_UH_U32( sad0 ); + pu_sad_array[1] = HADD_UH_U32( sad1 ); + pu_sad_array[2] = HADD_UH_U32( sad2 ); + pu_sad_array[3] = HADD_UH_U32( sad3 ); +} + +static void sad_16width_x4d_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_aref[], int32_t i_ref_stride, + int32_t i_height, uint32_t *pu_sad_array ) +{ + int32_t i_ht_cnt; + uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3; + v16u8 src, ref0, ref1, ref2, ref3, diff; + v8u16 sad0 = { 0 }; + v8u16 sad1 = { 0 }; + v8u16 sad2 = { 0 }; + v8u16 sad3 = { 0 }; + + p_ref0 = p_aref[0]; + p_ref1 = p_aref[1]; + p_ref2 = p_aref[2]; + p_ref3 = p_aref[3]; + + for ( i_ht_cnt = ( i_height >> 1 ); i_ht_cnt--; ) + { + src = LD_UB( p_src ); + p_src += i_src_stride; + ref0 = LD_UB( p_ref0 ); + p_ref0 += i_ref_stride; + ref1 = LD_UB( p_ref1 ); + p_ref1 += i_ref_stride; + ref2 = LD_UB( p_ref2 ); + p_ref2 += i_ref_stride; + ref3 = LD_UB( p_ref3 ); + p_ref3 += i_ref_stride; + + diff = __msa_asub_u_b( src, ref0 ); + sad0 += __msa_hadd_u_h( diff, diff ); + diff = __msa_asub_u_b( src, ref1 ); + sad1 += __msa_hadd_u_h( diff, diff ); + diff = __msa_asub_u_b( src, ref2 ); + sad2 += __msa_hadd_u_h( diff, diff ); + diff = __msa_asub_u_b( src, ref3 ); + sad3 += __msa_hadd_u_h( diff, diff ); + + src = LD_UB( p_src ); + p_src += i_src_stride; + ref0 = LD_UB( p_ref0 ); + p_ref0 += i_ref_stride; + ref1 = LD_UB( p_ref1 ); + p_ref1 += i_ref_stride; + ref2 = LD_UB( p_ref2 ); + p_ref2 += i_ref_stride; + ref3 = LD_UB( p_ref3 ); + p_ref3 += i_ref_stride; + + diff = __msa_asub_u_b( src, ref0 ); + sad0 += __msa_hadd_u_h( diff, diff ); + diff = __msa_asub_u_b( src, ref1 ); + sad1 += __msa_hadd_u_h( diff, diff ); + diff = __msa_asub_u_b( src, ref2 ); + sad2 += __msa_hadd_u_h( diff, diff ); + diff = __msa_asub_u_b( src, ref3 ); + sad3 += __msa_hadd_u_h( diff, diff ); + } + + pu_sad_array[0] = HADD_UH_U32( sad0 ); + pu_sad_array[1] = HADD_UH_U32( sad1 ); + pu_sad_array[2] = HADD_UH_U32( sad2 ); + pu_sad_array[3] = HADD_UH_U32( sad3 ); +} + +static uint64_t avc_pixel_var16width_msa( uint8_t *p_pix, int32_t i_stride, + uint8_t i_height ) +{ + uint32_t u_sum = 0, u_sqr_out = 0, u_cnt; + v16i8 pix, zero = { 0 }; + v8u16 add, pix_r, pix_l; + v4u32 sqr = { 0 }; + + for ( u_cnt = i_height; u_cnt--; ) + { + pix = LD_SB( p_pix ); + p_pix += i_stride; + add = __msa_hadd_u_h( ( v16u8 ) pix, ( v16u8 ) pix ); + u_sum += HADD_UH_U32( add ); + ILVRL_B2_UH( zero, pix, pix_r, pix_l ); + sqr = __msa_dpadd_u_w( sqr, pix_r, pix_r ); + sqr = __msa_dpadd_u_w( sqr, pix_l, pix_l ); + } + + u_sqr_out = HADD_SW_S32( sqr ); + + return ( u_sum + ( ( uint64_t ) u_sqr_out << 32 ) ); +} + +static uint64_t avc_pixel_var8width_msa( uint8_t *p_pix, int32_t i_stride, + uint8_t i_height ) +{ + uint32_t u_sum = 0, u_sqr_out = 0, u_cnt; + v16i8 pix, zero = { 0 }; + v8u16 add, pix_r; + v4u32 sqr = { 0 }; + + for ( u_cnt = i_height; u_cnt--; ) + { + pix = LD_SB( p_pix ); + p_pix += i_stride; + pix_r = ( v8u16 ) __msa_ilvr_b( zero, pix ); + add = __msa_hadd_u_h( ( v16u8 ) pix_r, ( v16u8 ) pix_r ); + u_sum += HADD_UH_U32( add ); + sqr = __msa_dpadd_u_w( sqr, pix_r, pix_r ); + } + + u_sqr_out = HADD_SW_S32( sqr ); + + return ( u_sum + ( ( uint64_t ) u_sqr_out << 32 ) ); +} + +static uint32_t sse_diff_8width_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_ref, int32_t i_ref_stride, + int32_t i_height, int32_t *p_diff ) +{ + int32_t i_ht_cnt; + uint32_t u_sse; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v8i16 avg = { 0 }; + v4i32 vec, var = { 0 }; + + for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) + { + LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); + p_src += ( 4 * i_src_stride ); + LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 ); + p_ref += ( 4 * i_ref_stride ); + + PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2, + src0, src1, ref0, ref1 ); + CALC_MSE_AVG_B( src0, ref0, var, avg ); + CALC_MSE_AVG_B( src1, ref1, var, avg ); + } + + vec = __msa_hadd_s_w( avg, avg ); + *p_diff = HADD_SW_S32( vec ); + u_sse = HADD_SW_S32( var ); + + return u_sse; +} + +static uint32_t sse_4width_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_ref, int32_t i_ref_stride, + int32_t i_height ) +{ + int32_t i_ht_cnt; + uint32_t u_sse; + uint32_t u_src0, u_src1, u_src2, u_src3; + uint32_t u_ref0, u_ref1, u_ref2, u_ref3; + v16u8 src = { 0 }; + v16u8 ref = { 0 }; + v4i32 var = { 0 }; + + for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) + { + LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 ); + p_src += ( 4 * i_src_stride ); + LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 ); + p_ref += ( 4 * i_ref_stride ); + + INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src ); + INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref ); + CALC_MSE_B( src, ref, var ); + } + + u_sse = HADD_SW_S32( var ); + + return u_sse; +} + +static uint32_t sse_8width_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_ref, int32_t i_ref_stride, + int32_t i_height ) +{ + int32_t i_ht_cnt; + uint32_t u_sse; + v16u8 src0, src1, src2, src3; + v16u8 ref0, ref1, ref2, ref3; + v4i32 var = { 0 }; + + for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) + { + LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); + p_src += ( 4 * i_src_stride ); + LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 ); + p_ref += ( 4 * i_ref_stride ); + + PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2, + src0, src1, ref0, ref1 ); + CALC_MSE_B( src0, ref0, var ); + CALC_MSE_B( src1, ref1, var ); + } + + u_sse = HADD_SW_S32( var ); + + return u_sse; +} + +static uint32_t sse_16width_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_ref, int32_t i_ref_stride, + int32_t i_height ) +{ + int32_t i_ht_cnt; + uint32_t u_sse; + v16u8 src, ref; + v4i32 var = { 0 }; + + for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; ) + { + src = LD_UB( p_src ); + p_src += i_src_stride; + ref = LD_UB( p_ref ); + p_ref += i_ref_stride; + CALC_MSE_B( src, ref, var ); + + src = LD_UB( p_src ); + p_src += i_src_stride; + ref = LD_UB( p_ref ); + p_ref += i_ref_stride; + CALC_MSE_B( src, ref, var ); + + src = LD_UB( p_src ); + p_src += i_src_stride; + ref = LD_UB( p_ref ); + p_ref += i_ref_stride; + CALC_MSE_B( src, ref, var ); + + src = LD_UB( p_src ); + p_src += i_src_stride; + ref = LD_UB( p_ref ); + p_ref += i_ref_stride; + CALC_MSE_B( src, ref, var ); + } + + u_sse = HADD_SW_S32( var ); + + return u_sse; +} + +static void ssim_4x4x2_core_msa( const uint8_t *p_src, int32_t i_src_stride, + const uint8_t *p_ref, int32_t i_ref_stride, + int32_t pi_sum_array[2][4] ) +{ + v16i8 zero = { 0 }; + v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3; + v8u16 temp0, temp1, temp2, temp3; + v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; + v4u32 tmp0; + v4i32 tmp2, tmp3; + + LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 ); + p_src += ( 4 * i_src_stride ); + LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 ); + p_ref += ( 4 * i_ref_stride ); + + ILVR_D2_UB( src1, src0, src3, src2, src0, src2 ); + ILVR_D2_UB( ref1, ref0, ref3, ref2, ref0, ref2 ); + HADD_UB2_UH( src0, src2, temp0, temp1 ); + + temp2 = ( v8u16 ) __msa_ilvev_w( ( v4i32 ) temp1, ( v4i32 ) temp0 ); + temp3 = ( v8u16 ) __msa_ilvod_w( ( v4i32 ) temp1, ( v4i32 ) temp0 ); + + pi_sum_array[0][0] = ( int32_t ) HADD_UH_U32( temp2 ); + pi_sum_array[1][0] = ( int32_t ) HADD_UH_U32( temp3 ); + + HADD_UB2_UH( ref0, ref2, temp0, temp1 ); + + temp2 = ( v8u16 ) __msa_ilvev_w( ( v4i32 ) temp1, ( v4i32 ) temp0 ); + temp3 = ( v8u16 ) __msa_ilvod_w( ( v4i32 ) temp1, ( v4i32 ) temp0 ); + + pi_sum_array[0][1] = ( int32_t ) HADD_UH_U32( temp2 ); + pi_sum_array[1][1] = ( int32_t ) HADD_UH_U32( temp3 ); + + ILVR_B4_UH( zero, src0, zero, src2, zero, ref0, zero, ref2, vec0, vec2, + vec4, vec6 ); + ILVL_B4_UH( zero, src0, zero, src2, zero, ref0, zero, ref2, vec1, vec3, + vec5, vec7 ); + + tmp0 = __msa_dotp_u_w( vec0, vec0 ); + tmp0 = __msa_dpadd_u_w( tmp0, vec1, vec1 ); + tmp0 = __msa_dpadd_u_w( tmp0, vec2, vec2 ); + tmp0 = __msa_dpadd_u_w( tmp0, vec3, vec3 ); + tmp0 = __msa_dpadd_u_w( tmp0, vec4, vec4 ); + tmp0 = __msa_dpadd_u_w( tmp0, vec5, vec5 ); + tmp0 = __msa_dpadd_u_w( tmp0, vec6, vec6 ); + tmp0 = __msa_dpadd_u_w( tmp0, vec7, vec7 ); + + tmp2 = ( v4i32 ) __msa_ilvev_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 ); + tmp3 = ( v4i32 ) __msa_ilvod_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 ); + tmp2 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp2, ( v4u32 ) tmp2 ); + tmp3 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp3, ( v4u32 ) tmp3 ); + + pi_sum_array[0][2] = __msa_copy_u_w( tmp2, 0 ); + pi_sum_array[1][2] = __msa_copy_u_w( tmp3, 0 ); + + tmp0 = __msa_dotp_u_w( vec4, vec0 ); + tmp0 = __msa_dpadd_u_w( tmp0, vec5, vec1 ); + tmp0 = __msa_dpadd_u_w( tmp0, vec6, vec2 ); + tmp0 = __msa_dpadd_u_w( tmp0, vec7, vec3 ); + + tmp2 = ( v4i32 ) __msa_ilvev_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 ); + tmp3 = ( v4i32 ) __msa_ilvod_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 ); + tmp2 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp2, ( v4u32 ) tmp2 ); + tmp3 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp3, ( v4u32 ) tmp3 ); + + pi_sum_array[0][3] = __msa_copy_u_w( tmp2, 0 ); + pi_sum_array[1][3] = __msa_copy_u_w( tmp3, 0 ); +} + +static int32_t pixel_satd_4width_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_ref, int32_t i_ref_stride, + uint8_t i_height ) +{ + int32_t cnt; + uint32_t u_sum = 0; + v16i8 src0, src1, src2, src3; + v16i8 ref0, ref1, ref2, ref3; + v8i16 zero = { 0 }; + v8i16 diff0, diff1, diff2, diff3; + v8i16 temp0, temp1, temp2, temp3; + + for ( cnt = i_height >> 2; cnt--; ) + { + LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 ); + p_src += 4 * i_src_stride; + LD_SB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 ); + p_ref += 4 * i_ref_stride; + + ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3, + diff0, diff1, diff2, diff3 ); + HSUB_UB4_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 ); + TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3, + diff0, diff1, diff2, diff3 ); + BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); + TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3, + diff0, diff1, diff2, diff3 ); + BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); + + diff0 = __msa_add_a_h( diff0, zero ); + diff1 = __msa_add_a_h( diff1, zero ); + diff2 = __msa_add_a_h( diff2, zero ); + diff3 = __msa_add_a_h( diff3, zero ); + diff0 = ( diff0 + diff1 + diff2 + diff3 ); + diff0 = ( v8i16 ) __msa_hadd_u_w( ( v8u16 ) diff0, ( v8u16 ) diff0 ); + diff0 = ( v8i16 ) __msa_hadd_u_d( ( v4u32 ) diff0, ( v4u32 ) diff0 ); + u_sum += __msa_copy_u_w( ( v4i32 ) diff0, 0 ); + } + + return ( u_sum >> 1 ); +} + +static int32_t pixel_satd_8width_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_ref, int32_t i_ref_stride, + uint8_t i_height ) +{ + int32_t cnt; + uint32_t u_sum = 0; + v16i8 src0, src1, src2, src3; + v16i8 ref0, ref1, ref2, ref3; + v8i16 zero = { 0 }; + v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + v8i16 temp0, temp1, temp2, temp3; + + for ( cnt = i_height >> 2; cnt--; ) + { + LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 ); + p_src += 4 * i_src_stride; + LD_SB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 ); + p_ref += 4 * i_ref_stride; + + ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3, + diff0, diff1, diff2, diff3 ); + HSUB_UB4_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 ); + TRANSPOSE8X4_SH_SH( diff0, diff1, diff2, diff3, + diff0, diff2, diff4, diff6 ); + + diff1 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff0, 1 ); + diff3 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff2, 1 ); + diff5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff4, 1 ); + diff7 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff6, 1 ); + + BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); + BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 ); + TRANSPOSE4X8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, + diff7, diff0, diff1, diff2, diff3, diff4, diff5, + diff6, diff7 ); + BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); + + diff0 = __msa_add_a_h( diff0, zero ); + diff1 = __msa_add_a_h( diff1, zero ); + diff2 = __msa_add_a_h( diff2, zero ); + diff3 = __msa_add_a_h( diff3, zero ); + diff0 = ( diff0 + diff1 + diff2 + diff3 ); + u_sum += HADD_UH_U32( diff0 ); + } + + return ( u_sum >> 1 ); +} + +static int32_t sa8d_8x8_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_ref, int32_t i_ref_stride ) +{ + uint32_t u_sum = 0; + v16i8 src0, src1, src2, src3, src4, src5, src6, src7; + v16i8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v8i16 zero = { 0 }; + v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + v8i16 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7; + v8i16 temp0, temp1, temp2, temp3; + + LD_SB8( p_src, i_src_stride, src0, src1, src2, src3, src4, src5, src6, src7 ); + LD_SB8( p_ref, i_ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7 ); + ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3, sub0, sub1, + sub2, sub3 ); + ILVR_B4_SH( src4, ref4, src5, ref5, src6, ref6, src7, ref7, sub4, sub5, + sub6, sub7 ); + HSUB_UB4_SH( sub0, sub1, sub2, sub3, sub0, sub1, sub2, sub3 ); + HSUB_UB4_SH( sub4, sub5, sub6, sub7, sub4, sub5, sub6, sub7 ); + TRANSPOSE8x8_SH_SH( sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, + sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7 ); + BUTTERFLY_4( sub0, sub2, sub3, sub1, diff0, diff1, diff4, diff5 ); + BUTTERFLY_4( sub4, sub6, sub7, sub5, diff2, diff3, diff7, diff6 ); + BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); + BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 ); + TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7, + diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7 ); + BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); + BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 ); + + temp0 = diff0 + diff4; + temp1 = diff1 + diff5; + temp2 = diff2 + diff6; + temp3 = diff3 + diff7; + + temp0 = __msa_add_a_h( temp0, zero ); + temp1 = __msa_add_a_h( temp1, zero ); + temp2 = __msa_add_a_h( temp2, zero ); + temp3 = __msa_add_a_h( temp3, zero ); + + diff0 = temp0 + __msa_asub_s_h( diff0, diff4 ); + diff1 = temp1 + __msa_asub_s_h( diff1, diff5 ); + diff2 = temp2 + __msa_asub_s_h( diff2, diff6 ); + diff3 = temp3 + __msa_asub_s_h( diff3, diff7 ); + diff0 = ( diff0 + diff1 + diff2 + diff3 ); + + u_sum = HADD_UH_U32( diff0 ); + + return u_sum; +} + +static uint64_t pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, int32_t i_stride ) +{ + int16_t tmp0, tmp1, tmp2, tmp3; + uint32_t u_sum4 = 0, u_sum8 = 0, u_dc; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 zero = { 0 }; + v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7; + v8i16 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7; + v8i16 temp0, temp1, temp2, temp3; + + LD_UB8( p_pix, i_stride, src0, src1, src2, src3, src4, src5, src6, src7 ); + + ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3, diff0, diff1, + diff2, diff3 ); + ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7, diff4, diff5, + diff6, diff7 ); + TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, + diff4, diff5, diff6, diff7, + diff0, diff1, diff2, diff3, + diff4, diff5, diff6, diff7 ); + BUTTERFLY_4( diff0, diff2, diff3, diff1, + temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, + diff0, diff1, diff3, diff2 ); + BUTTERFLY_4( diff4, diff6, diff7, diff5, + temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, + diff4, diff5, diff7, diff6 ); + TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, + diff4, diff5, diff6, diff7, + diff0, diff1, diff2, diff3, + diff4, diff5, diff6, diff7 ); + BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); + BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 ); + + tmp0 = diff0[0]; + tmp1 = diff0[4]; + tmp2 = diff4[0]; + tmp3 = diff4[4]; + + sub0 = __msa_add_a_h( diff0, zero ); + sub1 = __msa_add_a_h( diff1, zero ); + sub2 = __msa_add_a_h( diff2, zero ); + sub3 = __msa_add_a_h( diff3, zero ); + sub4 = __msa_add_a_h( diff4, zero ); + sub5 = __msa_add_a_h( diff5, zero ); + sub6 = __msa_add_a_h( diff6, zero ); + sub7 = __msa_add_a_h( diff7, zero ); + + sub0 = ( sub0 + sub1 + sub2 + sub3 ); + sub1 = ( sub4 + sub5 + sub6 + sub7 ); + sub0 += sub1; + + u_sum4 += HADD_UH_U32( sub0 ); + + TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7, + sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7 ); + + ILVR_D2_SH( sub2, sub0, sub6, sub4, diff0, diff1 ); + ILVR_D2_SH( sub3, sub1, sub7, sub5, diff4, diff6 ); + + diff2 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub2, ( v2i64 ) sub0 ); + diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub6, ( v2i64 ) sub4 ); + diff5 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub3, ( v2i64 ) sub1 ); + diff7 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub7, ( v2i64 ) sub5 ); + + BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 ); + BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 ); + BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 ); + + sub0 = __msa_add_a_h( diff0, zero ); + sub1 = __msa_add_a_h( diff1, zero ); + sub2 = __msa_add_a_h( diff2, zero ); + sub3 = __msa_add_a_h( diff3, zero ); + sub4 = __msa_add_a_h( diff4, zero ); + sub5 = __msa_add_a_h( diff5, zero ); + sub6 = __msa_add_a_h( diff6, zero ); + sub7 = __msa_add_a_h( diff7, zero ); + + sub0 = ( sub0 + sub1 + sub2 + sub3 ); + sub1 = ( sub4 + sub5 + sub6 + sub7 ); + sub0 += sub1; + + u_sum8 += HADD_UH_U32( sub0 ); + + u_dc = ( uint16_t ) ( tmp0 + tmp1 + tmp2 + tmp3 ); + u_sum4 = u_sum4 - u_dc; + u_sum8 = u_sum8 - u_dc; + + return ( ( uint64_t ) u_sum8 << 32 ) + u_sum4; +} + +int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sad_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 ); +} + +int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sad_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 ); +} + +int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 ); +} + +int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 ); +} + +int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 ); +} + +int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 ); +} + +int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 ); +} + +int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 ); +} + +void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ) +{ + uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 }; + + sad_16width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 16, + ( uint32_t * ) p_sad_array ); +} + +void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ) +{ + uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 }; + + sad_16width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8, + ( uint32_t * ) p_sad_array ); +} + +void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ) +{ + uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 }; + + sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 16, + ( uint32_t * ) p_sad_array ); +} + +void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ) +{ + uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 }; + + sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8, + ( uint32_t * ) p_sad_array ); +} + +void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ) +{ + uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 }; + + sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 4, + ( uint32_t * ) p_sad_array ); +} + +void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ) +{ + uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 }; + + sad_4width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8, + ( uint32_t * ) p_sad_array ); +} + +void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ) +{ + uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 }; + + sad_4width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 4, + ( uint32_t * ) p_sad_array ); +} + +void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ) +{ + sad_16width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2, + i_ref_stride, 16, ( uint32_t * ) p_sad_array ); +} + +void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ) +{ + sad_16width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2, + i_ref_stride, 8, ( uint32_t * ) p_sad_array ); +} + +void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ) +{ + sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2, + i_ref_stride, 16, ( uint32_t * ) p_sad_array ); +} + +void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ) +{ + sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2, + i_ref_stride, 8, ( uint32_t * ) p_sad_array ); +} + +void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ) +{ + sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2, + i_ref_stride, 4, ( uint32_t * ) p_sad_array ); +} + +void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ) +{ + sad_4width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2, + i_ref_stride, 8, ( uint32_t * ) p_sad_array ); +} + +void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ) +{ + sad_4width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2, + i_ref_stride, 4, ( uint32_t * ) p_sad_array ); +} + +int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sse_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 ); +} + +int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sse_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 ); +} + +int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 ); +} + +int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 ); +} + +int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 ); +} + +int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 ); +} + +int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 ); +} + +int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ) +{ + return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 ); +} + +void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ) +{ + x264_intra_predict_vert_4x4_msa( p_dec ); + p_sad_array[0] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_hor_4x4_msa( p_dec ); + p_sad_array[1] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_dc_4x4_msa( p_dec ); + p_sad_array[2] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ) +{ + x264_intra_predict_vert_16x16_msa( p_dec ); + p_sad_array[0] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_hor_16x16_msa( p_dec ); + p_sad_array[1] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_dc_16x16_msa( p_dec ); + p_sad_array[2] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36], + int32_t p_sad_array[3] ) +{ + ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] ); + + x264_intra_predict_v_8x8_msa( pix, p_edge ); + p_sad_array[0] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_h_8x8_msa( pix, p_edge ); + p_sad_array[1] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_dc_8x8_msa( pix, p_edge ); + p_sad_array[2] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ) +{ + x264_intra_predict_dc_4blk_8x8_msa( p_dec ); + p_sad_array[0] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_hor_8x8_msa( p_dec ); + p_sad_array[1] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_vert_8x8_msa( p_dec ); + p_sad_array[2] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1, + const uint8_t *p_pix2, intptr_t i_stride2, + int32_t i_sums[2][4] ) +{ + ssim_4x4x2_core_msa( p_pix1, i_stride1, p_pix2, i_stride2, i_sums ); +} + +uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride ) +{ + uint64_t u_sum; + + u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride ); + + return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); +} + +uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride ) +{ + uint64_t u_sum; + + u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride ); + u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride, i_stride ); + + return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); +} + +uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride ) +{ + uint64_t u_sum; + + u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride ); + u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8, i_stride ); + + return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); +} + +uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride ) +{ + uint64_t u_sum; + + u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride ); + u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8, i_stride ); + u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride, i_stride ); + u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride + 8, i_stride ); + + return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); +} + +int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ) +{ + return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 4 ); +} + +int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ) +{ + return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 ); +} + +int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ) +{ + return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 ); +} + +int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ) +{ + return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 4 ); +} + +int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ) +{ + return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 ); +} + +int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ) +{ + return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 ); +} + +int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ) +{ + uint32_t u32Sum = 0; + + u32Sum = pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 ); + u32Sum += pixel_satd_8width_msa( p_pix1 + 8, i_stride, + p_pix2 + 8, i_stride2, 8 ); + + return u32Sum; +} + +int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ) +{ + uint32_t u32Sum = 0; + + u32Sum = pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 ); + u32Sum += pixel_satd_8width_msa( p_pix1 + 8, i_stride, + p_pix2 + 8, i_stride2, 16 ); + + return u32Sum; +} + +int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ) +{ + int32_t i32Sum = sa8d_8x8_msa( p_pix1, i_stride, p_pix2, i_stride2 ); + + return ( i32Sum + 2 ) >> 2; +} + +int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ) +{ + int32_t i32Sum = sa8d_8x8_msa( p_pix1, i_stride, p_pix2, i_stride2 ) + + sa8d_8x8_msa( p_pix1 + 8, i_stride, + p_pix2 + 8, i_stride2 ) + + sa8d_8x8_msa( p_pix1 + 8 * i_stride, i_stride, + p_pix2 + 8 * i_stride2, i_stride2 ) + + sa8d_8x8_msa( p_pix1 + 8 + 8 * i_stride, i_stride, + p_pix2 + 8 + 8 * i_stride2, i_stride2 ); + + return ( i32Sum + 2 ) >> 2; +} + +void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ) +{ + x264_intra_predict_vert_4x4_msa( p_dec ); + p_sad_array[0] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_hor_4x4_msa( p_dec ); + p_sad_array[1] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_dc_4x4_msa( p_dec ); + p_sad_array[2] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ) +{ + x264_intra_predict_vert_16x16_msa( p_dec ); + p_sad_array[0] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_hor_16x16_msa( p_dec ); + p_sad_array[1] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_dc_16x16_msa( p_dec ); + p_sad_array[2] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36], + int32_t p_sad_array[3] ) +{ + ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] ); + + x264_intra_predict_v_8x8_msa( pix, p_edge ); + p_sad_array[0] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_h_8x8_msa( pix, p_edge ); + p_sad_array[1] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_dc_8x8_msa( pix, p_edge ); + p_sad_array[2] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ) +{ + x264_intra_predict_dc_4blk_8x8_msa( p_dec ); + p_sad_array[0] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_hor_8x8_msa( p_dec ); + p_sad_array[1] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_intra_predict_vert_8x8_msa( p_dec ); + p_sad_array[2] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride ) +{ + return avc_pixel_var16width_msa( p_pix, i_stride, 16 ); +} + +uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride ) +{ + return avc_pixel_var8width_msa( p_pix, i_stride, 16 ); +} + +uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride ) +{ + return avc_pixel_var8width_msa( p_pix, i_stride, 8 ); +} + +int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1, + uint8_t *p_pix2, intptr_t i_stride2, + int32_t *p_ssd ) +{ + int32_t i_var = 0, i_diff = 0, i_sqr = 0; + + i_sqr = sse_diff_8width_msa( p_pix1, i_stride1, p_pix2, i_stride2, 16, + &i_diff ); + i_var = VARIANCE_WxH( i_sqr, i_diff, 7 ); + *p_ssd = i_sqr; + + return i_var; +} + +int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1, + uint8_t *p_pix2, intptr_t i_stride2, + int32_t *p_ssd ) +{ + int32_t i_var = 0, i_diff = 0, i_sqr = 0; + + i_sqr = sse_diff_8width_msa( p_pix1, i_stride1, + p_pix2, i_stride2, 8, &i_diff ); + i_var = VARIANCE_WxH( i_sqr, i_diff, 6 ); + *p_ssd = i_sqr; + + return i_var; +} +#endif
View file
x264-snapshot-20150804-2245.tar.bz2/common/mips/pixel.h
Added
@@ -0,0 +1,170 @@ +/***************************************************************************** + * pixel.h: msa pixel metrics + ***************************************************************************** + * Copyright (C) 2015 x264 project + * + * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_MIPS_SAD_H +#define X264_MIPS_SAD_H + +int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36], + int32_t p_sad_array[3] ); +void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1, + const uint8_t *p_pix2, intptr_t i_stride2, + int32_t i_sums[2][4] ); +uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride ); +uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride ); +uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride ); +uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride ); +int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36], + int32_t p_sad_array[3] ); +void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride ); +uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride ); +uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride ); +int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1, + uint8_t *p_pix2, intptr_t i_stride2, + int32_t *p_ssd ); +int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1, + uint8_t *p_pix2, intptr_t i_stride2, + int32_t *p_ssd ); + +#endif
View file
x264-snapshot-20150804-2245.tar.bz2/common/mips/predict-c.c
Added
@@ -0,0 +1,607 @@ +/***************************************************************************** + * predict-c.c: msa intra prediction + ***************************************************************************** + * Copyright (C) 2015 x264 project + * + * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "macros.h" + +#if !HIGH_BIT_DEPTH +static void intra_predict_vert_4x4_msa( uint8_t *p_src, uint8_t *p_dst, + int32_t i_dst_stride ) +{ + uint32_t u_src_data; + + u_src_data = LW( p_src ); + + SW4( u_src_data, u_src_data, u_src_data, u_src_data, p_dst, i_dst_stride ); +} + +static void intra_predict_vert_8x8_msa( uint8_t *p_src, uint8_t *p_dst, + int32_t i_dst_stride ) +{ + uint64_t u_out; + + u_out = LD( p_src ); + + SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride ); +} + +static void intra_predict_vert_16x16_msa( uint8_t *p_src, uint8_t *p_dst, + int32_t i_dst_stride ) +{ + v16u8 src0 = LD_UB( p_src ); + + ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst, + i_dst_stride ); + p_dst += ( 8 * i_dst_stride ); + ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst, + i_dst_stride ); +} + +static void intra_predict_horiz_4x4_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, int32_t i_dst_stride ) +{ + uint32_t u_out0, u_out1, u_out2, u_out3; + + u_out0 = p_src[0 * i_src_stride] * 0x01010101; + u_out1 = p_src[1 * i_src_stride] * 0x01010101; + u_out2 = p_src[2 * i_src_stride] * 0x01010101; + u_out3 = p_src[3 * i_src_stride] * 0x01010101; + + SW4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); +} + +static void intra_predict_horiz_8x8_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, int32_t i_dst_stride ) +{ + uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7; + + u_out0 = p_src[0 * i_src_stride] * 0x0101010101010101ull; + u_out1 = p_src[1 * i_src_stride] * 0x0101010101010101ull; + u_out2 = p_src[2 * i_src_stride] * 0x0101010101010101ull; + u_out3 = p_src[3 * i_src_stride] * 0x0101010101010101ull; + u_out4 = p_src[4 * i_src_stride] * 0x0101010101010101ull; + u_out5 = p_src[5 * i_src_stride] * 0x0101010101010101ull; + u_out6 = p_src[6 * i_src_stride] * 0x0101010101010101ull; + u_out7 = p_src[7 * i_src_stride] * 0x0101010101010101ull; + + SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride ); +} + +static void intra_predict_horiz_16x16_msa( uint8_t *p_src, int32_t i_src_stride, + uint8_t *p_dst, + int32_t i_dst_stride ) +{ + uint32_t u_row; + uint8_t u_inp0, u_inp1, u_inp2, u_inp3; + v16u8 src0, src1, src2, src3; + + for ( u_row = 4; u_row--; ) + { + u_inp0 = p_src[0]; + p_src += i_src_stride; + u_inp1 = p_src[0]; + p_src += i_src_stride; + u_inp2 = p_src[0]; + p_src += i_src_stride; + u_inp3 = p_src[0]; + p_src += i_src_stride; + + src0 = ( v16u8 ) __msa_fill_b( u_inp0 ); + src1 = ( v16u8 ) __msa_fill_b( u_inp1 ); + src2 = ( v16u8 ) __msa_fill_b( u_inp2 ); + src3 = ( v16u8 ) __msa_fill_b( u_inp3 ); + + ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + } +} + +static void intra_predict_dc_4x4_msa( uint8_t *p_src_top, uint8_t *p_src_left, + int32_t i_src_stride_left, + uint8_t *p_dst, int32_t i_dst_stride, + uint8_t is_above, uint8_t is_left ) +{ + uint32_t u_row; + uint32_t u_out, u_addition = 0; + v16u8 src_above, store; + v8u16 sum_above; + v4u32 sum; + + if ( is_left && is_above ) + { + src_above = LD_UB( p_src_top ); + + sum_above = __msa_hadd_u_h( src_above, src_above ); + sum = __msa_hadd_u_w( sum_above, sum_above ); + u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 ); + + for ( u_row = 0; u_row < 4; u_row++ ) + { + u_addition += p_src_left[u_row * i_src_stride_left]; + } + + u_addition = ( u_addition + 4 ) >> 3; + store = ( v16u8 ) __msa_fill_b( u_addition ); + } + else if ( is_left ) + { + for ( u_row = 0; u_row < 4; u_row++ ) + { + u_addition += p_src_left[u_row * i_src_stride_left]; + } + + u_addition = ( u_addition + 2 ) >> 2; + store = ( v16u8 ) __msa_fill_b( u_addition ); + } + else if ( is_above ) + { + src_above = LD_UB( p_src_top ); + + sum_above = __msa_hadd_u_h( src_above, src_above ); + sum = __msa_hadd_u_w( sum_above, sum_above ); + sum = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum, 2 ); + store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 ); + } + else + { + store = ( v16u8 ) __msa_ldi_b( 128 ); + } + + u_out = __msa_copy_u_w( ( v4i32 ) store, 0 ); + + SW4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride ); +} + +static void intra_predict_dc_8x8_msa( uint8_t *p_src_top, uint8_t *p_src_left, + uint8_t *p_dst, int32_t i_dst_stride ) +{ + uint64_t u_val0, u_val1; + v16i8 store; + v16u8 src = { 0 }; + v8u16 sum_h; + v4u32 sum_w; + v2u64 sum_d; + + u_val0 = LD( p_src_top ); + u_val1 = LD( p_src_left ); + INSERT_D2_UB( u_val0, u_val1, src ); + sum_h = __msa_hadd_u_h( src, src ); + sum_w = __msa_hadd_u_w( sum_h, sum_h ); + sum_d = __msa_hadd_u_d( sum_w, sum_w ); + sum_w = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum_d, ( v4i32 ) sum_d ); + sum_d = __msa_hadd_u_d( sum_w, sum_w ); + sum_w = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum_d, 4 ); + store = __msa_splati_b( ( v16i8 ) sum_w, 0 ); + u_val0 = __msa_copy_u_d( ( v2i64 ) store, 0 ); + + SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride ); +} + +static void intra_predict_dc_16x16_msa( uint8_t *p_src_top, uint8_t *p_src_left, + int32_t i_src_stride_left, + uint8_t *p_dst, int32_t i_dst_stride, + uint8_t is_above, uint8_t is_left ) +{ + uint32_t u_row; + uint32_t u_addition = 0; + v16u8 src_above, store; + v8u16 sum_above; + v4u32 sum_top; + v2u64 sum; + + if ( is_left && is_above ) + { + src_above = LD_UB( p_src_top ); + + sum_above = __msa_hadd_u_h( src_above, src_above ); + sum_top = __msa_hadd_u_w( sum_above, sum_above ); + sum = __msa_hadd_u_d( sum_top, sum_top ); + sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum ); + sum = __msa_hadd_u_d( sum_top, sum_top ); + u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 ); + + for ( u_row = 0; u_row < 16; u_row++ ) + { + u_addition += p_src_left[u_row * i_src_stride_left]; + } + + u_addition = ( u_addition + 16 ) >> 5; + store = ( v16u8 ) __msa_fill_b( u_addition ); + } + else if ( is_left ) + { + for ( u_row = 0; u_row < 16; u_row++ ) + { + u_addition += p_src_left[u_row * i_src_stride_left]; + } + + u_addition = ( u_addition + 8 ) >> 4; + store = ( v16u8 ) __msa_fill_b( u_addition ); + } + else if ( is_above ) + { + src_above = LD_UB( p_src_top ); + + sum_above = __msa_hadd_u_h( src_above, src_above ); + sum_top = __msa_hadd_u_w( sum_above, sum_above ); + sum = __msa_hadd_u_d( sum_top, sum_top ); + sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum ); + sum = __msa_hadd_u_d( sum_top, sum_top ); + sum = ( v2u64 ) __msa_srari_d( ( v2i64 ) sum, 4 ); + store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 ); + } + else + { + store = ( v16u8 ) __msa_ldi_b( 128 ); + } + + ST_UB8( store, store, store, store, store, store, store, store, p_dst, + i_dst_stride ); + p_dst += ( 8 * i_dst_stride ); + ST_UB8( store, store, store, store, store, store, store, store, p_dst, + i_dst_stride ); +} + +static void intra_predict_plane_8x8_msa( uint8_t *p_src, int32_t i_stride ) +{ + uint8_t u_lpcnt; + int32_t i_res, i_res0, i_res1, i_res2, i_res3; + uint64_t u_out0, u_out1; + v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 }; + v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 }; + v4i32 int_multiplier = { 0, 1, 2, 3 }; + v16u8 p_src_top; + v8i16 vec9, vec10, vec11; + v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8; + v2i64 sum; + + p_src_top = LD_UB( p_src - ( i_stride + 1 ) ); + p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top, + ( v16i8 ) p_src_top ); + + vec9 = __msa_hsub_u_h( p_src_top, p_src_top ); + vec9 *= short_multiplier; + vec8 = __msa_hadd_s_w( vec9, vec9 ); + sum = __msa_hadd_s_d( vec8, vec8 ); + + i_res0 = __msa_copy_s_w( ( v4i32 ) sum, 0 ); + + i_res1 = ( p_src[4 * i_stride - 1] - p_src[2 * i_stride - 1] ) + + 2 * ( p_src[5 * i_stride - 1] - p_src[i_stride - 1] ) + + 3 * ( p_src[6 * i_stride - 1] - p_src[-1] ) + + 4 * ( p_src[7 * i_stride - 1] - p_src[-i_stride - 1] ); + + i_res0 *= 17; + i_res1 *= 17; + i_res0 = ( i_res0 + 16 ) >> 5; + i_res1 = ( i_res1 + 16 ) >> 5; + + i_res3 = 3 * ( i_res0 + i_res1 ); + i_res2 = 16 * ( p_src[7 * i_stride - 1] + p_src[-i_stride + 7] + 1 ); + i_res = i_res2 - i_res3; + + vec8 = __msa_fill_w( i_res0 ); + vec4 = __msa_fill_w( i_res ); + vec2 = __msa_fill_w( i_res1 ); + vec5 = vec8 * int_multiplier; + vec3 = vec8 * 4; + + for ( u_lpcnt = 4; u_lpcnt--; ) + { + vec0 = vec5; + vec0 += vec4; + vec1 = vec0 + vec3; + vec6 = vec5; + vec4 += vec2; + vec6 += vec4; + vec7 = vec6 + vec3; + + SRA_4V( vec0, vec1, vec6, vec7, 5 ); + PCKEV_H2_SH( vec1, vec0, vec7, vec6, vec10, vec11 ); + CLIP_SH2_0_255( vec10, vec11 ); + PCKEV_B2_SH( vec10, vec10, vec11, vec11, vec10, vec11 ); + + u_out0 = __msa_copy_s_d( ( v2i64 ) vec10, 0 ); + u_out1 = __msa_copy_s_d( ( v2i64 ) vec11, 0 ); + SD( u_out0, p_src ); + p_src += i_stride; + SD( u_out1, p_src ); + p_src += i_stride; + + vec4 += vec2; + } +} + +static void intra_predict_plane_16x16_msa( uint8_t *p_src, int32_t i_stride ) +{ + uint8_t u_lpcnt; + int32_t i_res0, i_res1, i_res2, i_res3; + uint64_t u_load0, u_load1; + v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 }; + v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 }; + v4i32 int_multiplier = { 0, 1, 2, 3 }; + v16u8 p_src_top = { 0 }; + v8i16 vec9, vec10; + v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add; + + u_load0 = LD( p_src - ( i_stride + 1 ) ); + u_load1 = LD( p_src - ( i_stride + 1 ) + 9 ); + + INSERT_D2_UB( u_load0, u_load1, p_src_top ); + + p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top, + ( v16i8 ) p_src_top ); + + vec9 = __msa_hsub_u_h( p_src_top, p_src_top ); + vec9 *= short_multiplier; + vec8 = __msa_hadd_s_w( vec9, vec9 ); + res_add = ( v4i32 ) __msa_hadd_s_d( vec8, vec8 ); + + i_res0 = __msa_copy_s_w( res_add, 0 ) + __msa_copy_s_w( res_add, 2 ); + + i_res1 = ( p_src[8 * i_stride - 1] - p_src[6 * i_stride - 1] ) + + 2 * ( p_src[9 * i_stride - 1] - p_src[5 * i_stride - 1] ) + + 3 * ( p_src[10 * i_stride - 1] - p_src[4 * i_stride - 1] ) + + 4 * ( p_src[11 * i_stride - 1] - p_src[3 * i_stride - 1] ) + + 5 * ( p_src[12 * i_stride - 1] - p_src[2 * i_stride - 1] ) + + 6 * ( p_src[13 * i_stride - 1] - p_src[i_stride - 1] ) + + 7 * ( p_src[14 * i_stride - 1] - p_src[-1] ) + + 8 * ( p_src[15 * i_stride - 1] - p_src[-1 * i_stride - 1] ); + + i_res0 *= 5; + i_res1 *= 5; + i_res0 = ( i_res0 + 32 ) >> 6; + i_res1 = ( i_res1 + 32 ) >> 6; + + i_res3 = 7 * ( i_res0 + i_res1 ); + i_res2 = 16 * ( p_src[15 * i_stride - 1] + p_src[-i_stride + 15] + 1 ); + i_res2 -= i_res3; + + vec8 = __msa_fill_w( i_res0 ); + vec4 = __msa_fill_w( i_res2 ); + vec5 = __msa_fill_w( i_res1 ); + vec6 = vec8 * 4; + vec7 = vec8 * int_multiplier; + + for ( u_lpcnt = 16; u_lpcnt--; ) + { + vec0 = vec7; + vec0 += vec4; + vec1 = vec0 + vec6; + vec2 = vec1 + vec6; + vec3 = vec2 + vec6; + + SRA_4V( vec0, vec1, vec2, vec3, 5 ); + PCKEV_H2_SH( vec1, vec0, vec3, vec2, vec9, vec10 ); + CLIP_SH2_0_255( vec9, vec10 ); + PCKEV_ST_SB( vec9, vec10, p_src ); + p_src += i_stride; + + vec4 += vec5; + } +} + +static void intra_predict_dc_4blk_8x8_msa( uint8_t *p_src, int32_t i_stride ) +{ + uint8_t u_lp_cnt; + uint32_t u_src0, u_src1, u_src3, u_src2 = 0; + uint32_t u_out0, u_out1, u_out2, u_out3; + v16u8 p_src_top; + v8u16 add; + v4u32 sum; + + p_src_top = LD_UB( p_src - i_stride ); + add = __msa_hadd_u_h( ( v16u8 ) p_src_top, ( v16u8 ) p_src_top ); + sum = __msa_hadd_u_w( add, add ); + u_src0 = __msa_copy_u_w( ( v4i32 ) sum, 0 ); + u_src1 = __msa_copy_u_w( ( v4i32 ) sum, 1 ); + + for ( u_lp_cnt = 0; u_lp_cnt < 4; u_lp_cnt++ ) + { + u_src0 += p_src[u_lp_cnt * i_stride - 1]; + u_src2 += p_src[( 4 + u_lp_cnt ) * i_stride - 1]; + } + + u_src0 = ( u_src0 + 4 ) >> 3; + u_src3 = ( u_src1 + u_src2 + 4 ) >> 3; + u_src1 = ( u_src1 + 2 ) >> 2; + u_src2 = ( u_src2 + 2 ) >> 2; + + u_out0 = u_src0 * 0x01010101; + u_out1 = u_src1 * 0x01010101; + u_out2 = u_src2 * 0x01010101; + u_out3 = u_src3 * 0x01010101; + + for ( u_lp_cnt = 4; u_lp_cnt--; ) + { + SW( u_out0, p_src ); + SW( u_out1, ( p_src + 4 ) ); + SW( u_out2, ( p_src + 4 * i_stride ) ); + SW( u_out3, ( p_src + 4 * i_stride + 4 ) ); + p_src += i_stride; + } +} + +static void intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t *p_dst, + int32_t i_dst_stride ) +{ + uint8_t u_src_val = p_src[15]; + uint64_t u_out0, u_out1, u_out2, u_out3; + v16u8 src, vec4, vec5, res0; + v8u16 vec0, vec1, vec2, vec3; + v2i64 res1, res2, res3; + + src = LD_UB( p_src ); + + vec4 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 1 ); + vec5 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 2 ); + vec5 = ( v16u8 ) __msa_insert_b( ( v16i8 ) vec5, 14, u_src_val ); + ILVR_B2_UH( vec5, src, vec4, vec4, vec0, vec1 ); + ILVL_B2_UH( vec5, src, vec4, vec4, vec2, vec3 ); + HADD_UB4_UH( vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3 ); + + vec0 += vec1; + vec2 += vec3; + vec0 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec0, 2 ); + vec2 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec2, 2 ); + + res0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec2, ( v16i8 ) vec0 ); + res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 ); + res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 ); + res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 ); + + u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 ); + u_out1 = __msa_copy_u_d( res1, 0 ); + u_out2 = __msa_copy_u_d( res2, 0 ); + u_out3 = __msa_copy_u_d( res3, 0 ); + SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); + p_dst += ( 4 * i_dst_stride ); + + res0 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 4 ); + res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 ); + res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 ); + res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 ); + + u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 ); + u_out1 = __msa_copy_u_d( res1, 0 ); + u_out2 = __msa_copy_u_d( res2, 0 ); + u_out3 = __msa_copy_u_d( res3, 0 ); + SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride ); +} + +static void intra_predict_128dc_16x16_msa( uint8_t *p_dst, + int32_t i_dst_stride ) +{ + v16u8 out = ( v16u8 ) __msa_ldi_b( 128 ); + + ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride ); + p_dst += ( 8 * i_dst_stride ); + ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride ); +} + +void x264_intra_predict_dc_16x16_msa( uint8_t *p_src ) +{ + intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ), + FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 ); +} + +void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src ) +{ + intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ), + FDEC_STRIDE, p_src, FDEC_STRIDE, 0, 1 ); +} + +void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src ) +{ + intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ), + FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 0 ); +} + +void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src ) +{ + intra_predict_128dc_16x16_msa( p_src, FDEC_STRIDE ); +} + +void x264_intra_predict_hor_16x16_msa( uint8_t *p_src ) +{ + intra_predict_horiz_16x16_msa( ( p_src - 1 ), FDEC_STRIDE, + p_src, FDEC_STRIDE ); +} + +void x264_intra_predict_vert_16x16_msa( uint8_t *p_src ) +{ + intra_predict_vert_16x16_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE ); +} + +void x264_intra_predict_plane_16x16_msa( uint8_t *p_src ) +{ + intra_predict_plane_16x16_msa( p_src, FDEC_STRIDE ); +} + +void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src ) +{ + intra_predict_dc_4blk_8x8_msa( p_src, FDEC_STRIDE ); +} + +void x264_intra_predict_hor_8x8_msa( uint8_t *p_src ) +{ + intra_predict_horiz_8x8_msa( ( p_src - 1 ), FDEC_STRIDE, + p_src, FDEC_STRIDE ); +} + +void x264_intra_predict_vert_8x8_msa( uint8_t *p_src ) +{ + intra_predict_vert_8x8_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE ); +} + +void x264_intra_predict_plane_8x8_msa( uint8_t *p_src ) +{ + intra_predict_plane_8x8_msa( p_src, FDEC_STRIDE ); +} + +void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ) +{ + intra_predict_ddl_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE ); +} + +void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ) +{ + intra_predict_dc_8x8_msa( ( pu_xyz + 16 ), ( pu_xyz + 7 ), + p_src, FDEC_STRIDE ); +} + +void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ) +{ + intra_predict_horiz_8x8_msa( ( pu_xyz + 14 ), -1, p_src, FDEC_STRIDE ); +} + +void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ) +{ + intra_predict_vert_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE ); +} + +void x264_intra_predict_dc_4x4_msa( uint8_t *p_src ) +{ + intra_predict_dc_4x4_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ), + FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 ); +} + +void x264_intra_predict_hor_4x4_msa( uint8_t *p_src ) +{ + intra_predict_horiz_4x4_msa( ( p_src - 1 ), FDEC_STRIDE, + p_src, FDEC_STRIDE ); +} + +void x264_intra_predict_vert_4x4_msa( uint8_t *p_src ) +{ + intra_predict_vert_4x4_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE ); +} +#endif
View file
x264-snapshot-20150804-2245.tar.bz2/common/mips/predict.h
Added
@@ -0,0 +1,48 @@ +/***************************************************************************** + * predict.h: msa intra prediction + ***************************************************************************** + * Copyright (C) 2015 x264 project + * + * Authors: Rishikesh More <rishikesh.more@imgtec.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_MIPS_PREDICT_H +#define X264_MIPS_PREDICT_H + +void x264_intra_predict_dc_16x16_msa( uint8_t *p_src ); +void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src ); +void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src ); +void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src ); +void x264_intra_predict_hor_16x16_msa( uint8_t *p_src ); +void x264_intra_predict_vert_16x16_msa( uint8_t *p_src ); +void x264_intra_predict_plane_16x16_msa( uint8_t *p_src ); +void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src ); +void x264_intra_predict_hor_8x8_msa( uint8_t *p_src ); +void x264_intra_predict_vert_8x8_msa( uint8_t *p_src ); +void x264_intra_predict_plane_8x8_msa( uint8_t *p_src ); +void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ); +void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ); +void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ); +void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] ); +void x264_intra_predict_dc_4x4_msa( uint8_t *p_src ); +void x264_intra_predict_hor_4x4_msa( uint8_t *p_src ); +void x264_intra_predict_vert_4x4_msa( uint8_t *p_src ); + +#endif
View file
x264-snapshot-20150804-2245.tar.bz2/common/mips/quant-c.c
Added
@@ -0,0 +1,630 @@ +/***************************************************************************** + * quant-c.c: msa quantization and level-run + ***************************************************************************** + * Copyright (C) 2015 x264 project + * + * Authors: Rishikesh More <rishikesh.more@imgtec.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "macros.h" + +#if !HIGH_BIT_DEPTH +static void avc_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], + int32_t i_qp ) +{ + const int32_t i_mf = i_qp % 6; + const int32_t q_bits = i_qp / 6 - 4; + v8i16 dct0, dct1; + v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3; + + LD_SH2( p_dct, 8, dct0, dct1 ); + + LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 ); + LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 ); + + if ( q_bits >= 0 ) + { + v8i16 dequant_mf_h0, dequant_mf_h1, q_bits_vec; + + q_bits_vec = __msa_fill_h( q_bits ); + + PCKEV_H2_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2, + dequant_mf_h0, dequant_mf_h1 ); + + dct0 *= dequant_mf_h0; + dct1 *= dequant_mf_h1; + dct0 <<= q_bits_vec; + dct1 <<= q_bits_vec; + ST_SH2( dct0, dct1, p_dct, 8 ); + } + else + { + const int32_t q_bits_add = 1 << ( -q_bits - 1 ); + v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; + v4i32 q_bits_vec, q_bits_vec_add; + + q_bits_vec_add = __msa_fill_w( q_bits_add ); + q_bits_vec = __msa_fill_w( -q_bits ); + + UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); + UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); + + dct_signed_w0 *= dequant_m_f0; + dct_signed_w1 *= dequant_m_f1; + dct_signed_w2 *= dequant_m_f2; + dct_signed_w3 *= dequant_m_f3; + dct_signed_w0 += q_bits_vec_add; + dct_signed_w1 += q_bits_vec_add; + dct_signed_w2 += q_bits_vec_add; + dct_signed_w3 += q_bits_vec_add; + + SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3, + q_bits_vec ); + PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2, + dct0, dct1 ); + ST_SH2( dct0, dct1, p_dct, 8 ); + } +} + +static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64], + int32_t i_qp ) +{ + const int32_t i_mf = i_qp % 6; + const int32_t q_bits = i_qp / 6 - 6; + v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7; + v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3; + v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7; + v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11; + v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15; + + LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 ); + + LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 ); + LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 ); + LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 ); + LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 ); + LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 ); + LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 ); + LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 ); + LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 ); + + if ( q_bits >= 0 ) + { + v8i16 q_bits_vec; + v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3; + v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7; + + q_bits_vec = __msa_fill_h( q_bits ); + + PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2, + dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6, + dequant_mf_h0, dequant_mf_h1, + dequant_mf_h2, dequant_mf_h3 ); + PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10, + dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14, + dequant_mf_h4, dequant_mf_h5, + dequant_mf_h6, dequant_mf_h7 ); + + dct0 *= dequant_mf_h0; + dct1 *= dequant_mf_h1; + dct2 *= dequant_mf_h2; + dct3 *= dequant_mf_h3; + dct4 *= dequant_mf_h4; + dct5 *= dequant_mf_h5; + dct6 *= dequant_mf_h6; + dct7 *= dequant_mf_h7; + + SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec ); + SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec ); + + ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 ); + } + else + { + const int32_t q_bits_add = 1 << ( -q_bits - 1 ); + v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; + v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7; + v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11; + v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15; + v4i32 q_bits_vec, q_bits_vec_add; + + q_bits_vec_add = __msa_fill_w( q_bits_add ); + q_bits_vec = __msa_fill_w( -q_bits ); + + UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); + UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); + UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); + UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); + UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 ); + UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 ); + UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 ); + UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 ); + + dct_signed_w0 *= dequant_m_f0; + dct_signed_w1 *= dequant_m_f1; + dct_signed_w2 *= dequant_m_f2; + dct_signed_w3 *= dequant_m_f3; + dct_signed_w4 *= dequant_m_f4; + dct_signed_w5 *= dequant_m_f5; + dct_signed_w6 *= dequant_m_f6; + dct_signed_w7 *= dequant_m_f7; + dct_signed_w8 *= dequant_m_f8; + dct_signed_w9 *= dequant_m_f9; + dct_signed_w10 *= dequant_m_f10; + dct_signed_w11 *= dequant_m_f11; + dct_signed_w12 *= dequant_m_f12; + dct_signed_w13 *= dequant_m_f13; + dct_signed_w14 *= dequant_m_f14; + dct_signed_w15 *= dequant_m_f15; + + dct_signed_w0 += q_bits_vec_add; + dct_signed_w1 += q_bits_vec_add; + dct_signed_w2 += q_bits_vec_add; + dct_signed_w3 += q_bits_vec_add; + dct_signed_w4 += q_bits_vec_add; + dct_signed_w5 += q_bits_vec_add; + dct_signed_w6 += q_bits_vec_add; + dct_signed_w7 += q_bits_vec_add; + dct_signed_w8 += q_bits_vec_add; + dct_signed_w9 += q_bits_vec_add; + dct_signed_w10 += q_bits_vec_add; + dct_signed_w11 += q_bits_vec_add; + dct_signed_w12 += q_bits_vec_add; + dct_signed_w13 += q_bits_vec_add; + dct_signed_w14 += q_bits_vec_add; + dct_signed_w15 += q_bits_vec_add; + + SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3, + q_bits_vec ); + SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7, + q_bits_vec ); + SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11, + q_bits_vec ); + SRA_4V( dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15, + q_bits_vec ); + PCKEV_H4_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2, + dct_signed_w5, dct_signed_w4, dct_signed_w7, dct_signed_w6, + dct0, dct1, dct2, dct3 ); + PCKEV_H4_SH( dct_signed_w9, dct_signed_w8, dct_signed_w11, + dct_signed_w10, dct_signed_w13, dct_signed_w12, + dct_signed_w15, dct_signed_w14, dct4, dct5, dct6, dct7 ); + ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 ); + } +} + +static void avc_dequant_4x4_dc_msa( int16_t *p_dct, + int32_t pi_dequant_mf[6][16], + int32_t i_qp ) +{ + const int32_t q_bits = i_qp / 6 - 6; + int32_t i_dmf = pi_dequant_mf[i_qp % 6][0]; + v8i16 dct0, dct1, dequant_mf_h; + + LD_SH2( p_dct, 8, dct0, dct1 ); + + if ( q_bits >= 0 ) + { + i_dmf <<= q_bits; + + dequant_mf_h = __msa_fill_h( i_dmf ); + dct0 = dct0 * dequant_mf_h; + dct1 = dct1 * dequant_mf_h; + + ST_SH2( dct0, dct1, p_dct, 8 ); + } + else + { + const int32_t q_bits_add = 1 << ( -q_bits - 1 ); + v4i32 dequant_m_f, q_bits_vec, q_bits_vec_add; + v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; + + q_bits_vec_add = __msa_fill_w( q_bits_add ); + q_bits_vec = __msa_fill_w( -q_bits ); + + dequant_m_f = __msa_fill_w( i_dmf ); + + UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); + UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); + + dct_signed_w0 *= dequant_m_f; + dct_signed_w1 *= dequant_m_f; + dct_signed_w2 *= dequant_m_f; + dct_signed_w3 *= dequant_m_f; + + dct_signed_w0 += q_bits_vec_add; + dct_signed_w1 += q_bits_vec_add; + dct_signed_w2 += q_bits_vec_add; + dct_signed_w3 += q_bits_vec_add; + + SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3, + q_bits_vec ); + PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2, + dct0, dct1 ); + ST_SH2( dct0, dct1, p_dct, 8 ); + } +} + +static int32_t avc_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, + uint16_t *p_bias ) +{ + int32_t non_zero = 0; + v8i16 dct0, dct1; + v8i16 zero = { 0 }; + v8i16 dct0_mask, dct1_mask; + v8i16 dct_h0, dct_h1, mf_h0, mf_h1, bias_h0, bias_h1; + v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; + v4i32 dct_w0, dct_w1, dct_w2, dct_w3; + v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3; + v4i32 bias0, bias1, bias2, bias3; + + LD_SH2( p_dct, 8, dct0, dct1 ); + LD_SH2( p_bias, 8, bias_h0, bias_h1 ); + LD_SH2( p_mf, 8, mf_h0, mf_h1 ); + + dct0_mask = __msa_clei_s_h( dct0, 0 ); + dct1_mask = __msa_clei_s_h( dct1, 0 ); + + UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); + UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); + ILVR_H2_SW( zero, bias_h0, zero, bias_h1, bias0, bias2 ); + ILVL_H2_SW( zero, bias_h0, zero, bias_h1, bias1, bias3 ); + ILVR_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec0, mf_vec2 ); + ILVL_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec1, mf_vec3 ); + + dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); + dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); + dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); + dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); + + dct_w0 *= mf_vec0; + dct_w1 *= mf_vec1; + dct_w2 *= mf_vec2; + dct_w3 *= mf_vec3; + + SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); + PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 ); + + dct0 = zero - dct_h0; + dct1 = zero - dct_h1; + + dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, + ( v16u8 ) dct0_mask ); + dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, + ( v16u8 ) dct1_mask ); + non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) ); + ST_SH2( dct0, dct1, p_dct, 8 ); + + return !!non_zero; +} + +static int32_t avc_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, + uint16_t *p_bias ) +{ + int32_t non_zero = 0; + v8i16 dct0, dct1, dct2, dct3; + v8i16 zero = { 0 }; + v8i16 dct0_mask, dct1_mask, dct2_mask, dct3_mask; + v8i16 dct_h0, dct_h1, dct_h2, dct_h3, mf_h0, mf_h1, mf_h2, mf_h3; + v8i16 bias_h0, bias_h1, bias_h2, bias_h3; + v4i32 dct_w0, dct_w1, dct_w2, dct_w3, dct_w4, dct_w5, dct_w6, dct_w7; + v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; + v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7; + v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3; + v4i32 mf_vec4, mf_vec5, mf_vec6, mf_vec7; + v4i32 bias0, bias1, bias2, bias3, bias4, bias5, bias6, bias7; + + LD_SH4( p_dct, 8, dct0, dct1, dct2, dct3 ); + + dct0_mask = __msa_clei_s_h( dct0, 0 ); + dct1_mask = __msa_clei_s_h( dct1, 0 ); + dct2_mask = __msa_clei_s_h( dct2, 0 ); + dct3_mask = __msa_clei_s_h( dct3, 0 ); + + UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); + UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); + UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); + UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); + LD_SH4( p_bias, 8, bias_h0, bias_h1, bias_h2, bias_h3 ); + ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, + bias0, bias2, bias4, bias6 ); + ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, + bias1, bias3, bias5, bias7 ); + LD_SH4( p_mf, 8, mf_h0, mf_h1, mf_h2, mf_h3 ); + ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, + mf_vec0, mf_vec2, mf_vec4, mf_vec6 ); + ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, + mf_vec1, mf_vec3, mf_vec5, mf_vec7 ); + + dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); + dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); + dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); + dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); + dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 ); + dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 ); + dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 ); + dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 ); + + dct_w0 *= mf_vec0; + dct_w1 *= mf_vec1; + dct_w2 *= mf_vec2; + dct_w3 *= mf_vec3; + dct_w4 *= mf_vec4; + dct_w5 *= mf_vec5; + dct_w6 *= mf_vec6; + dct_w7 *= mf_vec7; + + SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); + SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 ); + PCKEV_H4_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_w5, dct_w4, dct_w7, dct_w6, + dct_h0, dct_h1, dct_h2, dct_h3 ); + SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3, + dct0, dct1, dct2, dct3 ); + + dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, + ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); + dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, + ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); + dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2, + ( v16u8 ) dct2, ( v16u8 ) dct2_mask ); + dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3, + ( v16u8 ) dct3, ( v16u8 ) dct3_mask ); + + non_zero = HADD_SW_S32( ( v4u32 )( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) ); + ST_SH4( dct0, dct1, dct2, dct3, p_dct, 8 ); + LD_SH4( p_dct + 32, 8, dct0, dct1, dct2, dct3 ); + + dct0_mask = __msa_clei_s_h( dct0, 0 ); + dct1_mask = __msa_clei_s_h( dct1, 0 ); + dct2_mask = __msa_clei_s_h( dct2, 0 ); + dct3_mask = __msa_clei_s_h( dct3, 0 ); + + UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); + UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); + UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); + UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); + LD_SH4( p_bias + 32, 8, bias_h0, bias_h1, bias_h2, bias_h3 ); + ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, + bias0, bias2, bias4, bias6 ); + ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, + bias1, bias3, bias5, bias7 ); + LD_SH4( p_mf + 32, 8, mf_h0, mf_h1, mf_h2, mf_h3 ); + ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, + mf_vec0, mf_vec2, mf_vec4, mf_vec6 ); + ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, + mf_vec1, mf_vec3, mf_vec5, mf_vec7 ); + + dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); + dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); + dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); + dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); + dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 ); + dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 ); + dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 ); + dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 ); + + dct_w0 *= mf_vec0; + dct_w1 *= mf_vec1; + dct_w2 *= mf_vec2; + dct_w3 *= mf_vec3; + dct_w4 *= mf_vec4; + dct_w5 *= mf_vec5; + dct_w6 *= mf_vec6; + dct_w7 *= mf_vec7; + + SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); + SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 ); + PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 ); + PCKEV_H2_SH( dct_w5, dct_w4, dct_w7, dct_w6, dct_h2, dct_h3 ); + SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3, + dct0, dct1, dct2, dct3 ); + + dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, + ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); + dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, + ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); + dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2, + ( v16u8 ) dct2, ( v16u8 ) dct2_mask ); + dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3, + ( v16u8 ) dct3, ( v16u8 ) dct3_mask ); + + non_zero += HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) ); + ST_SH4( dct0, dct1, dct2, dct3, p_dct + 32, 8 ); + + return !!non_zero; +} + +static int32_t avc_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, + int32_t i_bias ) +{ + int32_t non_zero = 0; + v8i16 dct0, dct1, dct0_mask, dct1_mask; + v8i16 zero = { 0 }; + v8i16 dct_h0, dct_h1; + v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; + v4i32 dct_w0, dct_w1, dct_w2, dct_w3; + v4i32 mf_vec, bias_vec; + + LD_SH2( p_dct, 8, dct0, dct1 ); + + dct0_mask = __msa_clei_s_h( dct0, 0 ); + dct1_mask = __msa_clei_s_h( dct1, 0 ); + + UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); + UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); + + bias_vec = __msa_fill_w( i_bias ); + mf_vec = __msa_fill_w( i_mf ); + + dct_w0 = __msa_add_a_w( dct_signed_w0, bias_vec ); + dct_w1 = __msa_add_a_w( dct_signed_w1, bias_vec ); + dct_w2 = __msa_add_a_w( dct_signed_w2, bias_vec ); + dct_w3 = __msa_add_a_w( dct_signed_w3, bias_vec ); + + dct_w0 *= mf_vec; + dct_w1 *= mf_vec; + dct_w2 *= mf_vec; + dct_w3 *= mf_vec; + + SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); + PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 ); + + dct0 = zero - dct_h0; + dct1 = zero - dct_h1; + dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, + ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); + dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, + ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); + non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) ); + + ST_SH2( dct0, dct1, p_dct, 8 ); + + return !!non_zero; +} + +static int32_t avc_coeff_last64_msa( int16_t *p_src ) +{ + uint32_t u_res; + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 tmp_h0, tmp_h1, tmp_h2, tmp_h3, tmp_h4, tmp_h5, tmp_h6, tmp_h7; + v16u8 tmp0, tmp1, tmp2, tmp3; + v8u16 vec0, vec1, vec2, vec3; + v4i32 out0; + v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; + + LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 ); + + tmp_h0 = __msa_ceqi_h( src0, 0 ); + tmp_h1 = __msa_ceqi_h( src1, 0 ); + tmp_h2 = __msa_ceqi_h( src2, 0 ); + tmp_h3 = __msa_ceqi_h( src3, 0 ); + tmp_h4 = __msa_ceqi_h( src4, 0 ); + tmp_h5 = __msa_ceqi_h( src5, 0 ); + tmp_h6 = __msa_ceqi_h( src6, 0 ); + tmp_h7 = __msa_ceqi_h( src7, 0 ); + + PCKEV_B4_UB( tmp_h1, tmp_h0, tmp_h3, tmp_h2, tmp_h5, tmp_h4, tmp_h7, tmp_h6, + tmp0, tmp1, tmp2, tmp3 ); + + tmp0 = tmp0 & mask; + tmp1 = tmp1 & mask; + tmp2 = tmp2 & mask; + tmp3 = tmp3 & mask; + + HADD_UB4_UH( tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3 ); + PCKEV_B2_UB( vec1, vec0, vec3, vec2, tmp0, tmp1 ); + HADD_UB2_UH( tmp0, tmp1, vec0, vec1 ); + + tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec1, ( v16i8 ) vec0 ); + vec0 = __msa_hadd_u_h( tmp0, tmp0 ); + tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec0, ( v16i8 ) vec0 ); + out0 = ( v4i32 ) __msa_nloc_d( ( v2i64 ) tmp0 ); + u_res = __msa_copy_u_w( out0, 0 ); + + return ( 63 - u_res ); +} + +static int32_t avc_coeff_last16_msa( int16_t *p_src ) +{ + uint32_t u_res; + v8i16 src0, src1; + v8u16 tmp_h0; + v16u8 tmp0; + v8i16 out0, out1; + v16i8 res0; + v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 }; + + LD_SH2( p_src, 8, src0, src1 ); + + out0 = __msa_ceqi_h( src0, 0 ); + out1 = __msa_ceqi_h( src1, 0 ); + + tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) out1, ( v16i8 ) out0 ); + tmp0 = tmp0 & mask; + tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 ); + tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 ); + tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 ); + tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 ); + tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 ); + res0 = __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 ); + out0 = __msa_nloc_h( ( v8i16 ) res0 ); + u_res = __msa_copy_u_h( out0, 0 ); + + return ( 15 - u_res ); +} + +void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], + int32_t i_qp ) +{ + avc_dequant_4x4_msa( p_dct, pi_dequant_mf, i_qp ); +} + +void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64], + int32_t i_qp ) +{ + avc_dequant_8x8_msa( p_dct, pi_dequant_mf, i_qp ); +} + +void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], + int32_t i_qp ) +{ + avc_dequant_4x4_dc_msa( p_dct, pi_dequant_mf, i_qp ); +} + +int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ) +{ + return avc_quant_4x4_msa( p_dct, p_mf, p_bias ); +} + +int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16], + uint16_t pu_mf[16], uint16_t pu_bias[16] ) +{ + int32_t i_non_zero, i_non_zero_acc = 0; + + for( int32_t j = 0; j < 4; j++ ) + { + i_non_zero = x264_quant_4x4_msa( p_dct[j], pu_mf, pu_bias ); + + i_non_zero_acc |= ( !!i_non_zero ) << j; + } + + return i_non_zero_acc; +} + +int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ) +{ + return avc_quant_8x8_msa( p_dct, p_mf, p_bias ); +} + +int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias ) +{ + return avc_quant_4x4_dc_msa( p_dct, i_mf, i_bias ); +} + +int32_t x264_coeff_last64_msa( int16_t *p_src ) +{ + return avc_coeff_last64_msa( p_src ); +} + +int32_t x264_coeff_last16_msa( int16_t *p_src ) +{ + return avc_coeff_last16_msa( p_src ); +} +#endif
View file
x264-snapshot-20150804-2245.tar.bz2/common/mips/quant.h
Added
@@ -0,0 +1,43 @@ +/***************************************************************************** + * quant.h: msa quantization and level-run + ***************************************************************************** + * Copyright (C) 2015 x264 project + * + * Authors: Rishikesh More <rishikesh.more@imgtec.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_MIPS_QUANT_H +#define X264_MIPS_QUANT_H + +void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], + int32_t i_qp ); +void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64], + int32_t i_qp ); +void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16], + int32_t i_qp ); +int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ); +int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16], + uint16_t pu_mf[16], uint16_t pu_bias[16] ); +int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ); +int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias ); +int32_t x264_coeff_last64_msa( int16_t *p_src ); +int32_t x264_coeff_last16_msa( int16_t *p_src ); + +#endif
View file
x264-snapshot-20141218-2245.tar.bz2/common/mvpred.c -> x264-snapshot-20150804-2245.tar.bz2/common/mvpred.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * mvpred.c: motion vector prediction ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Fiona Glaser <fiona@x264.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/opencl.c -> x264-snapshot-20150804-2245.tar.bz2/common/opencl.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * opencl.c: OpenCL initialization and kernel compilation ***************************************************************************** - * Copyright (C) 2012-2014 x264 project + * Copyright (C) 2012-2015 x264 project * * Authors: Steve Borho <sborho@multicorewareinc.com> * Anton Mitrofanov <BugMaster@narod.ru>
View file
x264-snapshot-20141218-2245.tar.bz2/common/opencl.h -> x264-snapshot-20150804-2245.tar.bz2/common/opencl.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * opencl.h: OpenCL structures and defines ***************************************************************************** - * Copyright (C) 2012-2014 x264 project + * Copyright (C) 2012-2015 x264 project * * Authors: Steve Borho <sborho@multicorewareinc.com> * Anton Mitrofanov <BugMaster@narod.ru>
View file
x264-snapshot-20141218-2245.tar.bz2/common/osdep.c -> x264-snapshot-20150804-2245.tar.bz2/common/osdep.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * osdep.c: platform-specific code ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> * Laurent Aimar <fenrir@via.ecp.fr> @@ -94,51 +94,6 @@ } #endif -#if HAVE_MMX -#ifdef __INTEL_COMPILER -/* Agner's patch to Intel's CPU dispatcher from pages 131-132 of - * http://agner.org/optimize/optimizing_cpp.pdf (2011-01-30) - * adapted to x264's cpu schema. */ - -// Global variable indicating cpu -int __intel_cpu_indicator = 0; -// CPU dispatcher function -void x264_intel_cpu_indicator_init( void ) -{ - unsigned int cpu = x264_cpu_detect(); - if( cpu&X264_CPU_AVX ) - __intel_cpu_indicator = 0x20000; - else if( cpu&X264_CPU_SSE42 ) - __intel_cpu_indicator = 0x8000; - else if( cpu&X264_CPU_SSE4 ) - __intel_cpu_indicator = 0x2000; - else if( cpu&X264_CPU_SSSE3 ) - __intel_cpu_indicator = 0x1000; - else if( cpu&X264_CPU_SSE3 ) - __intel_cpu_indicator = 0x800; - else if( cpu&X264_CPU_SSE2 && !(cpu&X264_CPU_SSE2_IS_SLOW) ) - __intel_cpu_indicator = 0x200; - else if( cpu&X264_CPU_SSE ) - __intel_cpu_indicator = 0x80; - else if( cpu&X264_CPU_MMX2 ) - __intel_cpu_indicator = 8; - else - __intel_cpu_indicator = 1; -} - -/* __intel_cpu_indicator_init appears to have a non-standard calling convention that - * assumes certain registers aren't preserved, so we'll route it through a function - * that backs up all the registers. */ -void __intel_cpu_indicator_init( void ) -{ - x264_safe_intel_cpu_indicator_init(); -} -#else -void x264_intel_cpu_indicator_init( void ) -{} -#endif -#endif - #ifdef _WIN32 /* Functions for dealing with Unicode on Windows. */ FILE *x264_fopen( const char *filename, const char *mode )
View file
x264-snapshot-20141218-2245.tar.bz2/common/osdep.h -> x264-snapshot-20150804-2245.tar.bz2/common/osdep.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * osdep.h: platform-specific code ***************************************************************************** - * Copyright (C) 2007-2014 x264 project + * Copyright (C) 2007-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr>
View file
x264-snapshot-20141218-2245.tar.bz2/common/pixel.c -> x264-snapshot-20150804-2245.tar.bz2/common/pixel.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * pixel.c: pixel metrics ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> @@ -42,6 +42,9 @@ # include "aarch64/pixel.h" # include "aarch64/predict.h" #endif +#if ARCH_MIPS +# include "mips/pixel.h" +#endif /**************************************************************************** @@ -598,8 +601,8 @@ INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _neon ) INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon ) INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon ) -INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c ) -INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c ) +INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _neon ) +INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _neon ) INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon ) INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon ) #endif @@ -1409,25 +1412,28 @@ #if ARCH_AARCH64 if( cpu&X264_CPU_NEON ) { - INIT7( sad, _neon ); + INIT8( sad, _neon ); // AArch64 has no distinct instructions for aligned load/store - INIT7_NAME( sad_aligned, sad, _neon ); + INIT8_NAME( sad_aligned, sad, _neon ); INIT7( sad_x3, _neon ); INIT7( sad_x4, _neon ); - INIT7( ssd, _neon ); - INIT7( satd, _neon ); + INIT8( ssd, _neon ); + INIT8( satd, _neon ); INIT7( satd_x3, _neon ); INIT7( satd_x4, _neon ); INIT4( hadamard_ac, _neon ); pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; + pixf->vsad = x264_pixel_vsad_neon; + pixf->asd8 = x264_pixel_asd8_neon; pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon; @@ -1440,11 +1446,44 @@ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon; pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; pixf->ssim_end4 = x264_pixel_ssim_end4_neon; } #endif // ARCH_AARCH64 +#if HAVE_MSA + if( cpu&X264_CPU_MSA ) + { + INIT8( sad, _msa ); + INIT8_NAME( sad_aligned, sad, _msa ); + INIT8( ssd, _msa ); + INIT7( sad_x3, _msa ); + INIT7( sad_x4, _msa ); + INIT8( satd, _msa ); + INIT4( hadamard_ac, _msa ); + + pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_msa; + pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_msa; + pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_msa; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_msa; + pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_msa; + pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_msa; + pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_msa; + pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_msa; + + pixf->ssim_4x4x2_core = x264_ssim_4x4x2_core_msa; + + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_msa; + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_msa; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_msa; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa; + pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16; + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8; + } +#endif // HAVE_MSA + #endif // HIGH_BIT_DEPTH #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC )
View file
x264-snapshot-20141218-2245.tar.bz2/common/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/pixel.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * pixel.c: pixel metrics ***************************************************************************** - * Copyright (C) 2004-2014 x264 project + * Copyright (C) 2004-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Fiona Glaser <fiona@x264.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/ppc/dct.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/dct.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * dct.c: ppc transform and zigzag ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu> * Eric Petit <eric.petit@lapsus.org> @@ -264,7 +264,7 @@ vec_u8_t lv = vec_ld(0, dest); \ vec_u8_t dstv = vec_perm(lv, zero_u8v, (vec_u8_t)perm_ldv); \ vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \ - vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \ + vec_u16_t dst16 = vec_u8_to_u16_h(dstv); \ vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \ vec_u8_t idstsum8 = vec_s16_to_u8(idstsum); \ /* unaligned store */ \ @@ -384,7 +384,7 @@ vec_u8_t lv = vec_ld( 7, dest ); \ vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \ vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \ - vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \ + vec_u16_t dst16 = vec_u8_to_u16_h(dstv); \ vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \ vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \ /* unaligned store */ \
View file
x264-snapshot-20141218-2245.tar.bz2/common/ppc/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/dct.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * dct.h: ppc transform and zigzag ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Eric Petit <eric.petit@lapsus.org> * Guillaume Poirier <gpoirier@mplayerhq.hu>
View file
x264-snapshot-20141218-2245.tar.bz2/common/ppc/deblock.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/deblock.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * deblock.c: ppc deblocking ***************************************************************************** - * Copyright (C) 2007-2014 x264 project + * Copyright (C) 2007-2015 x264 project * * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/ppc/mc.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/mc.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * mc.c: ppc motion compensation ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Eric Petit <eric.petit@lapsus.org> * Guillaume Poirier <gpoirier@mplayerhq.hu> @@ -40,24 +40,19 @@ typedef void (*pf_mc_t)( uint8_t *src, intptr_t i_src, uint8_t *dst, intptr_t i_dst, int i_height ); - -static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; -static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; - - static inline int x264_tapfilter( uint8_t *pix, int i_pix_next ) { return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + pix[ 3*i_pix_next]; } + static inline int x264_tapfilter1( uint8_t *pix ) { return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + pix[ 3]; } - static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src1, intptr_t i_src1, uint8_t *src2, int i_height ) @@ -181,10 +176,10 @@ { int qpel_idx = ((mvy&3)<<2) + (mvx&3); intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); - uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; + uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { - uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); switch( i_width ) { @@ -229,10 +224,10 @@ { int qpel_idx = ((mvy&3)<<2) + (mvx&3); intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); - uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; + uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { - uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); switch( i_width ) { case 4: @@ -296,6 +291,12 @@ } } +#ifdef WORDS_BIGENDIAN +#define VSLD(a,b,n) vec_sld(a,b,n) +#else +#define VSLD(a,b,n) vec_sld(b,a,16-n) +#endif + static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride, uint8_t *src, intptr_t i_src_stride, int mvx, int mvy, int i_height ) @@ -321,8 +322,13 @@ vec_u16_t src0v_16, src1v_16, src2v_16, src3v_16, dstv16; vec_u16_t shiftv, k32v; +#ifdef WORDS_BIGENDIAN static const vec_u8_t perm0v = CV(1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13); static const vec_u8_t perm1v = CV(3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15); +#else + static const vec_u8_t perm0v = CV(0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12); + static const vec_u8_t perm1v = CV(2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14); +#endif coeff0v = vec_ld( 0, coeff ); coeff3v = vec_splat( coeff0v, 3 ); @@ -334,7 +340,7 @@ VEC_LOAD( src, src2v_8, 9, vec_u8_t, src ); src2v_16 = vec_u8_to_u16( src2v_8 ); - src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) ); + src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) ); for( int y = 0; y < i_height; y += 2 ) { @@ -342,7 +348,7 @@ src1v_16 = src3v_16; VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src ); src2v_16 = vec_u8_to_u16( src2v_8 ); - src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) ); + src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) ); dstv16 = vec_mladd( coeff0v, src0v_16, k32v ); dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 ); @@ -364,7 +370,7 @@ src1v_16 = src3v_16; VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src ); src2v_16 = vec_u8_to_u16( src2v_8 ); - src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) ); + src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) ); dstv16 = vec_mladd( coeff0v, src0v_16, k32v ); dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 ); @@ -420,12 +426,17 @@ k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) ); shiftv = vec_splat_u16( 6 ); +#ifdef WORDS_BIGENDIAN static const vec_u8_t perm0v = CV(1,5,9,13,17,21,25,29,0,0,0,0,0,0,0,0); static const vec_u8_t perm1v = CV(3,7,11,15,19,23,27,31,0,0,0,0,0,0,0,0); +#else + static const vec_u8_t perm0v = CV(0,4,8,12,16,20,24,28,1,1,1,1,1,1,1,1); + static const vec_u8_t perm1v = CV(2,6,10,14,18,22,26,30,1,1,1,1,1,1,1,1); +#endif VEC_LOAD( src, src2v_8, 16, vec_u8_t, src ); VEC_LOAD( src+16, src3v_8, 2, vec_u8_t, src ); - src3v_8 = vec_sld( src2v_8, src3v_8, 2 ); + src3v_8 = VSLD( src2v_8, src3v_8, 2 ); for( int y = 0; y < i_height; y += 2 ) { @@ -434,7 +445,7 @@ VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src ); VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src ); - src3v_8 = vec_sld( src2v_8, src3v_8, 2 ); + src3v_8 = VSLD( src2v_8, src3v_8, 2 ); src0v_16h = vec_u8_to_u16_h( src0v_8 ); src0v_16l = vec_u8_to_u16_l( src0v_8 ); @@ -472,7 +483,7 @@ VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src ); VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src ); - src3v_8 = vec_sld( src2v_8, src3v_8, 2 ); + src3v_8 = VSLD( src2v_8, src3v_8, 2 ); src0v_16h = vec_u8_to_u16_h( src0v_8 ); src0v_16l = vec_u8_to_u16_l( src0v_8 ); @@ -555,11 +566,11 @@ VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \ VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \ \ - src2v = vec_sld( src1v, src6v, 1 ); \ - src3v = vec_sld( src1v, src6v, 2 ); \ - src4v = vec_sld( src1v, src6v, 3 ); \ - src5v = vec_sld( src1v, src6v, 4 ); \ - src6v = vec_sld( src1v, src6v, 5 ); \ + src2v = VSLD( src1v, src6v, 1 ); \ + src3v = VSLD( src1v, src6v, 2 ); \ + src4v = VSLD( src1v, src6v, 3 ); \ + src5v = VSLD( src1v, src6v, 4 ); \ + src6v = VSLD( src1v, src6v, 5 ); \ \ temp1v = vec_u8_to_s16_h( src1v ); \ temp2v = vec_u8_to_s16_h( src2v ); \ @@ -634,12 +645,12 @@ #define HPEL_FILTER_CENTRAL() \ { \ - temp1v = vec_sld( tempav, tempbv, 12 ); \ - temp2v = vec_sld( tempav, tempbv, 14 ); \ + temp1v = VSLD( tempav, tempbv, 12 ); \ + temp2v = VSLD( tempav, tempbv, 14 ); \ temp3v = tempbv; \ - temp4v = vec_sld( tempbv, tempcv, 2 ); \ - temp5v = vec_sld( tempbv, tempcv, 4 ); \ - temp6v = vec_sld( tempbv, tempcv, 6 ); \ + temp4v = VSLD( tempbv, tempcv, 2 ); \ + temp5v = VSLD( tempbv, tempcv, 4 ); \ + temp6v = VSLD( tempbv, tempcv, 6 ); \ \ HPEL_FILTER_2( temp1v, temp2v, temp3v, \ temp4v, temp5v, temp6v ); \ @@ -647,12 +658,12 @@ dest1v = vec_add( temp1v, thirtytwov ); \ dest1v = vec_sra( dest1v, sixv ); \ \ - temp1v = vec_sld( tempbv, tempcv, 12 ); \ - temp2v = vec_sld( tempbv, tempcv, 14 ); \ + temp1v = VSLD( tempbv, tempcv, 12 ); \ + temp2v = VSLD( tempbv, tempcv, 14 ); \ temp3v = tempcv; \ - temp4v = vec_sld( tempcv, tempdv, 2 ); \ - temp5v = vec_sld( tempcv, tempdv, 4 ); \ - temp6v = vec_sld( tempcv, tempdv, 6 ); \ + temp4v = VSLD( tempcv, tempdv, 2 ); \ + temp5v = VSLD( tempcv, tempdv, 4 ); \ + temp6v = VSLD( tempcv, tempdv, 6 ); \ \ HPEL_FILTER_2( temp1v, temp2v, temp3v, \ temp4v, temp5v, temp6v ); \ @@ -769,6 +780,9 @@ vec_u8_t lv, hv, src1p1v; vec_u8_t avg0v, avg1v, avghv, avghp1v, avgleftv, avgrightv; static const vec_u8_t inverse_bridge_shuffle = CV(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E ); +#ifndef WORDS_BIGENDIAN + static const vec_u8_t inverse_bridge_shuffle_1 = CV(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F ); +#endif for( int y = 0; y < height; y++ ) { @@ -793,11 +807,15 @@ src1p1v = vec_ld(16*(x*2+2), src1); avghp1v = vec_avg(lv, src1p1v); - avgleftv = vec_avg(vec_sld(avg0v, avghv, 1), avg0v); - avgrightv = vec_avg(vec_sld(avghv, avghp1v, 1), avghv); + avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v); + avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv); vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dst0); +#ifdef WORDS_BIGENDIAN vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dsth); +#else + vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dsth); +#endif avg0v = avghp1v; @@ -807,11 +825,15 @@ hv = vec_ld(16*(x*2+2), src2); avghp1v = vec_avg(src1p1v, hv); - avgleftv = vec_avg(vec_sld(avg1v, avghv, 1), avg1v); - avgrightv = vec_avg(vec_sld(avghv, avghp1v, 1), avghv); + avgleftv = vec_avg(VSLD(avg1v, avghv, 1), avg1v); + avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv); vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dstv); +#ifdef WORDS_BIGENDIAN vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dstc); +#else + vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dstc); +#endif avg1v = avghp1v; @@ -825,11 +847,15 @@ lv = vec_ld(16*(x*2+1), src2); avghp1v = vec_avg(src1v, lv); - avgleftv = vec_avg(vec_sld(avg0v, avghv, 1), avg0v); - avgrightv = vec_avg(vec_sld(avg1v, avghp1v, 1), avg1v); + avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v); + avgrightv = vec_avg(VSLD(avg1v, avghp1v, 1), avg1v); lv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle); +#ifdef WORDS_BIGENDIAN hv = (vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv); +#else + hv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1); +#endif vec_ste((vec_u32_t)lv,16*x,(uint32_t*)dst0); vec_ste((vec_u32_t)lv,16*x+4,(uint32_t*)dst0);
View file
x264-snapshot-20141218-2245.tar.bz2/common/ppc/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/mc.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * mc.h: ppc motion compensation ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Eric Petit <eric.petit@lapsus.org> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/ppc/pixel.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/pixel.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * pixel.c: ppc pixel metrics ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Eric Petit <eric.petit@lapsus.org> * Guillaume Poirier <gpoirier@mplayerhq.hu>
View file
x264-snapshot-20141218-2245.tar.bz2/common/ppc/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/pixel.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * pixel.h: ppc pixel metrics ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Eric Petit <eric.petit@lapsus.org> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/ppc/ppccommon.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/ppccommon.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * ppccommon.h: ppc utility macros ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Eric Petit <eric.petit@lapsus.org> * @@ -81,10 +81,17 @@ /*********************************************************************** * 8 <-> 16 bits conversions **********************************************************************/ +#ifdef WORDS_BIGENDIAN #define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v ) #define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v ) #define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v ) #define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v ) +#else +#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v ) +#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v ) +#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v ) +#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v ) +#endif #define vec_u8_to_u16(v) vec_u8_to_u16_h(v) #define vec_u8_to_s16(v) vec_u8_to_s16_h(v) @@ -96,10 +103,17 @@ /*********************************************************************** * 16 <-> 32 bits conversions **********************************************************************/ +#ifdef WORDS_BIGENDIAN #define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( zero_u16v, (vec_u16_t) v ) #define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( zero_u16v, (vec_u16_t) v ) #define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( zero_u16v, (vec_u16_t) v ) #define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( zero_u16v, (vec_u16_t) v ) +#else +#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( (vec_u16_t) v, zero_u16v ) +#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( (vec_u16_t) v, zero_u16v ) +#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( (vec_u16_t) v, zero_u16v ) +#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( (vec_u16_t) v, zero_u16v ) +#endif #define vec_u16_to_u32(v) vec_u16_to_u32_h(v) #define vec_u16_to_s32(v) vec_u16_to_s32_h(v)
View file
x264-snapshot-20141218-2245.tar.bz2/common/ppc/predict.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/predict.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * predict.c: ppc intra prediction ***************************************************************************** - * Copyright (C) 2007-2014 x264 project + * Copyright (C) 2007-2015 x264 project * * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/ppc/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/predict.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * predict.h: ppc intra prediction ***************************************************************************** - * Copyright (C) 2007-2014 x264 project + * Copyright (C) 2007-2015 x264 project * * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/ppc/quant.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/quant.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * quant.c: ppc quantization ***************************************************************************** - * Copyright (C) 2007-2014 x264 project + * Copyright (C) 2007-2015 x264 project * * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu> * @@ -251,6 +251,14 @@ vec_st(dctv, 8*y, dct); \ } +#ifdef WORDS_BIGENDIAN +#define VEC_MULE vec_mule +#define VEC_MULO vec_mulo +#else +#define VEC_MULE vec_mulo +#define VEC_MULO vec_mule +#endif + #define DEQUANT_SHR() \ { \ dctv = vec_ld(8*y, dct); \ @@ -259,14 +267,14 @@ mf1v = vec_ld(16*y, dequant_mf[i_mf]); \ mf2v = vec_ld(16+16*y, dequant_mf[i_mf]); \ \ - multEvenvA = vec_mule(dct1v, (vec_s16_t)mf1v); \ - multOddvA = vec_mulo(dct1v, (vec_s16_t)mf1v); \ + multEvenvA = VEC_MULE(dct1v, (vec_s16_t)mf1v); \ + multOddvA = VEC_MULO(dct1v, (vec_s16_t)mf1v); \ temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \ temp1v = vec_add(temp1v, fv); \ temp1v = vec_sra(temp1v, i_qbitsv); \ \ - multEvenvA = vec_mule(dct2v, (vec_s16_t)mf2v); \ - multOddvA = vec_mulo(dct2v, (vec_s16_t)mf2v); \ + multEvenvA = VEC_MULE(dct2v, (vec_s16_t)mf2v); \ + multOddvA = VEC_MULO(dct2v, (vec_s16_t)mf2v); \ temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \ temp2v = vec_add(temp2v, fv); \ temp2v = vec_sra(temp2v, i_qbitsv); \
View file
x264-snapshot-20141218-2245.tar.bz2/common/ppc/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/quant.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * quant.c: ppc quantization ***************************************************************************** - * Copyright (C) 2007-2014 x264 project + * Copyright (C) 2007-2015 x264 project * * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/predict.c -> x264-snapshot-20150804-2245.tar.bz2/common/predict.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * predict.c: intra prediction ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> @@ -43,6 +43,9 @@ #if ARCH_AARCH64 # include "aarch64/predict.h" #endif +#if ARCH_MIPS +# include "mips/predict.h" +#endif /**************************************************************************** * 16x16 prediction for intra luma block @@ -906,6 +909,21 @@ #if ARCH_AARCH64 x264_predict_16x16_init_aarch64( cpu, pf ); #endif + +#if !HIGH_BIT_DEPTH +#if HAVE_MSA + if( cpu&X264_CPU_MSA ) + { + pf[I_PRED_16x16_V ] = x264_intra_predict_vert_16x16_msa; + pf[I_PRED_16x16_H ] = x264_intra_predict_hor_16x16_msa; + pf[I_PRED_16x16_DC] = x264_intra_predict_dc_16x16_msa; + pf[I_PRED_16x16_P ] = x264_intra_predict_plane_16x16_msa; + pf[I_PRED_16x16_DC_LEFT]= x264_intra_predict_dc_left_16x16_msa; + pf[I_PRED_16x16_DC_TOP ]= x264_intra_predict_dc_top_16x16_msa; + pf[I_PRED_16x16_DC_128 ]= x264_intra_predict_dc_128_16x16_msa; + } +#endif +#endif } void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] ) @@ -934,6 +952,15 @@ #if ARCH_AARCH64 x264_predict_8x8c_init_aarch64( cpu, pf ); #endif + +#if !HIGH_BIT_DEPTH +#if HAVE_MSA + if( cpu&X264_CPU_MSA ) + { + pf[I_PRED_CHROMA_P ] = x264_intra_predict_plane_8x8_msa; + } +#endif +#endif } void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] ) @@ -949,6 +976,10 @@ #if HAVE_MMX x264_predict_8x16c_init_mmx( cpu, pf ); #endif + +#if ARCH_AARCH64 + x264_predict_8x16c_init_aarch64( cpu, pf ); +#endif } void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) @@ -978,6 +1009,15 @@ #if ARCH_AARCH64 x264_predict_8x8_init_aarch64( cpu, pf, predict_filter ); #endif + +#if !HIGH_BIT_DEPTH +#if HAVE_MSA + if( cpu&X264_CPU_MSA ) + { + pf[I_PRED_8x8_DDL] = x264_intra_predict_ddl_8x8_msa; + } +#endif +#endif } void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
View file
x264-snapshot-20141218-2245.tar.bz2/common/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/predict.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * predict.h: intra prediction ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr>
View file
x264-snapshot-20141218-2245.tar.bz2/common/quant.c -> x264-snapshot-20150804-2245.tar.bz2/common/quant.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * quant.c: quantization and level-run ***************************************************************************** - * Copyright (C) 2005-2014 x264 project + * Copyright (C) 2005-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Fiona Glaser <fiona@x264.com> @@ -40,6 +40,9 @@ #if ARCH_AARCH64 # include "aarch64/quant.h" #endif +#if ARCH_MIPS +# include "mips/quant.h" +#endif #define QUANT_ONE( coef, mf, f ) \ { \ @@ -714,7 +717,8 @@ #endif // HAVE_MMX #if HAVE_ALTIVEC - if( cpu&X264_CPU_ALTIVEC ) { + if( cpu&X264_CPU_ALTIVEC ) + { pf->quant_2x2_dc = x264_quant_2x2_dc_altivec; pf->quant_4x4_dc = x264_quant_4x4_dc_altivec; pf->quant_4x4 = x264_quant_4x4_altivec; @@ -753,6 +757,32 @@ { pf->coeff_last4 = x264_coeff_last4_aarch64; pf->coeff_last8 = x264_coeff_last8_aarch64; + pf->coeff_level_run4 = x264_coeff_level_run4_aarch64; + } + if( cpu&X264_CPU_NEON ) + { + pf->coeff_level_run8 = x264_coeff_level_run8_neon; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon; + pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon; + pf->decimate_score15 = x264_decimate_score15_neon; + pf->decimate_score16 = x264_decimate_score16_neon; + pf->decimate_score64 = x264_decimate_score64_neon; + pf->denoise_dct = x264_denoise_dct_neon; + } +#endif + +#if HAVE_MSA + if( cpu&X264_CPU_MSA ) + { + pf->quant_4x4 = x264_quant_4x4_msa; + pf->quant_4x4_dc = x264_quant_4x4_dc_msa; + pf->quant_4x4x4 = x264_quant_4x4x4_msa; + pf->quant_8x8 = x264_quant_8x8_msa; + pf->dequant_4x4 = x264_dequant_4x4_msa; + pf->dequant_4x4_dc = x264_dequant_4x4_dc_msa; + pf->dequant_8x8 = x264_dequant_8x8_msa; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_msa; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_msa; } #endif #endif // HIGH_BIT_DEPTH
View file
x264-snapshot-20141218-2245.tar.bz2/common/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/quant.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * quant.h: quantization and level-run ***************************************************************************** - * Copyright (C) 2005-2014 x264 project + * Copyright (C) 2005-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Fiona Glaser <fiona@x264.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/rectangle.c -> x264-snapshot-20150804-2245.tar.bz2/common/rectangle.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * rectangle.c: rectangle filling ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Fiona Glaser <fiona@x264.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/rectangle.h -> x264-snapshot-20150804-2245.tar.bz2/common/rectangle.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * rectangle.h: rectangle filling ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Fiona Glaser <fiona@x264.com> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/common/set.c -> x264-snapshot-20150804-2245.tar.bz2/common/set.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * set.c: quantization init ***************************************************************************** - * Copyright (C) 2005-2014 x264 project + * Copyright (C) 2005-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/set.h -> x264-snapshot-20150804-2245.tar.bz2/common/set.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * set.h: quantization init ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr>
View file
x264-snapshot-20141218-2245.tar.bz2/common/threadpool.c -> x264-snapshot-20150804-2245.tar.bz2/common/threadpool.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * threadpool.c: thread pooling ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/threadpool.h -> x264-snapshot-20150804-2245.tar.bz2/common/threadpool.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * threadpool.h: thread pooling ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/vlc.c -> x264-snapshot-20150804-2245.tar.bz2/common/vlc.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * vlc.c : vlc tables ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Fiona Glaser <fiona@x264.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/win32thread.c -> x264-snapshot-20150804-2245.tar.bz2/common/win32thread.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * win32thread.c: windows threading ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> * Pegasys Inc. <http://www.pegasys-inc.com> @@ -138,7 +138,7 @@ if( !win32_cond ) return -1; cond->ptr = win32_cond; - win32_cond->semaphore = CreateSemaphore( NULL, 0, 0x7fffffff, NULL ); + win32_cond->semaphore = CreateSemaphoreW( NULL, 0, 0x7fffffff, NULL ); if( !win32_cond->semaphore ) return -1; @@ -147,7 +147,7 @@ if( x264_pthread_mutex_init( &win32_cond->mtx_broadcast, NULL ) ) return -1; - win32_cond->waiters_done = CreateEvent( NULL, FALSE, FALSE, NULL ); + win32_cond->waiters_done = CreateEventW( NULL, FALSE, FALSE, NULL ); if( !win32_cond->waiters_done ) return -1;
View file
x264-snapshot-20141218-2245.tar.bz2/common/win32thread.h -> x264-snapshot-20150804-2245.tar.bz2/common/win32thread.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * win32thread.h: windows threading ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/bitstream-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/bitstream-a.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* bitstream-a.asm: x86 bitstream functions ;***************************************************************************** -;* Copyright (C) 2010-2014 x264 project +;* Copyright (C) 2010-2015 x264 project ;* ;* Authors: Fiona Glaser <fiona@x264.com> ;* Henrik Gramner <henrik@gramner.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/cabac-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/cabac-a.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* cabac-a.asm: x86 cabac ;***************************************************************************** -;* Copyright (C) 2008-2014 x264 project +;* Copyright (C) 2008-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Fiona Glaser <fiona@x264.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/const-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/const-a.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* const-a.asm: x86 global constants ;***************************************************************************** -;* Copyright (C) 2010-2014 x264 project +;* Copyright (C) 2010-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Fiona Glaser <fiona@x264.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/cpu-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/cpu-a.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* cpu-a.asm: x86 cpu utilities ;***************************************************************************** -;* Copyright (C) 2003-2014 x264 project +;* Copyright (C) 2003-2015 x264 project ;* ;* Authors: Laurent Aimar <fenrir@via.ecp.fr> ;* Loren Merritt <lorenm@u.washington.edu> @@ -145,53 +145,3 @@ cglobal cpu_sfence sfence ret - -cextern intel_cpu_indicator_init - -;----------------------------------------------------------------------------- -; void safe_intel_cpu_indicator_init( void ); -;----------------------------------------------------------------------------- -cglobal safe_intel_cpu_indicator_init - push r0 - push r1 - push r2 - push r3 - push r4 - push r5 - push r6 -%if ARCH_X86_64 - push r7 - push r8 - push r9 - push r10 - push r11 - push r12 - push r13 - push r14 -%endif - push rbp - mov rbp, rsp -%if WIN64 - sub rsp, 32 ; shadow space -%endif - and rsp, ~31 - call intel_cpu_indicator_init - leave -%if ARCH_X86_64 - pop r14 - pop r13 - pop r12 - pop r11 - pop r10 - pop r9 - pop r8 - pop r7 -%endif - pop r6 - pop r5 - pop r4 - pop r3 - pop r2 - pop r1 - pop r0 - ret
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/dct-32.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/dct-32.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* dct-32.asm: x86_32 transform and zigzag ;***************************************************************************** -;* Copyright (C) 2003-2014 x264 project +;* Copyright (C) 2003-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Holger Lubitz <holger@lubitz.org>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/dct-64.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/dct-64.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* dct-64.asm: x86_64 transform and zigzag ;***************************************************************************** -;* Copyright (C) 2003-2014 x264 project +;* Copyright (C) 2003-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Holger Lubitz <holger@lubitz.org>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/dct-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/dct-a.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* dct-a.asm: x86 transform and zigzag ;***************************************************************************** -;* Copyright (C) 2003-2014 x264 project +;* Copyright (C) 2003-2015 x264 project ;* ;* Authors: Holger Lubitz <holger@lubitz.org> ;* Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/dct.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * dct.h: x86 transform and zigzag ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/deblock-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/deblock-a.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* deblock-a.asm: x86 deblocking ;***************************************************************************** -;* Copyright (C) 2005-2014 x264 project +;* Copyright (C) 2005-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Fiona Glaser <fiona@x264.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/mc-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/mc-a.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* mc-a.asm: x86 motion compensation ;***************************************************************************** -;* Copyright (C) 2003-2014 x264 project +;* Copyright (C) 2003-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Fiona Glaser <fiona@x264.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/mc-a2.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/mc-a2.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* mc-a2.asm: x86 motion compensation ;***************************************************************************** -;* Copyright (C) 2005-2014 x264 project +;* Copyright (C) 2005-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Fiona Glaser <fiona@x264.com> @@ -40,6 +40,7 @@ deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 %if HIGH_BIT_DEPTH +copy_swap_shuf: times 2 db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 v210_mask: times 4 dq 0xc00ffc003ff003ff v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15 v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14 @@ -50,6 +51,7 @@ deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15 %else +copy_swap_shuf: times 2 db 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14 deinterleave_rgb_shuf: db 0,3,6,9,1,4,7,10,2,5,8,11,-1,-1,-1,-1 db 0,4,8,12,1,5,9,13,2,6,10,14,-1,-1,-1,-1 @@ -913,64 +915,90 @@ %undef sfence %endif ; !HIGH_BIT_DEPTH +%macro PREFETCHNT_ITER 2 ; src, bytes/iteration + %assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal? + %rep (%2+63) / 64 ; assume 64 byte cache lines + prefetchnta [%1+%%i] + %assign %%i %%i + 64 + %endrep +%endmacro + ;----------------------------------------------------------------------------- -; void plane_copy_core( pixel *dst, intptr_t i_dst, -; pixel *src, intptr_t i_src, int w, int h ) +; void plane_copy(_swap)_core( pixel *dst, intptr_t i_dst, +; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- -; assumes i_dst and w are multiples of 16, and i_dst>w -INIT_MMX -cglobal plane_copy_core_mmx2, 6,7 - FIX_STRIDES r1, r3, r4d -%if HIGH_BIT_DEPTH == 0 +; assumes i_dst and w are multiples of mmsize, and i_dst>w +%macro PLANE_COPY_CORE 1 ; swap +%if %1 +cglobal plane_copy_swap_core, 6,7 + mova m4, [copy_swap_shuf] +%else +cglobal plane_copy_core, 6,7 +%endif + FIX_STRIDES r1, r3 +%if %1 && HIGH_BIT_DEPTH + shl r4d, 2 +%elif %1 || HIGH_BIT_DEPTH + add r4d, r4d +%else movsxdifnidn r4, r4d %endif - sub r1, r4 - sub r3, r4 + add r0, r4 + add r2, r4 + neg r4 .loopy: - lea r6d, [r4-63] + lea r6, [r4+4*mmsize] +%if %1 + test r6d, r6d + jg .skip +%endif .loopx: - prefetchnta [r2+256] - movq m0, [r2 ] - movq m1, [r2+ 8] - movntq [r0 ], m0 - movntq [r0+ 8], m1 - movq m2, [r2+16] - movq m3, [r2+24] - movntq [r0+16], m2 - movntq [r0+24], m3 - movq m4, [r2+32] - movq m5, [r2+40] - movntq [r0+32], m4 - movntq [r0+40], m5 - movq m6, [r2+48] - movq m7, [r2+56] - movntq [r0+48], m6 - movntq [r0+56], m7 - add r2, 64 - add r0, 64 - sub r6d, 64 - jg .loopx - prefetchnta [r2+256] - add r6d, 63 - jle .end16 -.loop16: - movq m0, [r2 ] - movq m1, [r2+8] - movntq [r0 ], m0 - movntq [r0+8], m1 - add r2, 16 - add r0, 16 - sub r6d, 16 - jg .loop16 -.end16: + PREFETCHNT_ITER r2+r6, 4*mmsize + movu m0, [r2+r6-4*mmsize] + movu m1, [r2+r6-3*mmsize] + movu m2, [r2+r6-2*mmsize] + movu m3, [r2+r6-1*mmsize] +%if %1 + pshufb m0, m4 + pshufb m1, m4 + pshufb m2, m4 + pshufb m3, m4 +%endif + movnta [r0+r6-4*mmsize], m0 + movnta [r0+r6-3*mmsize], m1 + movnta [r0+r6-2*mmsize], m2 + movnta [r0+r6-1*mmsize], m3 + add r6, 4*mmsize + jle .loopx +.skip: + PREFETCHNT_ITER r2+r6, 4*mmsize + sub r6, 4*mmsize + jz .end +.loop_end: + movu m0, [r2+r6] +%if %1 + pshufb m0, m4 +%endif + movnta [r0+r6], m0 + add r6, mmsize + jl .loop_end +.end: add r0, r1 add r2, r3 - dec r5d + dec r5d jg .loopy sfence - emms RET +%endmacro +INIT_XMM sse +PLANE_COPY_CORE 0 +INIT_XMM ssse3 +PLANE_COPY_CORE 1 +INIT_YMM avx +PLANE_COPY_CORE 0 +INIT_YMM avx2 +PLANE_COPY_CORE 1 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint %if HIGH_BIT_DEPTH @@ -2136,7 +2164,7 @@ INIT_YMM avx MBTREE_AVX 8 -INIT_YMM avx2,fma3 +INIT_YMM avx2 MBTREE_AVX 7 %macro MBTREE_PROPAGATE_LIST 0
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/mc-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/x86/mc-c.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * mc-c.c: x86 motion compensation ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> @@ -90,8 +90,12 @@ void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int ); void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int ); void x264_prefetch_ref_mmx2( pixel *, intptr_t, int ); -void x264_plane_copy_core_mmx2( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); +void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); +void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); +void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); +void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); +void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h ); void x264_plane_copy_interleave_core_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); @@ -167,8 +171,8 @@ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_avx2_fma3( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); #define MC_CHROMA(cpu)\ void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\ @@ -363,9 +367,6 @@ } #endif // !HIGH_BIT_DEPTH -static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; -static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; - #define MC_LUMA(name,instr1,instr2)\ static void mc_luma_##name( pixel *dst, intptr_t i_dst_stride,\ pixel *src[4], intptr_t i_src_stride,\ @@ -374,10 +375,10 @@ {\ int qpel_idx = ((mvy&3)<<2) + (mvx&3);\ int offset = (mvy>>2)*i_src_stride + (mvx>>2);\ - pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\ + pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\ if( qpel_idx & 5 ) /* qpel interpolation needed */\ {\ - pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\ + pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\ x264_pixel_avg_wtab_##instr1[i_width>>2](\ dst, i_dst_stride, src1, i_src_stride,\ src2, i_height );\ @@ -412,10 +413,10 @@ {\ int qpel_idx = ((mvy&3)<<2) + (mvx&3);\ int offset = (mvy>>2)*i_src_stride + (mvx>>2);\ - pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\ + pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\ if( qpel_idx & 5 ) /* qpel interpolation needed */\ {\ - pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\ + pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\ x264_pixel_avg_wtab_##name[i_width>>2](\ dst, *i_dst_stride, src1, i_src_stride,\ src2, i_height );\ @@ -492,39 +493,94 @@ #endif #endif // HIGH_BIT_DEPTH -static void x264_plane_copy_mmx2( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h ) -{ - int c_w = 16/sizeof(pixel) - 1; - if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold. - x264_plane_copy_c( dst, i_dst, src, i_src, w, h ); - } else if( !(w&c_w) ) { - x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, w, h ); - } else if( i_src > 0 ) { - // have to use plain memcpy on the last line (in memory order) to avoid overreading src - x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, (w+c_w)&~c_w, h-1 ); - memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w*sizeof(pixel) ); - } else { - memcpy( dst, src, w*sizeof(pixel) ); - x264_plane_copy_core_mmx2( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h-1 ); - } +#define PLANE_COPY(align, cpu)\ +static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\ +{\ + int c_w = (align) / sizeof(pixel) - 1;\ + if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\ + x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\ + else if( !(w&c_w) )\ + x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\ + else\ + {\ + if( --h > 0 )\ + {\ + if( i_src > 0 )\ + {\ + x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ + dst += i_dst * h;\ + src += i_src * h;\ + }\ + else\ + x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ + }\ + /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\ + memcpy( dst, src, w*sizeof(pixel) );\ + }\ +} + +PLANE_COPY(16, sse) +PLANE_COPY(32, avx) + +#define PLANE_COPY_SWAP(align, cpu)\ +static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\ +{\ + int c_w = (align>>1) / sizeof(pixel) - 1;\ + if( !(w&c_w) )\ + x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\ + else if( w > c_w )\ + {\ + if( --h > 0 )\ + {\ + if( i_src > 0 )\ + {\ + x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\ + dst += i_dst * h;\ + src += i_src * h;\ + }\ + else\ + x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\ + }\ + x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\ + for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\ + {\ + dst[x] = src[x+1];\ + dst[x+1] = src[x];\ + }\ + }\ + else\ + x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\ } +PLANE_COPY_SWAP(16, ssse3) +PLANE_COPY_SWAP(32, avx2) + #define PLANE_INTERLEAVE(cpu) \ static void x264_plane_copy_interleave_##cpu( pixel *dst, intptr_t i_dst,\ pixel *srcu, intptr_t i_srcu,\ pixel *srcv, intptr_t i_srcv, int w, int h )\ {\ - if( !(w&15) ) {\ + int c_w = 16 / sizeof(pixel) - 1;\ + if( !(w&c_w) )\ x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ - } else if( w < 16 || (i_srcu ^ i_srcv) ) {\ - x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ - } else if( i_srcu > 0 ) {\ - x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+15)&~15, h-1 );\ - x264_plane_copy_interleave_c( dst+i_dst*(h-1), 0, srcu+i_srcu*(h-1), 0, srcv+i_srcv*(h-1), 0, w, 1 );\ - } else {\ + else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\ + {\ + if( --h > 0 )\ + {\ + if( i_srcu > 0 )\ + {\ + x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\ + dst += i_dst * h;\ + srcu += i_srcu * h;\ + srcv += i_srcv * h;\ + }\ + else\ + x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\ + }\ x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\ - x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+15)&~15, h-1 );\ }\ + else\ + x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\ } PLANE_INTERLEAVE(mmx2) @@ -666,7 +722,6 @@ pf->prefetch_fenc_422 = x264_prefetch_fenc_422_mmx2; pf->prefetch_ref = x264_prefetch_ref_mmx2; - pf->plane_copy = x264_plane_copy_mmx2; pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2; pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2; @@ -695,6 +750,7 @@ { pf->memcpy_aligned = x264_memcpy_aligned_sse; pf->memzero_aligned = x264_memzero_aligned_sse; + pf->plane_copy = x264_plane_copy_sse; } #if HIGH_BIT_DEPTH @@ -751,6 +807,7 @@ return; pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; + pf->plane_copy_swap = x264_plane_copy_swap_ssse3; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3; pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3; @@ -855,6 +912,7 @@ pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_ssse3; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3; + pf->plane_copy_swap = x264_plane_copy_swap_ssse3; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3; pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3; @@ -932,6 +990,7 @@ if( !(cpu&X264_CPU_AVX) ) return; pf->memzero_aligned = x264_memzero_aligned_avx; + pf->plane_copy = x264_plane_copy_avx; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx; pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx; @@ -940,8 +999,7 @@ if( !(cpu&X264_CPU_AVX2) ) return; + pf->plane_copy_swap = x264_plane_copy_swap_avx2; pf->get_ref = get_ref_avx2; - - if( cpu&X264_CPU_FMA3 ) - pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2_fma3; + pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2; }
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/mc.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * mc.h: x86 motion compensation ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/pixel-32.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/pixel-32.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* pixel-32.asm: x86_32 pixel metrics ;***************************************************************************** -;* Copyright (C) 2003-2014 x264 project +;* Copyright (C) 2003-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Laurent Aimar <fenrir@via.ecp.fr>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/pixel-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/pixel-a.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* pixel.asm: x86 pixel metrics ;***************************************************************************** -;* Copyright (C) 2003-2014 x264 project +;* Copyright (C) 2003-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Holger Lubitz <holger@lubitz.org>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/pixel.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * pixel.h: x86 pixel metrics ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/predict-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/predict-a.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* predict-a.asm: x86 intra prediction ;***************************************************************************** -;* Copyright (C) 2005-2014 x264 project +;* Copyright (C) 2005-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Holger Lubitz <holger@lubitz.org>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/predict-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/x86/predict-c.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * predict-c.c: intra prediction ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/predict.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * predict.h: x86 intra prediction ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/quant-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/quant-a.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* quant-a.asm: x86 quantization and level-run ;***************************************************************************** -;* Copyright (C) 2005-2014 x264 project +;* Copyright (C) 2005-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Fiona Glaser <fiona@x264.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/quant.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * quant.h: x86 quantization and level-run ***************************************************************************** - * Copyright (C) 2005-2014 x264 project + * Copyright (C) 2005-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Fiona Glaser <fiona@x264.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/sad-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/sad-a.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* sad-a.asm: x86 sad functions ;***************************************************************************** -;* Copyright (C) 2003-2014 x264 project +;* Copyright (C) 2003-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Fiona Glaser <fiona@x264.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/sad16-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/sad16-a.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* sad16-a.asm: x86 high depth sad functions ;***************************************************************************** -;* Copyright (C) 2010-2014 x264 project +;* Copyright (C) 2010-2015 x264 project ;* ;* Authors: Oskar Arvidsson <oskar@irock.se> ;* Henrik Gramner <henrik@gramner.com>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/trellis-64.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/trellis-64.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* trellis-64.asm: x86_64 trellis quantization ;***************************************************************************** -;* Copyright (C) 2012-2014 x264 project +;* Copyright (C) 2012-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;*
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/util.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/util.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * util.h: x86 inline asm ***************************************************************************** - * Copyright (C) 2008-2014 x264 project + * Copyright (C) 2008-2015 x264 project * * Authors: Fiona Glaser <fiona@x264.com> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/x86inc.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/x86inc.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* x86inc.asm: x264asm abstraction layer ;***************************************************************************** -;* Copyright (C) 2005-2014 x264 project +;* Copyright (C) 2005-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Anton Mitrofanov <BugMaster@narod.ru> @@ -64,6 +64,15 @@ %endif %endif +%define FORMAT_ELF 0 +%ifidn __OUTPUT_FORMAT__,elf + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf32 + %define FORMAT_ELF 1 +%elifidn __OUTPUT_FORMAT__,elf64 + %define FORMAT_ELF 1 +%endif + %ifdef PREFIX %define mangle(x) _ %+ x %else @@ -74,10 +83,6 @@ SECTION .rodata align=%1 %endmacro -%macro SECTION_TEXT 0-1 16 - SECTION .text align=%1 -%endmacro - %if WIN64 %define PIC %elif ARCH_X86_64 == 0 @@ -90,6 +95,10 @@ default rel %endif +%ifdef __NASM_VER__ + %use smartalign +%endif + ; Macros to eliminate most code duplication between x86_32 and x86_64: ; Currently this works only for leaf functions which load all their arguments ; into registers at the start, and make no other use of the stack. Luckily that @@ -675,7 +684,7 @@ CAT_XDEFINE cglobaled_, %2, 1 %endif %xdefine current_function %2 - %ifidn __OUTPUT_FORMAT__,elf + %if FORMAT_ELF global %2:function %%VISIBILITY %else global %2 @@ -701,14 +710,16 @@ ; like cextern, but without the prefix %macro cextern_naked 1 - %xdefine %1 mangle(%1) + %ifdef PREFIX + %xdefine %1 mangle(%1) + %endif CAT_XDEFINE cglobaled_, %1, 1 extern %1 %endmacro %macro const 1-2+ %xdefine %1 mangle(private_prefix %+ _ %+ %1) - %ifidn __OUTPUT_FORMAT__,elf + %if FORMAT_ELF global %1:data hidden %else global %1 @@ -716,10 +727,9 @@ %1: %2 %endmacro -; This is needed for ELF, otherwise the GNU linker assumes the stack is -; executable by default. -%ifidn __OUTPUT_FORMAT__,elf -SECTION .note.GNU-stack noalloc noexec nowrite progbits +; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. +%if FORMAT_ELF + [SECTION .note.GNU-stack noalloc noexec nowrite progbits] %endif ; cpuflags @@ -738,8 +748,8 @@ %assign cpuflags_avx (1<<11)| cpuflags_sse42 %assign cpuflags_xop (1<<12)| cpuflags_avx %assign cpuflags_fma4 (1<<13)| cpuflags_avx -%assign cpuflags_avx2 (1<<14)| cpuflags_avx -%assign cpuflags_fma3 (1<<15)| cpuflags_avx +%assign cpuflags_fma3 (1<<14)| cpuflags_avx +%assign cpuflags_avx2 (1<<15)| cpuflags_fma3 %assign cpuflags_cache32 (1<<16) %assign cpuflags_cache64 (1<<17) @@ -789,9 +799,17 @@ %endif %if ARCH_X86_64 || cpuflag(sse2) - CPU amdnop + %ifdef __NASM_VER__ + ALIGNMODE k8 + %else + CPU amdnop + %endif %else - CPU basicnop + %ifdef __NASM_VER__ + ALIGNMODE nop + %else + CPU basicnop + %endif %endif %endmacro @@ -868,7 +886,7 @@ %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, ymm %+ %%i - CAT_XDEFINE nymm, %%i, %%i + CAT_XDEFINE nnymm, %%i, %%i %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 @@ -1070,6 +1088,8 @@ %ifdef cpuname %if notcpuflag(%2) %error use of ``%1'' %2 instruction in cpuname function: current_function + %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8 + %error use of ``%1'' sse2 instruction in cpuname function: current_function %endif %endif %endif @@ -1206,7 +1226,7 @@ AVX_INSTR minss, sse, 1, 0, 1 AVX_INSTR movapd, sse2 AVX_INSTR movaps, sse -AVX_INSTR movd +AVX_INSTR movd, mmx AVX_INSTR movddup, sse3 AVX_INSTR movdqa, sse2 AVX_INSTR movdqu, sse2 @@ -1222,7 +1242,7 @@ AVX_INSTR movntdqa, sse4 AVX_INSTR movntpd, sse2 AVX_INSTR movntps, sse -AVX_INSTR movq +AVX_INSTR movq, mmx AVX_INSTR movsd, sse2, 1, 0, 0 AVX_INSTR movshdup, sse3 AVX_INSTR movsldup, sse3 @@ -1468,13 +1488,15 @@ FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss -; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug -%if ARCH_X86_64 == 0 -%macro vpbroadcastq 2 -%if sizeof%1 == 16 - movddup %1, %2 -%else - vbroadcastsd %1, %2 -%endif -%endmacro +; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) +%ifdef __YASM_VER__ + %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 + %macro vpbroadcastq 2 + %if sizeof%1 == 16 + movddup %1, %2 + %else + vbroadcastsd %1, %2 + %endif + %endmacro + %endif %endif
View file
x264-snapshot-20141218-2245.tar.bz2/common/x86/x86util.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/x86util.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* x86util.asm: x86 utility macros ;***************************************************************************** -;* Copyright (C) 2008-2014 x264 project +;* Copyright (C) 2008-2015 x264 project ;* ;* Authors: Holger Lubitz <holger@lubitz.org> ;* Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/config.guess -> x264-snapshot-20150804-2245.tar.bz2/config.guess
Changed
@@ -979,6 +979,9 @@ ppc64:Linux:*:*) echo powerpc64-unknown-linux-gnu exit ;; + ppc64le:Linux:*:*) + echo powerpc64le-unknown-linux-gnu + exit ;; ppc:Linux:*:*) echo powerpc-unknown-linux-gnu exit ;;
View file
x264-snapshot-20141218-2245.tar.bz2/configure -> x264-snapshot-20150804-2245.tar.bz2/configure
Changed
@@ -77,7 +77,7 @@ # several non gcc compilers issue an incredibly large number of warnings on any warning level, # suppress them by disabling all warnings rather than having to use #pragmas to disable most of them for arg in $*; do - [ $arg = -ffast-math ] && arg= + [ "$arg" = -ffast-math ] && arg= [[ "$arg" = -falign-loops* ]] && arg= [ "$arg" = -fno-tree-vectorize ] && arg= [ "$arg" = -Wshadow ] && arg= @@ -105,10 +105,10 @@ cl_ldflags() { for arg in $*; do arg=${arg/LIBPATH/libpath} - [ ${arg#-libpath:} == $arg -a ${arg#-l} != $arg ] && arg=${arg#-l}.lib - [ ${arg#-L} != $arg ] && arg=-libpath:${arg#-L} - [ $arg = -Wl,--large-address-aware ] && arg=-largeaddressaware - [ $arg = -s ] && arg= + [ "${arg#-libpath:}" == "$arg" -a "${arg#-l}" != "$arg" ] && arg=${arg#-l}.lib + [ "${arg#-L}" != "$arg" ] && arg=-libpath:${arg#-L} + [ "$arg" = -Wl,--large-address-aware ] && arg=-largeaddressaware + [ "$arg" = -s ] && arg= [ "$arg" = -Wl,-Bsymbolic ] && arg= [ "$arg" = -fno-tree-vectorize ] && arg= [ "$arg" = -Werror ] && arg= @@ -119,6 +119,7 @@ arg=${arg/pthreadGC/pthreadVC} [ "$arg" = avifil32.lib ] && arg=vfw32.lib [ "$arg" = gpac_static.lib ] && arg=libgpac_static.lib + [ "$arg" = x264.lib ] && arg=libx264.lib [ -n "$arg" ] && echo -n "$arg " done @@ -143,7 +144,9 @@ log_check "for $3 in $1"; fi rm -f conftest.c - [ -n "$1" ] && echo "#include <$1>" > conftest.c + for arg in $1; do + echo "#include <$arg>" >> conftest.c + done echo "int main (void) { $3 return 0; }" >> conftest.c if [ $compiler_style = MS ]; then cc_cmd="$CC conftest.c $(cc_cflags $CFLAGS $CHECK_CFLAGS $2) -link $(cl_ldflags $2 $LDFLAGSCLI $LDFLAGS)" @@ -172,7 +175,9 @@ cpp_check() { log_check "whether $3 is true" rm -f conftest.c - [ -n "$1" ] && echo "#include <$1>" > conftest.c + for arg in $1; do + echo "#include <$arg>" >> conftest.c + done echo -e "#if !($3) \n#error $4 \n#endif " >> conftest.c if [ $compiler_style = MS ]; then cpp_cmd="$CC conftest.c $(cc_cflags $CFLAGS $2) -P" @@ -256,6 +261,48 @@ exit 1 } +configure_system_override() { + log_check "system libx264 configuration" + x264_config_path="$1/x264_config.h" + if [ -e "$x264_config_path" ]; then + res=$? + log_ok + arg="$(grep '#define X264_GPL ' $x264_config_path | sed -e 's/#define X264_GPL *//; s/ *$//')" + if [ -n "$arg" ]; then + [ "$arg" = 0 ] && arg="no" || arg="yes" + [ "$arg" != "$gpl" ] && die "Incompatible license with system libx264" + fi + arg="$(grep '#define X264_BIT_DEPTH ' $x264_config_path | sed -e 's/#define X264_BIT_DEPTH *//; s/ *$//')" + if [ -n "$arg" ]; then + if [ "$arg" != "$bit_depth" ]; then + echo "Override output bit depth with system libx264 configuration" + bit_depth="$arg" + fi + fi + arg="$(grep '#define X264_CHROMA_FORMAT ' $x264_config_path | sed -e 's/#define X264_CHROMA_FORMAT *//; s/ *$//')" + if [ -n "$arg" ]; then + [ "$arg" = 0 ] && arg="all" || arg="${arg#X264_CSP_I}" + if [ "$arg" != "$chroma_format" ]; then + echo "Override output chroma format with system libx264 configuration" + chroma_format="$arg" + fi + fi + arg="$(grep '#define X264_INTERLACED ' $x264_config_path | sed -e 's/#define X264_INTERLACED *//; s/ *$//')" + if [ -n "$arg" ]; then + [ "$arg" = 0 ] && arg="no" || arg="yes" + if [ "$arg" != "$interlaced" ]; then + echo "Override interlaced encoding support with system libx264 configuration" + interlaced="$arg" + fi + fi + else + res=$? + log_fail + log_msg "Failed search path was: $x264_config_path" + fi + return $res +} + rm -f x264_config.h config.h config.mak config.log x264.pc x264.def conftest* SRCPATH="$(cd $(dirname $0); pwd)" @@ -311,7 +358,8 @@ # list of all preprocessor HAVE values we can define CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \ - LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC" + LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC INTEL_DISPATCHER \ + MSA" # parse options @@ -458,6 +506,8 @@ host_vendor="${host%%-*}" host_os="${host#*-}" +trap 'rm -f conftest*' EXIT + # test for use of compilers that require specific handling cc_base=`basename "$CC"` QPRE="-" @@ -600,9 +650,9 @@ case $host_cpu in i*86) ARCH="X86" - AS="yasm" + AS="${AS-yasm}" AS_EXT=".asm" - ASFLAGS="$ASFLAGS -O2 -DARCH_X86_64=0 -I\$(SRCPATH)/common/x86/" + ASFLAGS="$ASFLAGS -DARCH_X86_64=0 -I\$(SRCPATH)/common/x86/" if [ $compiler = GNU ]; then if [[ "$asm" == auto && "$CFLAGS" != *-march* ]]; then CFLAGS="$CFLAGS -march=i686" @@ -629,39 +679,39 @@ stack_alignment=4 fi if [ "$SYS" = MACOSX ]; then - ASFLAGS="$ASFLAGS -f macho -DPREFIX" + ASFLAGS="$ASFLAGS -f macho32 -DPREFIX" elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then ASFLAGS="$ASFLAGS -f win32 -DPREFIX" LDFLAGS="$LDFLAGS -Wl,--large-address-aware" [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--nxcompat -Wl,--dynamicbase" [ $compiler = GNU ] && RCFLAGS="--target=pe-i386 $RCFLAGS" else - ASFLAGS="$ASFLAGS -f elf" + ASFLAGS="$ASFLAGS -f elf32" fi ;; x86_64) ARCH="X86_64" - AS="yasm" + AS="${AS-yasm}" AS_EXT=".asm" ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -I\$(SRCPATH)/common/x86/" [ $compiler = GNU ] && CFLAGS="-m64 $CFLAGS" && LDFLAGS="-m64 $LDFLAGS" if [ "$SYS" = MACOSX ]; then - ASFLAGS="$ASFLAGS -f macho64 -m amd64 -DPIC -DPREFIX" + ASFLAGS="$ASFLAGS -f macho64 -DPIC -DPREFIX" if cc_check '' "-arch x86_64"; then CFLAGS="$CFLAGS -arch x86_64" LDFLAGS="$LDFLAGS -arch x86_64" fi elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then - ASFLAGS="$ASFLAGS -f win32 -m amd64" + ASFLAGS="$ASFLAGS -f win64" # only the GNU toolchain is inconsistent in prefixing function names with _ [ $compiler = GNU ] && cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX" [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--nxcompat -Wl,--dynamicbase" [ $compiler = GNU ] && RCFLAGS="--target=pe-x86-64 $RCFLAGS" else - ASFLAGS="$ASFLAGS -f elf -m amd64" + ASFLAGS="$ASFLAGS -f elf64" fi ;; - powerpc|powerpc64) + powerpc*) ARCH="PPC" if [ $asm = auto ] ; then define HAVE_ALTIVEC @@ -678,13 +728,15 @@ sparc) ARCH="SPARC" ;; - mips|mipsel|mips64|mips64el) + mips*) ARCH="MIPS" + AS="${AS-${CC}}" + AS_EXT=".c" ;; arm*) ARCH="ARM" if [ "$SYS" = MACOSX ] ; then - AS="${AS-extras/gas-preprocessor.pl $CC}" + AS="${AS-${SRCPATH}/tools/gas-preprocessor.pl -arch arm -- ${CC}}" ASFLAGS="$ASFLAGS -DPREFIX -DPIC" # apple's ld doesn't support movw/movt relocations at all # build for armv7 by default if ! echo $CFLAGS | grep -Eq '\-arch' ; then @@ -698,7 +750,7 @@ aarch64) ARCH="AARCH64" if [ "$SYS" = MACOSX ] ; then - AS="${AS-extras/gas-preprocessor.pl $CC}" + AS="${AS-${SRCPATH}/tools/gas-preprocessor.pl -arch aarch64 -- ${CC}}" ASFLAGS="$ASFLAGS -DPREFIX" else AS="${AS-${CC}}" @@ -788,9 +840,6 @@ fi if [ $asm = auto -a $ARCH = AARCH64 ] ; then - # set flags so neon is built by default - echo $CFLAGS | grep -Eq '(-mcpu|-march|-mfpu|-arch)' || CFLAGS="$CFLAGS -arch arm64 -mfpu=neon" - if cc_check '' '' '__asm__("cmeq v0.8h, v0.8h, #0");' ; then define HAVE_NEON ASFLAGS="$ASFLAGS -c" else @@ -805,6 +854,20 @@ as_check ".func test${NL}.endfunc" && define HAVE_AS_FUNC 1 fi +if [ $asm = auto -a $ARCH = MIPS ] ; then + if ! echo $CFLAGS | grep -Eq '(-march|-mmsa|-mno-msa)' ; then + cc_check '' '-mmsa -mfp64 -mhard-float' && CFLAGS="-mmsa -mfp64 -mhard-float $CFLAGS" + fi + + if cc_check '' '' '__asm__("addvi.b $w0, $w1, 1");' ; then + define HAVE_MSA + else + echo "You specified a pre-MSA CPU in your CFLAGS." + echo "If you really want to run on such a CPU, configure with --disable-asm." + exit 1 + fi +fi + [ $asm = no ] && AS="" [ "x$AS" = x ] && asm="no" || asm="yes" @@ -815,16 +878,29 @@ ASFLAGS="$ASFLAGS -DSTACK_ALIGNMENT=$stack_alignment" # skip endianness check for Intel Compiler and MSVS, as all supported platforms are little. each have flags that will cause the check to fail as well +CPU_ENDIAN="little-endian" if [ $compiler = GNU ]; then echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c $CC $CFLAGS conftest.c -c -o conftest.o 2>/dev/null || die "endian test failed" if (${cross_prefix}strings -a conftest.o | grep -q BIGE) && (${cross_prefix}strings -a conftest.o | grep -q FPendian) ; then define WORDS_BIGENDIAN + CPU_ENDIAN="big-endian" elif !(${cross_prefix}strings -a conftest.o | grep -q EGIB && ${cross_prefix}strings -a conftest.o | grep -q naidnePF) ; then die "endian test failed" fi fi +if [ "$cli_libx264" = "system" -a "$shared" != "yes" ] ; then + [ "$static" = "yes" ] && die "Option --system-libx264 can not be used together with --enable-static" + if ${cross_prefix}pkg-config --exists x264 2>/dev/null; then + X264_LIBS="$(${cross_prefix}pkg-config --libs x264)" + X264_INCLUDE_DIR="${X264_INCLUDE_DIR-$(${cross_prefix}pkg-config --variable=includedir x264)}" + configure_system_override "$X264_INCLUDE_DIR" || die "Detection of system libx264 configuration failed" + else + die "Can not find system libx264" + fi +fi + # autodetect options that weren't forced nor disabled # pthread-win32 is lgpl, prevent its use if --disable-gpl is specified and targeting windows @@ -1044,7 +1120,7 @@ cc_check "stdint.h" "" "uint32_t test_vec __attribute__ ((vector_size (16))) = {0,1,2,3};" && define HAVE_VECTOREXT if [ "$pic" = "yes" ] ; then - CFLAGS="$CFLAGS -fPIC" + [ "$SYS" != WINDOWS -a "$SYS" != CYGWIN ] && CFLAGS="$CFLAGS -fPIC" ASFLAGS="$ASFLAGS -DPIC" # resolve textrels in the x86 asm cc_check stdio.h "-shared -Wl,-Bsymbolic" && SOFLAGS="$SOFLAGS -Wl,-Bsymbolic" @@ -1093,6 +1169,12 @@ CFLAGS="-Wno-maybe-uninitialized $CFLAGS" fi +if [ $compiler = ICC -o $compiler = ICL ] ; then + if cc_check 'extras/intel_dispatcher.h' '' 'x264_intel_dispatcher_override();' ; then + define HAVE_INTEL_DISPATCHER + fi +fi + if [ "$bit_depth" -gt "8" ]; then define HIGH_BIT_DEPTH ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=1" @@ -1131,6 +1213,31 @@ grep -q "HAVE_$var 1" config.h || define HAVE_$var 0 done +# generate exported config file + +config_chroma_format="X264_CSP_I$chroma_format" +[ "$config_chroma_format" == "X264_CSP_Iall" ] && config_chroma_format="0" +cat > x264_config.h << EOF +#define X264_BIT_DEPTH $bit_depth +#define X264_GPL $x264_gpl +#define X264_INTERLACED $x264_interlaced +#define X264_CHROMA_FORMAT $config_chroma_format +EOF + +${SRCPATH}/version.sh "${SRCPATH}" >> x264_config.h + +if [ "$cli_libx264" = "system" ] ; then + if [ "$shared" = "yes" ]; then + CLI_LIBX264='$(SONAME)' + else + CLI_LIBX264= + LDFLAGSCLI="$X264_LIBS $LDFLAGSCLI" + cc_check 'stdint.h x264.h' '' 'x264_encoder_open(0);' || die "System libx264 can't be used for compilation of this version" + fi +else + CLI_LIBX264='$(LIBX264)' +fi + DEPMM="${QPRE}MM" DEPMT="${QPRE}MT" if [ $compiler_style = MS ]; then @@ -1183,19 +1290,6 @@ PROF_USE_LD="-fprofile-use" fi -rm -f conftest* - -# generate exported config file - -config_chroma_format="X264_CSP_I$chroma_format" -[ "$config_chroma_format" == "X264_CSP_Iall" ] && config_chroma_format="0" -cat > x264_config.h << EOF -#define X264_BIT_DEPTH $bit_depth -#define X264_GPL $x264_gpl -#define X264_INTERLACED $x264_interlaced -#define X264_CHROMA_FORMAT $config_chroma_format -EOF - # generate config files cat > config.mak << EOF @@ -1205,7 +1299,7 @@ bindir=$bindir libdir=$libdir includedir=$includedir -ARCH=$ARCH +SYS_ARCH=$ARCH SYS=$SYS CC=$CC CFLAGS=$CFLAGS @@ -1284,23 +1378,9 @@ echo 'install: install-lib-static' >> config.mak fi -if [ "$cli_libx264" = "system" ] ; then - if [ "$shared" = "yes" ]; then - CLI_LIBX264='$(SONAME)' - elif ${cross_prefix}pkg-config --exists x264 2>/dev/null; then - LDFLAGSCLI="$LDFLAGSCLI $(${cross_prefix}pkg-config --libs x264)" - CLI_LIBX264= - else - die "Can not find system libx264" - fi -else - CLI_LIBX264='$(LIBX264)' -fi echo "LDFLAGSCLI = $LDFLAGSCLI" >> config.mak echo "CLI_LIBX264 = $CLI_LIBX264" >> config.mak -${SRCPATH}/version.sh "${SRCPATH}" >> x264_config.h - cat > x264.pc << EOF prefix=$prefix exec_prefix=$exec_prefix @@ -1322,6 +1402,7 @@ cat > conftest.log <<EOF platform: $ARCH +byte order: $CPU_ENDIAN system: $SYS cli: $cli libx264: $cli_libx264 @@ -1348,7 +1429,6 @@ echo >> config.log cat conftest.log >> config.log cat conftest.log -rm conftest.log [ "$SRCPATH" != "." ] && ln -sf ${SRCPATH}/Makefile ./Makefile mkdir -p common/{aarch64,arm,ppc,x86} encoder extras filters/video input output tools
View file
x264-snapshot-20141218-2245.tar.bz2/doc/vui.txt -> x264-snapshot-20150804-2245.tar.bz2/doc/vui.txt
Changed
@@ -16,14 +16,14 @@ * How do I use it? You can derive the SAR of an image from the width, height and the display aspect ratio (DAR) of the image as follows: - + SAR_x DAR_x * height ----- = -------------- SAR_y DAR_y * width - + for example: width x height = 704x576, DAR = 4:3 ==> SAR = 2304:2112 or 12:11 - + Please note that if your material is a digitized analog signal, you should not use this equation to calculate the SAR. Refer to the manual of your digitizing equipment or this link instead. @@ -36,7 +36,7 @@ correction of aspect ratios, and there are just few exceptions. You should even use it, if the SAR of your material is 1:1, as the default of x264 is "SAR not defined". - + 2. Overscan ------------ @@ -49,7 +49,7 @@ analog signal. Instead it refers to the "overscan" process on a display that shows only a part of the image. What that part is depends on the display. - + * How do I use this option? As I'm not sure about what part of the image is shown when the display uses an overscan process, I can't provide you with rules or examples. The safe @@ -72,7 +72,7 @@ * What is it? A purely informative setting, that explains what the type of your analog video was, before you digitized it. - + * How do I use this option? Just set it to the desired value. ( e.g. NTSC, PAL ) If you transcode from MPEG2, you may find the value for this option in the @@ -101,11 +101,11 @@ or want to make sure that your material is played back without oversaturation, set if to on. Please note that the default for this option in x264 is off, which is not a safe assumption. - + * Should I use this option? Yes, but there are few decoders/ media players that distinguish between the two options. - + 5. Color Primaries, Transfer Characteristics, Matrix Coefficients ------------------------------------------------------------------- @@ -120,7 +120,7 @@ profile of the digitizing equipment is known, it is possible to correct the colors and gamma of the decoded h264 stream in a way that the video stream looks the same, regardless of the digitizing equipment used. - + * How do I use these options? If you are able to find out which characteristics your digitizing equipment uses, (see the equipment documentation or make reference measurements) @@ -170,9 +170,8 @@ chroma sample location in that direction is equal to one of the luma samples. H264 Annex E contains images that tell you how to "transform" your Chroma Sample Location into a value of 0 to 5 that you can pass to x264. - + * Should I use this option? Unless you are a perfectionist, don't bother. Media players ignore this setting, and favor their own (fixed) assumed Chroma Sample Location. -
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/analyse.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/analyse.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * analyse.c: macroblock analysis ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/analyse.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/analyse.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * analyse.h: macroblock analysis ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/cabac.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/cabac.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * cabac.c: cabac bitstream writing ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/cavlc.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/cavlc.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * cavlc.c: cavlc bitstream writing ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/encoder.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/encoder.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * encoder.c: top-level encoder functions ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> @@ -32,6 +32,9 @@ #include "ratecontrol.h" #include "macroblock.h" #include "me.h" +#if HAVE_INTEL_DISPATCHER +#include "extras/intel_dispatcher.h" +#endif //#define DEBUG_MB_TYPE @@ -471,12 +474,12 @@ int i_csp = h->param.i_csp & X264_CSP_MASK; #if X264_CHROMA_FORMAT - if( CHROMA_FORMAT != CHROMA_420 && i_csp >= X264_CSP_I420 && i_csp <= X264_CSP_NV12 ) + if( CHROMA_FORMAT != CHROMA_420 && i_csp >= X264_CSP_I420 && i_csp < X264_CSP_I422 ) { x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:0 support\n" ); return -1; } - else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_V210 ) + else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp < X264_CSP_I444 ) { x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:2 support\n" ); return -1; @@ -489,36 +492,41 @@ #endif if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX ) { - x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" ); + x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" ); return -1; } - if( i_csp < X264_CSP_I444 && h->param.i_width % 2 ) + int w_mod = i_csp < X264_CSP_I444 ? 2 : 1; + int h_mod = (i_csp < X264_CSP_I422 ? 2 : 1) << PARAM_INTERLACED; + if( h->param.i_width % w_mod ) { - x264_log( h, X264_LOG_ERROR, "width not divisible by 2 (%dx%d)\n", - h->param.i_width, h->param.i_height ); + x264_log( h, X264_LOG_ERROR, "width not divisible by %d (%dx%d)\n", + w_mod, h->param.i_width, h->param.i_height ); return -1; } - - if( i_csp < X264_CSP_I422 && PARAM_INTERLACED && h->param.i_height % 4 ) + if( h->param.i_height % h_mod ) { - x264_log( h, X264_LOG_ERROR, "height not divisible by 4 (%dx%d)\n", - h->param.i_width, h->param.i_height ); + x264_log( h, X264_LOG_ERROR, "height not divisible by %d (%dx%d)\n", + h_mod, h->param.i_width, h->param.i_height ); return -1; } - if( (i_csp < X264_CSP_I422 || PARAM_INTERLACED) && h->param.i_height % 2 ) + if( h->param.crop_rect.i_left >= h->param.i_width || + h->param.crop_rect.i_right >= h->param.i_width || + h->param.crop_rect.i_top >= h->param.i_height || + h->param.crop_rect.i_bottom >= h->param.i_height || + h->param.crop_rect.i_left + h->param.crop_rect.i_right >= h->param.i_width || + h->param.crop_rect.i_top + h->param.crop_rect.i_bottom >= h->param.i_height ) { - x264_log( h, X264_LOG_ERROR, "height not divisible by 2 (%dx%d)\n", - h->param.i_width, h->param.i_height ); + x264_log( h, X264_LOG_ERROR, "invalid crop-rect %u,%u,%u,%u\n", h->param.crop_rect.i_left, + h->param.crop_rect.i_top, h->param.crop_rect.i_right, h->param.crop_rect.i_bottom ); return -1; } - - if( (h->param.crop_rect.i_left + h->param.crop_rect.i_right ) >= h->param.i_width || - (h->param.crop_rect.i_top + h->param.crop_rect.i_bottom) >= h->param.i_height ) + if( h->param.crop_rect.i_left % w_mod || h->param.crop_rect.i_right % w_mod || + h->param.crop_rect.i_top % h_mod || h->param.crop_rect.i_bottom % h_mod ) { - x264_log( h, X264_LOG_ERROR, "invalid crop-rect %u,%u,%u,%u\n", h->param.crop_rect.i_left, - h->param.crop_rect.i_top, h->param.crop_rect.i_right, h->param.crop_rect.i_bottom ); + x264_log( h, X264_LOG_ERROR, "crop-rect %u,%u,%u,%u not divisible by %dx%d\n", h->param.crop_rect.i_left, + h->param.crop_rect.i_top, h->param.crop_rect.i_right, h->param.crop_rect.i_bottom, w_mod, h_mod ); return -1; } @@ -529,7 +537,13 @@ } if( h->param.i_threads == X264_THREADS_AUTO ) + { h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2; + /* Avoid too many threads as they don't improve performance and + * complicate VBV. Capped at an arbitrary 2 rows per thread. */ + int max_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 2 ); + h->param.i_threads = X264_MIN( h->param.i_threads, max_threads ); + } int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 ); if( h->param.i_threads > 1 ) { @@ -583,7 +597,20 @@ h->param.i_dpb_size = 1; } - h->param.i_frame_packing = x264_clip3( h->param.i_frame_packing, -1, 5 ); + if( h->param.i_frame_packing < -1 || h->param.i_frame_packing > 7 ) + { + x264_log( h, X264_LOG_WARNING, "ignoring unknown frame packing value\n" ); + h->param.i_frame_packing = -1; + } + if( h->param.i_frame_packing == 7 && + ((h->param.i_width - h->param.crop_rect.i_left - h->param.crop_rect.i_right) % 3 || + (h->param.i_height - h->param.crop_rect.i_top - h->param.crop_rect.i_bottom) % 3) ) + { + x264_log( h, X264_LOG_ERROR, "cropped resolution %dx%d not compatible with tile format frame packing\n", + h->param.i_width - h->param.crop_rect.i_left - h->param.crop_rect.i_right, + h->param.i_height - h->param.crop_rect.i_top - h->param.crop_rect.i_bottom ); + return -1; + } /* Detect default ffmpeg settings and terminate with an error. */ if( b_open ) @@ -1050,7 +1077,7 @@ h->param.analyse.intra &= ~X264_ANALYSE_I8x8; } h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 ); - h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 ); + h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 3 ); h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 ); if( h->param.rc.f_aq_strength == 0 ) h->param.rc.i_aq_mode = 0; @@ -1390,6 +1417,10 @@ if( param->param_free ) param->param_free( param ); +#if HAVE_INTEL_DISPATCHER + x264_intel_dispatcher_override(); +#endif + if( x264_threading_init() ) { x264_log( h, X264_LOG_ERROR, "unable to initialize threading\n" ); @@ -1676,6 +1707,7 @@ else if( !x264_is_regular_file( f ) ) { x264_log( h, X264_LOG_ERROR, "dump_yuv: incompatible with non-regular file %s\n", h->param.psz_dump_yuv ); + fclose( f ); goto fail; } fclose( f ); @@ -3213,6 +3245,12 @@ /* ------------------- Setup new frame from picture -------------------- */ if( pic_in != NULL ) { + if( h->lookahead->b_exit_thread ) + { + x264_log( h, X264_LOG_ERROR, "lookahead thread is already stopped\n" ); + return -1; + } + /* 1: Copy the picture to a frame and move it to a buffer */ x264_frame_t *fenc = x264_frame_pop_unused( h, 0 ); if( !fenc ) @@ -4087,14 +4125,14 @@ if( h->stat.i_frame_count[SLICE_TYPE_I] > 0 ) { int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_I]; - double i_count = h->stat.i_frame_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0; + double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0; x264_print_intra( i_mb_count, i_count, b_print_pcm, buf ); x264_log( h, X264_LOG_INFO, "mb I %s\n", buf ); } if( h->stat.i_frame_count[SLICE_TYPE_P] > 0 ) { int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_P]; - double i_count = h->stat.i_frame_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0; + double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0; int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_P]; x264_print_intra( i_mb_count, i_count, b_print_pcm, buf ); x264_log( h, X264_LOG_INFO, @@ -4110,7 +4148,7 @@ if( h->stat.i_frame_count[SLICE_TYPE_B] > 0 ) { int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_B]; - double i_count = h->stat.i_frame_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0; + double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0; double i_mb_list_count; int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_B]; int64_t list_count[3] = {0}; /* 0 == L0, 1 == L1, 2 == BI */
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/lookahead.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/lookahead.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * lookahead.c: high-level lookahead functions ***************************************************************************** - * Copyright (C) 2010-2014 Avail Media and x264 project + * Copyright (C) 2010-2015 Avail Media and x264 project * * Authors: Michael Kazmier <mkazmier@availmedia.com> * Alex Giladi <agiladi@availmedia.com>
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/macroblock.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/macroblock.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * macroblock.c: macroblock encoding ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/macroblock.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/macroblock.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * macroblock.h: macroblock encoding ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr>
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/me.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/me.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * me.c: motion estimation ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr>
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/me.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/me.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * me.h: motion estimation ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr>
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/ratecontrol.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/ratecontrol.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * ratecontrol.c: ratecontrol ***************************************************************************** - * Copyright (C) 2005-2014 x264 project + * Copyright (C) 2005-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Michael Niedermayer <michaelni@gmx.at> @@ -96,6 +96,7 @@ /* VBV stuff */ double buffer_size; int64_t buffer_fill_final; + int64_t buffer_fill_final_min; double buffer_fill; /* planned buffer, if all in-progress frames hit their bit budget */ double buffer_rate; /* # of bits added to buffer_fill after each frame */ double vbv_max_rate; /* # of bits added to buffer_fill per second */ @@ -301,10 +302,6 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets ) { - /* constants chosen to result in approximately the same overall bitrate as without AQ. - * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */ - float strength; - float avg_adj = 0.f; /* Initialize frame stats */ for( int i = 0; i < 3; i++ ) { @@ -348,23 +345,30 @@ /* Actual adaptive quantization */ else { - if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE ) + /* constants chosen to result in approximately the same overall bitrate as without AQ. + * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */ + float strength; + float avg_adj = 0.f; + float bias_strength = 0.f; + + if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE || h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE_BIASED ) { - float bit_depth_correction = powf(1 << (BIT_DEPTH-8), 0.5f); + float bit_depth_correction = 1.f / (1 << (2*(BIT_DEPTH-8))); float avg_adj_pow2 = 0.f; for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ ) for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ ) { uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame ); - float qp_adj = powf( energy + 1, 0.125f ); + float qp_adj = powf( energy * bit_depth_correction + 1, 0.125f ); frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj; avg_adj += qp_adj; avg_adj_pow2 += qp_adj * qp_adj; } avg_adj /= h->mb.i_mb_count; avg_adj_pow2 /= h->mb.i_mb_count; - strength = h->param.rc.f_aq_strength * avg_adj / bit_depth_correction; - avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (14.f * bit_depth_correction)) / avg_adj; + strength = h->param.rc.f_aq_strength * avg_adj; + avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj; + bias_strength = h->param.rc.f_aq_strength; } else strength = h->param.rc.f_aq_strength * 1.0397f; @@ -374,7 +378,12 @@ { float qp_adj; int mb_xy = mb_x + mb_y*h->mb.i_mb_stride; - if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE ) + if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE_BIASED ) + { + qp_adj = frame->f_qp_offset[mb_xy]; + qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - 14.f / (qp_adj * qp_adj)); + } + else if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE ) { qp_adj = frame->f_qp_offset[mb_xy]; qp_adj = strength * (qp_adj - avg_adj); @@ -724,7 +733,8 @@ if( h->param.rc.f_vbv_buffer_init > 1. ) h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 ); h->param.rc.f_vbv_buffer_init = x264_clip3f( X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size ), 0, 1); - rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init * h->sps->vui.i_time_scale; + rc->buffer_fill_final = + rc->buffer_fill_final_min = rc->buffer_size * h->param.rc.f_vbv_buffer_init * h->sps->vui.i_time_scale; rc->b_vbv = 1; rc->b_vbv_min_rate = !rc->b_2pass && h->param.rc.i_rc_method == X264_RC_ABR @@ -776,11 +786,11 @@ if( h->param.i_nal_hrd ) { uint64_t denom = (uint64_t)h->sps->vui.hrd.i_bit_rate_unscaled * h->sps->vui.i_time_scale; - uint64_t num = 180000; + uint64_t num = 90000; x264_reduce_fraction64( &num, &denom ); - rc->hrd_multiply_denom = 180000 / num; + rc->hrd_multiply_denom = 90000 / num; - double bits_required = log2( 180000 / rc->hrd_multiply_denom ) + double bits_required = log2( 90000 / rc->hrd_multiply_denom ) + log2( h->sps->vui.i_time_scale ) + log2( h->sps->vui.hrd.i_cpb_size_unscaled ); if( bits_required >= 63 ) @@ -822,6 +832,7 @@ int num_preds = h->param.b_sliced_threads * h->param.i_threads + 1; CHECKED_MALLOC( rc->pred, 5 * sizeof(predictor_t) * num_preds ); CHECKED_MALLOC( rc->pred_b_from_p, sizeof(predictor_t) ); + static const float pred_coeff_table[3] = { 1.0, 1.0, 1.5 }; for( int i = 0; i < 3; i++ ) { rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP ); @@ -829,8 +840,8 @@ rc->lmax[i] = qp2qscale( h->param.rc.i_qp_max ); for( int j = 0; j < num_preds; j++ ) { - rc->pred[i+j*5].coeff_min = 2.0 / 4; - rc->pred[i+j*5].coeff = 2.0; + rc->pred[i+j*5].coeff_min = pred_coeff_table[i] / 2; + rc->pred[i+j*5].coeff = pred_coeff_table[i]; rc->pred[i+j*5].count = 1.0; rc->pred[i+j*5].decay = 0.5; rc->pred[i+j*5].offset = 0.0; @@ -844,7 +855,11 @@ rc->row_preds[i][j].offset = 0.0; } } - *rc->pred_b_from_p = rc->pred[0]; + rc->pred_b_from_p->coeff_min = 0.5 / 2; + rc->pred_b_from_p->coeff = 0.5; + rc->pred_b_from_p->count = 1.0; + rc->pred_b_from_p->decay = 0.5; + rc->pred_b_from_p->offset = 0.0; if( parse_zones( h ) < 0 ) { @@ -1914,15 +1929,16 @@ h->fenc->hrd_timing.cpb_removal_time = rc->nrt_first_access_unit + (double)(h->fenc->i_cpb_delay - h->i_cpb_delay_pir_offset) * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; - double cpb_earliest_arrival_time = h->fenc->hrd_timing.cpb_removal_time - (double)rc->initial_cpb_removal_delay / 90000; if( h->fenc->b_keyframe ) { - rc->nrt_first_access_unit = h->fenc->hrd_timing.cpb_removal_time; - rc->initial_cpb_removal_delay = h->initial_cpb_removal_delay; - rc->initial_cpb_removal_delay_offset = h->initial_cpb_removal_delay_offset; + rc->nrt_first_access_unit = h->fenc->hrd_timing.cpb_removal_time; + rc->initial_cpb_removal_delay = h->initial_cpb_removal_delay; + rc->initial_cpb_removal_delay_offset = h->initial_cpb_removal_delay_offset; } - else - cpb_earliest_arrival_time -= (double)rc->initial_cpb_removal_delay_offset / 90000; + + double cpb_earliest_arrival_time = h->fenc->hrd_timing.cpb_removal_time - (double)rc->initial_cpb_removal_delay / 90000; + if( !h->fenc->b_keyframe ) + cpb_earliest_arrival_time -= (double)rc->initial_cpb_removal_delay_offset / 90000; if( h->sps->vui.hrd.b_cbr_hrd ) h->fenc->hrd_timing.cpb_initial_arrival_time = rc->previous_cpb_final_arrival_time; @@ -2095,7 +2111,7 @@ int bitrate = h->sps->vui.hrd.i_bit_rate_unscaled; x264_ratecontrol_t *rcc = h->rc; x264_ratecontrol_t *rct = h->thread[0]->rc; - uint64_t buffer_size = (uint64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale; + int64_t buffer_size = (int64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale; if( rcc->last_satd >= h->mb.i_mb_count ) update_predictor( &rct->pred[h->sh.i_type], qp2qscale( rcc->qpa_rc ), rcc->last_satd, bits ); @@ -2103,32 +2119,45 @@ if( !rcc->b_vbv ) return filler; - rct->buffer_fill_final -= (uint64_t)bits * h->sps->vui.i_time_scale; + uint64_t buffer_diff = (uint64_t)bits * h->sps->vui.i_time_scale; + rct->buffer_fill_final -= buffer_diff; + rct->buffer_fill_final_min -= buffer_diff; - if( rct->buffer_fill_final < 0 ) + if( rct->buffer_fill_final_min < 0 ) { - double underflow = (double)rct->buffer_fill_final / h->sps->vui.i_time_scale; + double underflow = (double)rct->buffer_fill_final_min / h->sps->vui.i_time_scale; if( rcc->rate_factor_max_increment && rcc->qpm >= rcc->qp_novbv + rcc->rate_factor_max_increment ) x264_log( h, X264_LOG_DEBUG, "VBV underflow due to CRF-max (frame %d, %.0f bits)\n", h->i_frame, underflow ); else x264_log( h, X264_LOG_WARNING, "VBV underflow (frame %d, %.0f bits)\n", h->i_frame, underflow ); + rct->buffer_fill_final = + rct->buffer_fill_final_min = 0; } - rct->buffer_fill_final = X264_MAX( rct->buffer_fill_final, 0 ); if( h->param.i_avcintra_class ) - rct->buffer_fill_final += buffer_size; + buffer_diff = buffer_size; else - rct->buffer_fill_final += (uint64_t)bitrate * h->sps->vui.i_num_units_in_tick * h->fenc->i_cpb_duration; - - if( h->param.rc.b_filler && rct->buffer_fill_final > buffer_size ) - { - int64_t scale = (int64_t)h->sps->vui.i_time_scale * 8; - filler = (rct->buffer_fill_final - buffer_size + scale - 1) / scale; - bits = h->param.i_avcintra_class ? filler * 8 : X264_MAX( (FILLER_OVERHEAD - h->param.b_annexb), filler ) * 8; - rct->buffer_fill_final -= (uint64_t)bits * h->sps->vui.i_time_scale; + buffer_diff = (uint64_t)bitrate * h->sps->vui.i_num_units_in_tick * h->fenc->i_cpb_duration; + rct->buffer_fill_final += buffer_diff; + rct->buffer_fill_final_min += buffer_diff; + + if( rct->buffer_fill_final > buffer_size ) + { + if( h->param.rc.b_filler ) + { + int64_t scale = (int64_t)h->sps->vui.i_time_scale * 8; + filler = (rct->buffer_fill_final - buffer_size + scale - 1) / scale; + bits = h->param.i_avcintra_class ? filler * 8 : X264_MAX( (FILLER_OVERHEAD - h->param.b_annexb), filler ) * 8; + buffer_diff = (uint64_t)bits * h->sps->vui.i_time_scale; + rct->buffer_fill_final -= buffer_diff; + rct->buffer_fill_final_min -= buffer_diff; + } + else + { + rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, buffer_size ); + rct->buffer_fill_final_min = X264_MIN( rct->buffer_fill_final_min, buffer_size ); + } } - else - rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, buffer_size ); return filler; } @@ -2139,23 +2168,27 @@ uint64_t denom = (uint64_t)h->sps->vui.hrd.i_bit_rate_unscaled * h->sps->vui.i_time_scale / rct->hrd_multiply_denom; uint64_t cpb_state = rct->buffer_fill_final; uint64_t cpb_size = (uint64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale; - uint64_t multiply_factor = 180000 / rct->hrd_multiply_denom; + uint64_t multiply_factor = 90000 / rct->hrd_multiply_denom; - if( rct->buffer_fill_final < 0 || rct->buffer_fill_final > cpb_size ) + if( rct->buffer_fill_final < 0 || rct->buffer_fill_final > (int64_t)cpb_size ) { - x264_log( h, X264_LOG_WARNING, "CPB %s: %.0lf bits in a %.0lf-bit buffer\n", - rct->buffer_fill_final < 0 ? "underflow" : "overflow", (float)rct->buffer_fill_final/denom, (float)cpb_size/denom ); + x264_log( h, X264_LOG_WARNING, "CPB %s: %.0f bits in a %.0f-bit buffer\n", + rct->buffer_fill_final < 0 ? "underflow" : "overflow", + (double)rct->buffer_fill_final / h->sps->vui.i_time_scale, (double)cpb_size / h->sps->vui.i_time_scale ); } - h->initial_cpb_removal_delay = (multiply_factor * cpb_state + denom) / (2*denom); - h->initial_cpb_removal_delay_offset = (multiply_factor * cpb_size + denom) / (2*denom) - h->initial_cpb_removal_delay; + h->initial_cpb_removal_delay = (multiply_factor * cpb_state) / denom; + h->initial_cpb_removal_delay_offset = (multiply_factor * cpb_size) / denom - h->initial_cpb_removal_delay; + + int64_t decoder_buffer_fill = h->initial_cpb_removal_delay * denom / multiply_factor; + rct->buffer_fill_final_min = X264_MIN( rct->buffer_fill_final_min, decoder_buffer_fill ); } // provisionally update VBV according to the planned size of all frames currently in progress static void update_vbv_plan( x264_t *h, int overhead ) { x264_ratecontrol_t *rcc = h->rc; - rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final / h->sps->vui.i_time_scale; + rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final_min / h->sps->vui.i_time_scale; if( h->i_thread_frames > 1 ) { int j = h->rc - h->thread[0]->rc;
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/ratecontrol.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/ratecontrol.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * ratecontrol.h: ratecontrol ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr>
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/rdo.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/rdo.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * rdo.c: rate-distortion optimization ***************************************************************************** - * Copyright (C) 2005-2014 x264 project + * Copyright (C) 2005-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Fiona Glaser <fiona@x264.com> @@ -180,7 +180,7 @@ else { x264_macroblock_size_cavlc( h ); - i_bits = ( h->out.bs.i_bits_encoded * i_lambda2 + 128 ) >> 8; + i_bits = ( (uint64_t)h->out.bs.i_bits_encoded * i_lambda2 + 128 ) >> 8; } h->mb.b_transform_8x8 = b_transform_bak; @@ -261,7 +261,7 @@ i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8; } else - i_bits = x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2; + i_bits = (uint64_t)x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2; return (i_ssd<<8) + i_bits; } @@ -297,7 +297,7 @@ i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8; } else - i_bits = x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2; + i_bits = (uint64_t)x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2; return (i_ssd<<8) + i_bits; } @@ -331,7 +331,7 @@ i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8; } else - i_bits = x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2; + i_bits = (uint64_t)x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2; return (i_ssd<<8) + i_bits; } @@ -357,7 +357,7 @@ i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8; } else - i_bits = x264_chroma_size_cavlc( h ) * i_lambda2; + i_bits = (uint64_t)x264_chroma_size_cavlc( h ) * i_lambda2; return (i_ssd<<8) + i_bits; }
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/set.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/set.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * set: header writing ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> @@ -166,7 +166,7 @@ while( (1 << sps->i_log2_max_frame_num) <= max_frame_num ) sps->i_log2_max_frame_num++; - sps->i_poc_type = param->i_bframe || param->b_interlaced ? 0 : 2; + sps->i_poc_type = param->i_bframe || param->b_interlaced || param->i_avcintra_class ? 0 : 2; if( sps->i_poc_type == 0 ) { int max_delta_poc = (param->i_bframe + 2) * (!!param->i_bframe_pyramid + 1) * 2; @@ -578,7 +578,7 @@ memcpy( payload, uuid, 16 ); sprintf( payload+16, "x264 - core %d%s - H.264/MPEG-4 AVC codec - " - "Copy%s 2003-2014 - http://www.videolan.org/x264.html - options: %s", + "Copy%s 2003-2015 - http://www.videolan.org/x264.html - options: %s", X264_BUILD, X264_VERSION, HAVE_GPL?"left":"right", opts ); length = strlen(payload)+1; @@ -663,7 +663,7 @@ bs_write1( &q, quincunx_sampling_flag ); // quincunx_sampling_flag // 0: views are unrelated, 1: left view is on the left, 2: left view is on the right - bs_write ( &q, 6, 1 ); // content_interpretation_type + bs_write ( &q, 6, h->param.i_frame_packing != 6 ); // content_interpretation_type bs_write1( &q, 0 ); // spatial_flipping_flag bs_write1( &q, 0 ); // frame0_flipped_flag
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/set.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/set.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * set.h: header writing ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/slicetype-cl.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/slicetype-cl.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead) ***************************************************************************** - * Copyright (C) 2012-2014 x264 project + * Copyright (C) 2012-2015 x264 project * * Authors: Steve Borho <sborho@multicorewareinc.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/encoder/slicetype.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/slicetype.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * slicetype.c: lookahead analysis ***************************************************************************** - * Copyright (C) 2005-2014 x264 project + * Copyright (C) 2005-2015 x264 project * * Authors: Fiona Glaser <fiona@x264.com> * Loren Merritt <lorenm@u.washington.edu> @@ -612,7 +612,6 @@ if( b_bidir ) { - int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy]; ALIGNED_ARRAY_8( int16_t, dmv,[2],[2] ); m[1].i_pixel = PIXEL_8x8; @@ -624,14 +623,20 @@ LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres ); m[1].p_fref_w = m[1].p_fref[0]; - dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8; - dmv[0][1] = ( mvr[1] * dist_scale_factor + 128 ) >> 8; - dmv[1][0] = dmv[0][0] - mvr[0]; - dmv[1][1] = dmv[0][1] - mvr[1]; - CLIP_MV( dmv[0] ); - CLIP_MV( dmv[1] ); - if( h->param.analyse.i_subpel_refine <= 1 ) - M64( dmv ) &= ~0x0001000100010001ULL; /* mv & ~1 */ + if( fref1->lowres_mvs[0][p1-p0-1][0][0] != 0x7FFF ) + { + int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy]; + dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8; + dmv[0][1] = ( mvr[1] * dist_scale_factor + 128 ) >> 8; + dmv[1][0] = dmv[0][0] - mvr[0]; + dmv[1][1] = dmv[0][1] - mvr[1]; + CLIP_MV( dmv[0] ); + CLIP_MV( dmv[1] ); + if( h->param.analyse.i_subpel_refine <= 1 ) + M64( dmv ) &= ~0x0001000100010001ULL; /* mv & ~1 */ + } + else + M64( dmv ) = 0; TRY_BIDIR( dmv[0], dmv[1], 0 ); if( M64( dmv ) ) @@ -1104,7 +1109,7 @@ if( b_intra ) x264_slicetype_frame_cost( h, a, frames, 0, 0, 0, 0 ); - while( i > 0 && frames[i]->i_type == X264_TYPE_B ) + while( i > 0 && IS_X264_TYPE_B( frames[i]->i_type ) ) i--; last_nonb = i; @@ -1132,7 +1137,7 @@ while( i-- > idx ) { cur_nonb = i; - while( frames[cur_nonb]->i_type == X264_TYPE_B && cur_nonb > 0 ) + while( IS_X264_TYPE_B( frames[cur_nonb]->i_type ) && cur_nonb > 0 ) cur_nonb--; if( cur_nonb < idx ) break; @@ -1226,7 +1231,7 @@ int last_nonb = 0, cur_nonb = 1, idx = 0; x264_frame_t *prev_frame = NULL; int prev_frame_idx = 0; - while( cur_nonb < num_frames && frames[cur_nonb]->i_type == X264_TYPE_B ) + while( cur_nonb < num_frames && IS_X264_TYPE_B( frames[cur_nonb]->i_type ) ) cur_nonb++; int next_nonb = keyframe ? last_nonb : cur_nonb; @@ -1278,7 +1283,7 @@ } last_nonb = cur_nonb; cur_nonb++; - while( cur_nonb <= num_frames && frames[cur_nonb]->i_type == X264_TYPE_B ) + while( cur_nonb <= num_frames && IS_X264_TYPE_B( frames[cur_nonb]->i_type ) ) cur_nonb++; } frames[next_nonb]->i_planned_type[idx] = X264_TYPE_AUTO; @@ -1288,36 +1293,39 @@ { int loc = 1; int cost = 0; - int cur_p = 0; + int cur_nonb = 0; path--; /* Since the 1st path element is really the second frame */ while( path[loc] ) { - int next_p = loc; - /* Find the location of the next P-frame. */ - while( path[next_p] != 'P' ) - next_p++; - - /* Add the cost of the P-frame found above */ - cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_p, 0 ); + int next_nonb = loc; + /* Find the location of the next non-B-frame. */ + while( path[next_nonb] == 'B' ) + next_nonb++; + + /* Add the cost of the non-B-frame found above */ + if( path[next_nonb] == 'P' ) + cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, next_nonb, 0 ); + else /* I-frame */ + cost += x264_slicetype_frame_cost( h, a, frames, next_nonb, next_nonb, next_nonb, 0 ); /* Early terminate if the cost we have found is larger than the best path cost so far */ if( cost > threshold ) break; - if( h->param.i_bframe_pyramid && next_p - cur_p > 2 ) + if( h->param.i_bframe_pyramid && next_nonb - cur_nonb > 2 ) { - int middle = cur_p + (next_p - cur_p)/2; - cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, middle, 0 ); + int middle = cur_nonb + (next_nonb - cur_nonb)/2; + cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, middle, 0 ); for( int next_b = loc; next_b < middle && cost < threshold; next_b++ ) - cost += x264_slicetype_frame_cost( h, a, frames, cur_p, middle, next_b, 0 ); - for( int next_b = middle+1; next_b < next_p && cost < threshold; next_b++ ) - cost += x264_slicetype_frame_cost( h, a, frames, middle, next_p, next_b, 0 ); + cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, middle, next_b, 0 ); + for( int next_b = middle+1; next_b < next_nonb && cost < threshold; next_b++ ) + cost += x264_slicetype_frame_cost( h, a, frames, middle, next_nonb, next_b, 0 ); } else - for( int next_b = loc; next_b < next_p && cost < threshold; next_b++ ) - cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_b, 0 ); + for( int next_b = loc; next_b < next_nonb && cost < threshold; next_b++ ) + cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, next_b, 0 ); - loc = next_p + 1; - cur_p = next_p; + loc = next_nonb + 1; + cur_nonb = next_nonb; } return cost; } @@ -1331,6 +1339,7 @@ char paths[2][X264_LOOKAHEAD_MAX+1]; int num_paths = X264_MIN( h->param.i_bframe+1, length ); int best_cost = COST_MAX; + int best_possible = 0; int idx = 0; /* Iterate over all currently possible paths */ @@ -1342,12 +1351,33 @@ memset( paths[idx]+len, 'B', path ); strcpy( paths[idx]+len+path, "P" ); - /* Calculate the actual cost of the current path */ - int cost = x264_slicetype_path_cost( h, a, frames, paths[idx], best_cost ); - if( cost < best_cost ) + int possible = 1; + for( int i = 1; i <= length; i++ ) { - best_cost = cost; - idx ^= 1; + int i_type = frames[i]->i_type; + if( i_type == X264_TYPE_AUTO ) + continue; + if( IS_X264_TYPE_B( i_type ) ) + possible = possible && (i < len || i == length || paths[idx][i-1] == 'B'); + else + { + possible = possible && (i < len || paths[idx][i-1] != 'B'); + paths[idx][i-1] = IS_X264_TYPE_I( i_type ) ? 'I' : 'P'; + } + } + + if( possible || !best_possible ) + { + if( possible && !best_possible ) + best_cost = COST_MAX; + /* Calculate the actual cost of the current path */ + int cost = x264_slicetype_path_cost( h, a, frames, paths[idx], best_cost ); + if( cost < best_cost ) + { + best_cost = cost; + best_possible = possible; + idx ^= 1; + } } } @@ -1441,13 +1471,15 @@ return scenecut_internal( h, a, frames, p0, p1, real_scenecut ); } +#define IS_X264_TYPE_AUTO_OR_I(x) ((x)==X264_TYPE_AUTO || IS_X264_TYPE_I(x)) +#define IS_X264_TYPE_AUTO_OR_B(x) ((x)==X264_TYPE_AUTO || IS_X264_TYPE_B(x)) + void x264_slicetype_analyse( x264_t *h, int intra_minigop ) { x264_mb_analysis_t a; x264_frame_t *frames[X264_LOOKAHEAD_MAX+3] = { NULL, }; int num_frames, orig_num_frames, keyint_limit, framecnt; int i_mb_count = NUM_MBS; - int cost1p0, cost2p0, cost1b1, cost2p1; int i_max_search = X264_MIN( h->lookahead->next.i_size, X264_LOOKAHEAD_MAX ); int vbv_lookahead = h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead; /* For determinism we should limit the search to the number of frames lookahead has for sure @@ -1463,7 +1495,7 @@ if( !h->lookahead->last_nonb ) return; frames[0] = h->lookahead->last_nonb; - for( framecnt = 0; framecnt < i_max_search && h->lookahead->next.list[framecnt]->i_type == X264_TYPE_AUTO; framecnt++ ) + for( framecnt = 0; framecnt < i_max_search; framecnt++ ) frames[framecnt+1] = h->lookahead->next.list[framecnt]; x264_lowres_context_init( h, &a ); @@ -1492,12 +1524,11 @@ return; } - int num_bframes = 0; - int num_analysed_frames = num_frames; - int reset_start; - if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1, 1, orig_num_frames, i_max_search ) ) + if( IS_X264_TYPE_AUTO_OR_I( frames[1]->i_type ) && + h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1, 1, orig_num_frames, i_max_search ) ) { - frames[1]->i_type = X264_TYPE_I; + if( frames[1]->i_type == X264_TYPE_AUTO ) + frames[1]->i_type = X264_TYPE_I; return; } @@ -1505,6 +1536,23 @@ x264_opencl_slicetype_prep( h, frames, num_frames, a.i_lambda ); #endif + /* Replace forced keyframes with I/IDR-frames */ + for( int j = 1; j <= num_frames; j++ ) + { + if( frames[j]->i_type == X264_TYPE_KEYFRAME ) + frames[j]->i_type = h->param.b_open_gop ? X264_TYPE_I : X264_TYPE_IDR; + } + + /* Close GOP at IDR-frames */ + for( int j = 2; j <= num_frames; j++ ) + { + if( frames[j]->i_type == X264_TYPE_IDR && IS_X264_TYPE_AUTO_OR_B( frames[j-1]->i_type ) ) + frames[j-1]->i_type = X264_TYPE_P; + } + + int num_analysed_frames = num_frames; + int reset_start; + if( h->param.i_bframe ) { if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS ) @@ -1518,96 +1566,147 @@ for( int j = 2; j <= num_frames; j++ ) x264_slicetype_path( h, &a, frames, j, best_paths ); - num_bframes = strspn( best_paths[best_path_index], "B" ); /* Load the results of the analysis into the frame types. */ for( int j = 1; j < num_frames; j++ ) - frames[j]->i_type = best_paths[best_path_index][j-1] == 'B' ? X264_TYPE_B : X264_TYPE_P; + { + if( best_paths[best_path_index][j-1] != 'B' ) + { + if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) ) + frames[j]->i_type = X264_TYPE_P; + } + else + { + if( frames[j]->i_type == X264_TYPE_AUTO ) + frames[j]->i_type = X264_TYPE_B; + } + } } - frames[num_frames]->i_type = X264_TYPE_P; } else if( h->param.i_bframe_adaptive == X264_B_ADAPT_FAST ) { - for( int i = 0; i <= num_frames-2; ) + int last_nonb = 0; + int num_bframes = h->param.i_bframe; + for( int j = 1; j < num_frames; j++ ) { - cost2p1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+2, 1 ); - if( frames[i+2]->i_intra_mbs[2] > i_mb_count / 2 ) + if( j-1 > 0 && IS_X264_TYPE_B( frames[j-1]->i_type ) ) + num_bframes--; + else { - frames[i+1]->i_type = X264_TYPE_P; - frames[i+2]->i_type = X264_TYPE_P; - i += 2; + last_nonb = j-1; + num_bframes = h->param.i_bframe; + } + if( !num_bframes ) + { + if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) ) + frames[j]->i_type = X264_TYPE_P; continue; } -#if HAVE_OPENCL - if( h->param.b_opencl ) + if( frames[j]->i_type != X264_TYPE_AUTO ) + continue; + + if( IS_X264_TYPE_B( frames[j+1]->i_type ) ) { - int b_work_done = 0; - b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+2, i+1 ); - b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+1, i+1 ); - b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+1, i+2, i+2 ); - if( b_work_done ) - x264_opencl_flush( h ); + frames[j]->i_type = X264_TYPE_P; + continue; } + + if( j - last_nonb <= 1 ) + { + int cost2p1 = x264_slicetype_frame_cost( h, &a, frames, last_nonb+0, j+1, j+1, 1 ); + if( frames[j+1]->i_intra_mbs[2] > i_mb_count / 2 ) + { + frames[j]->i_type = X264_TYPE_P; + continue; + } + +#if HAVE_OPENCL + if( h->param.b_opencl ) + { + int b_work_done = 0; + b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, last_nonb+0, j+1, j+0 ); + b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, last_nonb+0, j+0, j+0 ); + b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, last_nonb+1, j+1, j+1 ); + if( b_work_done ) + x264_opencl_flush( h ); + } #endif - cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 ); - cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 ); - cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 ); + int cost1b1 = x264_slicetype_frame_cost( h, &a, frames, last_nonb+0, j+1, j+0, 0 ); + int cost1p0 = x264_slicetype_frame_cost( h, &a, frames, last_nonb+0, j+0, j+0, 0 ); + int cost2p0 = x264_slicetype_frame_cost( h, &a, frames, last_nonb+1, j+1, j+1, 0 ); - if( cost1p0 + cost2p0 < cost1b1 + cost2p1 ) - { - frames[i+1]->i_type = X264_TYPE_P; - i += 1; + if( cost1p0 + cost2p0 < cost1b1 + cost2p1 ) + { + frames[j]->i_type = X264_TYPE_P; + continue; + } + frames[j]->i_type = X264_TYPE_B; continue; } // arbitrary and untuned #define INTER_THRESH 300 #define P_SENS_BIAS (50 - h->param.i_bframe_bias) - frames[i+1]->i_type = X264_TYPE_B; - int j; - for( j = i+2; j <= X264_MIN( i+h->param.i_bframe, num_frames-1 ); j++ ) - { - int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-i-1), INTER_THRESH/10); - int pcost = x264_slicetype_frame_cost( h, &a, frames, i+0, j+1, j+1, 1 ); - if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j-i+1] > i_mb_count/3 ) - break; + int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-last_nonb-1), INTER_THRESH/10); + int pcost = x264_slicetype_frame_cost( h, &a, frames, last_nonb, j+1, j+1, 1 ); + if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j-last_nonb+1] > i_mb_count/3 ) + frames[j]->i_type = X264_TYPE_P; + else frames[j]->i_type = X264_TYPE_B; - } - frames[j]->i_type = X264_TYPE_P; - i = j; } - frames[num_frames]->i_type = X264_TYPE_P; - num_bframes = 0; - while( num_bframes < num_frames && frames[num_bframes+1]->i_type == X264_TYPE_B ) - num_bframes++; } else { - num_bframes = X264_MIN(num_frames-1, h->param.i_bframe); + int num_bframes = h->param.i_bframe; for( int j = 1; j < num_frames; j++ ) - frames[j]->i_type = (j%(num_bframes+1)) ? X264_TYPE_B : X264_TYPE_P; - frames[num_frames]->i_type = X264_TYPE_P; + { + if( !num_bframes ) + { + if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) ) + frames[j]->i_type = X264_TYPE_P; + } + else if( frames[j]->i_type == X264_TYPE_AUTO ) + { + if( IS_X264_TYPE_B( frames[j+1]->i_type ) ) + frames[j]->i_type = X264_TYPE_P; + else + frames[j]->i_type = X264_TYPE_B; + } + if( IS_X264_TYPE_B( frames[j]->i_type ) ) + num_bframes--; + else + num_bframes = h->param.i_bframe; + } } + if( IS_X264_TYPE_AUTO_OR_B( frames[num_frames]->i_type ) ) + frames[num_frames]->i_type = X264_TYPE_P; + + int num_bframes = 0; + while( num_bframes < num_frames && IS_X264_TYPE_B( frames[num_bframes+1]->i_type ) ) + num_bframes++; /* Check scenecut on the first minigop. */ for( int j = 1; j < num_bframes+1; j++ ) - if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1, 0, orig_num_frames, i_max_search ) ) + { + if( frames[j]->i_forced_type == X264_TYPE_AUTO && IS_X264_TYPE_AUTO_OR_I( frames[j+1]->i_forced_type ) && + h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1, 0, orig_num_frames, i_max_search ) ) { frames[j]->i_type = X264_TYPE_P; num_analysed_frames = j; break; } + } reset_start = keyframe ? 1 : X264_MIN( num_bframes+2, num_analysed_frames+1 ); } else { for( int j = 1; j <= num_frames; j++ ) - frames[j]->i_type = X264_TYPE_P; + if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) ) + frames[j]->i_type = X264_TYPE_P; reset_start = !keyframe + 1; - num_bframes = 0; } /* Perform the actual macroblock tree analysis. @@ -1617,21 +1716,63 @@ /* Enforce keyframe limit. */ if( !h->param.b_intra_refresh ) - for( int i = keyint_limit+1; i <= num_frames; i += h->param.i_keyint_max ) + { + int last_keyframe = h->lookahead->i_last_keyframe; + int last_possible = 0; + for( int j = 1; j <= num_frames; j++ ) { - frames[i]->i_type = X264_TYPE_I; - reset_start = X264_MIN( reset_start, i+1 ); - if( h->param.b_open_gop && h->param.b_bluray_compat ) - while( IS_X264_TYPE_B( frames[i-1]->i_type ) ) - i--; + x264_frame_t *frm = frames[j]; + int keyframe_dist = frm->i_frame - last_keyframe; + + if( IS_X264_TYPE_AUTO_OR_I( frm->i_forced_type ) ) + { + if( h->param.b_open_gop || !IS_X264_TYPE_B( frames[j-1]->i_forced_type ) ) + last_possible = j; + } + if( keyframe_dist >= h->param.i_keyint_max ) + { + if( last_possible != 0 && last_possible != j ) + { + j = last_possible; + frm = frames[j]; + keyframe_dist = frm->i_frame - last_keyframe; + } + last_possible = 0; + if( frm->i_type != X264_TYPE_IDR ) + frm->i_type = h->param.b_open_gop ? X264_TYPE_I : X264_TYPE_IDR; + } + if( frm->i_type == X264_TYPE_I && keyframe_dist >= h->param.i_keyint_min ) + { + if( h->param.b_open_gop ) + { + last_keyframe = frm->i_frame; + if( h->param.b_bluray_compat ) + { + // Use bluray order + int bframes = 0; + while( bframes < j-1 && IS_X264_TYPE_B( frames[j-1-bframes]->i_type ) ) + bframes++; + last_keyframe -= bframes; + } + } + else if( frm->i_forced_type != X264_TYPE_I ) + frm->i_type = X264_TYPE_IDR; + } + if( frm->i_type == X264_TYPE_IDR ) + { + last_keyframe = frm->i_frame; + if( j > 1 && IS_X264_TYPE_B( frames[j-1]->i_type ) ) + frames[j-1]->i_type = X264_TYPE_P; + } } + } if( vbv_lookahead ) x264_vbv_lookahead( h, &a, frames, num_frames, keyframe ); /* Restore frametypes for all frames that haven't actually been decided yet. */ for( int j = reset_start; j <= num_frames; j++ ) - frames[j]->i_type = X264_TYPE_AUTO; + frames[j]->i_type = frames[j]->i_forced_type; #if HAVE_OPENCL x264_opencl_slicetype_end( h ); @@ -1695,6 +1836,14 @@ for( bframes = 0, brefs = 0;; bframes++ ) { frm = h->lookahead->next.list[bframes]; + + if( frm->i_forced_type != X264_TYPE_AUTO && frm->i_type != frm->i_forced_type && + !(frm->i_forced_type == X264_TYPE_KEYFRAME && IS_X264_TYPE_I( frm->i_type )) ) + { + x264_log( h, X264_LOG_WARNING, "forced frame type (%d) at %d was changed to frame type (%d)\n", + frm->i_forced_type, frm->i_frame, frm->i_type ); + } + if( frm->i_type == X264_TYPE_BREF && h->param.i_bframe_pyramid < X264_B_PYRAMID_NORMAL && brefs == h->param.i_bframe_pyramid ) {
View file
x264-snapshot-20141218-2245.tar.bz2/example.c -> x264-snapshot-20150804-2245.tar.bz2/example.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * example.c: libx264 API usage example ***************************************************************************** - * Copyright (C) 2014 x264 project + * Copyright (C) 2014-2015 x264 project * * Authors: Anton Mitrofanov <BugMaster@narod.ru> * @@ -24,26 +24,14 @@ *****************************************************************************/ #ifdef _WIN32 -/* The following two defines must be located before the inclusion of any system header files. */ -#define WINVER 0x0500 -#define _WIN32_WINNT 0x0500 -#include <windows.h> #include <io.h> /* _setmode() */ #include <fcntl.h> /* _O_BINARY */ #endif #include <stdint.h> #include <stdio.h> -#include <signal.h> #include <x264.h> -/* Ctrl-C handler */ -static volatile int b_ctrl_c = 0; -static void sigint_handler( int a ) -{ - b_ctrl_c = 1; -} - #define FAIL_IF_ERROR( cond, ... )\ do\ {\ @@ -72,9 +60,6 @@ _setmode( _fileno( stderr ), _O_BINARY ); #endif - /* Control-C handler */ - signal( SIGINT, sigint_handler ); - FAIL_IF_ERROR( !(argc > 1), "Example usage: example 352x288 <input.yuv >output.h264\n" ); FAIL_IF_ERROR( 2 != sscanf( argv[1], "%dx%d", &width, &height ), "resolution not specified or incorrect\n" ); @@ -105,17 +90,17 @@ #undef fail #define fail fail3 + int luma_size = width * height; + int chroma_size = luma_size / 4; /* Encode frames */ - for( ; !b_ctrl_c; i_frame++ ) + for( ;; i_frame++ ) { /* Read input frame */ - int plane_size = width * height; - if( fread( pic.img.plane[0], 1, plane_size, stdin ) != plane_size ) + if( fread( pic.img.plane[0], 1, luma_size, stdin ) != luma_size ) break; - plane_size = ((width + 1) >> 1) * ((height + 1) >> 1); - if( fread( pic.img.plane[1], 1, plane_size, stdin ) != plane_size ) + if( fread( pic.img.plane[1], 1, chroma_size, stdin ) != chroma_size ) break; - if( fread( pic.img.plane[2], 1, plane_size, stdin ) != plane_size ) + if( fread( pic.img.plane[2], 1, chroma_size, stdin ) != chroma_size ) break; pic.i_pts = i_frame; @@ -129,7 +114,7 @@ } } /* Flush delayed frames */ - while( !b_ctrl_c && x264_encoder_delayed_frames( h ) ) + while( x264_encoder_delayed_frames( h ) ) { i_frame_size = x264_encoder_encode( h, &nal, &i_nal, NULL, &pic_out ); if( i_frame_size < 0 )
View file
x264-snapshot-20141218-2245.tar.bz2/extras/avxsynth_c.h -> x264-snapshot-20150804-2245.tar.bz2/extras/avxsynth_c.h
Changed
@@ -33,8 +33,12 @@ #ifndef __AVXSYNTH_C__ #define __AVXSYNTH_C__ -#include "windowsPorts/windows2linux.h" #include <stdarg.h> +#include <stdint.h> + +typedef int64_t INT64; +#define __stdcall +#define __declspec(x) #ifdef __cplusplus # define EXTERN_C extern "C" @@ -64,12 +68,6 @@ # endif #endif -#ifdef __GNUC__ -typedef long long int INT64; -#else -typedef __int64 INT64; -#endif - ///////////////////////////////////////////////////////////////////// //
View file
x264-snapshot-20150804-2245.tar.bz2/extras/intel_dispatcher.h
Added
@@ -0,0 +1,46 @@ +/***************************************************************************** + * intel_dispatcher.h: intel compiler cpu dispatcher override + ***************************************************************************** + * Copyright (C) 2014-2015 x264 project + * + * Authors: Anton Mitrofanov <BugMaster@narod.ru> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_INTEL_DISPATCHER_H +#define X264_INTEL_DISPATCHER_H + +/* Feature flags using _FEATURE_* defines from immintrin.h */ +extern unsigned long long __intel_cpu_feature_indicator; +extern unsigned long long __intel_cpu_feature_indicator_x; + +/* CPU vendor independent version of dispatcher */ +void __intel_cpu_features_init_x( void ); + +static void x264_intel_dispatcher_override( void ) +{ + if( __intel_cpu_feature_indicator & ~1ULL ) + return; + __intel_cpu_feature_indicator = 0; + __intel_cpu_feature_indicator_x = 0; + __intel_cpu_features_init_x(); + __intel_cpu_feature_indicator = __intel_cpu_feature_indicator_x; +} + +#endif
View file
x264-snapshot-20141218-2245.tar.bz2/filters/filters.c -> x264-snapshot-20150804-2245.tar.bz2/filters/filters.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * filters.c: common filter functions ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Diogo Franco <diogomfranco@gmail.com> * Steven Walters <kemuri9@gmail.com>
View file
x264-snapshot-20141218-2245.tar.bz2/filters/filters.h -> x264-snapshot-20150804-2245.tar.bz2/filters/filters.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * filters.h: common filter functions ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Diogo Franco <diogomfranco@gmail.com> * Steven Walters <kemuri9@gmail.com>
View file
x264-snapshot-20141218-2245.tar.bz2/filters/video/cache.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/cache.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * cache.c: cache video filter ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/filters/video/crop.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/crop.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * crop.c: crop video filter ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> * James Darnley <james.darnley@gmail.com>
View file
x264-snapshot-20141218-2245.tar.bz2/filters/video/depth.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/depth.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * depth.c: bit-depth conversion video filter ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Oskar Arvidsson <oskar@irock.se> * @@ -50,6 +50,7 @@ csp_mask == X264_CSP_YV16 || csp_mask == X264_CSP_YV24 || csp_mask == X264_CSP_NV12 || + csp_mask == X264_CSP_NV21 || csp_mask == X264_CSP_NV16 || csp_mask == X264_CSP_BGR || csp_mask == X264_CSP_RGB || @@ -59,7 +60,7 @@ static int csp_num_interleaved( int csp, int plane ) { int csp_mask = csp & X264_CSP_MASK; - return (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ? 2 : + return (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV21 || csp_mask == X264_CSP_NV16) && plane == 1 ? 2 : csp_mask == X264_CSP_BGR || csp_mask == X264_CSP_RGB ? 3 : csp_mask == X264_CSP_BGRA ? 4 : 1; @@ -73,10 +74,10 @@ static void dither_plane_##pitch( pixel *dst, int dst_stride, uint16_t *src, int src_stride, \ int width, int height, int16_t *errors ) \ { \ - const int lshift = 16-BIT_DEPTH; \ - const int rshift = 16-BIT_DEPTH+2; \ - const int half = 1 << (16-BIT_DEPTH+1); \ - const int pixel_max = (1 << BIT_DEPTH)-1; \ + const int lshift = 16-X264_BIT_DEPTH; \ + const int rshift = 16-X264_BIT_DEPTH+2; \ + const int half = 1 << (16-X264_BIT_DEPTH+1); \ + const int pixel_max = (1 << X264_BIT_DEPTH)-1; \ memset( errors, 0, (width+1) * sizeof(int16_t) ); \ for( int y = 0; y < height; y++, src += src_stride, dst += dst_stride ) \ { \ @@ -136,7 +137,7 @@ static void scale_image( cli_image_t *output, cli_image_t *img ) { int csp_mask = img->csp & X264_CSP_MASK; - const int shift = BIT_DEPTH - 8; + const int shift = X264_BIT_DEPTH - 8; for( int i = 0; i < img->planes; i++ ) { uint8_t *src = img->plane[i]; @@ -216,7 +217,7 @@ ret = 1; } - FAIL_IF_ERROR( bit_depth != BIT_DEPTH, "this build supports only bit depth %d\n", BIT_DEPTH ) + FAIL_IF_ERROR( bit_depth != X264_BIT_DEPTH, "this build supports only bit depth %d\n", X264_BIT_DEPTH ) FAIL_IF_ERROR( ret, "unsupported bit depth conversion.\n" ) /* only add the filter to the chain if it's needed */
View file
x264-snapshot-20141218-2245.tar.bz2/filters/video/fix_vfr_pts.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/fix_vfr_pts.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * fix_vfr_pts.c: vfr pts fixing video filter ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/filters/video/internal.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/internal.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * internal.c: video filter utilities ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/filters/video/internal.h -> x264-snapshot-20150804-2245.tar.bz2/filters/video/internal.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * internal.h: video filter utilities ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/filters/video/resize.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/resize.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * resize.c: resize video filter ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> * @@ -156,6 +156,7 @@ case X264_CSP_BGRA: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGRA64 : AV_PIX_FMT_BGRA; /* the next csp has no equivalent 16bit depth in swscale */ case X264_CSP_NV12: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV12; + case X264_CSP_NV21: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV21; /* the next csp is no supported by swscale at all */ case X264_CSP_NV16: default: return AV_PIX_FMT_NONE;
View file
x264-snapshot-20141218-2245.tar.bz2/filters/video/select_every.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/select_every.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * select_every.c: select-every video filter ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/filters/video/source.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/source.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * source.c: source video filter ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/filters/video/video.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/video.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * video.c: video filters ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/filters/video/video.h -> x264-snapshot-20150804-2245.tar.bz2/filters/video/video.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * video.h: video filters ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/input/avs.c -> x264-snapshot-20150804-2245.tar.bz2/input/avs.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * avs.c: avisynth input ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> * @@ -27,15 +27,15 @@ #if USE_AVXSYNTH #include <dlfcn.h> #if SYS_MACOSX -#define avs_open dlopen( "libavxsynth.dylib", RTLD_NOW ) +#define avs_open() dlopen( "libavxsynth.dylib", RTLD_NOW ) #else -#define avs_open dlopen( "libavxsynth.so", RTLD_NOW ) +#define avs_open() dlopen( "libavxsynth.so", RTLD_NOW ) #endif #define avs_close dlclose #define avs_address dlsym #else #include <windows.h> -#define avs_open LoadLibraryW( L"avisynth" ) +#define avs_open() LoadLibraryW( L"avisynth" ) #define avs_close FreeLibrary #define avs_address GetProcAddress #endif @@ -80,7 +80,7 @@ { AVS_Clip *clip; AVS_ScriptEnvironment *env; - HMODULE library; + void *library; int num_frames; struct { @@ -102,7 +102,7 @@ /* load the library and functions we require from it */ static int x264_avs_load_library( avs_hnd_t *h ) { - h->library = avs_open; + h->library = avs_open(); if( !h->library ) return -1; LOAD_AVS_FUNC( avs_clip_get_error, 0 ); @@ -175,8 +175,9 @@ FILE *fh = x264_fopen( psz_filename, "r" ); if( !fh ) return -1; - FAIL_IF_ERROR( !x264_is_regular_file( fh ), "AVS input is incompatible with non-regular file `%s'\n", psz_filename ); + int b_regular = x264_is_regular_file( fh ); fclose( fh ); + FAIL_IF_ERROR( !b_regular, "AVS input is incompatible with non-regular file `%s'\n", psz_filename ); avs_hnd_t *h = malloc( sizeof(avs_hnd_t) ); if( !h )
View file
x264-snapshot-20141218-2245.tar.bz2/input/ffms.c -> x264-snapshot-20150804-2245.tar.bz2/input/ffms.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * ffms.c: ffmpegsource input ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: Mike Gurlitz <mike.gurlitz@gmail.com> * Steven Walters <kemuri9@gmail.com>
View file
x264-snapshot-20141218-2245.tar.bz2/input/input.c -> x264-snapshot-20150804-2245.tar.bz2/input/input.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * input.c: common input functions ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Steven Walters <kemuri9@gmail.com> * @@ -33,6 +33,7 @@ [X264_CSP_YV16] = { "yv16", 3, { 1, .5, .5 }, { 1, 1, 1 }, 2, 1 }, [X264_CSP_YV24] = { "yv24", 3, { 1, 1, 1 }, { 1, 1, 1 }, 1, 1 }, [X264_CSP_NV12] = { "nv12", 2, { 1, 1 }, { 1, .5 }, 2, 2 }, + [X264_CSP_NV21] = { "nv21", 2, { 1, 1 }, { 1, .5 }, 2, 2 }, [X264_CSP_NV16] = { "nv16", 2, { 1, 1 }, { 1, 1 }, 2, 1 }, [X264_CSP_BGR] = { "bgr", 1, { 3 }, { 1 }, 1, 1 }, [X264_CSP_BGRA] = { "bgra", 1, { 4 }, { 1 }, 1, 1 },
View file
x264-snapshot-20141218-2245.tar.bz2/input/input.h -> x264-snapshot-20150804-2245.tar.bz2/input/input.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * input.h: file input ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/input/lavf.c -> x264-snapshot-20150804-2245.tar.bz2/input/lavf.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * lavf.c: libavformat input ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: Mike Gurlitz <mike.gurlitz@gmail.com> * Steven Walters <kemuri9@gmail.com>
View file
x264-snapshot-20141218-2245.tar.bz2/input/raw.c -> x264-snapshot-20150804-2245.tar.bz2/input/raw.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * raw.c: raw input ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/input/thread.c -> x264-snapshot-20150804-2245.tar.bz2/input/thread.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * thread.c: threaded input ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/input/timecode.c -> x264-snapshot-20150804-2245.tar.bz2/input/timecode.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * timecode.c: timecode file input ***************************************************************************** - * Copyright (C) 2010-2014 x264 project + * Copyright (C) 2010-2015 x264 project * * Authors: Yusuke Nakamura <muken.the.vfrmaniac@gmail.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/input/y4m.c -> x264-snapshot-20150804-2245.tar.bz2/input/y4m.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * y4m.c: y4m input ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/output/flv.c -> x264-snapshot-20150804-2245.tar.bz2/output/flv.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * flv.c: flv muxer ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: Kieran Kunhya <kieran@kunhya.com> * @@ -75,21 +75,29 @@ static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt ) { - *p_handle = NULL; flv_hnd_t *p_flv = calloc( 1, sizeof(flv_hnd_t) ); - if( !p_flv ) - return -1; - - p_flv->b_dts_compress = opt->use_dts_compress; - - p_flv->c = flv_create_writer( psz_filename ); - if( !p_flv->c ) - return -1; - - CHECK( write_header( p_flv->c ) ); - *p_handle = p_flv; + if( p_flv ) + { + flv_buffer *c = flv_create_writer( psz_filename ); + if( c ) + { + if( !write_header( c ) ) + { + p_flv->c = c; + p_flv->b_dts_compress = opt->use_dts_compress; + *p_handle = p_flv; + return 0; + } + + fclose( c->fp ); + free( c->data ); + free( c ); + } + free( p_flv ); + } - return 0; + *p_handle = NULL; + return -1; } static int set_param( hnd_t handle, x264_param_t *p_param ) @@ -293,15 +301,22 @@ return i_size; } -static void rewrite_amf_double( FILE *fp, uint64_t position, double value ) +static int rewrite_amf_double( FILE *fp, uint64_t position, double value ) { uint64_t x = endian_fix64( flv_dbl2int( value ) ); - fseek( fp, position, SEEK_SET ); - fwrite( &x, 8, 1, fp ); + return !fseek( fp, position, SEEK_SET ) && fwrite( &x, 8, 1, fp ) == 1 ? 0 : -1; } +#undef CHECK +#define CHECK(x)\ +do {\ + if( (x) < 0 )\ + goto error;\ +} while( 0 ) + static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts ) { + int ret = -1; flv_hnd_t *p_flv = handle; flv_buffer *c = p_flv->c; @@ -317,19 +332,22 @@ if( p_flv->i_framerate_pos ) { framerate = (double)p_flv->i_framenum / total_duration; - rewrite_amf_double( c->fp, p_flv->i_framerate_pos, framerate ); + CHECK( rewrite_amf_double( c->fp, p_flv->i_framerate_pos, framerate ) ); } - rewrite_amf_double( c->fp, p_flv->i_duration_pos, total_duration ); - rewrite_amf_double( c->fp, p_flv->i_filesize_pos, filesize ); - rewrite_amf_double( c->fp, p_flv->i_bitrate_pos, filesize * 8 / ( total_duration * 1000 ) ); + CHECK( rewrite_amf_double( c->fp, p_flv->i_duration_pos, total_duration ) ); + CHECK( rewrite_amf_double( c->fp, p_flv->i_filesize_pos, filesize ) ); + CHECK( rewrite_amf_double( c->fp, p_flv->i_bitrate_pos, filesize * 8 / ( total_duration * 1000 ) ) ); } + ret = 0; +error: fclose( c->fp ); - free( p_flv ); + free( c->data ); free( c ); + free( p_flv ); - return 0; + return ret; } const cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };
View file
x264-snapshot-20141218-2245.tar.bz2/output/flv_bytestream.c -> x264-snapshot-20150804-2245.tar.bz2/output/flv_bytestream.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * flv_bytestream.c: flv muxer utilities ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: Kieran Kunhya <kieran@kunhya.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/output/flv_bytestream.h -> x264-snapshot-20150804-2245.tar.bz2/output/flv_bytestream.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * flv_bytestream.h: flv muxer utilities ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: Kieran Kunhya <kieran@kunhya.com> *
View file
x264-snapshot-20141218-2245.tar.bz2/output/matroska.c -> x264-snapshot-20150804-2245.tar.bz2/output/matroska.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * matroska.c: matroska muxer ***************************************************************************** - * Copyright (C) 2005-2014 x264 project + * Copyright (C) 2005-2015 x264 project * * Authors: Mike Matsnev <mike@haali.su> * @@ -62,9 +62,14 @@ return 0; } +#define STEREO_COUNT 7 +static const uint8_t stereo_modes[STEREO_COUNT] = {5,9,7,1,3,13,0}; +static const uint8_t stereo_w_div[STEREO_COUNT] = {1,2,1,2,1,1,1}; +static const uint8_t stereo_h_div[STEREO_COUNT] = {1,1,2,1,2,1,1}; + static int set_param( hnd_t handle, x264_param_t *p_param ) { - mkv_hnd_t *p_mkv = handle; + mkv_hnd_t *p_mkv = handle; int64_t dw, dh; if( p_param->i_fps_num > 0 && !p_param->b_vfr_input ) @@ -77,25 +82,27 @@ p_mkv->frame_duration = 0; } - p_mkv->width = p_mkv->d_width = p_param->i_width; - p_mkv->height = p_mkv->d_height = p_param->i_height; + dw = p_mkv->width = p_param->i_width; + dh = p_mkv->height = p_param->i_height; p_mkv->display_size_units = DS_PIXELS; - p_mkv->stereo_mode = p_param->i_frame_packing; - + p_mkv->stereo_mode = -1; + if( p_param->i_frame_packing >= 0 && p_param->i_frame_packing < STEREO_COUNT ) + { + p_mkv->stereo_mode = stereo_modes[p_param->i_frame_packing]; + dw /= stereo_w_div[p_param->i_frame_packing]; + dh /= stereo_h_div[p_param->i_frame_packing]; + } if( p_param->vui.i_sar_width && p_param->vui.i_sar_height && p_param->vui.i_sar_width != p_param->vui.i_sar_height ) { if ( p_param->vui.i_sar_width > p_param->vui.i_sar_height ) { - dw = (int64_t)p_param->i_width * p_param->vui.i_sar_width / p_param->vui.i_sar_height; - dh = p_param->i_height; + dw = dw * p_param->vui.i_sar_width / p_param->vui.i_sar_height; } else { - dw = p_param->i_width; - dh = (int64_t)p_param->i_height * p_param->vui.i_sar_height / p_param->vui.i_sar_width; + dh = dh * p_param->vui.i_sar_height / p_param->vui.i_sar_width; } - - p_mkv->d_width = (int)dw; - p_mkv->d_height = (int)dh; } + p_mkv->d_width = (int)dw; + p_mkv->d_height = (int)dh; p_mkv->i_timebase_num = p_param->i_timebase_num; p_mkv->i_timebase_den = p_param->i_timebase_den; @@ -150,11 +157,11 @@ avcC, avcC_len, p_mkv->frame_duration, 50000, p_mkv->width, p_mkv->height, p_mkv->d_width, p_mkv->d_height, p_mkv->display_size_units, p_mkv->stereo_mode ); + free( avcC ); + if( ret < 0 ) return ret; - free( avcC ); - // SEI if( !p_mkv->b_writing_frame )
View file
x264-snapshot-20141218-2245.tar.bz2/output/matroska_ebml.c -> x264-snapshot-20150804-2245.tar.bz2/output/matroska_ebml.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * matroska_ebml.c: matroska muxer utilities ***************************************************************************** - * Copyright (C) 2005-2014 x264 project + * Copyright (C) 2005-2015 x264 project * * Authors: Mike Matsnev <mike@haali.su> * @@ -317,8 +317,6 @@ return w; } -static const uint8_t mk_stereo_modes[6] = {5,9,7,1,3,13}; - int mk_write_header( mk_writer *w, const char *writing_app, const char *codec_id, const void *codec_private, unsigned codec_private_size, @@ -342,7 +340,7 @@ CHECK( mk_write_uint( c, 0x42f2, 4 ) ); // EBMLMaxIDLength CHECK( mk_write_uint( c, 0x42f3, 8 ) ); // EBMLMaxSizeLength CHECK( mk_write_string( c, 0x4282, "matroska") ); // DocType - CHECK( mk_write_uint( c, 0x4287, 2 ) ); // DocTypeVersion + CHECK( mk_write_uint( c, 0x4287, stereo_mode >= 0 ? 3 : 2 ) ); // DocTypeVersion CHECK( mk_write_uint( c, 0x4285, 2 ) ); // DocTypeReadversion CHECK( mk_close_context( c, 0 ) ); @@ -381,8 +379,8 @@ CHECK( mk_write_uint( v, 0x54b2, display_size_units ) ); CHECK( mk_write_uint( v, 0x54b0, d_width ) ); CHECK( mk_write_uint( v, 0x54ba, d_height ) ); - if( stereo_mode >= 0 && stereo_mode <= 5 ) - CHECK( mk_write_uint( v, 0x53b8, mk_stereo_modes[stereo_mode] ) ); + if( stereo_mode >= 0 ) + CHECK( mk_write_uint( v, 0x53b8, stereo_mode ) ); CHECK( mk_close_context( v, 0 ) ); CHECK( mk_close_context( ti, 0 ) );
View file
x264-snapshot-20141218-2245.tar.bz2/output/matroska_ebml.h -> x264-snapshot-20150804-2245.tar.bz2/output/matroska_ebml.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * matroska_ebml.h: matroska muxer utilities ***************************************************************************** - * Copyright (C) 2005-2014 x264 project + * Copyright (C) 2005-2015 x264 project * * Authors: Mike Matsnev <mike@haali.su> * @@ -27,10 +27,10 @@ #define X264_MATROSKA_EBML_H /* Matroska display size units from the spec */ -#define DS_PIXELS 0 -#define DS_CM 1 -#define DS_INCHES 2 -#define DS_ASPECT_RATIO 3 +#define DS_PIXELS 0 +#define DS_CM 1 +#define DS_INCHES 2 +#define DS_ASPECT_RATIO 3 typedef struct mk_writer mk_writer;
View file
x264-snapshot-20141218-2245.tar.bz2/output/mp4.c -> x264-snapshot-20150804-2245.tar.bz2/output/mp4.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * mp4.c: mp4 muxer ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> @@ -169,8 +169,9 @@ FILE *fh = x264_fopen( psz_filename, "w" ); if( !fh ) return -1; - FAIL_IF_ERR( !x264_is_regular_file( fh ), "mp4", "MP4 output is incompatible with non-regular file `%s'\n", psz_filename ) + int b_regular = x264_is_regular_file( fh ); fclose( fh ); + FAIL_IF_ERR( !b_regular, "mp4", "MP4 output is incompatible with non-regular file `%s'\n", psz_filename ) mp4_hnd_t *p_mp4 = calloc( 1, sizeof(mp4_hnd_t) ); if( !p_mp4 )
View file
x264-snapshot-20141218-2245.tar.bz2/output/mp4_lsmash.c -> x264-snapshot-20150804-2245.tar.bz2/output/mp4_lsmash.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * mp4_lsmash.c: mp4 muxer using L-SMASH ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/output/output.h -> x264-snapshot-20150804-2245.tar.bz2/output/output.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * output.h: x264 file output modules ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/output/raw.c -> x264-snapshot-20150804-2245.tar.bz2/output/raw.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * raw.c: raw muxer ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/tools/checkasm-a.asm -> x264-snapshot-20150804-2245.tar.bz2/tools/checkasm-a.asm
Changed
@@ -1,7 +1,7 @@ ;***************************************************************************** ;* checkasm-a.asm: assembly check tool ;***************************************************************************** -;* Copyright (C) 2008-2014 x264 project +;* Copyright (C) 2008-2015 x264 project ;* ;* Authors: Loren Merritt <lorenm@u.washington.edu> ;* Henrik Gramner <henrik@gramner.com> @@ -33,24 +33,24 @@ %if ARCH_X86_64 ; just random numbers to reduce the chance of incidental match ALIGN 16 -x6: ddq 0x79445c159ce790641a1b2550a612b48c -x7: ddq 0x86b2536fcd8cf6362eed899d5a28ddcd -x8: ddq 0x3f2bf84fc0fcca4eb0856806085e7943 -x9: ddq 0xd229e1f5b281303facbd382dcf5b8de2 -x10: ddq 0xab63e2e11fa38ed971aeaff20b095fd9 -x11: ddq 0x77d410d5c42c882d89b0c0765892729a -x12: ddq 0x24b3c1d2a024048bc45ea11a955d8dd5 -x13: ddq 0xdd7b8919edd427862e8ec680de14b47c -x14: ddq 0x11e53e2b2ac655ef135ce6888fa02cbf -x15: ddq 0x6de8f4c914c334d5011ff554472a7a10 -n7: dq 0x21f86d66c8ca00ce -n8: dq 0x75b6ba21077c48ad -n9: dq 0xed56bb2dcb3c7736 -n10: dq 0x8bda43d3fd1a7e06 -n11: dq 0xb64a9c9e5d318408 -n12: dq 0xdf9a54b303f1d3a3 -n13: dq 0x4a75479abd64e097 -n14: dq 0x249214109d5d1c88 +x6: dq 0x1a1b2550a612b48c,0x79445c159ce79064 +x7: dq 0x2eed899d5a28ddcd,0x86b2536fcd8cf636 +x8: dq 0xb0856806085e7943,0x3f2bf84fc0fcca4e +x9: dq 0xacbd382dcf5b8de2,0xd229e1f5b281303f +x10: dq 0x71aeaff20b095fd9,0xab63e2e11fa38ed9 +x11: dq 0x89b0c0765892729a,0x77d410d5c42c882d +x12: dq 0xc45ea11a955d8dd5,0x24b3c1d2a024048b +x13: dq 0x2e8ec680de14b47c,0xdd7b8919edd42786 +x14: dq 0x135ce6888fa02cbf,0x11e53e2b2ac655ef +x15: dq 0x011ff554472a7a10,0x6de8f4c914c334d5 +n7: dq 0x21f86d66c8ca00ce +n8: dq 0x75b6ba21077c48ad +n9: dq 0xed56bb2dcb3c7736 +n10: dq 0x8bda43d3fd1a7e06 +n11: dq 0xb64a9c9e5d318408 +n12: dq 0xdf9a54b303f1d3a3 +n13: dq 0x4a75479abd64e097 +n14: dq 0x249214109d5d1c88 %endif SECTION .text
View file
x264-snapshot-20141218-2245.tar.bz2/tools/checkasm.c -> x264-snapshot-20150804-2245.tar.bz2/tools/checkasm.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * checkasm.c: assembly check tool ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> @@ -97,6 +97,12 @@ asm volatile( "mftb %0" : "=r"(a) :: "memory" ); #elif ARCH_ARM // ARMv7 only asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) :: "memory" ); +#elif ARCH_AARCH64 + uint64_t b = 0; + asm volatile( "mrs %0, pmccntr_el0" : "=r"(b) :: "memory" ); + a = b; +#elif ARCH_MIPS + asm volatile( "rdhwr %0, $2" : "=r"(a) :: "memory" ); #endif return a; } @@ -167,12 +173,12 @@ continue; printf( "%s_%s%s: %"PRId64"\n", benchs[i].name, #if HAVE_MMX - b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" : b->cpu&X264_CPU_AVX2 ? "avx2" : b->cpu&X264_CPU_FMA3 ? "fma3" : b->cpu&X264_CPU_FMA4 ? "fma4" : b->cpu&X264_CPU_XOP ? "xop" : b->cpu&X264_CPU_AVX ? "avx" : + b->cpu&X264_CPU_SSE42 ? "sse42" : b->cpu&X264_CPU_SSE4 ? "sse4" : b->cpu&X264_CPU_SSSE3 ? "ssse3" : b->cpu&X264_CPU_SSE3 ? "sse3" : @@ -189,6 +195,8 @@ #elif ARCH_AARCH64 b->cpu&X264_CPU_NEON ? "neon" : b->cpu&X264_CPU_ARMV8 ? "armv8" : +#elif ARCH_MIPS + b->cpu&X264_CPU_MSA ? "msa" : #endif "c", #if HAVE_MMX @@ -637,7 +645,7 @@ } \ predict_8x8[res_c>>16]( fdec1, edge ); \ int res_a = call_a( pixel_asm.name, fenc, fdec2, edge, bitcosts+8-pred_mode, satds_a ); \ - if( res_c != res_a || memcmp(satds_c, satds_a, sizeof(satds_c)) ) \ + if( res_c != res_a || memcmp(satds_c, satds_a, 16 * sizeof(*satds_c)) ) \ { \ ok = 0; \ fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \ @@ -1409,6 +1417,32 @@ } } + if( mc_a.plane_copy_swap != mc_ref.plane_copy_swap ) + { + set_func_name( "plane_copy_swap" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = (plane_specs[i].w + 1) >> 1; + int h = plane_specs[i].h; + intptr_t src_stride = plane_specs[i].src_stride; + intptr_t dst_stride = (2*w + 127) & ~63; + assert( dst_stride * h <= 0x1000 ); + pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1); + memset( pbuf3, 0, 0x1000*sizeof(pixel) ); + memset( pbuf4, 0, 0x1000*sizeof(pixel) ); + call_c( mc_c.plane_copy_swap, pbuf3, dst_stride, src1, src_stride, w, h ); + call_a( mc_a.plane_copy_swap, pbuf4, dst_stride, src1, src_stride, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) ) + { + ok = 0; + fprintf( stderr, "plane_copy_swap FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride ); + break; + } + } + } + if( mc_a.plane_copy_interleave != mc_ref.plane_copy_interleave ) { set_func_name( "plane_copy_interleave" ); @@ -1496,7 +1530,7 @@ if( mc_a.plane_copy_deinterleave_v210 != mc_ref.plane_copy_deinterleave_v210 ) { set_func_name( "plane_copy_deinterleave_v210" ); - used_asm = 1; + ok = 1; used_asm = 1; for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) { int w = (plane_specs[i].w + 1) >> 1; @@ -1517,8 +1551,8 @@ break; } } + report( "v210 :" ); } - report( "v210 :" ); if( mc_a.hpel_filter != mc_ref.hpel_filter ) { @@ -2311,12 +2345,16 @@ {\ fprintf( stderr, #name "[%d] : [FAILED]\n", dir );\ ok = 0;\ - for( int k = -1; k < 16; k++ )\ - printf( "%2x ", edge[16+k] );\ - printf( "\n" );\ + if( ip_c.name == (void *)ip_c.predict_8x8 )\ + {\ + for( int k = -1; k < 16; k++ )\ + printf( "%2x ", edge[16+k] );\ + printf( "\n" );\ + }\ for( int j = 0; j < h; j++ )\ {\ - printf( "%2x ", edge[14-j] );\ + if( ip_c.name == (void *)ip_c.predict_8x8 )\ + printf( "%2x ", edge[14-j] );\ for( int k = 0; k < w; k++ )\ printf( "%2x ", pbuf4[48+k+j*FDEC_STRIDE] );\ printf( "\n" );\ @@ -2324,7 +2362,8 @@ printf( "\n" );\ for( int j = 0; j < h; j++ )\ {\ - printf( " " );\ + if( ip_c.name == (void *)ip_c.predict_8x8 )\ + printf( " " );\ for( int k = 0; k < w; k++ )\ printf( "%2x ", pbuf3[48+k+j*FDEC_STRIDE] );\ printf( "\n" );\ @@ -2428,6 +2467,8 @@ DECL_CABAC(c) #if HAVE_MMX DECL_CABAC(asm) +#elif defined(ARCH_AARCH64) +DECL_CABAC(asm) #else #define run_cabac_decision_asm run_cabac_decision_c #define run_cabac_bypass_asm run_cabac_bypass_c @@ -2646,7 +2687,7 @@ #endif if( cpu_detect & X264_CPU_LZCNT ) { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX LZCNT" ); cpu1 &= ~X264_CPU_LZCNT; } ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" ); @@ -2664,11 +2705,11 @@ cpu1 &= ~X264_CPU_SLOW_SHUFFLE; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" ); cpu1 &= ~X264_CPU_SLOW_CTZ; - } - if( cpu_detect & X264_CPU_LZCNT ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" ); - cpu1 &= ~X264_CPU_LZCNT; + if( cpu_detect & X264_CPU_LZCNT ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE2 LZCNT" ); + cpu1 &= ~X264_CPU_LZCNT; + } } if( cpu_detect & X264_CPU_SSE3 ) { @@ -2688,9 +2729,16 @@ ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" ); cpu1 &= ~X264_CPU_CACHELINE_64; cpu1 &= ~X264_CPU_SLOW_ATOM; + if( cpu_detect & X264_CPU_LZCNT ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSSE3 LZCNT" ); + cpu1 &= ~X264_CPU_LZCNT; + } } if( cpu_detect & X264_CPU_SSE4 ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" ); + if( cpu_detect & X264_CPU_SSE42 ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE42, "SSE4.2" ); if( cpu_detect & X264_CPU_AVX ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" ); if( cpu_detect & X264_CPU_XOP ) @@ -2700,30 +2748,30 @@ ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" ); cpu1 &= ~X264_CPU_FMA4; } - if( cpu_detect & X264_CPU_BMI1 ) + if( cpu_detect & X264_CPU_FMA3 ) { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" ); - cpu1 &= ~X264_CPU_BMI1; + ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" ); + cpu1 &= ~X264_CPU_FMA3; } if( cpu_detect & X264_CPU_AVX2 ) { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3 | X264_CPU_AVX2, "AVX2" ); if( cpu_detect & X264_CPU_LZCNT ) { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2_LZCNT" ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2 LZCNT" ); cpu1 &= ~X264_CPU_LZCNT; } } + if( cpu_detect & X264_CPU_BMI1 ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" ); + cpu1 &= ~X264_CPU_BMI1; + } if( cpu_detect & X264_CPU_BMI2 ) { ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" ); cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2); } - if( cpu_detect & X264_CPU_FMA3 ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" ); - cpu1 &= ~X264_CPU_FMA3; - } #elif ARCH_PPC if( cpu_detect & X264_CPU_ALTIVEC ) { @@ -2742,6 +2790,9 @@ ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV8, "ARMv8" ); if( cpu_detect & X264_CPU_NEON ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" ); +#elif ARCH_MIPS + if( cpu_detect & X264_CPU_MSA ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_MSA, "MSA" ); #endif return ret; } @@ -2752,7 +2803,7 @@ if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) ) { -#if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_ARM +#if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_ARM && !ARCH_AARCH64 && !ARCH_MIPS fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" ); return 1; #endif
View file
x264-snapshot-20150804-2245.tar.bz2/tools/gas-preprocessor.pl
Added
@@ -0,0 +1,1033 @@ +#!/usr/bin/env perl +# by David Conrad +# This code is licensed under GPLv2 or later; go to gnu.org to read it +# (not that it much matters for an asm preprocessor) +# usage: set your assembler to be something like "perl gas-preprocessor.pl gcc" +use strict; + +# Apple's gas is ancient and doesn't support modern preprocessing features like +# .rept and has ugly macro syntax, among other things. Thus, this script +# implements the subset of the gas preprocessor used by x264 and ffmpeg +# that isn't supported by Apple's gas. + +my %canonical_arch = ("aarch64" => "aarch64", "arm64" => "aarch64", + "arm" => "arm", + "powerpc" => "powerpc", "ppc" => "powerpc"); + +my %comments = ("aarch64" => '//', + "arm" => '@', + "powerpc" => '#'); + +my @gcc_cmd; +my @preprocess_c_cmd; + +my $comm; +my $arch; +my $as_type = "apple-gas"; + +my $fix_unreq = $^O eq "darwin"; +my $force_thumb = 0; + +my $arm_cond_codes = "eq|ne|cs|cc|mi|pl|vs|vc|hi|ls|ge|lt|gt|le|al|hs|lo"; + +my $usage_str = " +$0\n +Gas-preprocessor.pl converts assembler files using modern GNU as syntax for +Apple's ancient gas version or clang's incompatible integrated assembler. The +conversion is regularly tested for Libav, x264 and vlc. Other projects might +use different features which are not correctly handled. + +Options for this program needs to be separated with ' -- ' from the assembler +command. Following options are currently supported: + + -help - this usage text + -arch - target architecture + -as-type - one value out of {{,apple-}{gas,clang},armasm} + -fix-unreq + -no-fix-unreq + -force-thumb - assemble as thumb regardless of the input source + (note, this is incomplete and only works for sources + it explicitly was tested with) +"; + +sub usage() { + print $usage_str; +} + +while (@ARGV) { + my $opt = shift; + + if ($opt =~ /^-(no-)?fix-unreq$/) { + $fix_unreq = $1 ne "no-"; + } elsif ($opt eq "-force-thumb") { + $force_thumb = 1; + } elsif ($opt eq "-arch") { + $arch = shift; + die "unknown arch: '$arch'\n" if not exists $comments{$arch}; + } elsif ($opt eq "-as-type") { + $as_type = shift; + die "unknown as type: '$as_type'\n" if $as_type !~ /^((apple-)?(gas|clang)|armasm)$/; + } elsif ($opt eq "-help") { + usage(); + exit 0; + } elsif ($opt eq "--" ) { + @gcc_cmd = @ARGV; + } elsif ($opt =~ /^-/) { + die "option '$opt' is not known. See '$0 -help' for usage information\n"; + } else { + push @gcc_cmd, $opt, @ARGV; + } + last if (@gcc_cmd); +} + +if (grep /\.c$/, @gcc_cmd) { + # C file (inline asm?) - compile + @preprocess_c_cmd = (@gcc_cmd, "-S"); +} elsif (grep /\.[sS]$/, @gcc_cmd) { + # asm file, just do C preprocessor + @preprocess_c_cmd = (@gcc_cmd, "-E"); +} elsif (grep /-(v|h|-version|dumpversion)/, @gcc_cmd) { + # pass -v/--version along, used during probing. Matching '-v' might have + # uninteded results but it doesn't matter much if gas-preprocessor or + # the compiler fails. + exec(@gcc_cmd); +} else { + die "Unrecognized input filetype"; +} +if ($as_type eq "armasm") { + + $preprocess_c_cmd[0] = "cpp"; + push(@preprocess_c_cmd, "-U__ELF__"); + push(@preprocess_c_cmd, "-U__MACH__"); + + @preprocess_c_cmd = grep ! /^-nologo$/, @preprocess_c_cmd; + # Remove -ignore XX parameter pairs from preprocess_c_cmd + my $index = 1; + while ($index < $#preprocess_c_cmd) { + if ($preprocess_c_cmd[$index] eq "-ignore" and $index + 1 < $#preprocess_c_cmd) { + splice(@preprocess_c_cmd, $index, 2); + next; + } + $index++; + } + if (grep /^-MM$/, @preprocess_c_cmd) { + system(@preprocess_c_cmd) == 0 or die "Error running preprocessor"; + exit 0; + } +} + +# if compiling, avoid creating an output file named '-.o' +if ((grep /^-c$/, @gcc_cmd) && !(grep /^-o/, @gcc_cmd)) { + foreach my $i (@gcc_cmd) { + if ($i =~ /\.[csS]$/) { + my $outputfile = $i; + $outputfile =~ s/\.[csS]$/.o/; + push(@gcc_cmd, "-o"); + push(@gcc_cmd, $outputfile); + last; + } + } +} +# replace only the '-o' argument with '-', avoids rewriting the make dependency +# target specified with -MT to '-' +my $index = 1; +while ($index < $#preprocess_c_cmd) { + if ($preprocess_c_cmd[$index] eq "-o") { + $index++; + $preprocess_c_cmd[$index] = "-"; + } + $index++; +} + +my $tempfile; +if ($as_type ne "armasm") { + @gcc_cmd = map { /\.[csS]$/ ? qw(-x assembler -) : $_ } @gcc_cmd; +} else { + @preprocess_c_cmd = grep ! /^-c$/, @preprocess_c_cmd; + @preprocess_c_cmd = grep ! /^-m/, @preprocess_c_cmd; + + @preprocess_c_cmd = grep ! /^-G/, @preprocess_c_cmd; + @preprocess_c_cmd = grep ! /^-W/, @preprocess_c_cmd; + @preprocess_c_cmd = grep ! /^-Z/, @preprocess_c_cmd; + @preprocess_c_cmd = grep ! /^-fp/, @preprocess_c_cmd; + @preprocess_c_cmd = grep ! /^-EHsc$/, @preprocess_c_cmd; + @preprocess_c_cmd = grep ! /^-O/, @preprocess_c_cmd; + + @gcc_cmd = grep ! /^-G/, @gcc_cmd; + @gcc_cmd = grep ! /^-W/, @gcc_cmd; + @gcc_cmd = grep ! /^-Z/, @gcc_cmd; + @gcc_cmd = grep ! /^-fp/, @gcc_cmd; + @gcc_cmd = grep ! /^-EHsc$/, @gcc_cmd; + @gcc_cmd = grep ! /^-O/, @gcc_cmd; + + my @outfiles = grep /\.(o|obj)$/, @gcc_cmd; + $tempfile = $outfiles[0].".asm"; + + # Remove most parameters from gcc_cmd, which actually is the armasm command, + # which doesn't support any of the common compiler/preprocessor options. + @gcc_cmd = grep ! /^-D/, @gcc_cmd; + @gcc_cmd = grep ! /^-U/, @gcc_cmd; + @gcc_cmd = grep ! /^-m/, @gcc_cmd; + @gcc_cmd = grep ! /^-M/, @gcc_cmd; + @gcc_cmd = grep ! /^-c$/, @gcc_cmd; + @gcc_cmd = grep ! /^-I/, @gcc_cmd; + @gcc_cmd = map { /\.S$/ ? $tempfile : $_ } @gcc_cmd; +} + +# detect architecture from gcc binary name +if (!$arch) { + if ($gcc_cmd[0] =~ /(arm64|aarch64|arm|powerpc|ppc)/) { + $arch = $1; + } else { + # look for -arch flag + foreach my $i (1 .. $#gcc_cmd-1) { + if ($gcc_cmd[$i] eq "-arch" and + $gcc_cmd[$i+1] =~ /(arm64|aarch64|arm|powerpc|ppc)/) { + $arch = $1; + } + } + } +} + +# assume we're not cross-compiling if no -arch or the binary doesn't have the arch name +$arch = qx/arch/ if (!$arch); + +die "Unknown target architecture '$arch'" if not exists $canonical_arch{$arch}; + +$arch = $canonical_arch{$arch}; +$comm = $comments{$arch}; +my $inputcomm = $comm; +$comm = ";" if $as_type =~ /armasm/; + +my %ppc_spr = (ctr => 9, + vrsave => 256); + +open(INPUT, "-|", @preprocess_c_cmd) || die "Error running preprocessor"; + +if ($ENV{GASPP_DEBUG}) { + open(ASMFILE, ">&STDOUT"); +} else { + if ($as_type ne "armasm") { + open(ASMFILE, "|-", @gcc_cmd) or die "Error running assembler"; + } else { + open(ASMFILE, ">", $tempfile); + } +} + +my $current_macro = ''; +my $macro_level = 0; +my $rept_level = 0; +my %macro_lines; +my %macro_args; +my %macro_args_default; +my $macro_count = 0; +my $altmacro = 0; +my $in_irp = 0; + +my $num_repts; +my @rept_lines; + +my @irp_args; +my $irp_param; + +my @ifstack; + +my %symbols; + +my @sections; + +my %literal_labels; # for ldr <reg>, =<expr> +my $literal_num = 0; +my $literal_expr = ".word"; +$literal_expr = ".quad" if $arch eq "aarch64"; + +my $thumb = 0; + +my %thumb_labels; +my %call_targets; +my %mov32_targets; + +my %neon_alias_reg; +my %neon_alias_type; + +my $temp_label_next = 0; +my %last_temp_labels; +my %next_temp_labels; + +my %labels_seen; + +my %aarch64_req_alias; + +if ($force_thumb) { + parse_line(".thumb\n"); +} + +# pass 1: parse .macro +# note that the handling of arguments is probably overly permissive vs. gas +# but it should be the same for valid cases +while (<INPUT>) { + # remove lines starting with '#', preprocessing is done, '#' at start of + # the line indicates a comment for all supported archs (aarch64, arm, ppc + # and x86). Also strips line number comments but since they are off anyway + # it is no loss. + s/^#.*$//; + # remove all comments (to avoid interfering with evaluating directives) + s/(?<!\\)$inputcomm.*//x; + # Strip out windows linefeeds + s/\r$//; + + foreach my $subline (split(";", $_)) { + # Add newlines at the end of lines that don't already have one + chomp $subline; + $subline .= "\n"; + parse_line($subline); + } +} + +sub eval_expr { + my $expr = $_[0]; + while ($expr =~ /([A-Za-z._][A-Za-z0-9._]*)/g) { + my $sym = $1; + $expr =~ s/$sym/($symbols{$sym})/ if defined $symbols{$sym}; + } + eval $expr; +} + +sub handle_if { + my $line = $_[0]; + # handle .if directives; apple's assembler doesn't support important non-basic ones + # evaluating them is also needed to handle recursive macros + if ($line =~ /\.if(n?)([a-z]*)\s+(.*)/) { + my $result = $1 eq "n"; + my $type = $2; + my $expr = $3; + + if ($type eq "b") { + $expr =~ s/\s//g; + $result ^= $expr eq ""; + } elsif ($type eq "c") { + if ($expr =~ /(.*)\s*,\s*(.*)/) { + $result ^= $1 eq $2; + } else { + die "argument to .ifc not recognized"; + } + } elsif ($type eq "") { + $result ^= eval_expr($expr) != 0; + } elsif ($type eq "eq") { + $result = eval_expr($expr) == 0; + } elsif ($type eq "lt") { + $result = eval_expr($expr) < 0; + } else { + chomp($line); + die "unhandled .if varient. \"$line\""; + } + push (@ifstack, $result); + return 1; + } else { + return 0; + } +} + +sub parse_if_line { + my $line = $_[0]; + + # evaluate .if blocks + if (scalar(@ifstack)) { + # Don't evaluate any new if statements if we're within + # a repetition or macro - they will be evaluated once + # the repetition is unrolled or the macro is expanded. + if (scalar(@rept_lines) == 0 and $macro_level == 0) { + if ($line =~ /\.endif/) { + pop(@ifstack); + return 1; + } elsif ($line =~ /\.elseif\s+(.*)/) { + if ($ifstack[-1] == 0) { + $ifstack[-1] = !!eval_expr($1); + } elsif ($ifstack[-1] > 0) { + $ifstack[-1] = -$ifstack[-1]; + } + return 1; + } elsif ($line =~ /\.else/) { + $ifstack[-1] = !$ifstack[-1]; + return 1; + } elsif (handle_if($line)) { + return 1; + } + } + + # discard lines in false .if blocks + foreach my $i (0 .. $#ifstack) { + if ($ifstack[$i] <= 0) { + return 1; + } + } + } + return 0; +} + +sub parse_line { + my $line = $_[0]; + + return if (parse_if_line($line)); + + if (scalar(@rept_lines) == 0) { + if (/\.macro/) { + $macro_level++; + if ($macro_level > 1 && !$current_macro) { + die "nested macros but we don't have master macro"; + } + } elsif (/\.endm/) { + $macro_level--; + if ($macro_level < 0) { + die "unmatched .endm"; + } elsif ($macro_level == 0) { + $current_macro = ''; + return; + } + } + } + + if ($macro_level == 0) { + if ($line =~ /\.(rept|irp)/) { + $rept_level++; + } elsif ($line =~ /.endr/) { + $rept_level--; + } + } + + if ($macro_level > 1) { + push(@{$macro_lines{$current_macro}}, $line); + } elsif (scalar(@rept_lines) and $rept_level >= 1) { + push(@rept_lines, $line); + } elsif ($macro_level == 0) { + expand_macros($line); + } else { + if ($line =~ /\.macro\s+([\d\w\.]+)\s*,?\s*(.*)/) { + $current_macro = $1; + + # commas in the argument list are optional, so only use whitespace as the separator + my $arglist = $2; + $arglist =~ s/,/ /g; + + my @args = split(/\s+/, $arglist); + foreach my $i (0 .. $#args) { + my @argpair = split(/=/, $args[$i]); + $macro_args{$current_macro}[$i] = $argpair[0]; + $argpair[0] =~ s/:vararg$//; + $macro_args_default{$current_macro}{$argpair[0]} = $argpair[1]; + } + # ensure %macro_lines has the macro name added as a key + $macro_lines{$current_macro} = []; + + } elsif ($current_macro) { + push(@{$macro_lines{$current_macro}}, $line); + } else { + die "macro level without a macro name"; + } + } +} + +sub handle_set { + my $line = $_[0]; + if ($line =~ /\.set\s+(.*),\s*(.*)/) { + $symbols{$1} = eval_expr($2); + return 1; + } + return 0; +} + +sub expand_macros { + my $line = $_[0]; + + # handle .if directives; apple's assembler doesn't support important non-basic ones + # evaluating them is also needed to handle recursive macros + if (handle_if($line)) { + return; + } + + if (/\.purgem\s+([\d\w\.]+)/) { + delete $macro_lines{$1}; + delete $macro_args{$1}; + delete $macro_args_default{$1}; + return; + } + + if ($line =~ /\.altmacro/) { + $altmacro = 1; + return; + } + + if ($line =~ /\.noaltmacro/) { + $altmacro = 0; + return; + } + + $line =~ s/\%([^,]*)/eval_expr($1)/eg if $altmacro; + + # Strip out the .set lines from the armasm output + return if (handle_set($line) and $as_type eq "armasm"); + + if ($line =~ /\.rept\s+(.*)/) { + $num_repts = $1; + @rept_lines = ("\n"); + + # handle the possibility of repeating another directive on the same line + # .endr on the same line is not valid, I don't know if a non-directive is + if ($num_repts =~ s/(\.\w+.*)//) { + push(@rept_lines, "$1\n"); + } + $num_repts = eval_expr($num_repts); + } elsif ($line =~ /\.irp\s+([\d\w\.]+)\s*(.*)/) { + $in_irp = 1; + $num_repts = 1; + @rept_lines = ("\n"); + $irp_param = $1; + + # only use whitespace as the separator + my $irp_arglist = $2; + $irp_arglist =~ s/,/ /g; + $irp_arglist =~ s/^\s+//; + @irp_args = split(/\s+/, $irp_arglist); + } elsif ($line =~ /\.irpc\s+([\d\w\.]+)\s*(.*)/) { + $in_irp = 1; + $num_repts = 1; + @rept_lines = ("\n"); + $irp_param = $1; + + my $irp_arglist = $2; + $irp_arglist =~ s/,/ /g; + $irp_arglist =~ s/^\s+//; + @irp_args = split(//, $irp_arglist); + } elsif ($line =~ /\.endr/) { + my @prev_rept_lines = @rept_lines; + my $prev_in_irp = $in_irp; + my @prev_irp_args = @irp_args; + my $prev_irp_param = $irp_param; + my $prev_num_repts = $num_repts; + @rept_lines = (); + $in_irp = 0; + @irp_args = ''; + + if ($prev_in_irp != 0) { + foreach my $i (@prev_irp_args) { + foreach my $origline (@prev_rept_lines) { + my $line = $origline; + $line =~ s/\\$prev_irp_param/$i/g; + $line =~ s/\\\(\)//g; # remove \() + parse_line($line); + } + } + } else { + for (1 .. $prev_num_repts) { + foreach my $origline (@prev_rept_lines) { + my $line = $origline; + parse_line($line); + } + } + } + } elsif ($line =~ /(\S+:|)\s*([\w\d\.]+)\s*(.*)/ && exists $macro_lines{$2}) { + handle_serialized_line($1); + my $macro = $2; + + # commas are optional here too, but are syntactically important because + # parameters can be blank + my @arglist = split(/,/, $3); + my @args; + my @args_seperator; + + my $comma_sep_required = 0; + foreach (@arglist) { + # allow arithmetic/shift operators in macro arguments + $_ =~ s/\s*(\+|-|\*|\/|<<|>>|<|>)\s*/$1/g; + + my @whitespace_split = split(/\s+/, $_); + if (!@whitespace_split) { + push(@args, ''); + push(@args_seperator, ''); + } else { + foreach (@whitespace_split) { + #print ("arglist = \"$_\"\n"); + if (length($_)) { + push(@args, $_); + my $sep = $comma_sep_required ? "," : " "; + push(@args_seperator, $sep); + #print ("sep = \"$sep\", arg = \"$_\"\n"); + $comma_sep_required = 0; + } + } + } + + $comma_sep_required = 1; + } + + my %replacements; + if ($macro_args_default{$macro}){ + %replacements = %{$macro_args_default{$macro}}; + } + + # construct hashtable of text to replace + foreach my $i (0 .. $#args) { + my $argname = $macro_args{$macro}[$i]; + my @macro_args = @{ $macro_args{$macro} }; + if ($args[$i] =~ m/=/) { + # arg=val references the argument name + # XXX: I'm not sure what the expected behaviour if a lot of + # these are mixed with unnamed args + my @named_arg = split(/=/, $args[$i]); + $replacements{$named_arg[0]} = $named_arg[1]; + } elsif ($i > $#{$macro_args{$macro}}) { + # more args given than the macro has named args + # XXX: is vararg allowed on arguments before the last? + $argname = $macro_args{$macro}[-1]; + if ($argname =~ s/:vararg$//) { + #print "macro = $macro, args[$i] = $args[$i], args_seperator=@args_seperator, argname = $argname, arglist[$i] = $arglist[$i], arglist = @arglist, args=@args, macro_args=@macro_args\n"; + #$replacements{$argname} .= ", $args[$i]"; + $replacements{$argname} .= "$args_seperator[$i] $args[$i]"; + } else { + die "Too many arguments to macro $macro"; + } + } else { + $argname =~ s/:vararg$//; + $replacements{$argname} = $args[$i]; + } + } + + my $count = $macro_count++; + + # apply replacements as regex + foreach (@{$macro_lines{$macro}}) { + my $macro_line = $_; + # do replacements by longest first, this avoids wrong replacement + # when argument names are subsets of each other + foreach (reverse sort {length $a <=> length $b} keys %replacements) { + $macro_line =~ s/\\$_/$replacements{$_}/g; + } + if ($altmacro) { + foreach (reverse sort {length $a <=> length $b} keys %replacements) { + $macro_line =~ s/\b$_\b/$replacements{$_}/g; + } + } + $macro_line =~ s/\\\@/$count/g; + $macro_line =~ s/\\\(\)//g; # remove \() + parse_line($macro_line); + } + } else { + handle_serialized_line($line); + } +} + +sub is_arm_register { + my $name = $_[0]; + if ($name eq "lr" or + $name eq "ip" or + $name =~ /^[rav]\d+$/) { + return 1; + } + return 0; +} + +sub handle_local_label { + my $line = $_[0]; + my $num = $_[1]; + my $dir = $_[2]; + my $target = "$num$dir"; + if ($dir eq "b") { + $line =~ s/$target/$last_temp_labels{$num}/g; + } else { + my $name = "temp_label_$temp_label_next"; + $temp_label_next++; + push(@{$next_temp_labels{$num}}, $name); + $line =~ s/$target/$name/g; + } + return $line; +} + +sub handle_serialized_line { + my $line = $_[0]; + + # handle .previous (only with regard to .section not .subsection) + if ($line =~ /\.(section|text|const_data)/) { + push(@sections, $line); + } elsif ($line =~ /\.previous/) { + if (!$sections[-2]) { + die ".previous without a previous section"; + } + $line = $sections[-2]; + push(@sections, $line); + } + + $thumb = 1 if $line =~ /\.code\s+16|\.thumb/; + $thumb = 0 if $line =~ /\.code\s+32|\.arm/; + + # handle ldr <reg>, =<expr> + if ($line =~ /(.*)\s*ldr([\w\s\d]+)\s*,\s*=(.*)/ and $as_type ne "armasm") { + my $label = $literal_labels{$3}; + if (!$label) { + $label = "Literal_$literal_num"; + $literal_num++; + $literal_labels{$3} = $label; + } + $line = "$1 ldr$2, $label\n"; + } elsif ($line =~ /\.ltorg/ and $as_type ne "armasm") { + $line .= ".align 2\n"; + foreach my $literal (keys %literal_labels) { + $line .= "$literal_labels{$literal}:\n $literal_expr $literal\n"; + } + %literal_labels = (); + } + + # handle GNU as pc-relative relocations for adrp/add + if ($line =~ /(.*)\s*adrp([\w\s\d]+)\s*,\s*#?:pg_hi21:([^\s]+)/) { + $line = "$1 adrp$2, ${3}\@PAGE\n"; + } elsif ($line =~ /(.*)\s*add([\w\s\d]+)\s*,([\w\s\d]+)\s*,\s*#?:lo12:([^\s]+)/) { + $line = "$1 add$2, $3, ${4}\@PAGEOFF\n"; + } + + # thumb add with large immediate needs explicit add.w + if ($thumb and $line =~ /add\s+.*#([^@]+)/) { + $line =~ s/add/add.w/ if eval_expr($1) > 255; + } + + # mach-o local symbol names start with L (no dot) + $line =~ s/(?<!\w)\.(L\w+)/$1/g; + + # recycle the '.func' directive for '.thumb_func' + if ($thumb and $as_type =~ /^apple-/) { + $line =~ s/\.func/.thumb_func/x; + } + + if ($thumb and $line =~ /^\s*(\w+)\s*:/) { + $thumb_labels{$1}++; + } + + if ($as_type =~ /^apple-/ and + $line =~ /^\s*((\w+\s*:\s*)?bl?x?(..)?(?:\.w)?|\.global)\s+(\w+)/) { + my $cond = $3; + my $label = $4; + # Don't interpret e.g. bic as b<cc> with ic as conditional code + if ($cond =~ /|$arm_cond_codes/) { + if (exists $thumb_labels{$label}) { + print ASMFILE ".thumb_func $label\n"; + } else { + $call_targets{$label}++; + } + } + } + + # @l -> lo16() @ha -> ha16() + $line =~ s/,\s+([^,]+)\@l\b/, lo16($1)/g; + $line =~ s/,\s+([^,]+)\@ha\b/, ha16($1)/g; + + # move to/from SPR + if ($line =~ /(\s+)(m[ft])([a-z]+)\s+(\w+)/ and exists $ppc_spr{$3}) { + if ($2 eq 'mt') { + $line = "$1${2}spr $ppc_spr{$3}, $4\n"; + } else { + $line = "$1${2}spr $4, $ppc_spr{$3}\n"; + } + } + + if ($line =~ /\.unreq\s+(.*)/) { + if (defined $neon_alias_reg{$1}) { + delete $neon_alias_reg{$1}; + delete $neon_alias_type{$1}; + return; + } elsif (defined $aarch64_req_alias{$1}) { + delete $aarch64_req_alias{$1}; + return; + } + } + # old gas versions store upper and lower case names on .req, + # but they remove only one on .unreq + if ($fix_unreq) { + if ($line =~ /\.unreq\s+(.*)/) { + $line = ".unreq " . lc($1) . "\n"; + $line .= ".unreq " . uc($1) . "\n"; + } + } + + if ($line =~ /(\w+)\s+\.(dn|qn)\s+(\w+)(?:\.(\w+))?(\[\d+\])?/) { + $neon_alias_reg{$1} = "$3$5"; + $neon_alias_type{$1} = $4; + return; + } + if (scalar keys %neon_alias_reg > 0 && $line =~ /^\s+v\w+/) { + # This line seems to possibly have a neon instruction + foreach (keys %neon_alias_reg) { + my $alias = $_; + # Require the register alias to match as an invididual word, not as a substring + # of a larger word-token. + if ($line =~ /\b$alias\b/) { + $line =~ s/\b$alias\b/$neon_alias_reg{$alias}/g; + # Add the type suffix. If multiple aliases match on the same line, + # only do this replacement the first time (a vfoo.bar string won't match v\w+). + $line =~ s/^(\s+)(v\w+)(\s+)/$1$2.$neon_alias_type{$alias}$3/; + } + } + } + + if ($arch eq "aarch64" or $as_type eq "armasm") { + # clang's integrated aarch64 assembler in Xcode 5 does not support .req/.unreq + if ($line =~ /\b(\w+)\s+\.req\s+(\w+)\b/) { + $aarch64_req_alias{$1} = $2; + return; + } + foreach (keys %aarch64_req_alias) { + my $alias = $_; + # recursively resolve aliases + my $resolved = $aarch64_req_alias{$alias}; + while (defined $aarch64_req_alias{$resolved}) { + $resolved = $aarch64_req_alias{$resolved}; + } + $line =~ s/\b$alias\b/$resolved/g; + } + } + if ($arch eq "aarch64") { + # fix missing aarch64 instructions in Xcode 5.1 (beta3) + # mov with vector arguments is not supported, use alias orr instead + if ($line =~ /^\s*mov\s+(v\d[\.{}\[\]\w]+),\s*(v\d[\.{}\[\]\w]+)\b\s*$/) { + $line = " orr $1, $2, $2\n"; + } + # movi 16, 32 bit shifted variant, shift is optional + if ($line =~ /^\s*movi\s+(v[0-3]?\d\.(?:2|4|8)[hsHS])\s*,\s*(#\w+)\b\s*$/) { + $line = " movi $1, $2, lsl #0\n"; + } + # Xcode 5 misses the alias uxtl. Replace it with the more general ushll. + # Clang 3.4 misses the alias sxtl too. Replace it with the more general sshll. + if ($line =~ /^\s*(s|u)xtl(2)?\s+(v[0-3]?\d\.[248][hsdHSD])\s*,\s*(v[0-3]?\d\.(?:2|4|8|16)[bhsBHS])\b\s*$/) { + $line = " $1shll$2 $3, $4, #0\n"; + } + # clang 3.4 does not automatically use shifted immediates in add/sub + if ($as_type eq "clang" and + $line =~ /^(\s*(?:add|sub)s?) ([^#l]+)#([\d\+\-\*\/ <>]+)\s*$/) { + my $imm = eval $3; + if ($imm > 4095 and not ($imm & 4095)) { + $line = "$1 $2#" . ($imm >> 12) . ", lsl #12\n"; + } + } + if ($ENV{GASPP_FIX_XCODE5}) { + if ($line =~ /^\s*bsl\b/) { + $line =~ s/\b(bsl)(\s+v[0-3]?\d\.(\w+))\b/$1.$3$2/; + $line =~ s/\b(v[0-3]?\d)\.$3\b/$1/g; + } + if ($line =~ /^\s*saddl2?\b/) { + $line =~ s/\b(saddl2?)(\s+v[0-3]?\d\.(\w+))\b/$1.$3$2/; + $line =~ s/\b(v[0-3]?\d)\.\w+\b/$1/g; + } + if ($line =~ /^\s*dup\b.*\]$/) { + $line =~ s/\bdup(\s+v[0-3]?\d)\.(\w+)\b/dup.$2$1/g; + $line =~ s/\b(v[0-3]?\d)\.[bhsdBHSD](\[\d\])$/$1$2/g; + } + } + } + + if ($as_type eq "armasm") { + # Also replace variables set by .set + foreach (keys %symbols) { + my $sym = $_; + $line =~ s/\b$sym\b/$symbols{$sym}/g; + } + + # Handle function declarations and keep track of the declared labels + if ($line =~ s/^\s*\.func\s+(\w+)/$1 PROC/) { + $labels_seen{$1} = 1; + } + + if ($line =~ s/^\s*(\d+)://) { + # Convert local labels into unique labels. armasm (at least in + # RVCT) has something similar, but still different enough. + # By converting to unique labels we avoid any possible + # incompatibilities. + + my $num = $1; + foreach (@{$next_temp_labels{$num}}) { + $line = "$_\n" . $line; + } + @next_temp_labels{$num} = (); + my $name = "temp_label_$temp_label_next"; + $temp_label_next++; + # The matching regexp above removes the label from the start of + # the line (which might contain an instruction as well), readd + # it on a separate line above it. + $line = "$name:\n" . $line; + $last_temp_labels{$num} = $name; + } + + if ($line =~ s/^(\w+):/$1/) { + # Skip labels that have already been declared with a PROC, + # labels must not be declared multiple times. + return if (defined $labels_seen{$1}); + $labels_seen{$1} = 1; + } elsif ($line !~ /(\w+) PROC/) { + # If not a label, make sure the line starts with whitespace, + # otherwise ms armasm interprets it incorrectly. + $line =~ s/^[\.\w]/\t$&/; + } + + + # Check branch instructions + if ($line =~ /(?:^|\n)\s*(\w+\s*:\s*)?(bl?x?(..)?(\.w)?)\s+(\w+)/) { + my $instr = $2; + my $cond = $3; + my $width = $4; + my $target = $5; + # Don't interpret e.g. bic as b<cc> with ic as conditional code + if ($cond !~ /|$arm_cond_codes/) { + # Not actually a branch + } elsif ($target =~ /(\d+)([bf])/) { + # The target is a local label + $line = handle_local_label($line, $1, $2); + $line =~ s/\b$instr\b/$&.w/ if $width eq ""; + } elsif (!is_arm_register($target)) { + $call_targets{$target}++; + } + } elsif ($line =~ /^\s*.h?word.*\b\d+[bf]\b/) { + while ($line =~ /\b(\d+)([bf])\b/g) { + $line = handle_local_label($line, $1, $2); + } + } + + # ALIGN in armasm syntax is the actual number of bytes + if ($line =~ /\.align\s+(\d+)/) { + my $align = 1 << $1; + $line =~ s/\.align\s(\d+)/ALIGN $align/; + } + # Convert gas style [r0, :128] into armasm [r0@128] alignment specification + $line =~ s/\[([^\[]+),\s*:(\d+)\]/[$1\@$2]/g; + + # armasm treats logical values {TRUE} and {FALSE} separately from + # numeric values - logical operators and values can't be intermixed + # with numerical values. Evaluate !<number> and (a <> b) into numbers, + # let the assembler evaluate the rest of the expressions. This current + # only works for cases when ! and <> are used with actual constant numbers, + # we don't evaluate subexpressions here. + + # Evaluate !<number> + while ($line =~ /!\s*(\d+)/g) { + my $val = ($1 != 0) ? 0 : 1; + $line =~ s/!(\d+)/$val/; + } + # Evaluate (a > b) + while ($line =~ /\(\s*(\d+)\s*([<>])\s*(\d+)\s*\)/) { + my $val; + if ($2 eq "<") { + $val = ($1 < $3) ? 1 : 0; + } else { + $val = ($1 > $3) ? 1 : 0; + } + $line =~ s/\(\s*(\d+)\s*([<>])\s*(\d+)\s*\)/$val/; + } + + # Change a movw... #:lower16: into a mov32 pseudoinstruction + $line =~ s/^(\s*)movw(\s+\w+\s*,\s*)\#:lower16:(.*)$/$1mov32$2$3/; + # and remove the following, matching movt completely + $line =~ s/^\s*movt\s+\w+\s*,\s*\#:upper16:.*$//; + + if ($line =~ /^\s*mov32\s+\w+,\s*([a-zA-Z]\w*)/) { + $mov32_targets{$1}++; + } + + # Misc bugs/deficiencies: + # armasm seems unable to parse e.g. "vmov s0, s1" without a type + # qualifier, thus add .f32. + $line =~ s/^(\s+(?:vmov|vadd))(\s+s)/$1.f32$2/; + # armasm is unable to parse &0x - add spacing + $line =~ s/&0x/& 0x/g; + } + + if ($force_thumb) { + # Convert register post indexing to a separate add instruction. + # This converts e.g. "ldr r0, [r1], r2" into "ldr r0, [r1]", + # "add r1, r1, r2". + $line =~ s/(ldr|str)\s+(\w+),\s*\[(\w+)\],\s*(\w+)/$1 $2, [$3]\n\tadd $3, $3, $4/g; + + # Convert "mov pc, lr" into "bx lr", since the former only works + # for switching from arm to thumb (and only in armv7), but not + # from thumb to arm. + s/mov\s*pc\s*,\s*lr/bx lr/g; + + # Convert stmdb/ldmia with only one register into a plain str/ldr with post-increment/decrement + $line =~ s/stmdb\s+sp!\s*,\s*\{([^,-]+)\}/str $1, [sp, #-4]!/g; + $line =~ s/ldmia\s+sp!\s*,\s*\{([^,-]+)\}/ldr $1, [sp], #4/g; + + $line =~ s/\.arm/.thumb/x; + } + + # comment out unsupported directives + $line =~ s/\.type/$comm$&/x if $as_type =~ /^(apple-|armasm)/; + $line =~ s/\.func/$comm$&/x if $as_type =~ /^(apple-|clang)/; + $line =~ s/\.endfunc/$comm$&/x if $as_type =~ /^(apple-|clang)/; + $line =~ s/\.endfunc/ENDP/x if $as_type =~ /armasm/; + $line =~ s/\.ltorg/$comm$&/x if $as_type =~ /^(apple-|clang)/; + $line =~ s/\.ltorg/LTORG/x if $as_type eq "armasm"; + $line =~ s/\.size/$comm$&/x if $as_type =~ /^(apple-|armasm)/; + $line =~ s/\.fpu/$comm$&/x if $as_type =~ /^(apple-|armasm)/; + $line =~ s/\.arch/$comm$&/x if $as_type =~ /^(apple-|clang|armasm)/; + $line =~ s/\.object_arch/$comm$&/x if $as_type =~ /^(apple-|armasm)/; + $line =~ s/.section\s+.note.GNU-stack.*/$comm$&/x if $as_type =~ /^(apple-|armasm)/; + + $line =~ s/\.syntax/$comm$&/x if $as_type =~ /armasm/; + + $line =~ s/\.hword/.short/x; + + if ($as_type =~ /^apple-/) { + # the syntax for these is a little different + $line =~ s/\.global/.globl/x; + # also catch .section .rodata since the equivalent to .const_data is .section __DATA,__const + $line =~ s/(.*)\.rodata/.const_data/x; + $line =~ s/\.int/.long/x; + $line =~ s/\.float/.single/x; + } + if ($as_type eq "armasm") { + $line =~ s/\.global/EXPORT/x; + $line =~ s/\.int/dcd/x; + $line =~ s/\.long/dcd/x; + $line =~ s/\.float/dcfs/x; + $line =~ s/\.word/dcd/x; + $line =~ s/\.short/dcw/x; + $line =~ s/\.byte/dcb/x; + $line =~ s/\.thumb/THUMB/x; + $line =~ s/\.arm/ARM/x; + # The alignment in AREA is the power of two, just as .align in gas + $line =~ s/\.text/AREA |.text|, CODE, READONLY, ALIGN=2, CODEALIGN/; + $line =~ s/(\s*)(.*)\.rodata/$1AREA |.rodata|, DATA, READONLY, ALIGN=5/; + + $line =~ s/fmxr/vmsr/; + $line =~ s/fmrx/vmrs/; + $line =~ s/fadds/vadd.f32/; + } + + # catch unknown section names that aren't mach-o style (with a comma) + if ($as_type =~ /apple-/ and $line =~ /.section ([^,]*)$/) { + die ".section $1 unsupported; figure out the mach-o section name and add it"; + } + + print ASMFILE $line; +} + +if ($as_type ne "armasm") { + print ASMFILE ".text\n"; + print ASMFILE ".align 2\n"; + foreach my $literal (keys %literal_labels) { + print ASMFILE "$literal_labels{$literal}:\n $literal_expr $literal\n"; + } + + map print(ASMFILE ".thumb_func $_\n"), + grep exists $thumb_labels{$_}, keys %call_targets; +} else { + map print(ASMFILE "\tIMPORT $_\n"), + grep ! exists $labels_seen{$_}, (keys %call_targets, keys %mov32_targets); + + print ASMFILE "\tEND\n"; +} + +close(INPUT) or exit 1; +close(ASMFILE) or exit 1; +if ($as_type eq "armasm" and ! defined $ENV{GASPP_DEBUG}) { + system(@gcc_cmd) == 0 or die "Error running assembler"; +} + +END { + unlink($tempfile) if defined $tempfile; +} +#exit 1
View file
x264-snapshot-20141218-2245.tar.bz2/x264.c -> x264-snapshot-20150804-2245.tar.bz2/x264.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * x264: top-level x264cli functions ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Loren Merritt <lorenm@u.washington.edu> * Laurent Aimar <fenrir@via.ecp.fr> @@ -209,6 +209,13 @@ #endif 0 }; +static const char * const chroma_format_names[] = +{ + [0] = "all", + [X264_CSP_I420] = "i420", + [X264_CSP_I422] = "i422", + [X264_CSP_I444] = "i444" +}; static const char * const range_names[] = { "auto", "tv", "pc", 0 }; @@ -325,7 +332,8 @@ #else printf( "using an unknown compiler\n" ); #endif - printf( "configuration: --bit-depth=%d --chroma-format=%s\n", x264_bit_depth, X264_CHROMA_FORMAT ? (output_csp_names[0]+1) : "all" ); + printf( "x264 configuration: --bit-depth=%d --chroma-format=%s\n", X264_BIT_DEPTH, chroma_format_names[X264_CHROMA_FORMAT] ); + printf( "libx264 configuration: --bit-depth=%d --chroma-format=%s\n", x264_bit_depth, chroma_format_names[x264_chroma_format] ); printf( "x264 license: " ); #if HAVE_GPL printf( "GPL version 2 or later\n" ); @@ -533,7 +541,7 @@ " Overrides all settings.\n" ); H2( #if X264_CHROMA_FORMAT <= X264_CSP_I420 -#if BIT_DEPTH==8 +#if X264_BIT_DEPTH==8 " - baseline:\n" " --no-8x8dct --bframes 0 --no-cabac\n" " --cqm flat --weightp 0\n" @@ -561,7 +569,7 @@ else H0( " - " #if X264_CHROMA_FORMAT <= X264_CSP_I420 -#if BIT_DEPTH==8 +#if X264_BIT_DEPTH==8 "baseline,main,high," #endif "high10," @@ -703,7 +711,9 @@ " - 2: row alternation - L and R are interlaced by row\n" " - 3: side by side - L is on the left, R on the right\n" " - 4: top bottom - L is on top, R on bottom\n" - " - 5: frame alternation - one view per frame\n" ); + " - 5: frame alternation - one view per frame\n" + " - 6: mono - 2D frame without any frame packing\n" + " - 7: tile format - L is on top-left, R split across\n" ); H0( "\n" ); H0( "Ratecontrol:\n" ); H0( "\n" ); @@ -726,7 +736,8 @@ H2( " --aq-mode <integer> AQ method [%d]\n" " - 0: Disabled\n" " - 1: Variance AQ (complexity mask)\n" - " - 2: Auto-variance AQ (experimental)\n", defaults->rc.i_aq_mode ); + " - 2: Auto-variance AQ\n" + " - 3: Auto-variance AQ with bias to dark scenes\n", defaults->rc.i_aq_mode ); H1( " --aq-strength <float> Reduces blocking and blurring in flat and\n" " textured areas. [%.1f]\n", defaults->rc.f_aq_strength ); H1( "\n" ); @@ -1286,11 +1297,11 @@ /* force the output csp to what the user specified (or the default) */ param->i_csp = info->csp; int csp = info->csp & X264_CSP_MASK; - if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp > X264_CSP_NV12) ) + if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp >= X264_CSP_I422) ) param->i_csp = X264_CSP_I420; - else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_V210) ) + else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp >= X264_CSP_I444) ) param->i_csp = X264_CSP_I422; - else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp > X264_CSP_YV24) ) + else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp >= X264_CSP_BGR) ) param->i_csp = X264_CSP_I444; else if( output_csp == X264_CSP_RGB && (csp < X264_CSP_BGR || csp > X264_CSP_RGB) ) param->i_csp = X264_CSP_RGB;
View file
x264-snapshot-20141218-2245.tar.bz2/x264.h -> x264-snapshot-20150804-2245.tar.bz2/x264.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * x264.h: x264 public header ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu> @@ -29,7 +29,7 @@ #define X264_X264_H #if !defined(_STDINT_H) && !defined(_STDINT_H_) && !defined(_STDINT_H_INCLUDED) && !defined(_STDINT) &&\ - !defined(_INTTYPES_H) && !defined(_INTTYPES_H_) && !defined(_INTTYPES) + !defined(_SYS_STDINT_H_) && !defined(_INTTYPES_H) && !defined(_INTTYPES_H_) && !defined(_INTTYPES) # ifdef _MSC_VER # pragma message("You must include stdint.h or inttypes.h before x264.h") # else @@ -41,7 +41,7 @@ #include "x264_config.h" -#define X264_BUILD 142 +#define X264_BUILD 148 /* Application developers planning to link against a shared library version of * libx264 from a Microsoft Visual Studio or similar development environment @@ -129,8 +129,8 @@ #define X264_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */ #define X264_CPU_XOP 0x0000800 /* AMD XOP */ #define X264_CPU_FMA4 0x0001000 /* AMD FMA4 */ -#define X264_CPU_AVX2 0x0002000 /* AVX2 */ -#define X264_CPU_FMA3 0x0004000 /* Intel FMA3 */ +#define X264_CPU_FMA3 0x0002000 /* FMA3 */ +#define X264_CPU_AVX2 0x0004000 /* AVX2 */ #define X264_CPU_BMI1 0x0008000 /* BMI1 */ #define X264_CPU_BMI2 0x0010000 /* BMI2 */ /* x86 modifiers */ @@ -158,6 +158,9 @@ #define X264_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */ #define X264_CPU_ARMV8 0x0000008 +/* MIPS */ +#define X264_CPU_MSA 0x0000001 /* MIPS MSA */ + /* Analyse flags */ #define X264_ANALYSE_I4x4 0x0001 /* Analyse i4x4 */ #define X264_ANALYSE_I8x8 0x0002 /* Analyse i8x8 (requires 8x8 transform) */ @@ -183,6 +186,7 @@ #define X264_AQ_NONE 0 #define X264_AQ_VARIANCE 1 #define X264_AQ_AUTOVARIANCE 2 +#define X264_AQ_AUTOVARIANCE_BIASED 3 #define X264_B_ADAPT_NONE 0 #define X264_B_ADAPT_FAST 1 #define X264_B_ADAPT_TRELLIS 2 @@ -213,16 +217,17 @@ #define X264_CSP_I420 0x0001 /* yuv 4:2:0 planar */ #define X264_CSP_YV12 0x0002 /* yvu 4:2:0 planar */ #define X264_CSP_NV12 0x0003 /* yuv 4:2:0, with one y plane and one packed u+v */ -#define X264_CSP_I422 0x0004 /* yuv 4:2:2 planar */ -#define X264_CSP_YV16 0x0005 /* yvu 4:2:2 planar */ -#define X264_CSP_NV16 0x0006 /* yuv 4:2:2, with one y plane and one packed u+v */ -#define X264_CSP_V210 0x0007 /* 10-bit yuv 4:2:2 packed in 32 */ -#define X264_CSP_I444 0x0008 /* yuv 4:4:4 planar */ -#define X264_CSP_YV24 0x0009 /* yvu 4:4:4 planar */ -#define X264_CSP_BGR 0x000a /* packed bgr 24bits */ -#define X264_CSP_BGRA 0x000b /* packed bgr 32bits */ -#define X264_CSP_RGB 0x000c /* packed rgb 24bits */ -#define X264_CSP_MAX 0x000d /* end of list */ +#define X264_CSP_NV21 0x0004 /* yuv 4:2:0, with one y plane and one packed v+u */ +#define X264_CSP_I422 0x0005 /* yuv 4:2:2 planar */ +#define X264_CSP_YV16 0x0006 /* yvu 4:2:2 planar */ +#define X264_CSP_NV16 0x0007 /* yuv 4:2:2, with one y plane and one packed u+v */ +#define X264_CSP_V210 0x0008 /* 10-bit yuv 4:2:2 packed in 32 */ +#define X264_CSP_I444 0x0009 /* yuv 4:4:4 planar */ +#define X264_CSP_YV24 0x000a /* yvu 4:4:4 planar */ +#define X264_CSP_BGR 0x000b /* packed bgr 24bits */ +#define X264_CSP_BGRA 0x000c /* packed bgr 32bits */ +#define X264_CSP_RGB 0x000d /* packed rgb 24bits */ +#define X264_CSP_MAX 0x000e /* end of list */ #define X264_CSP_VFLIP 0x1000 /* the csp is vertically flipped */ #define X264_CSP_HIGH_DEPTH 0x2000 /* the csp has a depth of 16 bits per pixel component */ @@ -234,7 +239,7 @@ #define X264_TYPE_BREF 0x0004 /* Non-disposable B-frame */ #define X264_TYPE_B 0x0005 #define X264_TYPE_KEYFRAME 0x0006 /* IDR or I depending on b_open_gop option */ -#define IS_X264_TYPE_I(x) ((x)==X264_TYPE_I || (x)==X264_TYPE_IDR) +#define IS_X264_TYPE_I(x) ((x)==X264_TYPE_I || (x)==X264_TYPE_IDR || (x)==X264_TYPE_KEYFRAME) #define IS_X264_TYPE_B(x) ((x)==X264_TYPE_B || (x)==X264_TYPE_BREF) /* Log level */ @@ -789,8 +794,6 @@ /* In: force picture type (if not auto) * If x264 encoding parameters are violated in the forcing of picture types, * x264 will correct the input picture type and log a warning. - * The quality of frametype decisions may suffer if a great deal of fine-grained - * mixing of auto and forced frametypes is done. * Out: type of the picture encoded */ int i_type; /* In: force quantizer for != X264_QP_AUTO */
View file
x264-snapshot-20141218-2245.tar.bz2/x264cli.h -> x264-snapshot-20150804-2245.tar.bz2/x264cli.h
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * x264cli.h: x264cli common ***************************************************************************** - * Copyright (C) 2003-2014 x264 project + * Copyright (C) 2003-2015 x264 project * * Authors: Laurent Aimar <fenrir@via.ecp.fr> * Loren Merritt <lorenm@u.washington.edu>
View file
x264-snapshot-20141218-2245.tar.bz2/x264dll.c -> x264-snapshot-20150804-2245.tar.bz2/x264dll.c
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * x264dll: x264 DLLMain for win32 ***************************************************************************** - * Copyright (C) 2009-2014 x264 project + * Copyright (C) 2009-2015 x264 project * * Authors: Anton Mitrofanov <BugMaster@narod.ru> *
View file
x264-snapshot-20141218-2245.tar.bz2/x264res.rc -> x264-snapshot-20150804-2245.tar.bz2/x264res.rc
Changed
@@ -1,7 +1,7 @@ /***************************************************************************** * x264res.rc: windows resource file ***************************************************************************** - * Copyright (C) 2012-2014 x264 project + * Copyright (C) 2012-2015 x264 project * * Authors: Henrik Gramner <henrik@gramner.com> * @@ -60,7 +60,7 @@ #endif VALUE "FileVersion", X264_POINTVER VALUE "InternalName", "x264" - VALUE "LegalCopyright", "Copyright (C) 2003-2014 x264 project" + VALUE "LegalCopyright", "Copyright (C) 2003-2015 x264 project" #ifdef DLL VALUE "OriginalFilename", "libx264-" xstr(X264_BUILD) ".dll" #else
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.