Packman Build Service PMBS

Changes of Revision 10

libx264.changes Changed

libx264.spec Changed

x264-snapshot-20141218-2245.tar.bz2/extras/gas-preprocessor.pl Deleted

@@ -1,253 +0,0 @@
-#!/usr/bin/env perl
-# by David Conrad
-# This code is licensed under GPLv2 or later; go to gnu.org to read it
-#  (not that it much matters for an asm preprocessor)
-# usage: set your assembler to be something like "perl gas-preprocessor.pl gcc"
-use strict;
-
-# Apple's gas is ancient and doesn't support modern preprocessing features like
-# .rept and has ugly macro syntax, among other things. Thus, this script
-# implements the subset of the gas preprocessor used by x264 and ffmpeg
-# that isn't supported by Apple's gas.
-
-my @gcc_cmd = @ARGV;
-my @preprocess_c_cmd;
-
-if (grep /\.c$/, @gcc_cmd) {
-    # C file (inline asm?) - compile
-    @preprocess_c_cmd = (@gcc_cmd, "-S");
-} elsif (grep /\.S$/, @gcc_cmd) {
-    # asm file, just do C preprocessor
-    @preprocess_c_cmd = (@gcc_cmd, "-E");
-} else {
-    die "Unrecognized input filetype";
-}
-@gcc_cmd = map { /\.[cS]$/ ? qw(-x assembler -) : $_ } @gcc_cmd;
-@preprocess_c_cmd = map { /\.o$/ ? "-" : $_ } @preprocess_c_cmd;
-
-open(ASMFILE, "-|", @preprocess_c_cmd) || die "Error running preprocessor";
-
-my $current_macro = '';
-my %macro_lines;
-my %macro_args;
-my %macro_args_default;
-
-my @pass1_lines;
-
-# pass 1: parse .macro
-# note that the handling of arguments is probably overly permissive vs. gas
-# but it should be the same for valid cases
-while (<ASMFILE>) {
-    # comment out unsupported directives
-    s/\.type/@.type/x;
-    s/\.func/@.func/x;
-    s/\.endfunc/@.endfunc/x;
-    s/\.ltorg/@.ltorg/x;
-    s/\.size/@.size/x;
-    s/\.fpu/@.fpu/x;
-
-    # the syntax for these is a little different
-    s/\.global/.globl/x;
-    # also catch .section .rodata since the equivalent to .const_data is .section __DATA,__const
-    s/(.*)\.rodata/.const_data/x;
-    s/\.int/.long/x;
-    s/\.float/.single/x;
-
-    # catch unknown section names that aren't mach-o style (with a comma)
-    if (/.section ([^,]*)$/) {
-        die ".section $1 unsupported; figure out the mach-o section name and add it";
-    }
-
-    # macros creating macros is not handled (is that valid?)
-    if (/\.macro\s+([\d\w\.]+)\s*(.*)/) {
-        $current_macro = $1;
-
-        # commas in the argument list are optional, so only use whitespace as the separator
-        my $arglist = $2;
-        $arglist =~ s/,/ /g;
-
-        my @args = split(/\s+/, $arglist);
-        foreach my $i (0 .. $#args) {
-            my @argpair = split(/=/, $args[$i]);
-            $macro_args{$current_macro}[$i] = $argpair[0];
-            $argpair[0] =~ s/:vararg$//;
-            $macro_args_default{$current_macro}{$argpair[0]} = $argpair[1];
-        }
-        # ensure %macro_lines has the macro name added as a key
-        $macro_lines{$current_macro} = [];
-    } elsif (/\.endm/) {
-        if (!$current_macro) {
-            die "ERROR: .endm without .macro";
-        }
-        $current_macro = '';
-    } elsif ($current_macro) {
-        push(@{$macro_lines{$current_macro}}, $_);
-    } else {
-        expand_macros($_);
-    }
-}
-
-sub expand_macros {
-    my $line = @_[0];
-    if ($line =~ /(\S+:|)\s*([\w\d\.]+)\s*(.*)/ && exists $macro_lines{$2}) {
-        push(@pass1_lines, $1);
-        my $macro = $2;
-
-        # commas are optional here too, but are syntactically important because
-        # parameters can be blank
-        my @arglist = split(/,/, $3);
-        my @args;
-        foreach (@arglist) {
-            my @whitespace_split = split(/\s+/, $_);
-            if (!@whitespace_split) {
-                push(@args, '');
-            } else {
-                foreach (@whitespace_split) {
-                    if (length($_)) {
-                        push(@args, $_);
-                    }
-                }
-            }
-        }
-
-        my %replacements;
-        if ($macro_args_default{$macro}){
-            %replacements = %{$macro_args_default{$macro}};
-        }
-
-        # construct hashtable of text to replace
-        foreach my $i (0 .. $#args) {
-            my $argname = $macro_args{$macro}[$i];
-
-            if ($args[$i] =~ m/=/) {
-                # arg=val references the argument name
-                # XXX: I'm not sure what the expected behaviour if a lot of
-                # these are mixed with unnamed args
-                my @named_arg = split(/=/, $args[$i]);
-                $replacements{$named_arg[0]} = $named_arg[1];
-            } elsif ($i > $#{$macro_args{$macro}}) {
-                # more args given than the macro has named args
-                # XXX: is vararg allowed on arguments before the last?
-                $argname = $macro_args{$macro}[-1];
-                if ($argname =~ s/:vararg$//) {
-                    $replacements{$argname} .= ", $args[$i]";
-                } else {
-                    die "Too many arguments to macro $macro";
-                }
-            } else {
-                $argname =~ s/:vararg$//;
-                $replacements{$argname} = $args[$i];
-            }
-        }
-
-        # apply replacements as regex
-        foreach (@{$macro_lines{$macro}}) {
-            my $macro_line = $_;
-            # do replacements by longest first, this avoids wrong replacement
-            # when argument names are subsets of each other
-            foreach (reverse sort {length $a <=> length $b} keys %replacements) {
-                $macro_line =~ s/\\$_/$replacements{$_}/g;
-            }
-            $macro_line =~ s/\\//g;     # remove \()
-            expand_macros($macro_line);
-        }
-    } else {
-        push(@pass1_lines, $line);
-    }
-}
-
-close(ASMFILE) or exit 1;
-open(ASMFILE, "|-", @gcc_cmd) or die "Error running assembler";
-
-my @sections;
-my $num_repts;
-my $rept_lines;
-
-my %literal_labels;     # for ldr <reg>, =<expr>
-my $literal_num = 0;
-
-# pass 2: parse .rept and .if variants
-# NOTE: since we don't implement a proper parser, using .rept with a
-# variable assigned from .set is not supported
-foreach my $line (@pass1_lines) {
-    # textual comparison .if
-    # this assumes nothing else on the same line
-    if ($line =~ /\.ifnb\s+(.*)/) {
-        if ($1) {
-            $line = ".if 1\n";
-        } else {
-            $line = ".if 0\n";
-        }
-    } elsif ($line =~ /\.ifb\s+(.*)/) {
-        if ($1) {
-            $line = ".if 0\n";
-        } else {
-            $line = ".if 1\n";
-        }
-    } elsif ($line =~ /\.ifc\s+(.*)\s*,\s*(.*)/) {
-        if ($1 eq $2) {
-            $line = ".if 1\n";
-        } else {
-            $line = ".if 0\n";
-        }
-    }
-
-    # handle .previous (only with regard to .section not .subsection)
-    if ($line =~ /\.(section|text|const_data)/) {
-        push(@sections, $line);
-    } elsif ($line =~ /\.previous/) {
-        if (!$sections[-2]) {
-            die ".previous without a previous section";
-        }
-        $line = $sections[-2];
-        push(@sections, $line);
-    }
-
-    # handle ldr <reg>, =<expr>
-    if ($line =~ /(.*)\s*ldr([\w\s\d]+)\s*,\s*=(.*)/) {
-        my $label = $literal_labels{$3};
-        if (!$label) {
-            $label = ".Literal_$literal_num";
-            $literal_num++;
-            $literal_labels{$3} = $label;
-        }
-        $line = "$1 ldr$2, $label\n";
-    } elsif ($line =~ /\.ltorg/) {
-        foreach my $literal (keys %literal_labels) {
-            $line .= "$literal_labels{$literal}:\n .word $literal\n";
-        }
-        %literal_labels = ();
-    }
-
-    # @l -> lo16()  @ha -> ha16()
-    $line =~ s/,\s+([^,]+)\@l(\s)/, lo16($1)$2/g;
-    $line =~ s/,\s+([^,]+)\@ha(\s)/, ha16($1)$2/g;
-
-    if ($line =~ /\.rept\s+(.*)/) {
-        $num_repts = $1;
-        $rept_lines = "\n";
-
-        # handle the possibility of repeating another directive on the same line
-        # .endr on the same line is not valid, I don't know if a non-directive is
-        if ($num_repts =~ s/(\.\w+.*)//) {
-            $rept_lines .= "$1\n";
-        }
-        $num_repts = eval($num_repts);
-    } elsif ($line =~ /\.endr/) {
-        for (1 .. $num_repts) {
-            print ASMFILE $rept_lines;
-        }
-        $rept_lines = '';
-    } elsif ($rept_lines) {
-        $rept_lines .= $line;
-    } else {
-        print ASMFILE $line;
-    }
-}
-
-print ASMFILE ".text\n";
-foreach my $literal (keys %literal_labels) {
-    print ASMFILE "$literal_labels{$literal}:\n .word $literal\n";
-}
-
-close(ASMFILE) or exit 1;

x264-snapshot-20141218-2245.tar.bz2/extras/windowsPorts Deleted

x264-snapshot-20141218-2245.tar.bz2/extras/windowsPorts/basicDataTypeConversions.h Deleted

@@ -1,85 +0,0 @@
-#ifndef __DATA_TYPE_CONVERSIONS_H__
-#define __DATA_TYPE_CONVERSIONS_H__
-
-#include <stdint.h>
-#include <wchar.h>
-
-#ifdef __cplusplus
-namespace avxsynth {
-#endif // __cplusplus
-
-typedef int64_t __int64;
-typedef int32_t __int32;
-#ifdef __cplusplus
-typedef bool	BOOL;
-#else
-typedef uint32_t BOOL;
-#endif // __cplusplus
-typedef void* HMODULE;
-typedef void* LPVOID;
-typedef void* PVOID;
-typedef PVOID HANDLE;
-typedef HANDLE HWND;
-typedef HANDLE HINSTANCE;
-typedef void* HDC;
-typedef void* HBITMAP;
-typedef void* HICON;
-typedef void* HFONT;
-typedef void* HGDIOBJ;
-typedef void* HBRUSH;
-typedef void* HMMIO;
-typedef void* HACMSTREAM;
-typedef void* HACMDRIVER;
-typedef void* HIC;
-typedef void* HACMOBJ;
-typedef HACMSTREAM* LPHACMSTREAM;
-typedef void* HACMDRIVERID;
-typedef void* LPHACMDRIVER;
-typedef unsigned char BYTE;
-typedef BYTE* LPBYTE;
-typedef char TCHAR;
-typedef TCHAR* LPTSTR;
-typedef const TCHAR* LPCTSTR;
-typedef char* LPSTR;
-typedef LPSTR LPOLESTR;
-typedef const char* LPCSTR;
-typedef LPCSTR LPCOLESTR;
-typedef wchar_t WCHAR;
-typedef unsigned short WORD;
-typedef unsigned int UINT;
-typedef UINT MMRESULT;
-typedef uint32_t DWORD;
-typedef DWORD COLORREF;
-typedef DWORD FOURCC;
-typedef DWORD HRESULT;
-typedef DWORD* LPDWORD;
-typedef DWORD* DWORD_PTR;
-typedef int32_t LONG;
-typedef int32_t* LONG_PTR;
-typedef LONG_PTR LRESULT;
-typedef uint32_t ULONG;
-typedef uint32_t* ULONG_PTR;
-//typedef __int64_t intptr_t;
-typedef uint64_t _fsize_t;
-
-
-//
-// Structures
-//
-
-typedef struct _GUID {
-  DWORD Data1;
-  WORD  Data2;
-  WORD  Data3;
-  BYTE  Data4[8];
-} GUID;
-
-typedef GUID REFIID;
-typedef GUID CLSID;
-typedef CLSID* LPCLSID;
-typedef GUID IID;
-
-#ifdef __cplusplus
-}; // namespace avxsynth
-#endif // __cplusplus
-#endif //  __DATA_TYPE_CONVERSIONS_H__

x264-snapshot-20141218-2245.tar.bz2/extras/windowsPorts/windows2linux.h Deleted

@@ -1,77 +0,0 @@
-#ifndef __WINDOWS2LINUX_H__
-#define __WINDOWS2LINUX_H__
-
-/*
- * LINUX SPECIFIC DEFINITIONS
-*/
-//
-// Data types conversions
-//
-#include <stdlib.h>
-#include <string.h>
-#include "basicDataTypeConversions.h"
-
-#ifdef __cplusplus
-namespace avxsynth {
-#endif // __cplusplus
-//
-// purposefully define the following MSFT definitions 
-// to mean nothing (as they do not mean anything on Linux)
-//
-#define __stdcall
-#define __cdecl
-#define noreturn
-#define __declspec(x)
-#define STDAPI       extern "C" HRESULT
-#define STDMETHODIMP HRESULT __stdcall
-#define STDMETHODIMP_(x) x __stdcall
-
-#define STDMETHOD(x)    virtual HRESULT x
-#define STDMETHOD_(a, x) virtual a x
-
-#ifndef TRUE
-#define TRUE  true
-#endif 
-
-#ifndef FALSE
-#define FALSE false
-#endif
-
-#define S_OK                (0x00000000)
-#define S_FALSE             (0x00000001)
-#define E_NOINTERFACE       (0X80004002)
-#define E_POINTER           (0x80004003)
-#define E_FAIL              (0x80004005)
-#define E_OUTOFMEMORY       (0x8007000E)
-
-#define INVALID_HANDLE_VALUE    ((HANDLE)((LONG_PTR)-1))
-#define FAILED(hr)              ((hr) & 0x80000000)
-#define SUCCEEDED(hr)           (!FAILED(hr))
-
-
-// 
-// Functions
-//
-#define MAKEDWORD(a,b,c,d) ((a << 24) | (b << 16) | (c << 8) | (d))
-#define MAKEWORD(a,b) ((a << 8) | (b))
-
-#define lstrlen                             strlen
-#define lstrcpy                             strcpy
-#define lstrcmpi                            strcasecmp
-#define _stricmp                            strcasecmp
-#define InterlockedIncrement(x)             __sync_fetch_and_add((x), 1)
-#define InterlockedDecrement(x)             __sync_fetch_and_sub((x), 1)
-// Windows uses (new, old) ordering but GCC has (old, new)
-#define InterlockedCompareExchange(x,y,z)   __sync_val_compare_and_swap(x,z,y)
-
-#define UInt32x32To64(a, b)                 ( (uint64_t) ( ((uint64_t)((uint32_t)(a))) * ((uint32_t)(b))  ) ) 
-#define Int64ShrlMod32(a, b)                ( (uint64_t) ( (uint64_t)(a) >> (b) ) )
-#define Int32x32To64(a, b)                  ((__int64)(((__int64)((long)(a))) * ((long)(b))))
-
-#define MulDiv(nNumber, nNumerator, nDenominator)   (int32_t) (((int64_t) (nNumber) * (int64_t) (nNumerator) + (int64_t) ((nDenominator)/2)) / (int64_t) (nDenominator))
-
-#ifdef __cplusplus
-}; // namespace avxsynth
-#endif // __cplusplus
-
-#endif //  __WINDOWS2LINUX_H__

x264-snapshot-20141218-2245.tar.bz2/AUTHORS -> x264-snapshot-20150804-2245.tar.bz2/AUTHORS Changed

x264-snapshot-20141218-2245.tar.bz2/Makefile -> x264-snapshot-20150804-2245.tar.bz2/Makefile Changed

@@ -87,12 +87,12 @@
 endif
 X86SRC = $(X86SRC0:%=common/x86/%)
 
-ifeq ($(ARCH),X86)
+ifeq ($(SYS_ARCH),X86)
 ARCH_X86 = yes
 ASMSRC   = $(X86SRC) common/x86/pixel-32.asm
 endif
 
-ifeq ($(ARCH),X86_64)
+ifeq ($(SYS_ARCH),X86_64)
 ARCH_X86 = yes
 ASMSRC   = $(X86SRC:-32.asm=-64.asm) common/x86/trellis-64.asm
 endif
@@ -106,7 +106,7 @@
 endif
 
 # AltiVec optims
-ifeq ($(ARCH),PPC)
+ifeq ($(SYS_ARCH),PPC)
 ifneq ($(AS),)
 SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \
         common/ppc/quant.c common/ppc/deblock.c \
@@ -115,7 +115,7 @@
 endif
 
 # NEON optims
-ifeq ($(ARCH),ARM)
+ifeq ($(SYS_ARCH),ARM)
 ifneq ($(AS),)
 ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \
           common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S \
@@ -126,20 +126,32 @@
 endif
 
 # AArch64 NEON optims
-ifeq ($(ARCH),AARCH64)
+ifeq ($(SYS_ARCH),AARCH64)
 ifneq ($(AS),)
-ASMSRC += common/aarch64/dct-a.S     \
+ASMSRC += common/aarch64/bitstream-a.S \
+          common/aarch64/cabac-a.S     \
+          common/aarch64/dct-a.S     \
           common/aarch64/deblock-a.S \
           common/aarch64/mc-a.S      \
           common/aarch64/pixel-a.S   \
           common/aarch64/predict-a.S \
           common/aarch64/quant-a.S
-SRCS   += common/aarch64/mc-c.c      \
+SRCS   += common/aarch64/asm-offsets.c \
+          common/aarch64/mc-c.c        \
           common/aarch64/predict-c.c
 OBJASM  = $(ASMSRC:%.S=%.o)
 endif
 endif
 
+# MSA optims
+ifeq ($(SYS_ARCH),MIPS)
+ifneq ($(findstring HAVE_MSA 1, $(CONFIG)),)
+SRCS += common/mips/mc-c.c common/mips/dct-c.c \
+        common/mips/deblock-c.c common/mips/pixel-c.c \
+        common/mips/predict-c.c common/mips/quant-c.c
+endif
+endif
+
 ifneq ($(HAVE_GETOPT_LONG),1)
 SRCCLI += extras/getopt.c
 endif
@@ -264,7 +276,7 @@
 	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
 
 distclean: clean
-	rm -f config.mak x264_config.h config.h config.log x264.pc x264.def
+	rm -f config.mak x264_config.h config.h config.log x264.pc x264.def conftest*
 
 install-cli: cli
 	$(INSTALL) -d $(DESTDIR)$(bindir)

x264-snapshot-20150804-2245.tar.bz2/common/aarch64/asm-offsets.c Added

@@ -0,0 +1,42 @@
+/*****************************************************************************
+ * asm-offsets.c: check asm offsets for aarch64
+ *****************************************************************************
+ * Copyright (C) 2014-2015 x264 project
+ *
+ * Authors: Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "asm-offsets.h"
+
+#define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \
+{ \
+    int m_##m[2 * (offsetof(s, m) == o) - 1]; \
+}
+
+X264_CHECK_OFFSET(x264_cabac_t, i_low,               CABAC_I_LOW);
+X264_CHECK_OFFSET(x264_cabac_t, i_range,             CABAC_I_RANGE);
+X264_CHECK_OFFSET(x264_cabac_t, i_queue,             CABAC_I_QUEUE);
+X264_CHECK_OFFSET(x264_cabac_t, i_bytes_outstanding, CABAC_I_BYTES_OUTSTANDING);
+X264_CHECK_OFFSET(x264_cabac_t, p_start,             CABAC_P_START);
+X264_CHECK_OFFSET(x264_cabac_t, p,                   CABAC_P);
+X264_CHECK_OFFSET(x264_cabac_t, p_end,               CABAC_P_END);
+X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded,     CABAC_F8_BITS_ENCODED);
+X264_CHECK_OFFSET(x264_cabac_t, state,               CABAC_STATE);

x264-snapshot-20150804-2245.tar.bz2/common/aarch64/asm-offsets.h Added

@@ -0,0 +1,39 @@
+/*****************************************************************************
+ * asm-offsets.h: asm offsets for aarch64
+ *****************************************************************************
+ * Copyright (C) 2014-2015 x264 project
+ *
+ * Authors: Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_ASM_OFFSETS_H
+#define X264_AARCH64_ASM_OFFSETS_H
+
+#define CABAC_I_LOW                 0x00
+#define CABAC_I_RANGE               0x04
+#define CABAC_I_QUEUE               0x08
+#define CABAC_I_BYTES_OUTSTANDING   0x0c
+#define CABAC_P_START               0x10
+#define CABAC_P                     0x18
+#define CABAC_P_END                 0x20
+#define CABAC_F8_BITS_ENCODED       0x30
+#define CABAC_STATE                 0x34
+
+#endif

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/asm.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/asm.S Changed

x264-snapshot-20150804-2245.tar.bz2/common/aarch64/bitstream-a.S Added

@@ -0,0 +1,82 @@
+/*****************************************************************************
+ * bitstream-a.S: aarch64 bitstream functions
+ *****************************************************************************
+ * Copyright (C) 2014-2015 x264 project
+ *
+ * Authors: Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+function x264_nal_escape_neon, export=1
+    movi        v0.16b,  #0xff
+    movi        v4.16b,  #4
+    mov         w3,  #3
+    subs        x6,  x1,  x2
+    cbz         x6,  99f
+0:
+    cmn         x6,  #15
+    b.lt        16f
+    mov         x1,  x2
+    b           100f
+16:
+    ld1         {v1.16b}, [x1], #16
+    ext         v2.16b, v0.16b, v1.16b, #14
+    ext         v3.16b, v0.16b, v1.16b, #15
+    cmhi        v7.16b, v4.16b, v1.16b
+    cmeq        v5.16b, v2.16b, #0
+    cmeq        v6.16b, v3.16b, #0
+    and         v5.16b, v5.16b, v7.16b
+    and         v5.16b, v5.16b, v6.16b
+    shrn        v7.8b,  v5.8h,  #4
+    mov         x7,  v7.d[0]
+    cbz         x7,  16f
+    mov         x6,  #-16
+100:
+    umov        w5,  v0.b[14]
+    umov        w4,  v0.b[15]
+    orr         w5,  w4,  w5, lsl #8
+101:
+    ldrb        w4,  [x1, x6]
+    orr         w9,  w4,  w5, lsl #16
+    cmp         w9,  #3
+    b.hi        102f
+    strb        w3,  [x0], #1
+    orr         w5,  w3,  w5, lsl #8
+102:
+    adds        x6,  x6,  #1
+    strb        w4,  [x0], #1
+    orr         w5,  w4,  w5, lsl #8
+    b.lt        101b
+    subs        x6,  x1,  x2
+    lsr         w9,  w5,  #8
+    mov         v0.b[14],  w9
+    mov         v0.b[15],  w5
+    b.lt        0b
+
+    ret
+16:
+    subs        x6,  x1,  x2
+    st1         {v1.16b}, [x0], #16
+    mov         v0.16b, v1.16b
+    b.lt        0b
+99:
+    ret
+endfunc

x264-snapshot-20150804-2245.tar.bz2/common/aarch64/cabac-a.S Added

@@ -0,0 +1,122 @@
+/*****************************************************************************
+ * cabac-a.S: aarch64 cabac
+ *****************************************************************************
+ * Copyright (C) 2014-2015 x264 project
+ *
+ * Authors: Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "asm-offsets.h"
+
+// w11 holds x264_cabac_t.i_low
+// w12 holds x264_cabac_t.i_range
+
+function x264_cabac_encode_decision_asm, export=1
+    movrel      x8,  X(x264_cabac_range_lps)
+    movrel      x9,  X(x264_cabac_transition)
+    add         w10, w1, #CABAC_STATE
+    ldrb        w3,  [x0,  x10]         // i_state
+    ldr         w12, [x0,  #CABAC_I_RANGE]
+    and         x4,  x3,  #~1
+    asr         w5,  w12, #6
+    add         x8,  x8,  x4, lsl #1
+    sub         w5,  w5,  #4
+    eor         w6,  w2,  w3            // b ^ i_state
+    ldrb        w4,  [x8,  x5]          // i_range_lps
+    ldr         w11, [x0, #CABAC_I_LOW]
+    sub         w12, w12, w4
+    tbz         w6,  #0,  1f            // (b ^ i_state) & 1
+    add         w11, w11, w12
+    mov         w12,  w4
+1:
+    orr         w4,  w2,  w3, lsl #1
+    ldrb        w9,  [x9,  x4]
+    strb        w9,  [x0,  x10]    // i_state
+
+cabac_encode_renorm:
+    clz         w5,  w12
+    ldr         w2,  [x0, #CABAC_I_QUEUE]
+    sub         w5,  w5,  #23
+    lsl         w12, w12, w5
+    lsl         w11, w11, w5
+2:
+    adds        w2,  w2,  w5
+    str         w12, [x0, #CABAC_I_RANGE]
+    b.lt        0f
+cabac_putbyte:
+    mov         w13, #0x400
+    add         w12, w2,  #10
+    lsl         w13, w13, w2
+    asr         w4,  w11, w12           // out
+    sub         w2,  w2,  #8
+    sub         w13, w13, #1
+    subs        w5,  w4,  #0xff
+    and         w11, w11, w13
+    ldr         w6,  [x0, #CABAC_I_BYTES_OUTSTANDING]
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    b.ne        1f
+
+    add         w6,  w6,  #1
+    str         w11, [x0, #CABAC_I_LOW]
+    str         w6,  [x0, #CABAC_I_BYTES_OUTSTANDING]
+    ret
+
+1:
+    ldr         x7,  [x0, #CABAC_P]
+    asr         w5,  w4,  #8            // carry
+    ldrb        w8,  [x7, #-1]
+    add         w8,  w8,  w5
+    sub         w5,  w5,  #1
+    strb        w8,  [x7, #-1]
+    cbz         w6,  3f
+2:
+    subs        w6,  w6,  #1
+    strb        w5,  [x7],  #1
+    b.gt        2b
+3:
+    strb        w4,  [x7],  #1
+    str         wzr, [x0, #CABAC_I_BYTES_OUTSTANDING]
+    str         x7,  [x0, #CABAC_P]
+0:
+    str         w11, [x0, #CABAC_I_LOW]
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    ret
+endfunc
+
+function x264_cabac_encode_bypass_asm, export=1
+    ldr         w12, [x0, #CABAC_I_RANGE]
+    ldr         w11, [x0, #CABAC_I_LOW]
+    ldr         w2,  [x0, #CABAC_I_QUEUE]
+    and         w1,  w1,  w12
+    add         w11, w1,  w11, lsl #1
+    adds        w2,  w2,  #1
+    b.ge        cabac_putbyte
+    str         w11, [x0, #CABAC_I_LOW]
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    ret
+endfunc
+
+function x264_cabac_encode_terminal_asm, export=1
+    ldr         w12, [x0, #CABAC_I_RANGE]
+    ldr         w11, [x0, #CABAC_I_LOW]
+    sub         w12, w12, #2
+    b           cabac_encode_renorm
+endfunc

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/dct-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/dct-a.S Changed

@@ -1,9 +1,10 @@
 /****************************************************************************
- * dct-a.S: AArch6464 transform and zigzag
+ * dct-a.S: aarch64 transform and zigzag
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -32,6 +33,25 @@
 .byte   26,27, 28,29, 22,23, 30,31
 endconst
 
+const scan4x4_field, align=4
+.byte    0,1,   2,3,   8,9,   4,5
+.byte    6,7,  10,11, 12,13, 14,15
+endconst
+
+const sub4x4_frame, align=4
+.byte    0,  1,  4,  8
+.byte    5,  2,  3,  6
+.byte    9, 12, 13, 10
+.byte    7, 11, 14, 15
+endconst
+
+const sub4x4_field, align=4
+.byte    0,  4,  1,  8
+.byte   12,  5,  9, 13
+.byte    2,  6, 10, 14
+.byte    3,  7, 11, 15
+endconst
+
 // sum = a + (b>>shift)   sub = (a>>shift) - b
 .macro SUMSUB_SHR shift sum sub a b t0 t1
     sshr        \t0,  \b, #\shift
@@ -602,56 +622,99 @@
     ret
 endfunc
 
+.macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7
+    ld1        {\t0\().8b}, [x1], x3
+    ld1        {\t1\().8b}, [x2], x4
+    ld1        {\t2\().8b}, [x1], x3
+    ld1        {\t3\().8b}, [x2], x4
+    usubl       \t0\().8h,  \t0\().8b,  \t1\().8b
+    ld1        {\t4\().8b}, [x1], x3
+    ld1        {\t5\().8b}, [x2], x4
+    usubl       \t1\().8h,  \t2\().8b,  \t3\().8b
+    ld1        {\t6\().8b}, [x1], x3
+    ld1        {\t7\().8b}, [x2], x4
+    add         \dst\().8h, \t0\().8h,  \t1\().8h
+    usubl       \t2\().8h,  \t4\().8b,  \t5\().8b
+    usubl       \t3\().8h,  \t6\().8b,  \t7\().8b
+    add         \dst\().8h, \dst\().8h, \t2\().8h
+    add         \dst\().8h, \dst\().8h, \t3\().8h
+.endm
+
 function x264_sub8x8_dct_dc_neon, export=1
     mov             x3,  #FENC_STRIDE
     mov             x4,  #FDEC_STRIDE
-    ld1        {v16.8b}, [x1], x3
-    ld1        {v17.8b}, [x2], x4
-    usubl       v16.8h,  v16.8b, v17.8b
-    ld1        {v18.8b}, [x1], x3
-    ld1        {v19.8b}, [x2], x4
-    usubl       v17.8h,  v18.8b, v19.8b
-    ld1        {v20.8b}, [x1], x3
-    ld1        {v21.8b}, [x2], x4
-    usubl       v18.8h, v20.8b, v21.8b
-    ld1        {v22.8b}, [x1], x3
-    add         v0.8h,  v16.8h, v17.8h
-    ld1        {v23.8b}, [x2], x4
-    usubl       v19.8h, v22.8b, v23.8b
-    ld1        {v24.8b}, [x1], x3
-    add         v0.8h,  v0.8h,  v18.8h
-    ld1        {v25.8b}, [x2], x4
-    usubl       v20.8h, v24.8b, v25.8b
-    ld1        {v26.8b}, [x1], x3
-    add         v0.8h,  v0.8h,  v19.8h
-    ld1        {v27.8b}, [x2], x4
-    usubl       v21.8h, v26.8b, v27.8b
-    ld1        {v28.8b}, [x1], x3
-    ld1        {v29.8b}, [x2], x4
-    usubl       v22.8h, v28.8b, v29.8b
-    ld1        {v30.8b}, [x1], x3
-    add         v1.8h,  v20.8h, v21.8h
-    ld1        {v31.8b}, [x2], x4
-    usubl       v23.8h, v30.8b, v31.8b
-    add         v1.8h,  v1.8h,  v22.8h
-    add         v1.8h,  v1.8h,  v23.8h
 
+    sub4x4x2_dct_dc  v0, v16, v17, v18, v19, v20, v21, v22, v23
+    sub4x4x2_dct_dc  v1, v24, v25, v26, v27, v28, v29, v30, v31
+
+    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
+    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
+    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
+    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
     transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
 
-    add         v0.8h,  v2.8h,  v3.8h
-    sub         v1.8h,  v2.8h,  v3.8h
+    addp        v0.8h,  v2.8h,  v3.8h
+    addp        v0.8h,  v0.8h,  v0.8h
 
-    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
+    st1        {v0.4h}, [x0]
+    ret
+endfunc
+
+function x264_sub8x16_dct_dc_neon, export=1
+    mov             x3,  #FENC_STRIDE
+    mov             x4,  #FDEC_STRIDE
+    sub4x4x2_dct_dc  v0, v16, v17, v18, v19, v20, v21, v22, v23
+    sub4x4x2_dct_dc  v1, v24, v25, v26, v27, v28, v29, v30, v31
+    sub4x4x2_dct_dc  v2, v16, v17, v18, v19, v20, v21, v22, v23
+    sub4x4x2_dct_dc  v3, v24, v25, v26, v27, v28, v29, v30, v31
 
-    add         v0.8h,  v2.8h,  v3.8h
-    sub         v1.8h,  v2.8h,  v3.8h
+    addp             v4.8h,  v0.8h,  v2.8h
+    addp             v5.8h,  v1.8h,  v3.8h
+
+    transpose   v2.4s,  v3.4s,  v4.4s,  v5.4s
+    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
+
+    transpose   v2.4s,  v3.4s,  v0.4s,  v1.4s
+    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
 
     transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
+    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
+
+    trn1        v2.2d,  v0.2d,  v1.2d
+    trn2        v3.2d,  v1.2d,  v0.2d
 
     addp        v0.8h,  v2.8h,  v3.8h
-    addp        v0.8h,  v0.8h,  v0.8h
 
-    st1        {v0.4h}, [x0]
+    st1        {v0.8h}, [x0]
+    ret
+endfunc
+
+function x264_zigzag_interleave_8x8_cavlc_neon, export=1
+    mov        x3,  #7
+    movi       v31.4s, #1
+    ld4        {v0.8h,v1.8h,v2.8h,v3.8h}, [x1],  #64
+    ld4        {v4.8h,v5.8h,v6.8h,v7.8h}, [x1],  #64
+    umax       v16.8h, v0.8h,  v4.8h
+    umax       v17.8h, v1.8h,  v5.8h
+    umax       v18.8h, v2.8h,  v6.8h
+    umax       v19.8h, v3.8h,  v7.8h
+    st1        {v0.8h}, [x0],  #16
+    st1        {v4.8h}, [x0],  #16
+    umaxp      v16.8h, v16.8h, v17.8h
+    umaxp      v18.8h, v18.8h, v19.8h
+    st1        {v1.8h}, [x0],  #16
+    st1        {v5.8h}, [x0],  #16
+    umaxp      v16.8h, v16.8h, v18.8h
+    st1        {v2.8h}, [x0],  #16
+    st1        {v6.8h}, [x0],  #16
+    cmhi       v16.4s, v16.4s, v31.4s
+    st1        {v3.8h}, [x0],  #16
+    and        v16.16b, v16.16b, v31.16b
+    st1        {v7.8h}, [x0],  #16
+    st1        {v16.b}[0],    [x2],  #1
+    st1        {v16.b}[4],    [x2],  x3
+    st1        {v16.b}[8],    [x2],  #1
+    st1        {v16.b}[12],   [x2]
     ret
 endfunc
 
@@ -664,3 +727,282 @@
     st1        {v2.16b,v3.16b},   [x0]
     ret
 endfunc
+
+.macro zigzag_sub_4x4 f ac
+function x264_zigzag_sub_4x4\ac\()_\f\()_neon, export=1
+    mov         x9,  #FENC_STRIDE
+    mov         x4,  #FDEC_STRIDE
+    movrel      x5,  sub4x4_\f
+    mov         x6,  x2
+    ld1        {v0.s}[0], [x1], x9
+    ld1        {v0.s}[1], [x1], x9
+    ld1        {v0.s}[2], [x1], x9
+    ld1        {v0.s}[3], [x1], x9
+    ld1        {v16.16b}, [x5]
+    ld1        {v1.s}[0], [x2], x4
+    ld1        {v1.s}[1], [x2], x4
+    ld1        {v1.s}[2], [x2], x4
+    ld1        {v1.s}[3], [x2], x4
+    tbl         v2.16b, {v0.16b}, v16.16b
+    tbl         v3.16b, {v1.16b}, v16.16b
+    st1        {v0.s}[0], [x6], x4
+    usubl       v4.8h,  v2.8b,  v3.8b
+.ifc \ac, ac
+    dup         h7, v4.h[0]
+    ins         v4.h[0], wzr
+    fmov        w5,  s7
+    strh        w5,  [x3]
+.endif
+    usubl2      v5.8h,  v2.16b, v3.16b
+    st1        {v0.s}[1], [x6], x4
+    umax        v6.8h,  v4.8h,  v5.8h
+    umaxv       h6,  v6.8h
+    st1        {v0.s}[2], [x6], x4
+    fmov        w7,  s6
+    st1        {v0.s}[3], [x6], x4
+    cmp         w7, #0
+    st1        {v4.8h,v5.8h},   [x0]
+    cset        w0, ne
+    ret
+endfunc
+.endm
+
+zigzag_sub_4x4 field
+zigzag_sub_4x4 field, ac
+zigzag_sub_4x4 frame
+zigzag_sub_4x4 frame, ac
+
+function x264_zigzag_scan_4x4_field_neon, export=1
+    movrel      x2, scan4x4_field
+    ld1        {v0.8h,v1.8h},   [x1]
+    ld1        {v16.16b},       [x2]
+    tbl         v0.16b, {v0.16b}, v16.16b
+    st1        {v0.8h,v1.8h},   [x0]
+    ret
+endfunc
+
+function x264_zigzag_scan_8x8_frame_neon, export=1
+    movrel      x2,  scan8x8_frame
+    ld1        {v0.8h,v1.8h},   [x1], #32
+    ld1        {v2.8h,v3.8h},   [x1], #32
+    ld1        {v4.8h,v5.8h},   [x1], #32
+    ld1        {v6.8h,v7.8h},   [x1]
+    ld1        {v16.16b,v17.16b}, [x2], #32
+    ld1        {v18.16b,v19.16b}, [x2], #32
+    ld1        {v20.16b,v21.16b}, [x2], #32
+    ld1        {v22.16b,v23.16b}, [x2], #32
+    tbl         v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
+    tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
+    tbl         v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
+    tbl         v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b
+    tbl         v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b
+    tbl         v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b
+    tbl         v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b
+    tbl         v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b
+    mov         v25.h[6], v4.h[0]
+    mov         v25.h[7], v5.h[0]
+    mov         v26.h[0], v4.h[1]
+    mov         v27.h[4], v7.h[0]
+    mov         v28.h[7], v4.h[4]
+    mov         v29.h[7], v3.h[6]
+    mov         v30.h[0], v2.h[7]
+    mov         v30.h[1], v3.h[7]
+    st1        {v24.8h,v25.8h}, [x0], #32
+    st1        {v26.8h,v27.8h}, [x0], #32
+    st1        {v28.8h,v29.8h}, [x0], #32
+    st1        {v30.8h,v31.8h}, [x0]
+    ret
+endfunc
+
+#define Z(z)   2*(z), 2*(z)+1
+#define T(x,y) Z(x*8+y)
+const scan8x8_frame, align=5
+    .byte T(0,0), T(1,0), T(0,1), T(0,2)
+    .byte T(1,1), T(2,0), T(3,0), T(2,1)
+    .byte T(1,2), T(0,3), T(0,4), T(1,3)
+    .byte T(2,2), T(3,1), T(4,0), T(5,0)
+    .byte T(4,1), T(3,2), T(2,3), T(1,4)
+    .byte T(0,5), T(0,6), T(1,5), T(2,4)
+#undef T
+#define T(x,y) Z((x-3)*8+y)
+    .byte T(3,3), T(4,2), T(5,1), T(6,0)
+    .byte T(7,0), T(6,1), T(5,2), T(4,3)
+#undef T
+#define T(x,y) Z((x-0)*8+y)
+    .byte T(3,4), T(2,5), T(1,6), T(0,7)
+    .byte T(1,7), T(2,6), T(3,5), T(4,4)
+#undef T
+#define T(x,y) Z((x-4)*8+y)
+    .byte T(5,3), T(6,2), T(7,1), T(7,2)
+    .byte T(6,3), T(5,4), T(4,5), T(3,6)
+    .byte T(2,7), T(3,7), T(4,6), T(5,5)
+    .byte T(6,4), T(7,3), T(7,4), T(6,5)
+    .byte T(5,6), T(4,7), T(5,7), T(6,6)
+    .byte T(7,5), T(7,6), T(6,7), T(7,7)
+endconst
+
+function x264_zigzag_scan_8x8_field_neon, export=1
+    movrel      x2,  scan8x8_field
+    ld1        {v0.8h,v1.8h},   [x1], #32
+    ld1        {v2.8h,v3.8h},   [x1], #32
+    ld1        {v4.8h,v5.8h},   [x1], #32
+    ld1        {v6.8h,v7.8h},   [x1]
+    ld1        {v16.16b,v17.16b}, [x2], #32
+    ld1        {v18.16b,v19.16b}, [x2], #32
+    ld1        {v20.16b,v21.16b}, [x2], #32
+    ld1        {v22.16b}, [x2]
+    ext         v31.16b, v7.16b, v7.16b, #4
+    tbl         v24.16b, {v0.16b,v1.16b},               v16.16b
+    tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
+    tbl         v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b
+    tbl         v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b
+    tbl         v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b
+    tbl         v29.16b, {v4.16b,v5.16b,v6.16b},        v21.16b
+    tbl         v30.16b, {v5.16b,v6.16b,v7.16b},        v22.16b
+    ext         v31.16b, v6.16b, v31.16b, #12
+    st1        {v24.8h,v25.8h}, [x0], #32
+    st1        {v26.8h,v27.8h}, [x0], #32
+    st1        {v28.8h,v29.8h}, [x0], #32
+    st1        {v30.8h,v31.8h}, [x0]
+    ret
+endfunc
+
+.macro zigzag_sub8x8 f
+function x264_zigzag_sub_8x8_\f\()_neon, export=1
+    movrel      x4,  sub8x8_\f
+    mov         x5,  #FENC_STRIDE
+    mov         x6,  #FDEC_STRIDE
+    mov         x7,  x2
+    ld1        {v0.d}[0], [x1], x5
+    ld1        {v0.d}[1], [x1], x5
+    ld1        {v1.d}[0], [x1], x5
+    ld1        {v1.d}[1], [x1], x5
+    ld1        {v2.d}[0], [x1], x5
+    ld1        {v2.d}[1], [x1], x5
+    ld1        {v3.d}[0], [x1], x5
+    ld1        {v3.d}[1], [x1]
+    ld1        {v4.d}[0], [x2], x6
+    ld1        {v4.d}[1], [x2], x6
+    ld1        {v5.d}[0], [x2], x6
+    ld1        {v5.d}[1], [x2], x6
+    ld1        {v6.d}[0], [x2], x6
+    ld1        {v6.d}[1], [x2], x6
+    ld1        {v7.d}[0], [x2], x6
+    ld1        {v7.d}[1], [x2]
+    ld1        {v16.16b,v17.16b}, [x4], #32
+    ld1        {v18.16b,v19.16b}, [x4], #32
+    tbl         v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
+    tbl         v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
+    tbl         v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
+    tbl         v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b
+    tbl         v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b
+    tbl         v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b
+    tbl         v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b
+    tbl         v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b
+    usubl       v4.8h,  v24.8b,  v28.8b
+    usubl2      v5.8h,  v24.16b, v28.16b
+    usubl       v6.8h,  v25.8b,  v29.8b
+    usubl2      v7.8h,  v25.16b, v29.16b
+    usubl       v16.8h, v26.8b,  v30.8b
+    usubl2      v17.8h, v26.16b, v30.16b
+    usubl       v18.8h, v27.8b,  v31.8b
+    usubl2      v19.8h, v27.16b, v31.16b
+    umax        v20.8h, v4.8h,   v5.8h
+    umax        v21.8h, v6.8h,   v7.8h
+    umax        v22.8h, v16.8h,  v17.8h
+    umax        v23.8h, v18.8h,  v19.8h
+    umax        v20.8h, v20.8h,  v21.8h
+    umax        v21.8h, v22.8h,  v23.8h
+    umax        v20.8h, v20.8h,  v21.8h
+    umaxv       h22,    v20.8h
+    st1        {v0.d}[0], [x7], x6
+    st1        {v0.d}[1], [x7], x6
+    st1        {v1.d}[0], [x7], x6
+    st1        {v1.d}[1], [x7], x6
+    st1        {v2.d}[0], [x7], x6
+    st1        {v2.d}[1], [x7], x6
+    st1        {v3.d}[0], [x7], x6
+    st1        {v3.d}[1], [x7]
+    st1        {v4.8h,v5.8h},   [x0], #32
+    st1        {v6.8h,v7.8h},   [x0], #32
+    st1        {v16.8h,v17.8h}, [x0], #32
+    st1        {v18.8h,v19.8h}, [x0]
+    fmov        w9,  s22
+    cmp         w9, #0
+    cset        w0, ne
+    ret
+endfunc
+.endm
+
+zigzag_sub8x8 field
+zigzag_sub8x8 frame
+
+#undef T
+#define T(x,y) Z(x*8+y)
+const scan8x8_field, align=5
+    .byte T(0,0), T(0,1), T(0,2), T(1,0)
+    .byte T(1,1), T(0,3), T(0,4), T(1,2)
+    .byte T(2,0), T(1,3), T(0,5), T(0,6)
+    .byte T(0,7), T(1,4), T(2,1), T(3,0)
+#undef T
+#define T(x,y) Z((x-1)*8+y)
+    .byte T(2,2), T(1,5), T(1,6), T(1,7)
+    .byte T(2,3), T(3,1), T(4,0), T(3,2)
+#undef T
+#define T(x,y) Z((x-2)*8+y)
+    .byte T(2,4), T(2,5), T(2,6), T(2,7)
+    .byte T(3,3), T(4,1), T(5,0), T(4,2)
+#undef T
+#define T(x,y) Z((x-3)*8+y)
+    .byte T(3,4), T(3,5), T(3,6), T(3,7)
+    .byte T(4,3), T(5,1), T(6,0), T(5,2)
+#undef T
+#define T(x,y) Z((x-4)*8+y)
+    .byte T(4,4), T(4,5), T(4,6), T(4,7)
+    .byte T(5,3), T(6,1), T(6,2), T(5,4)
+#undef T
+#define T(x,y) Z((x-5)*8+y)
+    .byte T(5,5), T(5,6), T(5,7), T(6,3)
+    .byte T(7,0), T(7,1), T(6,4), T(6,5)
+endconst
+
+
+#undef T
+#define T(y,x) x*8+y
+const sub8x8_frame, align=5
+    .byte T(0,0), T(1,0), T(0,1), T(0,2)
+    .byte T(1,1), T(2,0), T(3,0), T(2,1)
+    .byte T(1,2), T(0,3), T(0,4), T(1,3)
+    .byte T(2,2), T(3,1), T(4,0), T(5,0)
+    .byte T(4,1), T(3,2), T(2,3), T(1,4)
+    .byte T(0,5), T(0,6), T(1,5), T(2,4)
+    .byte T(3,3), T(4,2), T(5,1), T(6,0)
+    .byte T(7,0), T(6,1), T(5,2), T(4,3)
+    .byte T(3,4), T(2,5), T(1,6), T(0,7)
+    .byte T(1,7), T(2,6), T(3,5), T(4,4)
+    .byte T(5,3), T(6,2), T(7,1), T(7,2)
+    .byte T(6,3), T(5,4), T(4,5), T(3,6)
+    .byte T(2,7), T(3,7), T(4,6), T(5,5)
+    .byte T(6,4), T(7,3), T(7,4), T(6,5)
+    .byte T(5,6), T(4,7), T(5,7), T(6,6)
+    .byte T(7,5), T(7,6), T(6,7), T(7,7)
+endconst
+
+const sub8x8_field, align=5
+    .byte T(0,0), T(0,1), T(0,2), T(1,0)
+    .byte T(1,1), T(0,3), T(0,4), T(1,2)
+    .byte T(2,0), T(1,3), T(0,5), T(0,6)
+    .byte T(0,7), T(1,4), T(2,1), T(3,0)
+    .byte T(2,2), T(1,5), T(1,6), T(1,7)
+    .byte T(2,3), T(3,1), T(4,0), T(3,2)
+    .byte T(2,4), T(2,5), T(2,6), T(2,7)
+    .byte T(3,3), T(4,1), T(5,0), T(4,2)
+    .byte T(3,4), T(3,5), T(3,6), T(3,7)
+    .byte T(4,3), T(5,1), T(6,0), T(5,2)
+    .byte T(4,4), T(4,5), T(4,6), T(4,7)
+    .byte T(5,3), T(6,1), T(6,2), T(5,4)
+    .byte T(5,5), T(5,6), T(5,7), T(6,3)
+    .byte T(7,0), T(7,1), T(6,4), T(6,5)
+    .byte T(6,6), T(6,7), T(7,2), T(7,3)
+    .byte T(7,4), T(7,5), T(7,6), T(7,7)
+endconst

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/dct.h Changed

@@ -1,9 +1,10 @@
 /*****************************************************************************
- * dct.h: AArch64 transform and zigzag
+ * dct.h: aarch64 transform and zigzag
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -40,6 +41,7 @@
 void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
 void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
 void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
 
 void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
@@ -48,5 +50,18 @@
 void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
 
 void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_field_neon( int16_t level[64], int16_t dct[64] );
+
+int x264_zigzag_sub_4x4_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
+int x264_zigzag_sub_4x4ac_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
+int x264_zigzag_sub_4x4_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
+int x264_zigzag_sub_4x4ac_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
+
+int x264_zigzag_sub_8x8_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
+int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
+
+void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz );
 
 #endif

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/deblock-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/deblock-a.S Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * deblock.S: aarch64 deblocking
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: Mans Rullgard <mans@mansr.com>
  *          Janne Grunau <janne-x264@jannau.net>
@@ -180,6 +180,202 @@
     ret
 endfunc
 
+.macro h264_loop_filter_start_intra
+    orr             w4,  w2,  w3
+    cmp             w4,  #0
+    b.ne            1f
+    ret
+1:
+    dup             v30.16b, w2                // alpha
+    dup             v31.16b, w3                // beta
+.endm
+
+.macro h264_loop_filter_luma_intra
+    uabd            v16.16b, v7.16b,  v0.16b        // abs(p0 - q0)
+    uabd            v17.16b, v6.16b,  v7.16b        // abs(p1 - p0)
+    uabd            v18.16b, v1.16b,  v0.16b        // abs(q1 - q0)
+    cmhi            v19.16b, v30.16b, v16.16b       // < alpha
+    cmhi            v17.16b, v31.16b, v17.16b       // < beta
+    cmhi            v18.16b, v31.16b, v18.16b       // < beta
+
+    movi            v29.16b, #2
+    ushr            v30.16b, v30.16b, #2            // alpha >> 2
+    add             v30.16b, v30.16b, v29.16b       // (alpha >> 2) + 2
+    cmhi            v16.16b, v30.16b, v16.16b       // < (alpha >> 2) + 2
+
+    and             v19.16b, v19.16b, v17.16b
+    and             v19.16b, v19.16b, v18.16b
+    shrn            v20.8b,  v19.8h,  #4
+    mov             x4, v20.d[0]
+    cbz             x4, 9f
+
+    ushll           v20.8h,  v6.8b,   #1
+    ushll           v22.8h,  v1.8b,   #1
+    ushll2          v21.8h,  v6.16b,  #1
+    ushll2          v23.8h,  v1.16b,  #1
+    uaddw           v20.8h,  v20.8h,  v7.8b
+    uaddw           v22.8h,  v22.8h,  v0.8b
+    uaddw2          v21.8h,  v21.8h,  v7.16b
+    uaddw2          v23.8h,  v23.8h,  v0.16b
+    uaddw           v20.8h,  v20.8h,  v1.8b
+    uaddw           v22.8h,  v22.8h,  v6.8b
+    uaddw2          v21.8h,  v21.8h,  v1.16b
+    uaddw2          v23.8h,  v23.8h,  v6.16b
+
+    rshrn           v24.8b,  v20.8h,  #2 // p0'_1
+    rshrn           v25.8b,  v22.8h,  #2 // q0'_1
+    rshrn2          v24.16b, v21.8h,  #2 // p0'_1
+    rshrn2          v25.16b, v23.8h,  #2 // q0'_1
+
+    uabd            v17.16b, v5.16b,  v7.16b        // abs(p2 - p0)
+    uabd            v18.16b, v2.16b,  v0.16b        // abs(q2 - q0)
+    cmhi            v17.16b, v31.16b, v17.16b       // < beta
+    cmhi            v18.16b, v31.16b, v18.16b       // < beta
+
+    and             v17.16b, v16.16b, v17.16b  // if_2 && if_3
+    and             v18.16b, v16.16b, v18.16b  // if_2 && if_4
+
+    not             v30.16b, v17.16b
+    not             v31.16b, v18.16b
+
+    and             v30.16b, v30.16b, v19.16b  // if_1 && !(if_2 && if_3)
+    and             v31.16b, v31.16b, v19.16b  // if_1 && !(if_2 && if_4)
+
+    and             v17.16b, v19.16b, v17.16b  // if_1 && if_2 && if_3
+    and             v18.16b, v19.16b, v18.16b  // if_1 && if_2 && if_4
+
+    //calc            p, v7, v6, v5, v4, v17, v7, v6, v5, v4
+    uaddl           v26.8h,  v5.8b,   v7.8b
+    uaddl2          v27.8h,  v5.16b,  v7.16b
+    uaddw           v26.8h,  v26.8h,  v0.8b
+    uaddw2          v27.8h,  v27.8h,  v0.16b
+    add             v20.8h,  v20.8h,  v26.8h
+    add             v21.8h,  v21.8h,  v27.8h
+    uaddw           v20.8h,  v20.8h,  v0.8b
+    uaddw2          v21.8h,  v21.8h,  v0.16b
+    rshrn           v20.8b,  v20.8h,  #3 // p0'_2
+    rshrn2          v20.16b, v21.8h,  #3 // p0'_2
+    uaddw           v26.8h,  v26.8h,  v6.8b
+    uaddw2          v27.8h,  v27.8h,  v6.16b
+    rshrn           v21.8b,  v26.8h,  #2 // p1'_2
+    rshrn2          v21.16b, v27.8h,  #2 // p1'_2
+    uaddl           v28.8h,  v4.8b,   v5.8b
+    uaddl2          v29.8h,  v4.16b,  v5.16b
+    shl             v28.8h,  v28.8h,  #1
+    shl             v29.8h,  v29.8h,  #1
+    add             v28.8h,  v28.8h,  v26.8h
+    add             v29.8h,  v29.8h,  v27.8h
+    rshrn           v19.8b,  v28.8h,  #3 // p2'_2
+    rshrn2          v19.16b, v29.8h,  #3 // p2'_2
+
+    //calc            q, v0, v1, v2, v3, v18, v0, v1, v2, v3
+    uaddl           v26.8h,  v2.8b,   v0.8b
+    uaddl2          v27.8h,  v2.16b,  v0.16b
+    uaddw           v26.8h,  v26.8h,  v7.8b
+    uaddw2          v27.8h,  v27.8h,  v7.16b
+    add             v22.8h,  v22.8h,  v26.8h
+    add             v23.8h,  v23.8h,  v27.8h
+    uaddw           v22.8h,  v22.8h,  v7.8b
+    uaddw2          v23.8h,  v23.8h,  v7.16b
+    rshrn           v22.8b,  v22.8h,  #3 // q0'_2
+    rshrn2          v22.16b, v23.8h,  #3 // q0'_2
+    uaddw           v26.8h,  v26.8h,  v1.8b
+    uaddw2          v27.8h,  v27.8h,  v1.16b
+    rshrn           v23.8b,  v26.8h,  #2 // q1'_2
+    rshrn2          v23.16b, v27.8h,  #2 // q1'_2
+    uaddl           v28.8h,  v2.8b,   v3.8b
+    uaddl2          v29.8h,  v2.16b,  v3.16b
+    shl             v28.8h,  v28.8h,  #1
+    shl             v29.8h,  v29.8h,  #1
+    add             v28.8h,  v28.8h,  v26.8h
+    add             v29.8h,  v29.8h,  v27.8h
+    rshrn           v26.8b,  v28.8h,  #3 // q2'_2
+    rshrn2          v26.16b, v29.8h,  #3 // q2'_2
+
+    bit             v7.16b,  v24.16b, v30.16b  // p0'_1
+    bit             v0.16b,  v25.16b, v31.16b  // q0'_1
+    bit             v7.16b, v20.16b,  v17.16b  // p0'_2
+    bit             v6.16b, v21.16b,  v17.16b  // p1'_2
+    bit             v5.16b, v19.16b,  v17.16b  // p2'_2
+    bit             v0.16b, v22.16b,  v18.16b  // q0'_2
+    bit             v1.16b, v23.16b,  v18.16b  // q1'_2
+    bit             v2.16b, v26.16b,  v18.16b  // q2'_2
+.endm
+
+function x264_deblock_v_luma_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    ld1             {v0.16b},  [x0], x1 // q0
+    ld1             {v1.16b},  [x0], x1 // q1
+    ld1             {v2.16b},  [x0], x1 // q2
+    ld1             {v3.16b},  [x0], x1 // q3
+    sub             x0,  x0,  x1, lsl #3
+    ld1             {v4.16b},  [x0], x1 // p3
+    ld1             {v5.16b},  [x0], x1 // p2
+    ld1             {v6.16b},  [x0], x1 // p1
+    ld1             {v7.16b},  [x0]     // p0
+
+    h264_loop_filter_luma_intra
+
+    sub             x0,  x0,  x1, lsl #1
+    st1             {v5.16b}, [x0], x1  // p2
+    st1             {v6.16b}, [x0], x1  // p1
+    st1             {v7.16b}, [x0], x1  // p0
+    st1             {v0.16b}, [x0], x1  // q0
+    st1             {v1.16b}, [x0], x1  // q1
+    st1             {v2.16b}, [x0]      // q2
+9:
+    ret
+endfunc
+
+function x264_deblock_h_luma_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x0,  x0,  #4
+    ld1             {v4.8b},  [x0], x1
+    ld1             {v5.8b},  [x0], x1
+    ld1             {v6.8b},  [x0], x1
+    ld1             {v7.8b},  [x0], x1
+    ld1             {v0.8b},  [x0], x1
+    ld1             {v1.8b},  [x0], x1
+    ld1             {v2.8b},  [x0], x1
+    ld1             {v3.8b},  [x0], x1
+    ld1             {v4.d}[1],  [x0], x1
+    ld1             {v5.d}[1],  [x0], x1
+    ld1             {v6.d}[1],  [x0], x1
+    ld1             {v7.d}[1],  [x0], x1
+    ld1             {v0.d}[1],  [x0], x1
+    ld1             {v1.d}[1],  [x0], x1
+    ld1             {v2.d}[1],  [x0], x1
+    ld1             {v3.d}[1],  [x0], x1
+
+    transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
+
+    h264_loop_filter_luma_intra
+
+    transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
+
+    sub             x0,  x0,  x1, lsl #4
+    st1             {v4.8b},  [x0], x1
+    st1             {v5.8b},  [x0], x1
+    st1             {v6.8b},  [x0], x1
+    st1             {v7.8b},  [x0], x1
+    st1             {v0.8b},  [x0], x1
+    st1             {v1.8b},  [x0], x1
+    st1             {v2.8b},  [x0], x1
+    st1             {v3.8b},  [x0], x1
+    st1             {v4.d}[1],  [x0], x1
+    st1             {v5.d}[1],  [x0], x1
+    st1             {v6.d}[1],  [x0], x1
+    st1             {v7.d}[1],  [x0], x1
+    st1             {v0.d}[1],  [x0], x1
+    st1             {v1.d}[1],  [x0], x1
+    st1             {v2.d}[1],  [x0], x1
+    st1             {v3.d}[1],  [x0], x1
+9:
+    ret
+endfunc
+
 .macro h264_loop_filter_chroma
     dup             v22.16b, w2              // alpha
     uxtl            v24.8h,  v24.8b
@@ -247,6 +443,7 @@
     h264_loop_filter_start
 
     sub             x0,  x0,  #4
+deblock_h_chroma:
     ld1             {v18.d}[0], [x0], x1
     ld1             {v16.d}[0], [x0], x1
     ld1             {v0.d}[0],  [x0], x1
@@ -275,6 +472,229 @@
     ret
 endfunc
 
+function x264_deblock_h_chroma_422_neon, export=1
+    add             x5,  x0,  x1
+    add             x1,  x1,  x1
+    mov             x7,  x30
+    bl              X(x264_deblock_h_chroma_neon)
+    ldr             w6,  [x4]
+    mov             x30, x7
+    sub             x0,  x5,  #4
+    mov             v24.s[0], w6
+    b               deblock_h_chroma
+endfunc
+
+.macro h264_loop_filter_chroma8
+    dup             v22.8b,  w2                 // alpha
+    uxtl            v24.8h,  v24.8b
+    uabd            v26.8b,  v16.8b,  v17.8b    // abs(p0 - q0)
+    uxtl            v4.8h,   v17.8b
+    uabd            v28.8b,  v18.8b,  v16.8b    // abs(p1 - p0)
+    usubw           v4.8h,   v4.8h,   v16.8b
+    sli             v24.8h,  v24.8h,  #8
+    shl             v4.8h,   v4.8h,   #2
+    uabd            v30.8b,  v19.8b,  v17.8b    // abs(q1 - q0)
+    uaddw           v4.8h,   v4.8h,   v18.8b
+    cmhi            v26.8b,  v22.8b,  v26.8b    // < alpha
+    usubw           v4.8h,   v4.8h,   v19.8b
+    dup             v22.8b,  w3                 // beta
+    rshrn           v4.8b,   v4.8h,   #3
+    cmhi            v28.8b,  v22.8b,  v28.8b    // < beta
+    cmhi            v30.8b,  v22.8b,  v30.8b    // < beta
+    smin            v4.8b,   v4.8b,   v24.8b
+    neg             v25.8b,  v24.8b
+    and             v26.8b,  v26.8b,  v28.8b
+    smax            v4.8b,   v4.8b,   v25.8b
+    and             v26.8b,  v26.8b,  v30.8b
+    uxtl            v22.8h,  v17.8b
+    and             v4.8b,   v4.8b,   v26.8b
+    uxtl            v28.8h,  v16.8b
+    saddw           v28.8h,  v28.8h,  v4.8b
+    ssubw           v22.8h,  v22.8h,  v4.8b
+    sqxtun          v16.8b,  v28.8h
+    sqxtun          v17.8b,  v22.8h
+.endm
+
+function x264_deblock_h_chroma_mbaff_neon, export=1
+    h264_loop_filter_start
+
+    sub             x4,  x0,  #4
+    sub             x0,  x0,  #2
+
+    ld1             {v18.8b}, [x4], x1
+    ld1             {v16.8b}, [x4], x1
+    ld1             {v17.8b},  [x4], x1
+    ld1             {v19.8b},  [x4]
+
+    transpose4x4.h  v18, v16, v17, v19, v28, v29, v30, v31
+
+    h264_loop_filter_chroma8
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0]
+
+    ret
+endfunc
+
+.macro h264_loop_filter_chroma_intra, width=16
+    uabd            v26.16b, v16.16b, v17.16b  // abs(p0 - q0)
+    uabd            v27.16b, v18.16b, v16.16b  // abs(p1 - p0)
+    uabd            v28.16b, v19.16b, v17.16b  // abs(q1 - q0)
+    cmhi            v26.16b, v30.16b, v26.16b  // < alpha
+    cmhi            v27.16b, v31.16b, v27.16b  // < beta
+    cmhi            v28.16b, v31.16b, v28.16b  // < beta
+    and             v26.16b, v26.16b, v27.16b
+    and             v26.16b, v26.16b, v28.16b
+
+    ushll           v4.8h,   v18.8b,  #1
+    ushll           v6.8h,   v19.8b,  #1
+.ifc \width, 16
+    ushll2          v5.8h,   v18.16b, #1
+    ushll2          v7.8h,   v19.16b, #1
+    uaddl2          v21.8h,  v16.16b, v19.16b
+    uaddl2          v23.8h,  v17.16b, v18.16b
+.endif
+    uaddl           v20.8h,  v16.8b,  v19.8b
+    uaddl           v22.8h,  v17.8b,  v18.8b
+    add             v20.8h,  v20.8h,  v4.8h     // mlal?
+    add             v22.8h,  v22.8h,  v6.8h
+.ifc \width, 16
+    add             v21.8h,  v21.8h,  v5.8h
+    add             v23.8h,  v23.8h,  v7.8h
+.endif
+    uqrshrn         v24.8b,  v20.8h,  #2
+    uqrshrn         v25.8b,  v22.8h,  #2
+.ifc \width, 16
+    uqrshrn2        v24.16b, v21.8h,  #2
+    uqrshrn2        v25.16b, v23.8h,  #2
+.endif
+    bit             v16.16b, v24.16b, v26.16b
+    bit             v17.16b, v25.16b, v26.16b
+.endm
+
+function x264_deblock_v_chroma_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x0,  x0,  x1, lsl #1
+    ld1             {v18.16b}, [x0], x1
+    ld1             {v16.16b}, [x0], x1
+    ld1             {v17.16b}, [x0], x1
+    ld1             {v19.16b}, [x0]
+
+    h264_loop_filter_chroma_intra
+
+    sub             x0,  x0,  x1, lsl #1
+    st1             {v16.16b}, [x0], x1
+    st1             {v17.16b}, [x0], x1
+
+    ret
+endfunc
+
+function x264_deblock_h_chroma_intra_mbaff_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x4,  x0,  #4
+    sub             x0,  x0,  #2
+    ld1             {v18.8b}, [x4], x1
+    ld1             {v16.8b}, [x4], x1
+    ld1             {v17.8b}, [x4], x1
+    ld1             {v19.8b}, [x4], x1
+
+    transpose4x4.h  v18, v16, v17, v19, v26, v27, v28, v29
+
+    h264_loop_filter_chroma_intra, width=8
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0], x1
+
+    ret
+endfunc
+
+function x264_deblock_h_chroma_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x4,  x0,  #4
+    sub             x0,  x0,  #2
+    ld1             {v18.d}[0], [x4], x1
+    ld1             {v16.d}[0], [x4], x1
+    ld1             {v17.d}[0], [x4], x1
+    ld1             {v19.d}[0], [x4], x1
+    ld1             {v18.d}[1], [x4], x1
+    ld1             {v16.d}[1], [x4], x1
+    ld1             {v17.d}[1], [x4], x1
+    ld1             {v19.d}[1], [x4], x1
+
+    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
+
+    h264_loop_filter_chroma_intra
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0], x1
+    st2             {v16.h,v17.h}[4], [x0], x1
+    st2             {v16.h,v17.h}[5], [x0], x1
+    st2             {v16.h,v17.h}[6], [x0], x1
+    st2             {v16.h,v17.h}[7], [x0], x1
+
+    ret
+endfunc
+
+function x264_deblock_h_chroma_422_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x4,  x0,  #4
+    sub             x0,  x0,  #2
+    ld1             {v18.d}[0], [x4], x1
+    ld1             {v16.d}[0], [x4], x1
+    ld1             {v17.d}[0], [x4], x1
+    ld1             {v19.d}[0], [x4], x1
+    ld1             {v18.d}[1], [x4], x1
+    ld1             {v16.d}[1], [x4], x1
+    ld1             {v17.d}[1], [x4], x1
+    ld1             {v19.d}[1], [x4], x1
+
+    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
+
+    h264_loop_filter_chroma_intra
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0], x1
+    st2             {v16.h,v17.h}[4], [x0], x1
+    st2             {v16.h,v17.h}[5], [x0], x1
+    st2             {v16.h,v17.h}[6], [x0], x1
+    st2             {v16.h,v17.h}[7], [x0], x1
+
+    ld1             {v18.d}[0], [x4], x1
+    ld1             {v16.d}[0], [x4], x1
+    ld1             {v17.d}[0], [x4], x1
+    ld1             {v19.d}[0], [x4], x1
+    ld1             {v18.d}[1], [x4], x1
+    ld1             {v16.d}[1], [x4], x1
+    ld1             {v17.d}[1], [x4], x1
+    ld1             {v19.d}[1], [x4], x1
+
+    transpose4x8.h  v18, v16, v17, v19, v26, v27, v28, v29
+
+    h264_loop_filter_chroma_intra
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0], x1
+    st2             {v16.h,v17.h}[4], [x0], x1
+    st2             {v16.h,v17.h}[5], [x0], x1
+    st2             {v16.h,v17.h}[6], [x0], x1
+    st2             {v16.h,v17.h}[7], [x0], x1
+
+    ret
+endfunc
 
 //static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE],
 //                                int8_t ref[2][X264_SCAN8_LUMA_SIZE],

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/mc-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/mc-a.S Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc.S: aarch64 motion compensation
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
  *          Janne Grunau <janne-x264@jannau.net>
@@ -1253,6 +1253,34 @@
     ret
 endfunc
 
+function x264_plane_copy_neon, export=1
+    add         x8,  x4,  #15
+    and         x4,  x8,  #~15
+    sub         x1,  x1,  x4
+    sub         x3,  x3,  x4
+1:
+    mov         w8,  w4
+16:
+    tst         w8,  #16
+    b.eq        32f
+    subs        w8,  w8,  #16
+    ldr         q0,  [x2], #16
+    str         q0,  [x0], #16
+    b.eq        0f
+32:
+    subs        w8,  w8,  #32
+    ldp         q0,  q1,  [x2], #32
+    stp         q0,  q1,  [x0], #32
+    b.gt        32b
+0:
+    subs        w5,  w5,  #1
+    add         x2,  x2,  x3
+    add         x0,  x0,  x1
+    b.gt        1b
+
+    ret
+endfunc
+
 function x264_plane_copy_deinterleave_neon, export=1
     add         w9,  w6,  #15
     and         w9,  w9,  #0xfffffff0
@@ -1363,3 +1391,279 @@
 
     ret
 endfunc
+
+.macro integral4h p1, p2
+    ext         v1.8b,  \p1\().8b,  \p2\().8b,  #1
+    ext         v2.8b,  \p1\().8b,  \p2\().8b,  #2
+    ext         v3.8b,  \p1\().8b,  \p2\().8b,  #3
+    uaddl       v0.8h,  \p1\().8b,  v1.8b
+    uaddl       v4.8h,  v2.8b,  v3.8b
+    add         v0.8h,  v0.8h,  v4.8h
+    add         v0.8h,  v0.8h,  v5.8h
+.endm
+
+function integral_init4h_neon, export=1
+    sub         x3,  x0,  x2
+    ld1        {v6.8b,v7.8b}, [x1], #16
+1:
+    subs        x2,  x2,  #16
+    ld1        {v5.8h},  [x3], #16
+    integral4h  v6, v7
+    ld1        {v6.8b},  [x1], #8
+    ld1        {v5.8h},  [x3], #16
+    st1        {v0.8h},  [x0], #16
+    integral4h  v7, v6
+    ld1        {v7.8b},  [x1], #8
+    st1        {v0.8h},  [x0], #16
+    b.gt        1b
+    ret
+endfunc
+
+.macro integral8h p1, p2, s
+    ext         v1.8b,  \p1\().8b,  \p2\().8b,  #1
+    ext         v2.8b,  \p1\().8b,  \p2\().8b,  #2
+    ext         v3.8b,  \p1\().8b,  \p2\().8b,  #3
+    ext         v4.8b,  \p1\().8b,  \p2\().8b,  #4
+    ext         v5.8b,  \p1\().8b,  \p2\().8b,  #5
+    ext         v6.8b,  \p1\().8b,  \p2\().8b,  #6
+    ext         v7.8b,  \p1\().8b,  \p2\().8b,  #7
+    uaddl       v0.8h,  \p1\().8b,  v1.8b
+    uaddl       v2.8h,  v2.8b,  v3.8b
+    uaddl       v4.8h,  v4.8b,  v5.8b
+    uaddl       v6.8h,  v6.8b,  v7.8b
+    add         v0.8h,  v0.8h,  v2.8h
+    add         v4.8h,  v4.8h,  v6.8h
+    add         v0.8h,  v0.8h,  v4.8h
+    add         v0.8h,  v0.8h,  \s\().8h
+.endm
+
+function integral_init8h_neon, export=1
+    sub         x3,  x0,  x2
+    ld1        {v16.8b,v17.8b}, [x1], #16
+1:
+    subs        x2,  x2,  #16
+    ld1        {v18.8h}, [x3], #16
+    integral8h  v16, v17, v18
+    ld1        {v16.8b}, [x1], #8
+    ld1        {v18.8h}, [x3], #16
+    st1        {v0.8h},  [x0], #16
+    integral8h  v17, v16, v18
+    ld1        {v17.8b}, [x1], #8
+    st1        {v0.8h},  [x0], #16
+    b.gt        1b
+    ret
+endfunc
+
+function integral_init4v_neon, export=1
+    mov         x3,  x0
+    add         x4,  x0,  x2,  lsl #3
+    add         x8,  x0,  x2,  lsl #4
+    sub         x2,  x2,  #8
+    ld1        {v20.8h,v21.8h,v22.8h}, [x3], #48
+    ld1        {v16.8h,v17.8h,v18.8h}, [x8], #48
+1:
+    subs        x2,  x2,  #16
+    ld1        {v24.8h,v25.8h}, [x4], #32
+    ext         v0.16b,  v20.16b, v21.16b, #8
+    ext         v1.16b,  v21.16b, v22.16b, #8
+    ext         v2.16b,  v16.16b, v17.16b, #8
+    ext         v3.16b,  v17.16b, v18.16b, #8
+    sub         v24.8h,  v24.8h,  v20.8h
+    sub         v25.8h,  v25.8h,  v21.8h
+    add         v0.8h,   v0.8h,   v20.8h
+    add         v1.8h,   v1.8h,   v21.8h
+    add         v2.8h,   v2.8h,   v16.8h
+    add         v3.8h,   v3.8h,   v17.8h
+    st1        {v24.8h},  [x1], #16
+    st1        {v25.8h},  [x1], #16
+    mov         v20.16b,  v22.16b
+    mov         v16.16b,  v18.16b
+    sub         v0.8h,   v2.8h,   v0.8h
+    sub         v1.8h,   v3.8h,   v1.8h
+    ld1        {v21.8h,v22.8h}, [x3], #32
+    ld1        {v17.8h,v18.8h}, [x8], #32
+    st1        {v0.8h},  [x0], #16
+    st1        {v1.8h},  [x0], #16
+    b.gt        1b
+2:
+    ret
+endfunc
+
+function integral_init8v_neon, export=1
+    add         x2,  x0,  x1,  lsl #4
+    sub         x1,  x1,  #8
+    ands        x3,  x1,  #16 - 1
+    b.eq        1f
+    subs        x1,  x1,  #8
+    ld1        {v0.8h}, [x0]
+    ld1        {v2.8h}, [x2], #16
+    sub         v4.8h,  v2.8h,  v0.8h
+    st1        {v4.8h},  [x0], #16
+    b.le        2f
+1:
+    subs        x1,  x1,  #16
+    ld1        {v0.8h,v1.8h}, [x0]
+    ld1        {v2.8h,v3.8h}, [x2], #32
+    sub         v4.8h,  v2.8h,  v0.8h
+    sub         v5.8h,  v3.8h,  v1.8h
+    st1        {v4.8h},  [x0], #16
+    st1        {v5.8h},  [x0], #16
+    b.gt        1b
+2:
+    ret
+endfunc
+
+function x264_mbtree_propagate_cost_neon, export=1
+    ld1r        {v5.4s},  [x5]
+8:
+    subs        w6,  w6,  #8
+    ld1         {v1.8h},  [x1], #16
+    ld1         {v2.8h},  [x2], #16
+    ld1         {v3.8h},  [x3], #16
+    ld1         {v4.8h},  [x4], #16
+    bic         v3.8h,  #0xc0, lsl #8
+    umin        v3.8h,  v2.8h,  v3.8h
+    umull       v20.4s, v2.4h,  v4.4h   // propagate_intra
+    umull2      v21.4s, v2.8h,  v4.8h   // propagate_intra
+    usubl       v22.4s, v2.4h,  v3.4h   // propagate_num
+    usubl2      v23.4s, v2.8h,  v3.8h   // propagate_num
+    uxtl        v26.4s, v2.4h           // propagate_denom
+    uxtl2       v27.4s, v2.8h           // propagate_denom
+    uxtl        v24.4s, v1.4h
+    uxtl2       v25.4s, v1.8h
+    ucvtf       v20.4s, v20.4s
+    ucvtf       v21.4s, v21.4s
+    ucvtf       v26.4s, v26.4s
+    ucvtf       v27.4s, v27.4s
+    ucvtf       v22.4s, v22.4s
+    ucvtf       v23.4s, v23.4s
+    frecpe      v28.4s, v26.4s
+    frecpe      v29.4s, v27.4s
+    ucvtf       v24.4s, v24.4s
+    ucvtf       v25.4s, v25.4s
+    frecps      v30.4s, v28.4s, v26.4s
+    frecps      v31.4s, v29.4s, v27.4s
+    fmla        v24.4s, v20.4s, v5.4s   // propagate_amount
+    fmla        v25.4s, v21.4s, v5.4s   // propagate_amount
+    fmul        v28.4s, v28.4s, v30.4s
+    fmul        v29.4s, v29.4s, v31.4s
+    fmul        v16.4s, v24.4s, v22.4s
+    fmul        v17.4s, v25.4s, v23.4s
+    fmul        v18.4s, v16.4s, v28.4s
+    fmul        v19.4s, v17.4s, v29.4s
+    fcvtns      v20.4s, v18.4s
+    fcvtns      v21.4s, v19.4s
+    sqxtn       v0.4h,  v20.4s
+    sqxtn2      v0.8h,  v21.4s
+    st1         {v0.8h},  [x0], #16
+    b.ge        8b
+    ret
+endfunc
+
+const pw_0to15, align=5
+    .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+endconst
+
+function x264_mbtree_propagate_list_internal_neon, export=1
+    movrel      x11,  pw_0to15
+    dup         v31.8h,  w4             // bipred_weight
+    movi        v30.8h,  #0xc0, lsl #8
+    ld1         {v29.8h},  [x11] //h->mb.i_mb_x,h->mb.i_mb_y
+    movi        v28.4s,  #4//, lsl #16
+    movi        v27.8h,  #31
+    movi        v26.8h,  #32
+    dup         v24.8h,  w5             // mb_y
+    zip1        v29.8h,  v29.8h, v24.8h
+8:
+    subs        w6,  w6,  #8
+    ld1         {v1.8h},  [x1], #16     // propagate_amount
+    ld1         {v2.8h},  [x2], #16     // lowres_cost
+    and         v2.16b, v2.16b, v30.16b
+    cmeq        v25.8h, v2.8h,  v30.8h
+    umull       v16.4s, v1.4h,  v31.4h
+    umull2      v17.4s, v1.8h,  v31.8h
+    rshrn       v16.4h, v16.4s, #6
+    rshrn2      v16.8h, v17.4s, #6
+    bsl         v25.16b, v16.16b, v1.16b // if( lists_used == 3 )
+    //          propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+    ld1         {v4.8h,v5.8h},  [x0],  #32
+    sshr        v6.8h,  v4.8h,  #5
+    sshr        v7.8h,  v5.8h,  #5
+    add         v6.8h,  v6.8h,  v29.8h
+    add         v29.8h, v29.8h, v28.8h
+    add         v7.8h,  v7.8h,  v29.8h
+    add         v29.8h, v29.8h, v28.8h
+    st1         {v6.8h,v7.8h},  [x3],  #32
+    and         v4.16b, v4.16b, v27.16b
+    and         v5.16b, v5.16b, v27.16b
+    uzp1        v6.8h,  v4.8h,  v5.8h   // x & 31
+    uzp2        v7.8h,  v4.8h,  v5.8h   // y & 31
+    sub         v4.8h,  v26.8h, v6.8h   // 32 - (x & 31)
+    sub         v5.8h,  v26.8h, v7.8h   // 32 - (y & 31)
+    mul         v19.8h, v6.8h,  v7.8h   // idx3weight = y*x;
+    mul         v18.8h, v4.8h,  v7.8h   // idx2weight = y*(32-x);
+    mul         v17.8h, v6.8h,  v5.8h   // idx1weight = (32-y)*x;
+    mul         v16.8h, v4.8h,  v5.8h   // idx0weight = (32-y)*(32-x) ;
+    umull       v6.4s,  v19.4h, v25.4h
+    umull2      v7.4s,  v19.8h, v25.8h
+    umull       v4.4s,  v18.4h, v25.4h
+    umull2      v5.4s,  v18.8h, v25.8h
+    umull       v2.4s,  v17.4h, v25.4h
+    umull2      v3.4s,  v17.8h, v25.8h
+    umull       v0.4s,  v16.4h, v25.4h
+    umull2      v1.4s,  v16.8h, v25.8h
+    rshrn       v19.4h, v6.4s,  #10
+    rshrn2      v19.8h, v7.4s,  #10
+    rshrn       v18.4h, v4.4s,  #10
+    rshrn2      v18.8h, v5.4s,  #10
+    rshrn       v17.4h, v2.4s,  #10
+    rshrn2      v17.8h, v3.4s,  #10
+    rshrn       v16.4h, v0.4s,  #10
+    rshrn2      v16.8h, v1.4s,  #10
+    zip1        v0.8h,  v16.8h, v17.8h
+    zip2        v1.8h,  v16.8h, v17.8h
+    zip1        v2.8h,  v18.8h, v19.8h
+    zip2        v3.8h,  v18.8h, v19.8h
+    st1         {v0.8h,v1.8h},  [x3], #32
+    st1         {v2.8h,v3.8h},  [x3], #32
+    b.ge        8b
+    ret
+endfunc
+
+function x264_memcpy_aligned_neon, export=1
+    tst         x2,  #16
+    b.eq        32f
+    sub         x2,  x2,  #16
+    ldr         q0,  [x1], #16
+    str         q0,  [x0], #16
+32:
+    tst         x2,  #32
+    b.eq        640f
+    sub         x2,  x2,  #32
+    ldp         q0,  q1,  [x1], #32
+    stp         q0,  q1,  [x0], #32
+640:
+    cbz         x2,  1f
+64:
+    subs        x2,  x2,  #64
+    ldp         q0,  q1,  [x1, #32]
+    ldp         q2,  q3,  [x1], #64
+    stp         q0,  q1,  [x0, #32]
+    stp         q2,  q3,  [x0], #64
+    b.gt        64b
+1:
+    ret
+endfunc
+
+function x264_memzero_aligned_neon, export=1
+    movi        v0.16b,  #0
+    movi        v1.16b,  #0
+1:
+    subs        x1,  x1,  #128
+    stp         q0,  q1,  [x0, #96]
+    stp         q0,  q1,  [x0, #64]
+    stp         q0,  q1,  [x0, #32]
+    stp         q0,  q1,  [x0], 128
+    b.gt        1b
+    ret
+endfunc

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/mc-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/mc-c.c Changed

@@ -1,9 +1,10 @@
 /*****************************************************************************
  * mc-c.c: aarch64 motion compensation
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -48,6 +49,8 @@
 void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 
+void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
+                           pixel *src, intptr_t i_src, int w, int h );
 void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
                                          pixel *dstv, intptr_t i_dstv,
                                          pixel *src,  intptr_t i_src, int w, int h );
@@ -89,8 +92,14 @@
 void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 
 void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
+void integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
+void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
+void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
+void integral_init8v_neon( uint16_t *, intptr_t );
 void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
 
+void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
+
 #if !HIGH_BIT_DEPTH
 static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
 {
@@ -132,9 +141,6 @@
     x264_mc_copy_w16_neon,
 };
 
-static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-
 static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
                           uint8_t *src[4], intptr_t i_src_stride,
                           int mvx, int mvy,
@@ -142,13 +148,13 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
     if ( (mvy&3) == 3 )             // explict if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         x264_pixel_avg_wtab_neon[i_width>>2](
                 dst, i_dst_stride, src1, i_src_stride,
                 src2, i_height );
@@ -168,13 +174,13 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
     if ( (mvy&3) == 3 )             // explict if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         x264_pixel_avg_wtab_neon[i_width>>2](
                 dst, *i_dst_stride, src1, i_src_stride,
                 src2, i_height );
@@ -199,6 +205,89 @@
                             int height, int16_t *buf );
 #endif // !HIGH_BIT_DEPTH
 
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
+#define CLIP_ADD2(s,x)\
+do\
+{\
+    CLIP_ADD((s)[0], (x)[0]);\
+    CLIP_ADD((s)[1], (x)[1]);\
+} while(0)
+
+void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2],
+                                               int16_t *propagate_amount,
+                                               uint16_t *lowres_costs,
+                                               int16_t *output,
+                                               int bipred_weight, int mb_y,
+                                               int len );
+
+static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs,
+                                             int16_t (*mvs)[2],
+                                             int16_t *propagate_amount,
+                                             uint16_t *lowres_costs,
+                                             int bipred_weight, int mb_y,
+                                             int len, int list )
+{
+    int16_t *current = h->scratch_buffer2;
+
+    x264_mbtree_propagate_list_internal_neon( mvs, propagate_amount,
+                                              lowres_costs, current,
+                                              bipred_weight, mb_y, len );
+
+    unsigned stride = h->mb.i_mb_stride;
+    unsigned width = h->mb.i_mb_width;
+    unsigned height = h->mb.i_mb_height;
+
+    for( unsigned i = 0; i < len; current += 32 )
+    {
+        int end = X264_MIN( i+8, len );
+        for( ; i < end; i++, current += 2 )
+        {
+            if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )
+                continue;
+
+            unsigned mbx = current[0];
+            unsigned mby = current[1];
+            unsigned idx0 = mbx + mby * stride;
+            unsigned idx2 = idx0 + stride;
+
+            /* Shortcut for the simple/common case of zero MV */
+            if( !M32( mvs[i] ) )
+            {
+                CLIP_ADD( ref_costs[idx0], current[16] );
+                continue;
+            }
+
+            if( mbx < width-1 && mby < height-1 )
+            {
+                CLIP_ADD2( ref_costs+idx0, current+16 );
+                CLIP_ADD2( ref_costs+idx2, current+32 );
+            }
+            else
+            {
+                /* Note: this takes advantage of unsigned representation to
+                 * catch negative mbx/mby. */
+                if( mby < height )
+                {
+                    if( mbx < width )
+                        CLIP_ADD( ref_costs[idx0+0], current[16] );
+                    if( mbx+1 < width )
+                        CLIP_ADD( ref_costs[idx0+1], current[17] );
+                }
+                if( mby+1 < height )
+                {
+                    if( mbx < width )
+                        CLIP_ADD( ref_costs[idx2+0], current[32] );
+                    if( mbx+1 < width )
+                        CLIP_ADD( ref_costs[idx2+1], current[33] );
+                }
+            }
+        }
+    }
+}
+
+#undef CLIP_ADD
+#undef CLIP_ADD2
+
 void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
 {
 #if !HIGH_BIT_DEPTH
@@ -217,6 +306,7 @@
     pf->copy[PIXEL_8x8]      = x264_mc_copy_w8_neon;
     pf->copy[PIXEL_4x4]      = x264_mc_copy_w4_neon;
 
+    pf->plane_copy                  = x264_plane_copy_neon;
     pf->plane_copy_deinterleave     = x264_plane_copy_deinterleave_neon;
     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
     pf->plane_copy_interleave       = x264_plane_copy_interleave_neon;
@@ -245,5 +335,16 @@
     pf->get_ref = get_ref_neon;
     pf->hpel_filter = x264_hpel_filter_neon;
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+
+    pf->integral_init4h = integral_init4h_neon;
+    pf->integral_init8h = integral_init8h_neon;
+    pf->integral_init4v = integral_init4v_neon;
+    pf->integral_init8v = integral_init8v_neon;
+
+    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
+    pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
+
+    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
+    pf->memzero_aligned = x264_memzero_aligned_neon;
 #endif // !HIGH_BIT_DEPTH
 }

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/mc.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/pixel-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/pixel-a.S Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * pixel.S: aarch64 pixel metrics
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
  *          Janne Grunau <janne-x264@jannau.net>
@@ -114,6 +114,7 @@
 
 SAD_FUNC  4,  4
 SAD_FUNC  4,  8
+SAD_FUNC  4,  16
 SAD_FUNC  8,  4
 SAD_FUNC  8,  8
 SAD_FUNC  8,  16
@@ -148,7 +149,7 @@
     \first      v17.8h,  v2.8b,  v0.8b
     ld1        {v3.8b}, [x3], x5
     ld1        {v1.8b}, [x1], x5
-   \first       v18.8h,  v3.8b,  v0.8b
+    \first      v18.8h,  v3.8b,  v0.8b
     uabal       v16.8h,  v1.8b,  v5.8b
     ld1        {v2.8b}, [x2], x5
     ld1        {v3.8b}, [x3], x5
@@ -248,6 +249,56 @@
 SAD_X_FUNC  4, 16, 16
 
 
+function x264_pixel_vsad_neon, export=1
+    subs        w2,  w2,  #2
+    ld1        {v0.16b},  [x0],  x1
+    ld1        {v1.16b},  [x0],  x1
+    uabdl       v6.8h,  v0.8b,  v1.8b
+    uabdl2      v7.8h,  v0.16b, v1.16b
+    b.le        2f
+1:
+    subs        w2,  w2,  #2
+    ld1        {v0.16b},  [x0],  x1
+    uabal       v6.8h,  v1.8b,  v0.8b
+    uabal2      v7.8h,  v1.16b, v0.16b
+    ld1        {v1.16b},  [x0],  x1
+    b.lt        2f
+    uabal       v6.8h,  v0.8b,  v1.8b
+    uabal2      v7.8h,  v0.16b, v1.16b
+    b.gt        1b
+2:
+    add         v5.8h,  v6.8h,  v7.8h
+    uaddlv      s0,  v5.8h
+    fmov        w0,  s0
+    ret
+endfunc
+
+function x264_pixel_asd8_neon, export=1
+    sub         w4,  w4,  #2
+    ld1        {v0.8b}, [x0], x1
+    ld1        {v1.8b}, [x2], x3
+    ld1        {v2.8b}, [x0], x1
+    ld1        {v3.8b}, [x2], x3
+    usubl       v16.8h, v0.8b,  v1.8b
+1:
+    subs        w4,  w4,  #2
+    ld1        {v4.8b}, [x0], x1
+    ld1        {v5.8b}, [x2], x3
+    usubl       v17.8h, v2.8b,  v3.8b
+    usubl       v18.8h, v4.8b,  v5.8b
+    add         v16.8h, v16.8h, v17.8h
+    ld1        {v2.8b}, [x0], x1
+    ld1        {v3.8b}, [x2], x3
+    add         v16.8h, v16.8h, v18.8h
+    b.gt        1b
+    usubl       v17.8h, v2.8b,  v3.8b
+    add         v16.8h, v16.8h, v17.8h
+    saddlv      s0,  v16.8h
+    abs         v0.2s,  v0.2s
+    fmov        w0,  s0
+    ret
+endfunc
+
 .macro SSD_START_4
     ld1        {v16.s}[0], [x0], x1
     ld1        {v17.s}[0], [x2], x3
@@ -343,12 +394,84 @@
 
 SSD_FUNC   4, 4
 SSD_FUNC   4, 8
+SSD_FUNC   4, 16
 SSD_FUNC   8, 4
 SSD_FUNC   8, 8
 SSD_FUNC   8, 16
 SSD_FUNC  16, 8
 SSD_FUNC  16, 16
 
+
+function x264_pixel_ssd_nv12_core_neon, export=1
+    sxtw        x8,  w4
+    add         x8,  x8,  #8
+    and         x8,  x8,  #~15
+    movi        v6.2d,  #0
+    movi        v7.2d,  #0
+    sub         x1,  x1,  x8, lsl #1
+    sub         x3,  x3,  x8, lsl #1
+1:
+    subs        w8,  w4,  #16
+    ld2        {v0.8b,v1.8b},   [x0],  #16
+    ld2        {v2.8b,v3.8b},   [x2],  #16
+    ld2        {v24.8b,v25.8b}, [x0],  #16
+    ld2        {v26.8b,v27.8b}, [x2],  #16
+
+    usubl       v16.8h, v0.8b,  v2.8b
+    usubl       v17.8h, v1.8b,  v3.8b
+    smull       v20.4s, v16.4h, v16.4h
+    smull       v21.4s, v17.4h, v17.4h
+    usubl       v18.8h, v24.8b, v26.8b
+    usubl       v19.8h, v25.8b, v27.8b
+    smlal2      v20.4s, v16.8h, v16.8h
+    smlal2      v21.4s, v17.8h, v17.8h
+
+    b.lt        4f
+    b.eq        3f
+2:
+    smlal       v20.4s, v18.4h, v18.4h
+    smlal       v21.4s, v19.4h, v19.4h
+    ld2        {v0.8b,v1.8b}, [x0],  #16
+    ld2        {v2.8b,v3.8b}, [x2],  #16
+    smlal2      v20.4s, v18.8h, v18.8h
+    smlal2      v21.4s, v19.8h, v19.8h
+
+    subs        w8,  w8,  #16
+    usubl       v16.8h, v0.8b,  v2.8b
+    usubl       v17.8h, v1.8b,  v3.8b
+    smlal       v20.4s, v16.4h, v16.4h
+    smlal       v21.4s, v17.4h, v17.4h
+    ld2        {v24.8b,v25.8b}, [x0],  #16
+    ld2        {v26.8b,v27.8b}, [x2],  #16
+    smlal2      v20.4s, v16.8h, v16.8h
+    smlal2      v21.4s, v17.8h, v17.8h
+    b.lt        4f
+
+    usubl       v18.8h, v24.8b, v26.8b
+    usubl       v19.8h, v25.8b, v27.8b
+    b.gt        2b
+3:
+    smlal       v20.4s, v18.4h, v18.4h
+    smlal       v21.4s, v19.4h, v19.4h
+    smlal2      v20.4s, v18.8h, v18.8h
+    smlal2      v21.4s, v19.8h, v19.8h
+4:
+    subs        w5,  w5,  #1
+    uaddw       v6.2d,  v6.2d,  v20.2s
+    uaddw       v7.2d,  v7.2d,  v21.2s
+    add         x0,  x0,  x1
+    add         x2,  x2,  x3
+    uaddw2      v6.2d,  v6.2d,  v20.4s
+    uaddw2      v7.2d,  v7.2d,  v21.4s
+    b.gt        1b
+
+    addp        v6.2d,  v6.2d,  v7.2d
+    st1        {v6.d}[0], [x6]
+    st1        {v6.d}[1], [x7]
+
+    ret
+endfunc
+
 .macro pixel_var_8 h
 function x264_pixel_var_8x\h\()_neon, export=1
     ld1            {v16.8b}, [x0], x1
@@ -800,10 +923,65 @@
     b           x264_satd_8x4v_8x8h_neon
 endfunc
 
+function x264_pixel_satd_4x16_neon, export=1
+    mov         x4,  x30
+    ld1        {v1.s}[0],  [x2], x3
+    ld1        {v0.s}[0],  [x0], x1
+    ld1        {v3.s}[0],  [x2], x3
+    ld1        {v2.s}[0],  [x0], x1
+    ld1        {v5.s}[0],  [x2], x3
+    ld1        {v4.s}[0],  [x0], x1
+    ld1        {v7.s}[0],  [x2], x3
+    ld1        {v6.s}[0],  [x0], x1
+    ld1        {v1.s}[1],  [x2], x3
+    ld1        {v0.s}[1],  [x0], x1
+    ld1        {v3.s}[1],  [x2], x3
+    ld1        {v2.s}[1],  [x0], x1
+    ld1        {v5.s}[1],  [x2], x3
+    ld1        {v4.s}[1],  [x0], x1
+    ld1        {v7.s}[1],  [x2], x3
+    ld1        {v6.s}[1],  [x0], x1
+    usubl       v16.8h, v0.8b,  v1.8b
+    usubl       v17.8h, v2.8b,  v3.8b
+    usubl       v18.8h, v4.8b,  v5.8b
+    usubl       v19.8h, v6.8b,  v7.8b
+    ld1        {v1.s}[0],  [x2], x3
+    ld1        {v0.s}[0],  [x0], x1
+    ld1        {v3.s}[0],  [x2], x3
+    ld1        {v2.s}[0],  [x0], x1
+    ld1        {v5.s}[0],  [x2], x3
+    ld1        {v4.s}[0],  [x0], x1
+    ld1        {v7.s}[0],  [x2], x3
+    ld1        {v6.s}[0],  [x0], x1
+    ld1        {v1.s}[1],  [x2], x3
+    ld1        {v0.s}[1],  [x0], x1
+    ld1        {v3.s}[1],  [x2], x3
+    ld1        {v2.s}[1],  [x0], x1
+    ld1        {v5.s}[1],  [x2], x3
+    ld1        {v4.s}[1],  [x0], x1
+    ld1        {v7.s}[1],  [x2], x3
+    ld1        {v6.s}[1],  [x0], x1
+    usubl       v20.8h, v0.8b,  v1.8b
+    usubl       v21.8h, v2.8b,  v3.8b
+    usubl       v22.8h, v4.8b,  v5.8b
+    usubl       v23.8h, v6.8b,  v7.8b
+
+    SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v17.8h
+    SUMSUB_AB   v2.8h,  v3.8h,  v18.8h, v19.8h
+
+    bl          x264_satd_8x4v_8x8h_neon
+
+    add         v30.8h, v0.8h,  v1.8h
+    add         v31.8h, v2.8h,  v3.8h
+    add         v0.8h,  v30.8h, v31.8h
+    uaddlv      s0,  v0.8h
+    mov         w0,  v0.s[0]
+    ret         x4
+endfunc
 
 function x264_pixel_sa8d_8x8_neon, export=1
     mov         x4,  x30
-    bl          x264_sa8d_8x8_neon
+    bl          pixel_sa8d_8x8_neon
     add         v0.8h,  v0.8h,  v1.8h
     uaddlv      s0,  v0.8h
     mov         w0,  v0.s[0]
@@ -814,20 +992,20 @@
 
 function x264_pixel_sa8d_16x16_neon, export=1
     mov         x4,  x30
-    bl          x264_sa8d_8x8_neon
+    bl          pixel_sa8d_8x8_neon
     uaddlp      v30.4s, v0.8h
     uaddlp      v31.4s, v1.8h
-    bl          x264_sa8d_8x8_neon
+    bl          pixel_sa8d_8x8_neon
     uadalp      v30.4s, v0.8h
     uadalp      v31.4s, v1.8h
     sub         x0,  x0,  x1,  lsl #4
     sub         x2,  x2,  x3,  lsl #4
     add         x0,  x0,  #8
     add         x2,  x2,  #8
-    bl          x264_sa8d_8x8_neon
+    bl          pixel_sa8d_8x8_neon
     uadalp      v30.4s, v0.8h
     uadalp      v31.4s, v1.8h
-    bl          x264_sa8d_8x8_neon
+    bl          pixel_sa8d_8x8_neon
     uadalp      v30.4s, v0.8h
     uadalp      v31.4s, v1.8h
     add         v0.4s,  v30.4s, v31.4s
@@ -838,13 +1016,48 @@
     ret         x4
 endfunc
 
-function x264_sa8d_8x8_neon
+.macro sa8d_satd_8x8 satd=
+function pixel_sa8d_\satd\()8x8_neon
     load_diff_fly_8x8
 
     SUMSUB_AB   v16.8h, v18.8h, v0.8h,  v2.8h
     SUMSUB_AB   v17.8h, v19.8h, v1.8h,  v3.8h
 
     HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h,  v1.8h, v2.8h, v3.8h
+.ifc \satd, satd_
+    transpose   v0.8h,  v1.8h,  v16.8h, v17.8h
+    transpose   v2.8h,  v3.8h,  v18.8h, v19.8h
+    transpose   v4.8h,  v5.8h,  v20.8h, v21.8h
+    transpose   v6.8h,  v7.8h,  v22.8h, v23.8h
+
+    SUMSUB_AB   v24.8h, v25.8h, v0.8h,  v1.8h
+    SUMSUB_AB   v26.8h, v27.8h, v2.8h,  v3.8h
+    SUMSUB_AB   v0.8h,  v1.8h,  v4.8h,  v5.8h
+    SUMSUB_AB   v2.8h,  v3.8h,  v6.8h,  v7.8h
+
+    transpose   v4.4s,  v6.4s,  v24.4s, v26.4s
+    transpose   v5.4s,  v7.4s,  v25.4s, v27.4s
+    transpose   v24.4s, v26.4s, v0.4s,  v2.4s
+    transpose   v25.4s, v27.4s, v1.4s,  v3.4s
+
+    abs         v0.8h,  v4.8h
+    abs         v1.8h,  v5.8h
+    abs         v2.8h,  v6.8h
+    abs         v3.8h,  v7.8h
+    abs         v4.8h,  v24.8h
+    abs         v5.8h,  v25.8h
+    abs         v6.8h,  v26.8h
+    abs         v7.8h,  v27.8h
+
+    umax        v0.8h,  v0.8h,  v2.8h
+    umax        v1.8h,  v1.8h,  v3.8h
+    umax        v2.8h,  v4.8h,  v6.8h
+    umax        v3.8h,  v5.8h,  v7.8h
+
+    add         v26.8h, v0.8h,  v1.8h
+    add         v27.8h, v2.8h,  v3.8h
+.endif
+
     SUMSUB_AB   v0.8h,  v16.8h, v16.8h, v20.8h
     SUMSUB_AB   v1.8h,  v17.8h, v17.8h, v21.8h
     SUMSUB_AB   v2.8h,  v18.8h, v18.8h, v22.8h
@@ -855,20 +1068,20 @@
     transpose   v22.8h, v23.8h, v18.8h, v19.8h
     transpose   v6.8h,  v7.8h,  v2.8h,  v3.8h
 
-    SUMSUB_AB   v28.8h, v29.8h, v20.8h, v21.8h
+    SUMSUB_AB   v2.8h,  v3.8h,  v20.8h, v21.8h
     SUMSUB_AB   v24.8h, v25.8h, v4.8h,  v5.8h
     SUMSUB_AB   v0.8h,  v1.8h,  v22.8h, v23.8h
-    SUMSUB_AB   v26.8h, v27.8h, v6.8h,  v7.8h
+    SUMSUB_AB   v4.8h,  v5.8h,  v6.8h,  v7.8h
 
-    transpose   v20.4s, v22.4s, v28.4s, v0.4s
-    transpose   v21.4s, v23.4s, v29.4s, v1.4s
-    transpose   v16.4s, v18.4s, v24.4s, v26.4s
-    transpose   v17.4s, v19.4s, v25.4s, v27.4s
+    transpose   v20.4s, v22.4s, v2.4s,  v0.4s
+    transpose   v21.4s, v23.4s, v3.4s,  v1.4s
+    transpose   v16.4s, v18.4s, v24.4s, v4.4s
+    transpose   v17.4s, v19.4s, v25.4s, v5.4s
 
     SUMSUB_AB   v0.8h,  v2.8h,  v20.8h, v22.8h
     SUMSUB_AB   v1.8h,  v3.8h,  v21.8h, v23.8h
-    SUMSUB_AB   v4.8h,  v6.8h, v16.8h, v18.8h
-    SUMSUB_AB   v5.8h,  v7.8h, v17.8h, v19.8h
+    SUMSUB_AB   v4.8h,  v6.8h,  v16.8h, v18.8h
+    SUMSUB_AB   v5.8h,  v7.8h,  v17.8h, v19.8h
 
     transpose   v16.2d, v20.2d,  v0.2d,  v4.2d
     transpose   v17.2d, v21.2d,  v1.2d,  v5.2d
@@ -894,7 +1107,47 @@
 
     ret
 endfunc
+.endm
 
+sa8d_satd_8x8
+sa8d_satd_8x8 satd_
+
+function x264_pixel_sa8d_satd_16x16_neon, export=1
+    mov         x4,  x30
+    bl          pixel_sa8d_satd_8x8_neon
+    uaddlp      v30.4s, v0.8h
+    uaddlp      v31.4s, v1.8h
+    uaddlp      v28.4s, v26.8h
+    uaddlp      v29.4s, v27.8h
+    bl          pixel_sa8d_satd_8x8_neon
+    uadalp      v30.4s, v0.8h
+    uadalp      v31.4s, v1.8h
+    uadalp      v28.4s, v26.8h
+    uadalp      v29.4s, v27.8h
+    sub         x0,  x0,  x1,  lsl #4
+    sub         x2,  x2,  x3,  lsl #4
+    add         x0,  x0,  #8
+    add         x2,  x2,  #8
+    bl          pixel_sa8d_satd_8x8_neon
+    uadalp      v30.4s, v0.8h
+    uadalp      v31.4s, v1.8h
+    uadalp      v28.4s, v26.8h
+    uadalp      v29.4s, v27.8h
+    bl          pixel_sa8d_satd_8x8_neon
+    uadalp      v30.4s, v0.8h
+    uadalp      v31.4s, v1.8h
+    uadalp      v28.4s, v26.8h
+    uadalp      v29.4s, v27.8h
+    add         v0.4s,  v30.4s, v31.4s  // sa8d
+    add         v1.4s,  v28.4s, v29.4s  // satd
+    addv        s0,  v0.4s
+    addv        s1,  v1.4s
+    urshr       v0.4s,  v0.4s,  #1
+    fmov        w0,  s0
+    fmov        w1,  s1
+    add         x0,  x0,  x1, lsl #32
+    ret         x4
+endfunc
 
 .macro HADAMARD_AC w h
 function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/pixel.h Changed

@@ -1,9 +1,10 @@
 /*****************************************************************************
  * pixel.h: aarch64 pixel metrics
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -32,6 +33,7 @@
     ret x264_pixel_##name##_8x16_##suffix args;\
     ret x264_pixel_##name##_8x8_##suffix args;\
     ret x264_pixel_##name##_8x4_##suffix args;\
+    ret x264_pixel_##name##_4x16_##suffix args;\
     ret x264_pixel_##name##_4x8_##suffix args;\
     ret x264_pixel_##name##_4x4_##suffix args;\
 
@@ -47,8 +49,14 @@
 DECL_X1( satd, neon )
 DECL_X1( ssd, neon )
 
+
+void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
+
+int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
+
 int x264_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
 int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
+uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
 
 uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
 uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
@@ -66,4 +74,6 @@
                                       int sums[2][4] );
 float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
 
+int x264_pixel_asd8_neon( uint8_t *, intptr_t,  uint8_t *, intptr_t, int );
+
 #endif

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/predict-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/predict-a.S Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * predict.S: aarch64 intra prediction
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
  *          Mans Rullgard <mans@mansr.com>
@@ -436,14 +436,25 @@
 endfunc
 
 function x264_predict_8x8c_dc_left_neon, export=1
-    sub         x2,  x0,  #1
+    ldrb        w2,  [x0, #0 * FDEC_STRIDE - 1]
+    ldrb        w3,  [x0, #1 * FDEC_STRIDE - 1]
+    ldrb        w4,  [x0, #2 * FDEC_STRIDE - 1]
+    ldrb        w5,  [x0, #3 * FDEC_STRIDE - 1]
     mov         x1,  #FDEC_STRIDE
-    ldcol.8     v0,  x2,  x1
-    uaddlp      v0.4h,  v0.8b
-    addp        v0.4h,  v0.4h,  v0.4h
+    add         w2,  w2,  w3
+    add         w3,  w4,  w5
+    ldrb        w6,  [x0, #4 * FDEC_STRIDE - 1]
+    ldrb        w7,  [x0, #5 * FDEC_STRIDE - 1]
+    ldrb        w8,  [x0, #6 * FDEC_STRIDE - 1]
+    ldrb        w9,  [x0, #7 * FDEC_STRIDE - 1]
+    add         w6,  w6,  w7
+    add         w7,  w8,  w9
+    add         w2,  w2,  w3
+    add         w6,  w6,  w7
+    dup         v0.8h,  w2
+    dup         v1.8h,  w6
     rshrn       v0.8b,  v0.8h,  #2
-    dup         v1.8b,  v0.b[1]
-    dup         v0.8b,  v0.b[0]
+    rshrn       v1.8b,  v1.8h,  #2
     b           pred8x8c_dc_end
 endfunc
 
@@ -546,6 +557,223 @@
 endfunc
 
 
+.macro loadsum4 wd, t1, t2, t3, x, idx
+    ldrb        \wd,  [\x, #(\idx + 0) * FDEC_STRIDE - 1]
+    ldrb        \t1,  [\x, #(\idx + 1) * FDEC_STRIDE - 1]
+    ldrb        \t2,  [\x, #(\idx + 2) * FDEC_STRIDE - 1]
+    ldrb        \t3,  [\x, #(\idx + 3) * FDEC_STRIDE - 1]
+    add         \wd,  \wd,  \t1
+    add         \t1,  \t2,  \t3
+    add         \wd,  \wd,  \t1
+.endm
+
+function x264_predict_8x16c_h_neon, export=1
+    sub         x2,  x0,  #1
+    add         x3,  x0,  #FDEC_STRIDE - 1
+    mov         x7,  #2 * FDEC_STRIDE
+    add         x1,  x0,  #FDEC_STRIDE
+.rept 4
+    ld1r       {v0.8b}, [x2], x7
+    ld1r       {v1.8b}, [x3], x7
+    ld1r       {v2.8b}, [x2], x7
+    ld1r       {v3.8b}, [x3], x7
+    st1        {v0.8b}, [x0], x7
+    st1        {v1.8b}, [x1], x7
+    st1        {v2.8b}, [x0], x7
+    st1        {v3.8b}, [x1], x7
+.endr
+    ret
+endfunc
+
+function x264_predict_8x16c_v_neon, export=1
+    sub         x1,  x0,  #FDEC_STRIDE
+    mov         x2,  #2 * FDEC_STRIDE
+    ld1        {v0.8b}, [x1], x2
+.rept 8
+    st1        {v0.8b}, [x0], x2
+    st1        {v0.8b}, [x1], x2
+.endr
+    ret
+endfunc
+
+function x264_predict_8x16c_p_neon, export=1
+    movrel      x4,  p16weight
+    ld1        {v17.8h}, [x4]
+    sub         x3,  x0,  #FDEC_STRIDE
+    mov         x1,  #FDEC_STRIDE
+    add         x2,  x3,  #4
+    sub         x3,  x3,  #1
+
+    ld1        {v0.8b}, [x3]
+    ld1        {v2.8b}, [x2], x1
+    ldcol.8     v1,  x3,  x1
+    add         x3,  x3,  x1
+    ldcol.8     v3,  x3,  x1
+    ext         v4.8b,  v2.8b,  v2.8b,  #3
+    ext         v5.8b,  v3.8b,  v3.8b,  #7
+    rev32       v0.8b,  v0.8b
+    rev64       v1.8b,  v1.8b
+
+    uaddl       v4.8h,  v5.8b,  v4.8b // a * 1/16
+
+    usubl       v2.8h,  v2.8b,  v0.8b
+    mul         v2.8h,  v2.8h,  v17.8h
+    saddlp      v2.4s,  v2.8h
+    addp        v2.4s,  v2.4s,  v2.4s  // H
+
+    usubl       v3.8h,  v3.8b,  v1.8b
+    mul         v3.8h,  v3.8h,  v17.8h
+    saddlp      v3.4s,  v3.8h
+    addp        v3.4s,  v3.4s,  v3.4s
+    addp        v3.4s,  v3.4s,  v3.4s  // V
+
+    ext         v17.16b, v17.16b, v17.16b, #14
+
+    shl         v4.4h,  v4.4h,  #4     // a
+    shl         v6.2s,  v2.2s,  #4     // 16 * H
+    shl         v7.2s,  v3.2s,  #2     // 4 * V
+    add         v2.2s,  v2.2s,  v6.2s  // 17 * H
+    add         v3.2s,  v3.2s,  v7.2s  // 5 * V
+    rshrn       v2.4h,  v2.4s,  #5     // b
+    rshrn       v3.4h,  v3.4s,  #6     // c
+
+    mov         v17.h[0],  wzr
+
+    sub         v4.4h,  v4.4h,  v2.4h  // a - b
+    shl         v6.4h,  v2.4h,  #1     // 2 * b
+    add         v4.4h,  v4.4h,  v3.4h  // a - b + c
+    shl         v7.4h,  v3.4h,  #3     // 8 * c
+    sub         v4.4h,  v4.4h,  v6.4h  // a - 3b + c
+    sub         v4.4h,  v4.4h,  v7.4h  // a - 3b - 7c
+
+    mul         v0.8h,  v17.8h, v2.h[0]         // 0,1,2,3,4,5,6,7 * b
+    dup         v1.8h,  v4.h[0]                 // i00
+    dup         v2.8h,  v3.h[0]                 // c
+    add         v1.8h,  v1.8h,  v0.8h           // pix + {0..7}*b
+    mov         x3,  #16
+1:
+    subs        x3,  x3,  #2
+    sqrshrun    v4.8b,  v1.8h,  #5
+    add         v1.8h,  v1.8h,  v2.8h
+    sqrshrun    v5.8b,  v1.8h,  #5
+    st1        {v4.8b}, [x0], x1
+    add         v1.8h,  v1.8h,  v2.8h
+    st1        {v5.8b}, [x0], x1
+    b.ne        1b
+    ret
+endfunc
+
+function x264_predict_8x16c_dc_neon, export=1
+    sub         x3,  x0,  #FDEC_STRIDE
+    mov         x1,  #FDEC_STRIDE
+    ld1        {v6.8b}, [x3]
+    loadsum4    w2, w3, w4, w5, x0, 0
+    uaddlp      v6.4h,  v6.8b
+    dup         v22.8h, w2              // s2
+    loadsum4    w6, w7, w8, w9, x0, 4
+    addp        v6.4h,  v6.4h,  v6.4h   // s0, s1
+    dup         v23.8h, w6              // s3
+    loadsum4    w2, w3, w4, w5, x0, 8
+    dup         v20.8h, v6.h[0]         // s0
+    dup         v24.8h, w2              // s4
+    loadsum4    w6, w7, w8, w9, x0, 12
+    dup         v21.8h, v6.h[1]         // s1
+    dup         v25.8h, w6              // s5
+
+    ext         v16.16b, v20.16b, v21.16b, #8
+    ext         v17.16b, v22.16b, v21.16b, #8
+    ext         v1.16b,  v23.16b, v21.16b, #8
+    ext         v2.16b,  v24.16b, v21.16b, #8
+    ext         v3.16b,  v25.16b, v21.16b, #8
+
+    add         v0.8h,  v16.8h, v17.8h
+    add         v1.8h,  v1.8h,  v23.8h
+    add         v2.8h,  v2.8h,  v24.8h
+    add         v3.8h,  v3.8h,  v25.8h
+
+    rshrn       v0.8b,  v0.8h,  #3
+    rshrn       v1.8b,  v1.8h,  #3
+    rshrn       v2.8b,  v2.8h,  #3
+    rshrn       v3.8b,  v3.8h,  #3
+.irp  idx, 0, 1, 2, 3
+.rept 4
+    st1        {v\idx\().8b}, [x0], x1
+.endr
+.endr
+    ret
+endfunc
+
+function x264_predict_8x16c_dc_left_neon, export=1
+    mov         x1,  #FDEC_STRIDE
+    ldrb        w2,  [x0, # 0 * FDEC_STRIDE - 1]
+    ldrb        w3,  [x0, # 1 * FDEC_STRIDE - 1]
+    ldrb        w4,  [x0, # 2 * FDEC_STRIDE - 1]
+    ldrb        w5,  [x0, # 3 * FDEC_STRIDE - 1]
+    add         w2,  w2,  w3
+
+    ldrb        w6,  [x0, # 4 * FDEC_STRIDE - 1]
+    add         w4,  w4,  w5
+    ldrb        w7,  [x0, # 5 * FDEC_STRIDE - 1]
+    add         w2,  w2,  w4
+    ldrb        w8,  [x0, # 6 * FDEC_STRIDE - 1]
+    ldrb        w9,  [x0, # 7 * FDEC_STRIDE - 1]
+    dup         v0.8h,  w2
+    add         w6,  w6,  w7
+    rshrn       v0.8b,  v0.8h,  #2
+    add         w8,  w8,  w9
+
+    ldrb        w10, [x0, # 8 * FDEC_STRIDE - 1]
+    ldrb        w11, [x0, # 9 * FDEC_STRIDE - 1]
+    add         w6,  w6,  w8
+    ldrb        w12, [x0, #10 * FDEC_STRIDE - 1]
+    ldrb        w13, [x0, #11 * FDEC_STRIDE - 1]
+    dup         v1.8h,  w6
+    add         w10,  w10,  w11
+    rshrn       v1.8b,  v1.8h,  #2
+    add         w12,  w12,  w13
+
+    ldrb        w2,  [x0, #12 * FDEC_STRIDE - 1]
+    ldrb        w3,  [x0, #13 * FDEC_STRIDE - 1]
+    add         w10,  w10,  w12
+    ldrb        w4,  [x0, #14 * FDEC_STRIDE - 1]
+    ldrb        w5,  [x0, #15 * FDEC_STRIDE - 1]
+    dup         v2.8h,  w10
+    add         w2,  w2,  w3
+    rshrn       v2.8b,  v2.8h,  #2
+    add         w4,  w4,  w5
+    st1        {v0.8b}, [x0], x1
+    st1        {v0.8b}, [x0], x1
+    add         w2,  w2,  w4
+    st1        {v0.8b}, [x0], x1
+    dup         v3.8h,  w2
+    st1        {v0.8b}, [x0], x1
+    rshrn       v3.8b,  v3.8h,  #2
+
+.irp  idx, 1, 2, 3
+.rept 4
+    st1        {v\idx\().8b}, [x0], x1
+.endr
+.endr
+    ret
+endfunc
+
+function x264_predict_8x16c_dc_top_neon, export=1
+    sub         x2,  x0,  #FDEC_STRIDE
+    mov         x1,  #FDEC_STRIDE
+    ld1        {v0.8b}, [x2]
+    uaddlp      v0.4h,  v0.8b
+    addp        v0.4h,  v0.4h,  v0.4h
+    rshrn       v4.8b,  v0.8h,  #2
+    dup         v0.8b,  v4.b[0]
+    dup         v1.8b,  v4.b[1]
+    ext         v0.8b,  v0.8b,  v1.8b,  #4
+.rept 16
+    st1        {v0.8b}, [x0], x1
+.endr
+    ret
+endfunc
+
+
 function x264_predict_16x16_dc_top_neon, export=1
     sub         x2,  x0,  #FDEC_STRIDE
     mov         x1,  #FDEC_STRIDE
@@ -603,7 +831,7 @@
 .rept 16
     st1        {v0.16b}, [x0], x7
 .endr
-   ret
+    ret
 endfunc
 
 function x264_predict_16x16_p_neon, export=1

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/predict-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/predict-c.c Changed

@@ -1,9 +1,10 @@
 /*****************************************************************************
  * predict.c: aarch64 intra prediction
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -35,6 +36,10 @@
 void x264_predict_8x8c_dc_left_neon( uint8_t *src );
 void x264_predict_8x8c_p_neon( uint8_t *src );
 
+void x264_predict_8x16c_dc_left_neon( uint8_t *src );
+void x264_predict_8x16c_dc_top_neon( uint8_t *src );
+void x264_predict_8x16c_p_neon( uint8_t *src );
+
 void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
 void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
 void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
@@ -80,6 +85,22 @@
 #endif // !HIGH_BIT_DEPTH
 }
 
+
+void x264_predict_8x16c_init_aarch64( int cpu, x264_predict_t pf[7] )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf[I_PRED_CHROMA_V ]     = x264_predict_8x16c_v_neon;
+    pf[I_PRED_CHROMA_H ]     = x264_predict_8x16c_h_neon;
+    pf[I_PRED_CHROMA_DC]     = x264_predict_8x16c_dc_neon;
+    pf[I_PRED_CHROMA_P ]     = x264_predict_8x16c_p_neon;
+    pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_neon;
+    pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_neon;
+#endif // !HIGH_BIT_DEPTH
+}
+
 void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
 {
     if (!(cpu&X264_CPU_NEON))

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/predict.h Changed

@@ -1,9 +1,10 @@
 /*****************************************************************************
  * predict.h: aarch64 intra prediction
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -40,6 +41,9 @@
 void x264_predict_8x8c_dc_neon( uint8_t *src );
 void x264_predict_8x8c_h_neon( uint8_t *src );
 void x264_predict_8x8c_v_neon( uint8_t *src );
+void x264_predict_8x16c_v_neon( uint8_t *src );
+void x264_predict_8x16c_h_neon( uint8_t *src );
+void x264_predict_8x16c_dc_neon( uint8_t *src );
 void x264_predict_16x16_v_neon( uint8_t *src );
 void x264_predict_16x16_h_neon( uint8_t *src );
 void x264_predict_16x16_dc_neon( uint8_t *src );
@@ -47,6 +51,7 @@
 void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] );
 void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
 void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] );
+void x264_predict_8x16c_init_aarch64( int cpu, x264_predict_t pf[7] );
 void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] );
 
 #endif /* X264_AARCH64_PREDICT_H */

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/quant-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/quant-a.S Changed

@@ -1,9 +1,10 @@
 /****************************************************************************
  * quant.S: arm quantization and level-run
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -300,6 +301,118 @@
     ret
 endfunc
 
+.macro decimate_score_1x size
+function x264_decimate_score\size\()_neon, export=1
+    ld1        {v0.8h,v1.8h}, [x0]
+    movrel      x5,  X(x264_decimate_table4)
+    movi        v3.16b, #0x01
+    sqxtn       v0.8b,  v0.8h
+    sqxtn2      v0.16b, v1.8h
+    abs         v2.16b, v0.16b
+    cmeq        v1.16b, v0.16b, #0
+    cmhi        v2.16b, v2.16b, v3.16b
+    shrn        v1.8b,  v1.8h,  #4
+    shrn        v2.8b,  v2.8h,  #4
+    fmov        x2,  d2
+    fmov        x1,  d1
+    cbnz        x2,  9f
+    mvn         x1,  x1
+    mov         w0,  #0
+    cbz         x1,  0f
+.ifc \size, 15
+    lsr         x1,  x1,  #1
+.endif
+    rbit        x1,  x1
+1:
+    clz         x3,  x1
+    lsr         x6,  x3,  #2
+    lsl         x1,  x1,  x3
+    ldrb        w7,  [x5, x6]
+    cbz         x1,  2f
+    lsl         x1,  x1,  #4
+    add         w0,  w0,  w7
+    cbnz        x1,  1b
+    ret
+2:
+    add         w0,  w0,  w7
+0:
+    ret
+9:
+    mov         w0,  #9
+    ret
+endfunc
+.endm
+
+decimate_score_1x 15
+decimate_score_1x 16
+
+const mask64, align=6
+    .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+    .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+endconst
+
+function x264_decimate_score64_neon, export=1
+    ld1        {v0.8h,v1.8h}, [x0], #32
+    ld1        {v2.8h,v3.8h}, [x0], #32
+    ld1        {v4.8h,v5.8h}, [x0], #32
+    ld1        {v6.8h,v7.8h}, [x0]
+    movrel      x6,  mask64
+    movi        v31.16b, #0x01
+    sqxtn       v16.8b,  v1.8h
+    sqxtn2      v16.16b, v0.8h
+    sqxtn       v17.8b,  v3.8h
+    sqxtn2      v17.16b, v2.8h
+    sqxtn       v18.8b,  v5.8h
+    sqxtn2      v18.16b, v4.8h
+    sqxtn       v19.8b,  v7.8h
+    sqxtn2      v19.16b, v6.8h
+    abs         v4.16b, v16.16b
+    abs         v5.16b, v17.16b
+    abs         v6.16b, v18.16b
+    abs         v7.16b, v19.16b
+    ld1        {v30.16b}, [x6]
+    cmeq        v0.16b, v16.16b, #0
+    cmeq        v1.16b, v17.16b, #0
+    cmeq        v2.16b, v18.16b, #0
+    cmeq        v3.16b, v19.16b, #0
+    umax        v4.16b, v4.16b, v5.16b
+    umax        v6.16b, v6.16b, v7.16b
+    and         v0.16b, v0.16b, v30.16b
+    and         v1.16b, v1.16b, v30.16b
+    and         v2.16b, v2.16b, v30.16b
+    and         v3.16b, v3.16b, v30.16b
+    umax        v4.16b, v4.16b, v6.16b
+    addp        v0.16b, v1.16b, v0.16b
+    addp        v2.16b, v3.16b, v2.16b
+    cmhi        v4.16b, v4.16b, v31.16b
+    addp        v0.16b, v2.16b, v0.16b
+    shrn        v4.8b,  v4.8h,  #4
+    addp        v0.16b, v0.16b, v0.16b
+    fmov        x2,  d4
+    fmov        x1,  d0
+    cbnz        x2,  9f
+    mvn         x1,  x1
+    mov         w0,  #0
+    cbz         x1,  0f
+    movrel      x5,  X(x264_decimate_table8)
+1:
+    clz         x3,  x1
+    lsl         x1,  x1,  x3
+    ldrb        w7,  [x5, x3]
+    cbz         x1,  2f
+    lsl         x1,  x1,  #1
+    add         w0,  w0,  w7
+    cbnz        x1,  1b
+    ret
+2:
+    add         w0,  w0,  w7
+0:
+    ret
+9:
+    mov         w0,  #9
+    ret
+endfunc
+
 // int coeff_last( int16_t *l )
 function x264_coeff_last4_aarch64, export=1
     ldr         x2,  [x0]
@@ -384,3 +497,105 @@
     sub         w0,  w3,  w2
     ret
 endfunc
+
+.macro coeff_level_run_start size
+    add         x6,  x1,  #23            // runlevel->mask
+    mov         w7,  #0
+    mov         w8,  #0
+    mov         w9,  #1
+    and         x6,  x6,  #~15
+    mov         w4,  #\size - 1
+.endm
+
+.macro coeff_level_run shift
+    clz         x3,  x2
+    subs        w4,  w4,  w3, lsr #\shift
+    str         w4,  [x1], #4
+1:
+    ldrh        w5,  [x0, x4, lsl #1]
+    strh        w5,  [x6], #2
+    add         w7,  w7,  #1
+    lsl         w10, w9, w4
+    orr         w8,  w8,  w10
+    b.le        2f
+    add         w3,  w3,  #1 << \shift
+    sub         w4,  w4,  #1
+    and         x3,  x3,  #~((1 << \shift) - 1)
+    lsl         x2,  x2,  x3
+    clz         x3,  x2
+    subs        w4,  w4,  w3, lsr #\shift
+    b.ge        1b
+2:
+    str         w8,  [x1]
+    mov         w0,  w7
+.endm
+
+function x264_coeff_level_run4_aarch64, export=1
+    ldr         x2,  [x0]
+
+    coeff_level_run_start 4
+
+    coeff_level_run 4
+
+    ret
+endfunc
+
+.macro X264_COEFF_LEVEL_RUN size
+function x264_coeff_level_run\size\()_neon, export=1
+.if \size == 15
+    sub         x0,  x0,  #2
+.endif
+.if         \size < 15
+    .equ        shiftw, 3
+    ld1         {v0.8h}, [x0]
+    uqxtn       v0.8b,  v0.8h
+    cmtst       v0.8b,  v0.8b,  v0.8b
+.else
+    .equ        shiftw, 2
+    ld1         {v0.8h,v1.8h}, [x0]
+    uqxtn       v0.8b,  v0.8h
+    uqxtn2      v0.16b, v1.8h
+    cmtst       v0.16b, v0.16b, v0.16b
+    shrn        v0.8b,  v0.8h,  #4
+.endif
+    fmov        x2,  d0
+.if \size == 15
+    add         x0,  x0,  #2
+.endif
+
+    coeff_level_run_start \size
+
+    coeff_level_run shiftw
+
+    ret
+endfunc
+.endm
+
+X264_COEFF_LEVEL_RUN 8
+X264_COEFF_LEVEL_RUN 15
+X264_COEFF_LEVEL_RUN 16
+
+function x264_denoise_dct_neon, export=1
+1:  subs        w3,  w3,  #16
+    ld1         {v0.8h,v1.8h}, [x0]
+    ld1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1]
+    abs         v16.8h,  v0.8h
+    abs         v17.8h,  v1.8h
+    ld1         {v2.8h,v3.8h}, [x2], #32
+    cmlt        v18.8h,  v0.8h,   #0
+    cmlt        v19.8h,  v1.8h,   #0
+    uaddw       v4.4s,   v4.4s,   v16.4h
+    uaddw2      v5.4s,   v5.4s,   v16.8h
+    uqsub       v20.8h,  v16.8h,  v2.8h
+    uqsub       v21.8h,  v17.8h,  v3.8h
+    uaddw       v6.4s,   v6.4s,   v17.4h
+    uaddw2      v7.4s,   v7.4s,   v17.8h
+    neg         v22.8h,  v20.8h
+    neg         v23.8h,  v21.8h
+    bsl         v18.16b, v22.16b, v20.16b
+    bsl         v19.16b, v23.16b, v21.16b
+    st1         {v4.4s,v5.4s,v6.4s,v7.4s}, [x1], #64
+    st1         {v18.8h,v19.8h}, [x0], #32
+    b.gt        1b
+    ret
+endfunc

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/quant.h Changed

@@ -1,9 +1,10 @@
 /*****************************************************************************
  * quant.h: arm quantization and level-run
  *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
+ * Copyright (C) 2005-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -38,10 +39,21 @@
 void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 
+int x264_decimate_score15_neon( int16_t * );
+int x264_decimate_score16_neon( int16_t * );
+int x264_decimate_score64_neon( int16_t * );
+
 int x264_coeff_last4_aarch64( int16_t * );
 int x264_coeff_last8_aarch64( int16_t * );
 int x264_coeff_last15_neon( int16_t * );
 int x264_coeff_last16_neon( int16_t * );
 int x264_coeff_last64_neon( int16_t * );
 
+int x264_coeff_level_run4_aarch64( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run8_neon( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run15_neon( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run16_neon( int16_t *, x264_run_level_t * );
+
+void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
+
 #endif

x264-snapshot-20141218-2245.tar.bz2/common/arm/asm.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/asm.S Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/cpu-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/cpu-a.S Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/dct-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/dct-a.S Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/dct.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/deblock-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/deblock-a.S Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/mc-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/mc-a.S Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc.S: arm motion compensation
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
  *          Mans Rullgard <mans@mansr.com>
@@ -1566,6 +1566,30 @@
     pop             {r4-r7, pc}
 endfunc
 
+function x264_plane_copy_swap_neon
+    push            {r4-r5, lr}
+    ldrd            r4, r5, [sp, #12]
+    add             lr,  r4,  #15
+    bic             lr,  lr,  #15
+    sub             r1,  r1,  lr, lsl #1
+    sub             r3,  r3,  lr, lsl #1
+1:
+    vld1.8          {q0, q1}, [r2]!
+    subs            lr,  lr,  #16
+    vrev16.8        q0,  q0
+    vrev16.8        q1,  q1
+    vst1.8          {q0, q1}, [r0]!
+    bgt             1b
+
+    subs            r5,  r5,  #1
+    add             r0,  r0,  r1
+    add             r2,  r2,  r3
+    mov             lr,  r4
+    bgt             1b
+
+    pop             {r4-r5, pc}
+endfunc
+
 function x264_store_interleave_chroma_neon
     push            {lr}
     ldr             lr,  [sp, #4]

x264-snapshot-20141218-2245.tar.bz2/common/arm/mc-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/arm/mc-c.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc-c.c: arm motion compensation
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
  *
@@ -57,6 +57,8 @@
 void x264_plane_copy_interleave_neon( pixel *dst,  intptr_t i_dst,
                                       pixel *srcu, intptr_t i_srcu,
                                       pixel *srcv, intptr_t i_srcv, int w, int h );
+void x264_plane_copy_swap_neon( pixel *dst, intptr_t i_dst,
+                                pixel *src, intptr_t i_src, int w, int h );
 
 void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
 void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
@@ -136,9 +138,6 @@
     x264_mc_copy_w16_neon,
 };
 
-static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-
 static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
                           uint8_t *src[4], intptr_t i_src_stride,
                           int mvx, int mvy,
@@ -146,13 +145,13 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
     if ( (mvy&3) == 3 )             // explict if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         x264_pixel_avg_wtab_neon[i_width>>2](
                 dst, i_dst_stride, src1, i_src_stride,
                 src2, i_height );
@@ -172,13 +171,13 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
     if ( (mvy&3) == 3 )             // explict if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         x264_pixel_avg_wtab_neon[i_width>>2](
                 dst, *i_dst_stride, src1, i_src_stride,
                 src2, i_height );
@@ -243,6 +242,7 @@
     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
     pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
+    pf->plane_copy_swap = x264_plane_copy_swap_neon;
 
     pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
     pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;

x264-snapshot-20141218-2245.tar.bz2/common/arm/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/mc.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/pixel-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/pixel-a.S Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/pixel.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/predict-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/predict-a.S Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/predict-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/arm/predict-c.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/predict.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/quant-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/quant-a.S Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/quant.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/bitstream.c -> x264-snapshot-20150804-2245.tar.bz2/common/bitstream.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * bitstream.c: bitstream writing
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Fiona Glaser <fiona@x264.com>
@@ -54,6 +54,8 @@
 void x264_cabac_block_residual_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 
+uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
+
 /****************************************************************************
  * x264_nal_encode:
  ****************************************************************************/
@@ -142,4 +144,8 @@
     }
 #endif
 #endif
+#if ARCH_AARCH64
+    if( cpu&X264_CPU_NEON )
+        pf->nal_escape = x264_nal_escape_neon;
+#endif
 }

x264-snapshot-20141218-2245.tar.bz2/common/bitstream.h -> x264-snapshot-20150804-2245.tar.bz2/common/bitstream.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/cabac.c -> x264-snapshot-20150804-2245.tar.bz2/common/cabac.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/cabac.h -> x264-snapshot-20150804-2245.tar.bz2/common/cabac.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/common.c -> x264-snapshot-20150804-2245.tar.bz2/common/common.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * common.c: misc common functions
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -579,6 +579,7 @@
 {
     char *name_buf = NULL;
     int b_error = 0;
+    int errortype = X264_PARAM_BAD_VALUE;
     int name_was_bool;
     int value_was_null = !value;
     int i;
@@ -595,6 +596,8 @@
     {
         char *c;
         name_buf = strdup(name);
+        if( !name_buf )
+            return X264_PARAM_BAD_NAME;
         while( (c = strchr( name_buf, '_' )) )
             *c = '-';
         name = name_buf;
@@ -617,20 +620,23 @@
                  !strcasecmp(value, "auto") || atobool(value) ? x264_cpu_detect() : 0;
         if( b_error )
         {
-            char *buf = strdup(value);
-            char *tok, UNUSED *saveptr=NULL, *init;
-            b_error = 0;
-            p->cpu = 0;
-            for( init=buf; (tok=strtok_r(init, ",", &saveptr)); init=NULL )
+            char *buf = strdup( value );
+            if( buf )
             {
-                for( i=0; x264_cpu_names[i].flags && strcasecmp(tok, x264_cpu_names[i].name); i++ );
-                p->cpu |= x264_cpu_names[i].flags;
-                if( !x264_cpu_names[i].flags )
-                    b_error = 1;
+                char *tok, UNUSED *saveptr=NULL, *init;
+                b_error = 0;
+                p->cpu = 0;
+                for( init=buf; (tok=strtok_r(init, ",", &saveptr)); init=NULL )
+                {
+                    for( i=0; x264_cpu_names[i].flags && strcasecmp(tok, x264_cpu_names[i].name); i++ );
+                    p->cpu |= x264_cpu_names[i].flags;
+                    if( !x264_cpu_names[i].flags )
+                        b_error = 1;
+                }
+                free( buf );
+                if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) )
+                    p->cpu |= X264_CPU_SSE2_IS_FAST;
             }
-            free( buf );
-            if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) )
-                p->cpu |= X264_CPU_SSE2_IS_FAST;
         }
     }
     OPT("threads")
@@ -1049,7 +1055,10 @@
     OPT("opencl-device")
         p->i_opencl_device = atoi( value );
     else
-        return X264_PARAM_BAD_NAME;
+    {
+        b_error = 1;
+        errortype = X264_PARAM_BAD_NAME;
+    }
 #undef OPT
 #undef OPT2
 #undef atobool
@@ -1060,7 +1069,7 @@
         free( name_buf );
 
     b_error |= value_was_null && !name_was_bool;
-    return b_error ? X264_PARAM_BAD_VALUE : 0;
+    return b_error ? errortype : 0;
 }
 
 /****************************************************************************
@@ -1133,6 +1142,7 @@
         [X264_CSP_I420] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } },
         [X264_CSP_YV12] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } },
         [X264_CSP_NV12] = { 2, { 256*1, 256*1 },        { 256*1, 256/2 },       },
+        [X264_CSP_NV21] = { 2, { 256*1, 256*1 },        { 256*1, 256/2 },       },
         [X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
         [X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
         [X264_CSP_NV16] = { 2, { 256*1, 256*1 },        { 256*1, 256*1 },       },
@@ -1265,29 +1275,36 @@
 char *x264_slurp_file( const char *filename )
 {
     int b_error = 0;
-    size_t i_size;
+    int64_t i_size;
     char *buf;
     FILE *fh = x264_fopen( filename, "rb" );
     if( !fh )
         return NULL;
+
     b_error |= fseek( fh, 0, SEEK_END ) < 0;
     b_error |= ( i_size = ftell( fh ) ) <= 0;
+    if( WORD_SIZE == 4 )
+        b_error |= i_size > INT32_MAX;
     b_error |= fseek( fh, 0, SEEK_SET ) < 0;
     if( b_error )
         goto error;
+
     buf = x264_malloc( i_size+2 );
     if( !buf )
         goto error;
+
     b_error |= fread( buf, 1, i_size, fh ) != i_size;
-    if( buf[i_size-1] != '\n' )
-        buf[i_size++] = '\n';
-    buf[i_size] = 0;
     fclose( fh );
     if( b_error )
     {
         x264_free( buf );
         return NULL;
     }
+
+    if( buf[i_size-1] != '\n' )
+        buf[i_size++] = '\n';
+    buf[i_size] = '\0';
+
     return buf;
 error:
     fclose( fh );

x264-snapshot-20141218-2245.tar.bz2/common/common.h -> x264-snapshot-20150804-2245.tar.bz2/common/common.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/cpu.c -> x264-snapshot-20150804-2245.tar.bz2/common/cpu.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * cpu.c: cpu detection
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -67,8 +67,8 @@
     {"AVX",         AVX},
     {"XOP",         AVX|X264_CPU_XOP},
     {"FMA4",        AVX|X264_CPU_FMA4},
-    {"AVX2",        AVX|X264_CPU_AVX2},
     {"FMA3",        AVX|X264_CPU_FMA3},
+    {"AVX2",        AVX|X264_CPU_FMA3|X264_CPU_AVX2},
 #undef AVX
 #undef SSE2
 #undef MMX2
@@ -92,6 +92,8 @@
 #elif ARCH_AARCH64
     {"ARMv8",           X264_CPU_ARMV8},
     {"NEON",            X264_CPU_NEON},
+#elif ARCH_MIPS
+    {"MSA",             X264_CPU_MSA},
 #endif
     {"", 0},
 };
@@ -419,6 +421,17 @@
     return X264_CPU_ARMV8 | X264_CPU_NEON;
 }
 
+#elif ARCH_MIPS
+
+uint32_t x264_cpu_detect( void )
+{
+    uint32_t flags = 0;
+#if HAVE_MSA
+    flags |= X264_CPU_MSA;
+#endif
+    return flags;
+}
+
 #else
 
 uint32_t x264_cpu_detect( void )

x264-snapshot-20141218-2245.tar.bz2/common/cpu.h -> x264-snapshot-20150804-2245.tar.bz2/common/cpu.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/dct.c -> x264-snapshot-20150804-2245.tar.bz2/common/dct.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * dct.c: transform and zigzag
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -38,6 +38,9 @@
 #if ARCH_AARCH64
 #   include "aarch64/dct.h"
 #endif
+#if ARCH_MIPS
+#   include "mips/dct.h"
+#endif
 
 /* the inverse of the scaling factors introduced by 8x8 fdct */
 /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
@@ -747,8 +750,32 @@
 
         dctf->add8x8_idct8  = x264_add8x8_idct8_neon;
         dctf->add16x16_idct8= x264_add16x16_idct8_neon;
+#if ARCH_AARCH64
+        dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
+#endif
+    }
+#endif
+
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        dctf->sub4x4_dct       = x264_sub4x4_dct_msa;
+        dctf->sub8x8_dct       = x264_sub8x8_dct_msa;
+        dctf->sub16x16_dct     = x264_sub16x16_dct_msa;
+        dctf->sub8x8_dct_dc    = x264_sub8x8_dct_dc_msa;
+        dctf->sub8x16_dct_dc   = x264_sub8x16_dct_dc_msa;
+        dctf->dct4x4dc         = x264_dct4x4dc_msa;
+        dctf->idct4x4dc        = x264_idct4x4dc_msa;
+        dctf->add4x4_idct      = x264_add4x4_idct_msa;
+        dctf->add8x8_idct      = x264_add8x8_idct_msa;
+        dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_msa;
+        dctf->add16x16_idct    = x264_add16x16_idct_msa;
+        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa;
+        dctf->add8x8_idct8     = x264_add8x8_idct8_msa;
+        dctf->add16x16_idct8   = x264_add16x16_idct8_msa;
     }
 #endif
+
 #endif // HIGH_BIT_DEPTH
 }
 
@@ -1004,7 +1031,20 @@
 #endif
 #if HAVE_ARMV6 || ARCH_AARCH64
     if( cpu&X264_CPU_NEON )
-        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
+    {
+        pf_progressive->scan_4x4  = x264_zigzag_scan_4x4_frame_neon;
+#if ARCH_AARCH64
+        pf_interlaced->scan_4x4   = x264_zigzag_scan_4x4_field_neon;
+        pf_interlaced->scan_8x8   = x264_zigzag_scan_8x8_field_neon;
+        pf_interlaced->sub_4x4    = x264_zigzag_sub_4x4_field_neon;
+        pf_interlaced->sub_4x4ac  = x264_zigzag_sub_4x4ac_field_neon;
+        pf_interlaced->sub_8x8    = x264_zigzag_sub_8x8_field_neon;
+        pf_progressive->scan_8x8  = x264_zigzag_scan_8x8_frame_neon;
+        pf_progressive->sub_4x4   = x264_zigzag_sub_4x4_frame_neon;
+        pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon;
+        pf_progressive->sub_8x8   = x264_zigzag_sub_8x8_frame_neon;
+#endif // ARCH_AARCH64
+    }
 #endif // HAVE_ARMV6 || ARCH_AARCH64
 #endif // HIGH_BIT_DEPTH
 
@@ -1047,4 +1087,21 @@
     }
 #endif // HIGH_BIT_DEPTH
 #endif
+#if !HIGH_BIT_DEPTH
+#if ARCH_AARCH64
+    if( cpu&X264_CPU_NEON )
+    {
+        pf_interlaced->interleave_8x8_cavlc =
+        pf_progressive->interleave_8x8_cavlc =  x264_zigzag_interleave_8x8_cavlc_neon;
+    }
+#endif // ARCH_AARCH64
+#endif // !HIGH_BIT_DEPTH
+#if !HIGH_BIT_DEPTH
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        pf_progressive->scan_4x4  = x264_zigzag_scan_4x4_frame_msa;
+    }
+#endif
+#endif
 }

x264-snapshot-20141218-2245.tar.bz2/common/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/dct.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/deblock.c -> x264-snapshot-20150804-2245.tar.bz2/common/deblock.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * deblock.c: deblocking
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -612,8 +612,10 @@
     uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
     if( intra_cur )
     {
-        memset( &bs[0][1], 3, 3*4*sizeof(uint8_t) );
-        memset( &bs[1][1], 3, 3*4*sizeof(uint8_t) );
+        M32( bs[0][1] ) = 0x03030303;
+        M64( bs[0][2] ) = 0x0303030303030303ULL;
+        M32( bs[1][1] ) = 0x03030303;
+        M64( bs[1][2] ) = 0x0303030303030303ULL;
     }
     else
         h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
@@ -737,6 +739,32 @@
 void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                  int mvy_limit, int bframe );
+#if ARCH_AARCH64
+void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+#endif
+#endif
+
+#if !HIGH_BIT_DEPTH
+#if HAVE_MSA
+void x264_deblock_v_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_strength_msa( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
+                                int bframe );
+#endif
 #endif
 
 void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
@@ -835,18 +863,43 @@
     {
         pf->deblock_luma[1] = x264_deblock_v_luma_altivec;
         pf->deblock_luma[0] = x264_deblock_h_luma_altivec;
-   }
+    }
 #endif // HAVE_ALTIVEC
 
 #if HAVE_ARMV6 || ARCH_AARCH64
-   if( cpu&X264_CPU_NEON )
-   {
+    if( cpu&X264_CPU_NEON )
+    {
         pf->deblock_luma[1] = x264_deblock_v_luma_neon;
         pf->deblock_luma[0] = x264_deblock_h_luma_neon;
         pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
         pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
+#if ARCH_AARCH64
+        pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
+        pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
+        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
+        pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
+        pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
+        pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
+        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon;
+        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon;
+#endif
         pf->deblock_strength     = x264_deblock_strength_neon;
-   }
+    }
+#endif
+
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        pf->deblock_luma[1] = x264_deblock_v_luma_msa;
+        pf->deblock_luma[0] = x264_deblock_h_luma_msa;
+        pf->deblock_chroma[1] = x264_deblock_v_chroma_msa;
+        pf->deblock_h_chroma_420 = x264_deblock_h_chroma_msa;
+        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_msa;
+        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_msa;
+        pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_msa;
+        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_msa;
+        pf->deblock_strength = x264_deblock_strength_msa;
+    }
 #endif
 #endif // !HIGH_BIT_DEPTH

x264-snapshot-20141218-2245.tar.bz2/common/frame.c -> x264-snapshot-20150804-2245.tar.bz2/common/frame.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * frame.c: frame handling
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -47,6 +47,7 @@
     switch( external_csp & X264_CSP_MASK )
     {
         case X264_CSP_NV12:
+        case X264_CSP_NV21:
         case X264_CSP_I420:
         case X264_CSP_YV12:
             return X264_CSP_NV12;
@@ -77,7 +78,7 @@
 #if ARCH_X86 || ARCH_X86_64
     if( h->param.cpu&X264_CPU_CACHELINE_64 )
         align = 64;
-    else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX2 )
+    else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
         align = 32;
 #endif
 #if ARCH_PPC
@@ -387,7 +388,15 @@
         return -1;
     }
 
-    dst->i_type     = src->i_type;
+    if( src->i_type < X264_TYPE_AUTO || src->i_type > X264_TYPE_KEYFRAME )
+    {
+        x264_log( h, X264_LOG_WARNING, "forced frame type (%d) at %d is unknown\n", src->i_type, h->frames.i_input );
+        dst->i_forced_type = X264_TYPE_AUTO;
+    }
+    else
+        dst->i_forced_type = src->i_type;
+
+    dst->i_type     = dst->i_forced_type;
     dst->i_qpplus1  = src->i_qpplus1;
     dst->i_pts      = dst->i_reordered_pts = src->i_pts;
     dst->param      = src->param;
@@ -435,6 +444,12 @@
             h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
                               stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>v_shift );
         }
+        else if( i_csp == X264_CSP_NV21 )
+        {
+            get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
+            h->mc.plane_copy_swap( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
+                                   stride[1]/sizeof(pixel), h->param.i_width>>1, h->param.i_height>>v_shift );
+        }
         else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 )
         {
             int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16;

x264-snapshot-20141218-2245.tar.bz2/common/frame.h -> x264-snapshot-20150804-2245.tar.bz2/common/frame.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/macroblock.c -> x264-snapshot-20150804-2245.tar.bz2/common/macroblock.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * macroblock.c: macroblock common functions
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Fiona Glaser <fiona@x264.com>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -1158,7 +1158,7 @@
             {
                 // Looking at the bottom field so always take the bottom macroblock of the pair.
                 h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]];
-                h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]];
+                h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[1]];
                 h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[2]];
                 CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[0]] );
                 CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[1]] );
@@ -1436,8 +1436,10 @@
     uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
     if( IS_INTRA( h->mb.i_type ) )
     {
-        memset( bs[0][1], 3, 3*4*sizeof(uint8_t) );
-        memset( bs[1][1], 3, 3*4*sizeof(uint8_t) );
+        M32( bs[0][1] ) = 0x03030303;
+        M64( bs[0][2] ) = 0x0303030303030303ULL;
+        M32( bs[1][1] ) = 0x03030303;
+        M64( bs[1][2] ) = 0x0303030303030303ULL;
         return;
     }
 
@@ -1450,7 +1452,9 @@
             M32( bs[0][0] ) = 0x02020202;
             M32( bs[0][2] ) = 0x02020202;
             M32( bs[0][4] ) = 0x02020202;
-            memset( bs[1][0], 2, 5*4*sizeof(uint8_t) ); /* [1][1] and [1][3] has to be set for 4:2:2 */
+            M64( bs[1][0] ) = 0x0202020202020202ULL; /* [1][1] and [1][3] has to be set for 4:2:2 */
+            M64( bs[1][2] ) = 0x0202020202020202ULL;
+            M32( bs[1][4] ) = 0x02020202;
             return;
         }
     }

x264-snapshot-20141218-2245.tar.bz2/common/macroblock.h -> x264-snapshot-20150804-2245.tar.bz2/common/macroblock.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/mc.c -> x264-snapshot-20150804-2245.tar.bz2/common/mc.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc.c: motion compensation
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -38,6 +38,9 @@
 #if ARCH_AARCH64
 #include "aarch64/mc.h"
 #endif
+#if ARCH_MIPS
+#include "mips/mc.h"
+#endif
 
 
 static inline void pixel_avg( pixel *dst,  intptr_t i_dst_stride,
@@ -189,8 +192,8 @@
     }
 }
 
-static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+const uint8_t x264_hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+const uint8_t x264_hpel_ref1[16] = {0,0,1,0,2,2,3,2,2,2,3,2,2,2,3,2};
 
 static void mc_luma( pixel *dst,    intptr_t i_dst_stride,
                      pixel *src[4], intptr_t i_src_stride,
@@ -199,11 +202,11 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
+    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         pixel_avg( dst, i_dst_stride, src1, i_src_stride,
                    src2, i_src_stride, i_width, i_height );
         if( weight->weightfn )
@@ -222,11 +225,11 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
+    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
                    src2, i_src_stride, i_width, i_height );
         if( weight->weightfn )
@@ -299,6 +302,17 @@
     }
 }
 
+void x264_plane_copy_swap_c( pixel *dst, intptr_t i_dst,
+                             pixel *src, intptr_t i_src, int w, int h )
+{
+    for( int y=0; y<h; y++, dst+=i_dst, src+=i_src )
+        for( int x=0; x<2*w; x+=2 )
+        {
+            dst[x]   = src[x+1];
+            dst[x+1] = src[x];
+        }
+}
+
 void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
                                    pixel *srcu, intptr_t i_srcu,
                                    pixel *srcv, intptr_t i_srcv, int w, int h )
@@ -612,6 +626,7 @@
     pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec;
 
     pf->plane_copy = x264_plane_copy_c;
+    pf->plane_copy_swap = x264_plane_copy_swap_c;
     pf->plane_copy_interleave = x264_plane_copy_interleave_c;
     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c;
@@ -647,6 +662,10 @@
 #if ARCH_AARCH64
     x264_mc_init_aarch64( cpu, pf );
 #endif
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+        x264_mc_init_mips( cpu, pf );
+#endif
 
     if( cpu_independent )
     {

x264-snapshot-20141218-2245.tar.bz2/common/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/mc.h Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc.h: motion compensation
  *****************************************************************************
- * Copyright (C) 2004-2014 x264 project
+ * Copyright (C) 2004-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *
@@ -41,6 +41,8 @@
 } ALIGNED_16( x264_weight_t );
 
 extern const x264_weight_t x264_weight_none[3];
+extern const uint8_t x264_hpel_ref0[16];
+extern const uint8_t x264_hpel_ref1[16];
 
 #define SET_WEIGHT( w, b, s, d, o )\
 {\
@@ -86,6 +88,7 @@
     void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, intptr_t i_src, int height );
 
     void (*plane_copy)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
+    void (*plane_copy_swap)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
     void (*plane_copy_interleave)( pixel *dst,  intptr_t i_dst, pixel *srcu, intptr_t i_srcu,
                                    pixel *srcv, intptr_t i_srcv, int w, int h );
     /* may write up to 15 pixels off the end of each plane */

x264-snapshot-20150804-2245.tar.bz2/common/mips Added

x264-snapshot-20150804-2245.tar.bz2/common/mips/dct-c.c Added

@@ -0,0 +1,525 @@
+/*****************************************************************************
+ * dct-c.c: msa transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Rishikesh More <rishikesh.more@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macros.h"
+
+#if !HIGH_BIT_DEPTH
+#define AVC_ITRANS_H( in0, in1, in2, in3, out0, out1, out2, out3 )          \
+{                                                                           \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+                                                                            \
+    tmp0_m = in0 + in2;                                                     \
+    tmp1_m = in0 - in2;                                                     \
+    tmp2_m = in1 >> 1;                                                      \
+    tmp2_m = tmp2_m - in3;                                                  \
+    tmp3_m = in3 >> 1;                                                      \
+    tmp3_m = in1 + tmp3_m;                                                  \
+                                                                            \
+    BUTTERFLY_4( tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3 );  \
+}
+
+static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst,
+                              int32_t i_src_stride )
+{
+    v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3;
+    v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
+    v4i32 hor_res0, hor_res1, hor_res2, hor_res3;
+    v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r;
+
+    LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
+    UNPCK_R_SH_SW( src0, src0_r );
+    UNPCK_R_SH_SW( src1, src1_r );
+    UNPCK_R_SH_SW( src2, src2_r );
+    UNPCK_R_SH_SW( src3, src3_r );
+    BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r,
+                 tmp0, tmp3, tmp2, tmp1 );
+    BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
+                 hor_res0, hor_res3, hor_res2, hor_res1 );
+    TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3,
+                        hor_res0, hor_res1, hor_res2, hor_res3 );
+    BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1,
+                 tmp0, tmp3, tmp2, tmp1 );
+    BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
+                 ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r );
+    SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 );
+    PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r,
+                 ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r,
+                 ver_res0, ver_res1, ver_res2, ver_res3 );
+    PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 );
+    ST_SH2( ver_res0, ver_res2, p_dst, 8 );
+}
+
+static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride,
+                                uint8_t *p_ref, int32_t i_dst_stride,
+                                int16_t *p_dst )
+{
+    uint32_t i_src0, i_src1, i_src2, i_src3;
+    uint32_t i_ref0, i_ref1, i_ref2, i_ref3;
+    v16i8 src = { 0 };
+    v16i8 ref = { 0 };
+    v16u8 inp0, inp1;
+    v8i16 diff0, diff1, diff2, diff3;
+    v8i16 temp0, temp1, temp2, temp3;
+
+    LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
+    LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 );
+
+    INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
+    INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref );
+
+    ILVRL_B2_UB( src, ref, inp0, inp1 );
+
+    HSUB_UB2_SH( inp0, inp1, diff0, diff2 );
+
+    diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 );
+    diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 );
+
+    BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 );
+
+    diff0 = temp0 + temp1;
+    diff1 = ( temp3 << 1 ) + temp2;
+    diff2 = temp0 - temp1;
+    diff3 = temp3 - ( temp2 << 1 );
+
+    TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
+                        temp0, temp1, temp2, temp3 );
+    BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 );
+
+    temp0 = diff0 + diff1;
+    temp1 = ( diff3 << 1 ) + diff2;
+    temp2 = diff0 - diff1;
+    temp3 = diff3 - ( diff2 << 1 );
+
+    ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 );
+    ST_UB2( inp0, inp1, p_dst, 8 );
+}
+
+static void avc_zigzag_scan_4x4_frame_msa( int16_t pi_dct[16],
+                                           int16_t pi_level[16] )
+{
+    v8i16 src0, src1;
+    v8i16 mask0 = { 0, 4, 1, 2, 5, 8, 12, 9 };
+    v8i16 mask1 = { 6, 3, 7, 10, 13, 14, 11, 15 };
+
+    LD_SH2( pi_dct, 8, src0, src1 );
+    VSHF_H2_SH( src0, src1, src0, src1, mask0, mask1, mask0, mask1 );
+    ST_SH2( mask0, mask1, pi_level, 8 );
+}
+
+static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src,
+                                    int32_t i_dst_stride )
+{
+    v8i16 src0, src1, src2, src3;
+    v8i16 hres0, hres1, hres2, hres3;
+    v8i16 vres0, vres1, vres2, vres3;
+    v8i16 zeros = { 0 };
+
+    LD4x4_SH( p_src, src0, src1, src2, src3 );
+    AVC_ITRANS_H( src0, src1, src2, src3, hres0, hres1, hres2, hres3 );
+    TRANSPOSE4x4_SH_SH( hres0, hres1, hres2, hres3,
+                        hres0, hres1, hres2, hres3 );
+    AVC_ITRANS_H( hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3 );
+    SRARI_H4_SH( vres0, vres1, vres2, vres3, 6 );
+    ADDBLK_ST4x4_UB( vres0, vres1, vres2, vres3, p_dst, i_dst_stride );
+    ST_SH2( zeros, zeros, p_src, 8 );
+}
+
+static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src,
+                                       int32_t i_dst_stride )
+{
+    int16_t i_dc;
+    uint32_t i_src0, i_src1, i_src2, i_src3;
+    v16u8 pred = { 0 };
+    v16i8 out;
+    v8i16 input_dc, pred_r, pred_l;
+
+    i_dc = ( p_src[0] + 32 ) >> 6;
+    input_dc = __msa_fill_h( i_dc );
+    p_src[ 0 ] = 0;
+
+    LW4( p_dst, i_dst_stride, i_src0, i_src1, i_src2, i_src3 );
+    INSERT_W4_UB( i_src0, i_src1, i_src2, i_src3, pred );
+    UNPCK_UB_SH( pred, pred_r, pred_l );
+
+    pred_r += input_dc;
+    pred_l += input_dc;
+
+    CLIP_SH2_0_255( pred_r, pred_l );
+    out = __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r );
+    ST4x4_UB( out, out, 0, 1, 2, 3, p_dst, i_dst_stride );
+}
+
+static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src,
+                                  int32_t i_dst_stride )
+{
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 vec0, vec1, vec2, vec3;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
+    v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
+    v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
+    v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
+    v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
+    v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 zeros = { 0 };
+
+    p_src[ 0 ] += 32;
+
+    LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
+
+    vec0 = src0 + src4;
+    vec1 = src0 - src4;
+    vec2 = src2 >> 1;
+    vec2 = vec2 - src6;
+    vec3 = src6 >> 1;
+    vec3 = src2 + vec3;
+
+    BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 );
+
+    vec0 = src7 >> 1;
+    vec0 = src5 - vec0 - src3 - src7;
+    vec1 = src3 >> 1;
+    vec1 = src1 - vec1 + src7 - src3;
+    vec2 = src5 >> 1;
+    vec2 = vec2 - src1 + src7 + src5;
+    vec3 = src1 >> 1;
+    vec3 = vec3 + src3 + src5 + src1;
+    tmp4 = vec3 >> 2;
+    tmp4 += vec0;
+    tmp5 = vec2 >> 2;
+    tmp5 += vec1;
+    tmp6 = vec1 >> 2;
+    tmp6 -= vec2;
+    tmp7 = vec0 >> 2;
+    tmp7 = vec3 - tmp7;
+
+    BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+                 res0, res1, res2, res3, res4, res5, res6, res7 );
+    TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7,
+                        res0, res1, res2, res3, res4, res5, res6, res7 );
+    UNPCK_SH_SW( res0, tmp0_r, tmp0_l );
+    UNPCK_SH_SW( res1, tmp1_r, tmp1_l );
+    UNPCK_SH_SW( res2, tmp2_r, tmp2_l );
+    UNPCK_SH_SW( res3, tmp3_r, tmp3_l );
+    UNPCK_SH_SW( res4, tmp4_r, tmp4_l );
+    UNPCK_SH_SW( res5, tmp5_r, tmp5_l );
+    UNPCK_SH_SW( res6, tmp6_r, tmp6_l );
+    UNPCK_SH_SW( res7, tmp7_r, tmp7_l );
+    BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r,
+                 vec0_r, vec0_l, vec1_l, vec1_r );
+
+    vec2_r = tmp2_r >> 1;
+    vec2_l = tmp2_l >> 1;
+    vec2_r -= tmp6_r;
+    vec2_l -= tmp6_l;
+    vec3_r = tmp6_r >> 1;
+    vec3_l = tmp6_l >> 1;
+    vec3_r += tmp2_r;
+    vec3_l += tmp2_l;
+
+    BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r,
+                 tmp0_r, tmp2_r, tmp4_r, tmp6_r );
+    BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l,
+                 tmp0_l, tmp2_l, tmp4_l, tmp6_l );
+
+    vec0_r = tmp7_r >> 1;
+    vec0_l = tmp7_l >> 1;
+    vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
+    vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
+    vec1_r = tmp3_r >> 1;
+    vec1_l = tmp3_l >> 1;
+    vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
+    vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
+    vec2_r = tmp5_r >> 1;
+    vec2_l = tmp5_l >> 1;
+    vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
+    vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
+    vec3_r = tmp1_r >> 1;
+    vec3_l = tmp1_l >> 1;
+    vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
+    vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
+    tmp1_r = vec3_r >> 2;
+    tmp1_l = vec3_l >> 2;
+    tmp1_r += vec0_r;
+    tmp1_l += vec0_l;
+    tmp3_r = vec2_r >> 2;
+    tmp3_l = vec2_l >> 2;
+    tmp3_r += vec1_r;
+    tmp3_l += vec1_l;
+    tmp5_r = vec1_r >> 2;
+    tmp5_l = vec1_l >> 2;
+    tmp5_r -= vec2_r;
+    tmp5_l -= vec2_l;
+    tmp7_r = vec0_r >> 2;
+    tmp7_l = vec0_l >> 2;
+    tmp7_r = vec3_r - tmp7_r;
+    tmp7_l = vec3_l - tmp7_l;
+
+    BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r,
+                 res0_r, res0_l, res7_l, res7_r );
+    BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r,
+                 res1_r, res1_l, res6_l, res6_r );
+    BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r,
+                 res2_r, res2_l, res5_l, res5_r );
+    BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r,
+                 res3_r, res3_l, res4_l, res4_r );
+    SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 );
+    SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 );
+    SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 );
+    SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 );
+    PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
+                 res0, res1, res2, res3 );
+    PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
+                 res4, res5, res6, res7 );
+    LD_SB8( p_dst, i_dst_stride,
+            dst0, dst1, dst2, dst3,
+            dst4, dst5, dst6, dst7 );
+    ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
+                tmp0, tmp1, tmp2, tmp3 );
+    ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
+                tmp4, tmp5, tmp6, tmp7 );
+    ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
+          res0, res1, res2, res3 );
+    ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
+          res4, res5, res6, res7 );
+    CLIP_SH4_0_255( res0, res1, res2, res3 );
+    CLIP_SH4_0_255( res4, res5, res6, res7 );
+    PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
+                 dst0, dst1, dst2, dst3 );
+    ST8x4_UB( dst0, dst1, p_dst, i_dst_stride );
+    p_dst += ( 4 * i_dst_stride );
+    ST8x4_UB( dst2, dst3, p_dst, i_dst_stride );
+}
+
+static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride,
+                               int16_t *p_dst, int32_t i_dst_stride )
+{
+    v8i16 src0, src1, src2, src3;
+    v4i32 src0_r, src1_r, src2_r, src3_r;
+    v4i32 hres0, hres1, hres2, hres3;
+    v8i16 vres0, vres1, vres2, vres3;
+    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v2i64 res0, res1;
+
+    LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
+    UNPCK_R_SH_SW( src0, src0_r );
+    UNPCK_R_SH_SW( src1, src1_r );
+    UNPCK_R_SH_SW( src2, src2_r );
+    UNPCK_R_SH_SW( src3, src3_r );
+    BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1 );
+    BUTTERFLY_4( vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1 );
+    TRANSPOSE4x4_SW_SW( hres0, hres1, hres2, hres3,
+                        hres0, hres1, hres2, hres3 );
+    BUTTERFLY_4( hres0, hres2, hres3, hres1, vec0, vec3, vec2, vec1 );
+    BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 );
+    PCKEV_H4_SH( vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
+                 vres0, vres1, vres2, vres3 );
+    PCKOD_D2_SD( vres1, vres0, vres3, vres2, res0, res1 );
+    ST8x4_UB( res0, res1, p_dst, i_dst_stride * 2 );
+}
+
+static int32_t subtract_sum4x4_msa( uint8_t *p_src, int32_t i_src_stride,
+                                    uint8_t *pred_ptr, int32_t i_pred_stride )
+{
+    int16_t i_sum;
+    uint32_t i_src0, i_src1, i_src2, i_src3;
+    uint32_t i_pred0, i_pred1, i_pred2, i_pred3;
+    v16i8 src = { 0 };
+    v16i8 pred = { 0 };
+    v16u8 src_l0, src_l1;
+    v8i16 diff0, diff1;
+
+    LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
+    LW4( pred_ptr, i_pred_stride, i_pred0, i_pred1, i_pred2, i_pred3 );
+    INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
+    INSERT_W4_SB( i_pred0, i_pred1, i_pred2, i_pred3, pred );
+    ILVRL_B2_UB( src, pred, src_l0, src_l1 );
+    HSUB_UB2_SH( src_l0, src_l1, diff0, diff1 );
+    i_sum = HADD_UH_U32( diff0 + diff1 );
+
+    return i_sum;
+}
+
+void x264_dct4x4dc_msa( int16_t d[16] )
+{
+    avc_dct4x4dc_msa( d, d, 4 );
+}
+
+void x264_idct4x4dc_msa( int16_t d[16] )
+{
+    avc_idct4x4dc_msa( d, 4, d, 4 );
+}
+
+void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] )
+{
+    avc_idct4x4_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
+}
+
+void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] )
+{
+    avc_idct4x4_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
+    avc_idct4x4_addblk_msa( &p_dst[4], &pi_dct[1][0], FDEC_STRIDE );
+    avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 0],
+                            &pi_dct[2][0], FDEC_STRIDE );
+    avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 4],
+                            &pi_dct[3][0], FDEC_STRIDE );
+}
+
+void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] )
+{
+    x264_add8x8_idct_msa( &p_dst[0], &pi_dct[0] );
+    x264_add8x8_idct_msa( &p_dst[8], &pi_dct[4] );
+    x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 0], &pi_dct[8] );
+    x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 8], &pi_dct[12] );
+}
+
+void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] )
+{
+    avc_idct8_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
+}
+
+void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] )
+{
+    avc_idct8_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
+    avc_idct8_addblk_msa( &p_dst[8], &pi_dct[1][0], FDEC_STRIDE );
+    avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 0],
+                          &pi_dct[2][0], FDEC_STRIDE );
+    avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 8],
+                          &pi_dct[3][0], FDEC_STRIDE );
+}
+
+void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] )
+{
+    avc_idct4x4_addblk_dc_msa( &p_dst[0], &pi_dct[0], FDEC_STRIDE );
+    avc_idct4x4_addblk_dc_msa( &p_dst[4], &pi_dct[1], FDEC_STRIDE );
+    avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 0],
+                               &pi_dct[2], FDEC_STRIDE );
+    avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 4],
+                               &pi_dct[3], FDEC_STRIDE );
+}
+
+void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] )
+{
+    for( int32_t i = 0; i < 4; i++, pi_dct += 4, p_dst += 4 * FDEC_STRIDE )
+    {
+        avc_idct4x4_addblk_dc_msa( &p_dst[ 0], &pi_dct[0], FDEC_STRIDE );
+        avc_idct4x4_addblk_dc_msa( &p_dst[ 4], &pi_dct[1], FDEC_STRIDE );
+        avc_idct4x4_addblk_dc_msa( &p_dst[ 8], &pi_dct[2], FDEC_STRIDE );
+        avc_idct4x4_addblk_dc_msa( &p_dst[12], &pi_dct[3], FDEC_STRIDE );
+    }
+}
+
+void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src,
+                          uint8_t *p_ref )
+{
+    avc_sub4x4_dct_msa( p_src, FENC_STRIDE, p_ref, FDEC_STRIDE, p_dst );
+}
+
+void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
+                          uint8_t *p_ref )
+{
+    avc_sub4x4_dct_msa( &p_src[0], FENC_STRIDE,
+                        &p_ref[0], FDEC_STRIDE, p_dst[0] );
+    avc_sub4x4_dct_msa( &p_src[4], FENC_STRIDE, &p_ref[4],
+                        FDEC_STRIDE, p_dst[1] );
+    avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 0],
+                        FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 0],
+                        FDEC_STRIDE, p_dst[2] );
+    avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 4],
+                        FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 4],
+                        FDEC_STRIDE, p_dst[3] );
+}
+
+void x264_sub16x16_dct_msa( int16_t p_dst[16][16],
+                            uint8_t *p_src,
+                            uint8_t *p_ref )
+{
+    x264_sub8x8_dct_msa( &p_dst[ 0], &p_src[0], &p_ref[0] );
+    x264_sub8x8_dct_msa( &p_dst[ 4], &p_src[8], &p_ref[8] );
+    x264_sub8x8_dct_msa( &p_dst[ 8], &p_src[8 * FENC_STRIDE + 0],
+                         &p_ref[8*FDEC_STRIDE+0] );
+    x264_sub8x8_dct_msa( &p_dst[12], &p_src[8 * FENC_STRIDE + 8],
+                         &p_ref[8*FDEC_STRIDE+8] );
+}
+
+void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4],
+                             uint8_t *p_pix1, uint8_t *p_pix2 )
+{
+    int32_t d0, d1, d2, d3;
+
+    pi_dct[0] = subtract_sum4x4_msa( &p_pix1[0], FENC_STRIDE,
+                                     &p_pix2[0], FDEC_STRIDE );
+    pi_dct[1] = subtract_sum4x4_msa( &p_pix1[4], FENC_STRIDE,
+                                     &p_pix2[4], FDEC_STRIDE );
+    pi_dct[2] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 0], FENC_STRIDE,
+                                     &p_pix2[4 * FDEC_STRIDE + 0],
+                                     FDEC_STRIDE );
+    pi_dct[3] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 4], FENC_STRIDE,
+                                     &p_pix2[4 * FDEC_STRIDE + 4],
+                                     FDEC_STRIDE );
+
+    BUTTERFLY_4( pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1], d0, d1, d3, d2 );
+    BUTTERFLY_4( d0, d2, d3, d1, pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1] );
+}
+
+void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8],
+                              uint8_t *p_pix1, uint8_t *p_pix2 )
+{
+    int32_t a0, a1, a2, a3, a4, a5, a6, a7;
+    int32_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+    a0 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 0], FENC_STRIDE,
+                              &p_pix2[ 0 * FDEC_STRIDE + 0], FDEC_STRIDE );
+    a1 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 4], FENC_STRIDE,
+                              &p_pix2[ 0 * FDEC_STRIDE + 4], FDEC_STRIDE );
+    a2 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 0], FENC_STRIDE,
+                              &p_pix2[ 4 * FDEC_STRIDE + 0], FDEC_STRIDE );
+    a3 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 4], FENC_STRIDE,
+                              &p_pix2[ 4 * FDEC_STRIDE + 4], FDEC_STRIDE );
+    a4 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 0], FENC_STRIDE,
+                              &p_pix2[ 8 * FDEC_STRIDE + 0], FDEC_STRIDE );
+    a5 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 4], FENC_STRIDE,
+                              &p_pix2[ 8 * FDEC_STRIDE + 4], FDEC_STRIDE );
+    a6 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 0], FENC_STRIDE,
+                              &p_pix2[12 * FDEC_STRIDE + 0], FDEC_STRIDE );
+    a7 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 4], FENC_STRIDE,
+                              &p_pix2[12 * FDEC_STRIDE + 4], FDEC_STRIDE );
+
+    BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
+                 b0, b1, b2, b3, b7, b6, b5, b4 );
+    BUTTERFLY_8( b0, b2, b4, b6, b7, b5, b3, b1,
+                 a0, a1, a2, a3, a7, a6, a5, a4 );
+    BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
+                 pi_dct[0], pi_dct[1], pi_dct[6], pi_dct[7],
+                 pi_dct[5], pi_dct[4], pi_dct[3], pi_dct[2] );
+}
+
+void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] )
+{
+    avc_zigzag_scan_4x4_frame_msa( pi_dct, pi_level );
+}
+#endif

x264-snapshot-20150804-2245.tar.bz2/common/mips/dct.h Added

@@ -0,0 +1,49 @@
+/*****************************************************************************
+ * dct.h: msa transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Rishikesh More <rishikesh.more@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_DCT_H
+#define X264_MIPS_DCT_H
+
+void x264_dct4x4dc_msa( int16_t d[16] );
+void x264_idct4x4dc_msa( int16_t d[16] );
+void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] );
+void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] );
+void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] );
+void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] );
+void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] );
+void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] );
+void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] );
+void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref );
+void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
+                          uint8_t *p_ref );
+void x264_sub16x16_dct_msa( int16_t p_dst[16][16], uint8_t *p_src,
+                            uint8_t *p_ref );
+void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4], uint8_t *p_pix1,
+                             uint8_t *p_pix2 );
+void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8], uint8_t *p_pix1,
+                              uint8_t *p_pix2 );
+void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] );
+
+#endif

x264-snapshot-20150804-2245.tar.bz2/common/mips/deblock-c.c Added

@@ -0,0 +1,2010 @@
+/*****************************************************************************
+ * deblock-c.c: msa deblocking
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Neha Rana <neha.rana@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macros.h"
+
+#if !HIGH_BIT_DEPTH
+#define AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_or_q3_org_in, p0_or_q0_org_in,           \
+                                  q3_or_p3_org_in, p1_or_q1_org_in,           \
+                                  p2_or_q2_org_in, q1_or_p1_org_in,           \
+                                  p0_or_q0_out, p1_or_q1_out, p2_or_q2_out )  \
+{                                                                             \
+    v8i16 threshold;                                                          \
+    v8i16 const3 = __msa_ldi_h( 3 );                                          \
+                                                                              \
+    threshold = p0_or_q0_org_in + q3_or_p3_org_in;                            \
+    threshold += p1_or_q1_org_in;                                             \
+                                                                              \
+    p0_or_q0_out = threshold << 1;                                            \
+    p0_or_q0_out += p2_or_q2_org_in;                                          \
+    p0_or_q0_out += q1_or_p1_org_in;                                          \
+    p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 3 );                          \
+                                                                              \
+    p1_or_q1_out = p2_or_q2_org_in + threshold;                               \
+    p1_or_q1_out = __msa_srari_h( p1_or_q1_out, 2 );                          \
+                                                                              \
+    p2_or_q2_out = p2_or_q2_org_in * const3;                                  \
+    p2_or_q2_out += p3_or_q3_org_in;                                          \
+    p2_or_q2_out += p3_or_q3_org_in;                                          \
+    p2_or_q2_out += threshold;                                                \
+    p2_or_q2_out = __msa_srari_h( p2_or_q2_out, 3 );                          \
+}
+
+/* data[-u32_u_img_width] = ( uint8_t )( ( 2 * p1 + p0 + q1 + 2 ) >> 2 ); */
+#define AVC_LPF_P0_OR_Q0( p0_or_q0_org_in, q1_or_p1_org_in,  \
+                          p1_or_q1_org_in, p0_or_q0_out )    \
+{                                                            \
+    p0_or_q0_out = p0_or_q0_org_in + q1_or_p1_org_in;        \
+    p0_or_q0_out += p1_or_q1_org_in;                         \
+    p0_or_q0_out += p1_or_q1_org_in;                         \
+    p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 2 );         \
+}
+
+#define AVC_LPF_P1_OR_Q1( p0_or_q0_org_in, q0_or_p0_org_in,          \
+                          p1_or_q1_org_in, p2_or_q2_org_in,          \
+                          negate_tc_in, tc_in, p1_or_q1_out )        \
+{                                                                    \
+    v8i16 clip3, temp;                                               \
+                                                                     \
+    clip3 = ( v8i16 ) __msa_aver_u_h( ( v8u16 ) p0_or_q0_org_in,     \
+                                      ( v8u16 ) q0_or_p0_org_in );   \
+    temp = p1_or_q1_org_in << 1;                                     \
+    clip3 -= temp;                                                   \
+    clip3 = __msa_ave_s_h( p2_or_q2_org_in, clip3 );                 \
+    clip3 = CLIP_SH( clip3, negate_tc_in, tc_in );                   \
+    p1_or_q1_out = p1_or_q1_org_in + clip3;                          \
+}
+
+#define AVC_LPF_P0Q0( q0_or_p0_org_in, p0_or_q0_org_in,           \
+                      p1_or_q1_org_in, q1_or_p1_org_in,           \
+                      negate_threshold_in, threshold_in,          \
+                      p0_or_q0_out, q0_or_p0_out )                \
+{                                                                 \
+    v8i16 q0_sub_p0, p1_sub_q1, delta;                            \
+                                                                  \
+    q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in;                \
+    p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in;                \
+    q0_sub_p0 <<= 2;                                              \
+    p1_sub_q1 += 4;                                               \
+    delta = q0_sub_p0 + p1_sub_q1;                                \
+    delta >>= 3;                                                  \
+                                                                  \
+    delta = CLIP_SH( delta, negate_threshold_in, threshold_in );  \
+                                                                  \
+    p0_or_q0_out = p0_or_q0_org_in + delta;                       \
+    q0_or_p0_out = q0_or_p0_org_in - delta;                       \
+                                                                  \
+    CLIP_SH2_0_255( p0_or_q0_out, q0_or_p0_out );                 \
+}
+
+static void avc_loopfilter_luma_intra_edge_hor_msa( uint8_t *p_data,
+                                                    uint8_t u_alpha_in,
+                                                    uint8_t u_beta_in,
+                                                    uint32_t u_img_width )
+{
+    v16u8 p2_asub_p0, q2_asub_q0, p0_asub_q0;
+    v16u8 alpha, beta;
+    v16u8 is_less_than, is_less_than_beta, negate_is_less_than_beta;
+    v16u8 p2, p1, p0, q0, q1, q2;
+    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+    v8i16 p2_r = { 0 };
+    v8i16 p1_r = { 0 };
+    v8i16 p0_r = { 0 };
+    v8i16 q0_r = { 0 };
+    v8i16 q1_r = { 0 };
+    v8i16 q2_r = { 0 };
+    v8i16 p2_l = { 0 };
+    v8i16 p1_l = { 0 };
+    v8i16 p0_l = { 0 };
+    v8i16 q0_l = { 0 };
+    v8i16 q1_l = { 0 };
+    v8i16 q2_l = { 0 };
+    v16u8 tmp_flag;
+    v16i8 zero = { 0 };
+
+    alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
+    beta = ( v16u8 ) __msa_fill_b( u_beta_in );
+
+    LD_UB4( p_data - ( u_img_width << 1 ), u_img_width,
+            p1_org, p0_org, q0_org, q1_org );
+
+    {
+        v16u8 p1_asub_p0, q1_asub_q0, is_less_than_alpha;
+
+        p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
+        p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
+        q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
+
+        is_less_than_alpha = ( p0_asub_q0 < alpha );
+        is_less_than_beta = ( p1_asub_p0 < beta );
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = ( q1_asub_q0 < beta );
+        is_less_than = is_less_than_beta & is_less_than;
+    }
+
+    if( !__msa_test_bz_v( is_less_than ) )
+    {
+        q2_org = LD_UB( p_data + ( 2 * u_img_width ) );
+        p3_org = LD_UB( p_data - ( u_img_width << 2 ) );
+        p2_org = LD_UB( p_data - ( 3 * u_img_width ) );
+
+        UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
+        UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
+        UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
+
+        tmp_flag = alpha >> 2;
+        tmp_flag = tmp_flag + 2;
+        tmp_flag = ( p0_asub_q0 < tmp_flag );
+
+        p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
+        is_less_than_beta = ( p2_asub_p0 < beta );
+        is_less_than_beta = is_less_than_beta & tmp_flag;
+        negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+        {
+            v8u16 is_less_than_beta_l, is_less_than_beta_r;
+
+            q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
+
+            is_less_than_beta_r =
+                ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
+            if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
+            {
+                v8i16 p3_org_r;
+
+                ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r );
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r,
+                                          q0_org_r, p1_org_r,
+                                          p2_r, q1_org_r, p0_r, p1_r, p2_r );
+            }
+
+            q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
+
+            is_less_than_beta_l =
+                ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
+
+            if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
+            {
+                v8i16 p3_org_l;
+
+                ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l );
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l,
+                                          q0_org_l, p1_org_l,
+                                          p2_l, q1_org_l, p0_l, p1_l, p2_l );
+            }
+        }
+        /* combine and store */
+        if( !__msa_test_bz_v( is_less_than_beta ) )
+        {
+            PCKEV_B3_UB( p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2 );
+
+            p0_org = __msa_bmnz_v( p0_org, p0, is_less_than_beta );
+            p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
+            p2_org = __msa_bmnz_v( p2_org, p2, is_less_than_beta );
+
+            ST_UB( p1_org, p_data - ( 2 * u_img_width ) );
+            ST_UB( p2_org, p_data - ( 3 * u_img_width ) );
+        }
+        {
+            v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
+
+            negate_is_less_than_beta_r =
+                ( v8u16 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
+                                        zero, 8 );
+            if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_r ) )
+            {
+                AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
+            }
+
+            negate_is_less_than_beta_l =
+                ( v8u16 ) __msa_sldi_b( zero,
+                                        ( v16i8 ) negate_is_less_than_beta, 8 );
+            if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_l ) )
+            {
+                AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
+            }
+        }
+        if( !__msa_test_bz_v( negate_is_less_than_beta ) )
+        {
+            p0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p0_l, ( v16i8 ) p0_r );
+            p0_org = __msa_bmnz_v( p0_org, p0, negate_is_less_than_beta );
+        }
+
+        ST_UB( p0_org, p_data - u_img_width );
+
+        q3_org = LD_UB( p_data + ( 3 * u_img_width ) );
+        q2_asub_q0 = __msa_asub_u_b( q2_org, q0_org );
+        is_less_than_beta = ( q2_asub_q0 < beta );
+        is_less_than_beta = is_less_than_beta & tmp_flag;
+        negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        {
+            v8u16 is_less_than_beta_l, is_less_than_beta_r;
+            is_less_than_beta_r =
+                ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
+            if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
+            {
+                v8i16 q3_org_r;
+
+                ILVR_B2_SH( zero, q3_org, zero, q2_org, q3_org_r, q2_r );
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_r, q0_org_r,
+                                          p0_org_r, q1_org_r,
+                                          q2_r, p1_org_r, q0_r, q1_r, q2_r );
+            }
+            is_less_than_beta_l =
+                ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
+            if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
+            {
+                v8i16 q3_org_l;
+
+                ILVL_B2_SH( zero, q3_org, zero, q2_org, q3_org_l, q2_l );
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_l, q0_org_l,
+                                          p0_org_l, q1_org_l,
+                                          q2_l, p1_org_l, q0_l, q1_l, q2_l );
+            }
+        }
+
+        if( !__msa_test_bz_v( is_less_than_beta ) )
+        {
+            PCKEV_B3_UB( q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2 );
+            q0_org = __msa_bmnz_v( q0_org, q0, is_less_than_beta );
+            q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
+            q2_org = __msa_bmnz_v( q2_org, q2, is_less_than_beta );
+
+            ST_UB( q1_org, p_data + u_img_width );
+            ST_UB( q2_org, p_data + 2 * u_img_width );
+        }
+        {
+            v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
+            negate_is_less_than_beta_r =
+                ( v8u16 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
+                                        zero, 8 );
+            if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_r ) )
+            {
+                AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
+            }
+
+            negate_is_less_than_beta_l =
+                ( v8u16 ) __msa_sldi_b( zero,
+                                        ( v16i8 ) negate_is_less_than_beta, 8 );
+            if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_l ) )
+            {
+                AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
+            }
+        }
+        if( !__msa_test_bz_v( negate_is_less_than_beta ) )
+        {
+            q0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q0_l, ( v16i8 ) q0_r );
+            q0_org = __msa_bmnz_v( q0_org, q0, negate_is_less_than_beta );
+        }
+
+        ST_UB( q0_org, p_data );
+    }
+}
+
+static void avc_loopfilter_luma_intra_edge_ver_msa( uint8_t *p_data,
+                                                    uint8_t u_alpha_in,
+                                                    uint8_t u_beta_in,
+                                                    uint32_t u_img_width )
+{
+    uint8_t *p_src;
+    v16u8 alpha, beta, p0_asub_q0;
+    v16u8 is_less_than_alpha, is_less_than;
+    v16u8 is_less_than_beta, negate_is_less_than_beta;
+    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+    v8i16 p2_r = { 0 };
+    v8i16 p1_r = { 0 };
+    v8i16 p0_r = { 0 };
+    v8i16 q0_r = { 0 };
+    v8i16 q1_r = { 0 };
+    v8i16 q2_r = { 0 };
+    v8i16 p2_l = { 0 };
+    v8i16 p1_l = { 0 };
+    v8i16 p0_l = { 0 };
+    v8i16 q0_l = { 0 };
+    v8i16 q1_l = { 0 };
+    v8i16 q2_l = { 0 };
+    v16i8 zero = { 0 };
+    v16u8 tmp_flag;
+
+    p_src = p_data - 4;
+
+    {
+        v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+        v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+
+        LD_UB8( p_src, u_img_width,
+                row0, row1, row2, row3, row4, row5, row6, row7 );
+        LD_UB8( p_src + ( 8 * u_img_width ), u_img_width,
+                row8, row9, row10, row11, row12, row13, row14, row15 );
+
+        TRANSPOSE16x8_UB_UB( row0, row1, row2, row3,
+                             row4, row5, row6, row7,
+                             row8, row9, row10, row11,
+                             row12, row13, row14, row15,
+                             p3_org, p2_org, p1_org, p0_org,
+                             q0_org, q1_org, q2_org, q3_org );
+    }
+
+    UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
+    UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
+    UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
+    UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
+
+    {
+        v16u8 p1_asub_p0, q1_asub_q0;
+
+        p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
+        p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
+        q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
+
+        alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
+        beta = ( v16u8 ) __msa_fill_b( u_beta_in );
+
+        is_less_than_alpha = ( p0_asub_q0 < alpha );
+        is_less_than_beta = ( p1_asub_p0 < beta );
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = ( q1_asub_q0 < beta );
+        is_less_than = is_less_than_beta & is_less_than;
+    }
+
+    if( !__msa_test_bz_v( is_less_than ) )
+    {
+        tmp_flag = alpha >> 2;
+        tmp_flag = tmp_flag + 2;
+        tmp_flag = ( p0_asub_q0 < tmp_flag );
+
+        {
+            v16u8 p2_asub_p0;
+
+            p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
+            is_less_than_beta = ( p2_asub_p0 < beta );
+        }
+        is_less_than_beta = tmp_flag & is_less_than_beta;
+        negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        {
+            v16u8 is_less_than_beta_r;
+
+            is_less_than_beta_r =
+                ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
+            if( !__msa_test_bz_v( is_less_than_beta_r ) )
+            {
+                v8i16 p3_org_r;
+
+                ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r );
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r,
+                                          q0_org_r, p1_org_r,
+                                          p2_r, q1_org_r, p0_r, p1_r, p2_r );
+            }
+        }
+
+        {
+            v16u8 is_less_than_beta_l;
+
+            is_less_than_beta_l =
+                ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
+            if( !__msa_test_bz_v( is_less_than_beta_l ) )
+            {
+                v8i16 p3_org_l;
+
+                ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l );
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l,
+                                          q0_org_l, p1_org_l,
+                                          p2_l, q1_org_l, p0_l, p1_l, p2_l );
+            }
+        }
+        if( !__msa_test_bz_v( is_less_than_beta ) )
+        {
+            v16u8 p0, p2, p1;
+
+            PCKEV_B3_UB( p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2 );
+            p0_org = __msa_bmnz_v( p0_org, p0, is_less_than_beta );
+            p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
+            p2_org = __msa_bmnz_v( p2_org, p2, is_less_than_beta );
+        }
+        {
+            v16u8 negate_is_less_than_beta_r;
+
+            negate_is_less_than_beta_r =
+                ( v16u8 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
+                                        zero, 8 );
+
+            if( !__msa_test_bz_v( negate_is_less_than_beta_r ) )
+            {
+                AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
+            }
+        }
+        {
+            v16u8 negate_is_less_than_beta_l;
+
+            negate_is_less_than_beta_l =
+                ( v16u8 ) __msa_sldi_b( zero,
+                                        ( v16i8 ) negate_is_less_than_beta, 8 );
+            if( !__msa_test_bz_v( negate_is_less_than_beta_l ) )
+            {
+                AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
+            }
+        }
+
+        if( !__msa_test_bz_v( negate_is_less_than_beta ) )
+        {
+            v16u8 p0;
+
+            p0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p0_l, ( v16i8 ) p0_r );
+            p0_org = __msa_bmnz_v( p0_org, p0, negate_is_less_than_beta );
+        }
+
+        {
+            v16u8 q2_asub_q0;
+
+            q2_asub_q0 = __msa_asub_u_b( q2_org, q0_org );
+            is_less_than_beta = ( q2_asub_q0 < beta );
+        }
+
+        is_less_than_beta = is_less_than_beta & tmp_flag;
+        negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
+
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+
+        {
+            v16u8 is_less_than_beta_r;
+
+            is_less_than_beta_r =
+                ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
+            if( !__msa_test_bz_v( is_less_than_beta_r ) )
+            {
+                v8i16 q3_org_r;
+
+                ILVR_B2_SH( zero, q3_org, zero, q2_org, q3_org_r, q2_r );
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_r, q0_org_r,
+                                          p0_org_r, q1_org_r,
+                                          q2_r, p1_org_r, q0_r, q1_r, q2_r );
+            }
+        }
+        {
+            v16u8 is_less_than_beta_l;
+
+            is_less_than_beta_l =
+                ( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
+            if( !__msa_test_bz_v( is_less_than_beta_l ) )
+            {
+                v8i16 q3_org_l;
+
+                ILVL_B2_SH( zero, q3_org, zero, q2_org, q3_org_l, q2_l );
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_l, q0_org_l,
+                                          p0_org_l, q1_org_l,
+                                          q2_l, p1_org_l, q0_l, q1_l, q2_l );
+            }
+        }
+        if( !__msa_test_bz_v( is_less_than_beta ) )
+        {
+            v16u8 q0, q1, q2;
+
+            PCKEV_B3_UB( q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2 );
+            q0_org = __msa_bmnz_v( q0_org, q0, is_less_than_beta );
+            q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
+            q2_org = __msa_bmnz_v( q2_org, q2, is_less_than_beta );
+        }
+
+        {
+            v16u8 negate_is_less_than_beta_r;
+
+            negate_is_less_than_beta_r =
+                ( v16u8 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
+                                        zero, 8 );
+            if( !__msa_test_bz_v( negate_is_less_than_beta_r ) )
+            {
+                AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
+            }
+        }
+        {
+            v16u8 negate_is_less_than_beta_l;
+
+            negate_is_less_than_beta_l =
+                ( v16u8 ) __msa_sldi_b( zero,
+                                        ( v16i8 ) negate_is_less_than_beta, 8 );
+            if( !__msa_test_bz_v( negate_is_less_than_beta_l ) )
+            {
+                AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
+            }
+        }
+        if( !__msa_test_bz_v( negate_is_less_than_beta ) )
+        {
+            v16u8 q0;
+
+            q0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q0_l, ( v16i8 ) q0_r );
+            q0_org = __msa_bmnz_v( q0_org, q0, negate_is_less_than_beta );
+        }
+    }
+    {
+        v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+        ILVRL_B2_SH( p1_org, p2_org, tp0, tp2 );
+        ILVRL_B2_SH( q0_org, p0_org, tp1, tp3 );
+        ILVRL_B2_SH( q2_org, q1_org, tmp2, tmp5 );
+
+        ILVRL_H2_SH( tp1, tp0, tmp3, tmp4 );
+        ILVRL_H2_SH( tp3, tp2, tmp6, tmp7 );
+
+        p_src = p_data - 3;
+        ST4x4_UB( tmp3, tmp3, 0, 1, 2, 3, p_src, u_img_width );
+        ST2x4_UB( tmp2, 0, p_src + 4, u_img_width );
+        p_src += 4 * u_img_width;
+        ST4x4_UB( tmp4, tmp4, 0, 1, 2, 3, p_src, u_img_width );
+        ST2x4_UB( tmp2, 4, p_src + 4, u_img_width );
+        p_src += 4 * u_img_width;
+
+        ST4x4_UB( tmp6, tmp6, 0, 1, 2, 3, p_src, u_img_width );
+        ST2x4_UB( tmp5, 0, p_src + 4, u_img_width );
+        p_src += 4 * u_img_width;
+        ST4x4_UB( tmp7, tmp7, 0, 1, 2, 3, p_src, u_img_width );
+        ST2x4_UB( tmp5, 4, p_src + 4, u_img_width );
+    }
+}
+
+static void avc_lpf_cbcr_interleaved_intra_edge_hor_msa( uint8_t *p_chroma,
+                                                         uint8_t u_alpha_in,
+                                                         uint8_t u_beta_in,
+                                                         uint32_t u_img_width )
+{
+    v16u8 alpha, beta, is_less_than;
+    v16u8 p0, q0, p1_org, p0_org, q0_org, q1_org;
+    v8i16 p0_r = { 0 };
+    v8i16 q0_r = { 0 };
+    v8i16 p0_l = { 0 };
+    v8i16 q0_l = { 0 };
+
+    alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
+    beta = ( v16u8 ) __msa_fill_b( u_beta_in );
+
+    LD_UB4( p_chroma - ( u_img_width << 1 ), u_img_width,
+            p1_org, p0_org, q0_org, q1_org );
+
+    {
+        v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+        v16u8 is_less_than_alpha, is_less_than_beta;
+
+        p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
+        p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
+        q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
+
+        is_less_than_alpha = ( p0_asub_q0 < alpha );
+        is_less_than_beta = ( p1_asub_p0 < beta );
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = ( q1_asub_q0 < beta );
+        is_less_than = is_less_than_beta & is_less_than;
+    }
+
+    if( !__msa_test_bz_v( is_less_than ) )
+    {
+        v16i8 zero = { 0 };
+        v16u8 is_less_than_r, is_less_than_l;
+
+        is_less_than_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than,
+                                                 zero, 8 );
+        if( !__msa_test_bz_v( is_less_than_r ) )
+        {
+            v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+
+            ILVR_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
+                        zero, q1_org, p1_org_r, p0_org_r, q0_org_r,
+                        q1_org_r );
+            AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
+            AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
+        }
+
+        is_less_than_l = ( v16u8 ) __msa_sldi_b( zero,
+                                                 ( v16i8 ) is_less_than, 8 );
+        if( !__msa_test_bz_v( is_less_than_l ) )
+        {
+            v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+
+            ILVL_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
+                        zero, q1_org, p1_org_l, p0_org_l, q0_org_l,
+                        q1_org_l );
+            AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
+            AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
+        }
+
+        PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
+
+        p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
+        q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
+
+        ST_UB( p0_org, ( p_chroma - u_img_width ) );
+        ST_UB( q0_org, p_chroma );
+    }
+}
+
+static void avc_lpf_cbcr_interleaved_intra_edge_ver_msa( uint8_t *p_chroma,
+                                                         uint8_t u_alpha_in,
+                                                         uint8_t u_beta_in,
+                                                         uint32_t u_img_width )
+{
+    v16u8 is_less_than;
+    v16u8 p0, q0, p1_org, p0_org, q0_org, q1_org;
+    v8i16 p0_r = { 0 };
+    v8i16 q0_r = { 0 };
+    v8i16 p0_l = { 0 };
+    v8i16 q0_l = { 0 };
+    v16u8 p1_u_org, p0_u_org, q0_u_org, q1_u_org;
+    v16u8 p1_v_org, p0_v_org, q0_v_org, q1_v_org;
+    v16i8 tmp0, tmp1, tmp2, tmp3;
+    v4i32 vec0, vec1;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+
+    LD_UB8( ( p_chroma - 4 ), u_img_width,
+            row0, row1, row2, row3, row4, row5, row6, row7 );
+
+    TRANSPOSE8x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7,
+                        p1_u_org, p1_v_org, p0_u_org, p0_v_org,
+                        q0_u_org, q0_v_org, q1_u_org, q1_v_org );
+
+    ILVR_D4_UB( p1_v_org, p1_u_org, p0_v_org, p0_u_org, q0_v_org, q0_u_org,
+                q1_v_org, q1_u_org, p1_org, p0_org, q0_org, q1_org );
+
+    {
+        v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+        v16u8 is_less_than_beta, is_less_than_alpha, alpha, beta;
+
+        p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
+        p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
+        q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
+
+        alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
+        beta = ( v16u8 ) __msa_fill_b( u_beta_in );
+
+        is_less_than_alpha = ( p0_asub_q0 < alpha );
+        is_less_than_beta = ( p1_asub_p0 < beta );
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = ( q1_asub_q0 < beta );
+        is_less_than = is_less_than_beta & is_less_than;
+    }
+
+    if( !__msa_test_bz_v( is_less_than ) )
+    {
+        v16u8 is_less_than_r, is_less_than_l;
+        v16i8 zero = { 0 };
+
+        is_less_than_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than,
+                                                 zero, 8 );
+        if( !__msa_test_bz_v( is_less_than_r ) )
+        {
+            v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+
+            ILVR_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
+                        zero, q1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r );
+            AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
+            AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
+        }
+
+        is_less_than_l = ( v16u8 ) __msa_sldi_b( zero,
+                                                 ( v16i8 ) is_less_than, 8 );
+        if( !__msa_test_bz_v( is_less_than_l ) )
+        {
+            v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+
+            ILVL_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
+                        zero, q1_org, p1_org_l, p0_org_l, q0_org_l, q1_org_l );
+            AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
+            AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
+        }
+
+        PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
+
+        p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
+        q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
+
+        SLDI_B2_0_UB( p0_org, q0_org, p0_v_org, q0_v_org, 8 );
+        ILVR_D2_SB( p0_v_org, p0_org, q0_v_org, q0_org, tmp0, tmp1 );
+        ILVRL_B2_SB( tmp1, tmp0, tmp2, tmp3 );
+        ILVRL_B2_SW( tmp3, tmp2, vec0, vec1 );
+
+        ST4x8_UB( vec0, vec1, ( p_chroma - 2 ), u_img_width );
+    }
+}
+
+static void avc_loopfilter_luma_inter_edge_ver_msa( uint8_t *p_data,
+                                                    uint8_t u_bs0,
+                                                    uint8_t u_bs1,
+                                                    uint8_t u_bs2,
+                                                    uint8_t u_bs3,
+                                                    uint8_t u_tc0,
+                                                    uint8_t u_tc1,
+                                                    uint8_t u_tc2,
+                                                    uint8_t u_tc3,
+                                                    uint8_t u_alpha_in,
+                                                    uint8_t u_beta_in,
+                                                    uint32_t u_img_width )
+{
+    uint8_t *p_src;
+    v16u8 beta, tmp_vec, bs = { 0 };
+    v16u8 tc = { 0 };
+    v16u8 is_less_than, is_less_than_beta;
+    v16u8 p1, p0, q0, q1;
+    v8i16 p0_r, q0_r, p1_r = { 0 };
+    v8i16 q1_r = { 0 };
+    v8i16 p0_l, q0_l, p1_l = { 0 };
+    v8i16 q1_l = { 0 };
+    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+    v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
+    v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
+    v8i16 tc_r, tc_l;
+    v16i8 zero = { 0 };
+    v16u8 is_bs_greater_than0;
+
+    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs0 );
+    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 0, ( v4i32 ) tmp_vec );
+    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs1 );
+    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 1, ( v4i32 ) tmp_vec );
+    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs2 );
+    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 2, ( v4i32 ) tmp_vec );
+    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs3 );
+    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 3, ( v4i32 ) tmp_vec );
+
+    if( !__msa_test_bz_v( bs ) )
+    {
+        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc0 );
+        tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 0, ( v4i32 ) tmp_vec );
+        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc1 );
+        tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 1, ( v4i32 ) tmp_vec );
+        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc2 );
+        tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 2, ( v4i32 ) tmp_vec );
+        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc3 );
+        tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 3, ( v4i32 ) tmp_vec );
+
+        is_bs_greater_than0 = ( zero < bs );
+
+        {
+            v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+            v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+
+            p_src = p_data;
+            p_src -= 4;
+
+            LD_UB8( p_src, u_img_width,
+                    row0, row1, row2, row3, row4, row5, row6, row7 );
+            p_src += ( 8 * u_img_width );
+            LD_UB8( p_src, u_img_width,
+                    row8, row9, row10, row11, row12, row13, row14, row15 );
+
+            TRANSPOSE16x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7,
+                                 row8, row9, row10, row11,
+                                 row12, row13, row14, row15,
+                                 p3_org, p2_org, p1_org, p0_org,
+                                 q0_org, q1_org, q2_org, q3_org );
+        }
+        {
+            v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha;
+            v16u8 is_less_than_alpha;
+
+            p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
+            p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
+            q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
+
+            alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
+            beta = ( v16u8 ) __msa_fill_b( u_beta_in );
+
+            is_less_than_alpha = ( p0_asub_q0 < alpha );
+            is_less_than_beta = ( p1_asub_p0 < beta );
+            is_less_than = is_less_than_beta & is_less_than_alpha;
+            is_less_than_beta = ( q1_asub_q0 < beta );
+            is_less_than = is_less_than_beta & is_less_than;
+            is_less_than = is_less_than & is_bs_greater_than0;
+        }
+        if( !__msa_test_bz_v( is_less_than ) )
+        {
+            v16i8 negate_tc, sign_negate_tc;
+            v8i16 negate_tc_r, i16_negatetc_l;
+
+            negate_tc = zero - ( v16i8 ) tc;
+            sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
+
+            ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
+                         i16_negatetc_l );
+
+            UNPCK_UB_SH( tc, tc_r, tc_l );
+            UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
+            UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
+            UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
+
+            {
+                v16u8 p2_asub_p0;
+                v16u8 is_less_than_beta_r, is_less_than_beta_l;
+
+                p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
+                is_less_than_beta = ( p2_asub_p0 < beta );
+                is_less_than_beta = is_less_than_beta & is_less_than;
+
+                is_less_than_beta_r =
+                    ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
+                                            zero, 8 );
+                if( !__msa_test_bz_v( is_less_than_beta_r ) )
+                {
+                    p2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) p2_org );
+
+                    AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, p1_org_r, p2_org_r,
+                                      negate_tc_r, tc_r, p1_r );
+                }
+
+                is_less_than_beta_l =
+                    ( v16u8 ) __msa_sldi_b( zero,
+                                            ( v16i8 ) is_less_than_beta, 8 );
+                if( !__msa_test_bz_v( is_less_than_beta_l ) )
+                {
+                    p2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) p2_org );
+
+                    AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, p1_org_l, p2_org_l,
+                                      i16_negatetc_l, tc_l, p1_l );
+                }
+            }
+
+            if( !__msa_test_bz_v( is_less_than_beta ) )
+            {
+                p1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p1_l, ( v16i8 ) p1_r );
+                p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
+
+                is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
+                tc = tc + is_less_than_beta;
+            }
+
+            {
+                v16u8 u8_q2asub_q0;
+                v16u8 is_less_than_beta_l, is_less_than_beta_r;
+
+                u8_q2asub_q0 = __msa_asub_u_b( q2_org, q0_org );
+                is_less_than_beta = ( u8_q2asub_q0 < beta );
+                is_less_than_beta = is_less_than_beta & is_less_than;
+
+                q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
+
+                is_less_than_beta_r =
+                    ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
+                                            zero, 8 );
+                if( !__msa_test_bz_v( is_less_than_beta_r ) )
+                {
+                    q2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q2_org );
+                    AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, q1_org_r, q2_org_r,
+                                      negate_tc_r, tc_r, q1_r );
+                }
+
+                q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
+
+                is_less_than_beta_l =
+                    ( v16u8 ) __msa_sldi_b( zero,
+                                            ( v16i8 ) is_less_than_beta, 8 );
+                if( !__msa_test_bz_v( is_less_than_beta_l ) )
+                {
+                    q2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q2_org );
+                    AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, q1_org_l, q2_org_l,
+                                      i16_negatetc_l, tc_l, q1_l );
+                }
+            }
+
+            if( !__msa_test_bz_v( is_less_than_beta ) )
+            {
+                q1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q1_l, ( v16i8 ) q1_r );
+                q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
+
+                is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
+                tc = tc + is_less_than_beta;
+            }
+
+            {
+                v8i16 threshold_r, negate_thresh_r;
+                v8i16 threshold_l, negate_thresh_l;
+                v16i8 negate_thresh, sign_negate_thresh;
+
+                negate_thresh = zero - ( v16i8 ) tc;
+                sign_negate_thresh = __msa_clti_s_b( negate_thresh, 0 );
+
+                ILVR_B2_SH( zero, tc, sign_negate_thresh, negate_thresh,
+                            threshold_r, negate_thresh_r );
+
+                AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
+                              negate_thresh_r, threshold_r, p0_r, q0_r );
+
+                threshold_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) tc );
+                negate_thresh_l = ( v8i16 ) __msa_ilvl_b( sign_negate_thresh,
+                                                          negate_thresh );
+
+                AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
+                              negate_thresh_l, threshold_l, p0_l, q0_l );
+            }
+
+            PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
+
+            p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
+            q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
+        }
+        {
+            v16i8 tp0, tp1, tp2, tp3;
+            v8i16 tmp2, tmp5;
+            v4i32 tmp3, tmp4, tmp6, tmp7;
+            uint32_t u_out0, u_out2;
+            uint16_t u_out1, u_out3;
+
+            p_src = p_data - 3;
+
+            ILVRL_B2_SB( p1_org, p2_org, tp0, tp2 );
+            ILVRL_B2_SB( q0_org, p0_org, tp1, tp3 );
+            ILVRL_B2_SH( q2_org, q1_org, tmp2, tmp5 );
+
+            ILVRL_H2_SW( tp1, tp0, tmp3, tmp4 );
+            ILVRL_H2_SW( tp3, tp2, tmp6, tmp7 );
+
+            u_out0 = __msa_copy_u_w( tmp3, 0 );
+            u_out1 = __msa_copy_u_h( tmp2, 0 );
+            u_out2 = __msa_copy_u_w( tmp3, 1 );
+            u_out3 = __msa_copy_u_h( tmp2, 1 );
+
+            SW( u_out0, p_src );
+            SH( u_out1, ( p_src + 4 ) );
+            p_src += u_img_width;
+            SW( u_out2, p_src );
+            SH( u_out3, ( p_src + 4 ) );
+
+            u_out0 = __msa_copy_u_w( tmp3, 2 );
+            u_out1 = __msa_copy_u_h( tmp2, 2 );
+            u_out2 = __msa_copy_u_w( tmp3, 3 );
+            u_out3 = __msa_copy_u_h( tmp2, 3 );
+
+            p_src += u_img_width;
+            SW( u_out0, p_src );
+            SH( u_out1, ( p_src + 4 ) );
+            p_src += u_img_width;
+            SW( u_out2, p_src );
+            SH( u_out3, ( p_src + 4 ) );
+
+            u_out0 = __msa_copy_u_w( tmp4, 0 );
+            u_out1 = __msa_copy_u_h( tmp2, 4 );
+            u_out2 = __msa_copy_u_w( tmp4, 1 );
+            u_out3 = __msa_copy_u_h( tmp2, 5 );
+
+            p_src += u_img_width;
+            SW( u_out0, p_src );
+            SH( u_out1, ( p_src + 4 ) );
+            p_src += u_img_width;
+            SW( u_out2, p_src );
+            SH( u_out3, ( p_src + 4 ) );
+
+            u_out0 = __msa_copy_u_w( tmp4, 2 );
+            u_out1 = __msa_copy_u_h( tmp2, 6 );
+            u_out2 = __msa_copy_u_w( tmp4, 3 );
+            u_out3 = __msa_copy_u_h( tmp2, 7 );
+
+            p_src += u_img_width;
+            SW( u_out0, p_src );
+            SH( u_out1, ( p_src + 4 ) );
+            p_src += u_img_width;
+            SW( u_out2, p_src );
+            SH( u_out3, ( p_src + 4 ) );
+
+            u_out0 = __msa_copy_u_w( tmp6, 0 );
+            u_out1 = __msa_copy_u_h( tmp5, 0 );
+            u_out2 = __msa_copy_u_w( tmp6, 1 );
+            u_out3 = __msa_copy_u_h( tmp5, 1 );
+
+            p_src += u_img_width;
+            SW( u_out0, p_src );
+            SH( u_out1, ( p_src + 4 ) );
+            p_src += u_img_width;
+            SW( u_out2, p_src );
+            SH( u_out3, ( p_src + 4 ) );
+
+            u_out0 = __msa_copy_u_w( tmp6, 2 );
+            u_out1 = __msa_copy_u_h( tmp5, 2 );
+            u_out2 = __msa_copy_u_w( tmp6, 3 );
+            u_out3 = __msa_copy_u_h( tmp5, 3 );
+
+            p_src += u_img_width;
+            SW( u_out0, p_src );
+            SH( u_out1, ( p_src + 4 ) );
+            p_src += u_img_width;
+            SW( u_out2, p_src );
+            SH( u_out3, ( p_src + 4 ) );
+
+            u_out0 = __msa_copy_u_w( tmp7, 0 );
+            u_out1 = __msa_copy_u_h( tmp5, 4 );
+            u_out2 = __msa_copy_u_w( tmp7, 1 );
+            u_out3 = __msa_copy_u_h( tmp5, 5 );
+
+            p_src += u_img_width;
+            SW( u_out0, p_src );
+            SH( u_out1, ( p_src + 4 ) );
+            p_src += u_img_width;
+            SW( u_out2, p_src );
+            SH( u_out3, ( p_src + 4 ) );
+
+            u_out0 = __msa_copy_u_w( tmp7, 2 );
+            u_out1 = __msa_copy_u_h( tmp5, 6 );
+            u_out2 = __msa_copy_u_w( tmp7, 3 );
+            u_out3 = __msa_copy_u_h( tmp5, 7 );
+
+            p_src += u_img_width;
+            SW( u_out0, p_src );
+            SH( u_out1, ( p_src + 4 ) );
+            p_src += u_img_width;
+            SW( u_out2, p_src );
+            SH( u_out3, ( p_src + 4 ) );
+        }
+    }
+}
+
+static void avc_loopfilter_luma_inter_edge_hor_msa( uint8_t *p_data,
+                                                    uint8_t u_bs0,
+                                                    uint8_t u_bs1,
+                                                    uint8_t u_bs2,
+                                                    uint8_t u_bs3,
+                                                    uint8_t u_tc0,
+                                                    uint8_t u_tc1,
+                                                    uint8_t u_tc2,
+                                                    uint8_t u_tc3,
+                                                    uint8_t u_alpha_in,
+                                                    uint8_t u_beta_in,
+                                                    uint32_t u_image_width )
+{
+    v16u8 p2_asub_p0, u8_q2asub_q0;
+    v16u8 alpha, beta, is_less_than, is_less_than_beta;
+    v16u8 p1, p0, q0, q1;
+    v8i16 p1_r = { 0 };
+    v8i16 p0_r, q0_r, q1_r = { 0 };
+    v8i16 p1_l = { 0 };
+    v8i16 p0_l, q0_l, q1_l = { 0 };
+    v16u8 p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
+    v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
+    v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
+    v16i8 zero = { 0 };
+    v16u8 tmp_vec;
+    v16u8 bs = { 0 };
+    v16i8 tc = { 0 };
+
+    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs0 );
+    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 0, ( v4i32 ) tmp_vec );
+    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs1 );
+    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 1, ( v4i32 ) tmp_vec );
+    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs2 );
+    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 2, ( v4i32 ) tmp_vec );
+    tmp_vec = ( v16u8 ) __msa_fill_b( u_bs3 );
+    bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 3, ( v4i32 ) tmp_vec );
+
+    if( !__msa_test_bz_v( bs ) )
+    {
+        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc0 );
+        tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 0, ( v4i32 ) tmp_vec );
+        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc1 );
+        tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 1, ( v4i32 ) tmp_vec );
+        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc2 );
+        tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 2, ( v4i32 ) tmp_vec );
+        tmp_vec = ( v16u8 ) __msa_fill_b( u_tc3 );
+        tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 3, ( v4i32 ) tmp_vec );
+
+        alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
+        beta = ( v16u8 ) __msa_fill_b( u_beta_in );
+
+        LD_UB5( p_data - ( 3 * u_image_width ), u_image_width,
+                p2_org, p1_org, p0_org, q0_org, q1_org );
+
+        {
+            v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+            v16u8 is_less_than_alpha, is_bs_greater_than0;
+
+            is_bs_greater_than0 = ( ( v16u8 ) zero < bs );
+            p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
+            p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
+            q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
+
+            is_less_than_alpha = ( p0_asub_q0 < alpha );
+            is_less_than_beta = ( p1_asub_p0 < beta );
+            is_less_than = is_less_than_beta & is_less_than_alpha;
+            is_less_than_beta = ( q1_asub_q0 < beta );
+            is_less_than = is_less_than_beta & is_less_than;
+            is_less_than = is_less_than & is_bs_greater_than0;
+        }
+
+        if( !__msa_test_bz_v( is_less_than ) )
+        {
+            v16i8 sign_negate_tc, negate_tc;
+            v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
+
+            q2_org = LD_UB( p_data + ( 2 * u_image_width ) );
+            negate_tc = zero - tc;
+            sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
+
+            ILVRL_B2_SH( sign_negate_tc, negate_tc,
+                         negate_tc_r, i16_negatetc_l );
+
+            UNPCK_UB_SH( tc, tc_r, tc_l );
+            UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
+            UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
+            UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
+
+            p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
+            is_less_than_beta = ( p2_asub_p0 < beta );
+            is_less_than_beta = is_less_than_beta & is_less_than;
+            {
+                v8u16 is_less_than_beta_r, is_less_than_beta_l;
+
+                is_less_than_beta_r =
+                    ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
+                                            zero, 8 );
+                if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
+                {
+                    p2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) p2_org );
+
+                    AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, p1_org_r, p2_org_r,
+                                      negate_tc_r, tc_r, p1_r );
+                }
+
+                is_less_than_beta_l =
+                    ( v8u16 ) __msa_sldi_b( zero,
+                                            ( v16i8 ) is_less_than_beta, 8 );
+                if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
+                {
+                    p2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) p2_org );
+
+                    AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, p1_org_l, p2_org_l,
+                                      i16_negatetc_l, tc_l, p1_l );
+                }
+            }
+            if( !__msa_test_bz_v( is_less_than_beta ) )
+            {
+                p1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p1_l, ( v16i8 ) p1_r );
+                p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
+                ST_UB( p1_org, p_data - ( 2 * u_image_width ) );
+
+                is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
+                tc = tc + ( v16i8 ) is_less_than_beta;
+            }
+
+            u8_q2asub_q0 = __msa_asub_u_b( q2_org, q0_org );
+            is_less_than_beta = ( u8_q2asub_q0 < beta );
+            is_less_than_beta = is_less_than_beta & is_less_than;
+
+            {
+                v8u16 is_less_than_beta_r, is_less_than_beta_l;
+                is_less_than_beta_r =
+                    ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
+                                            zero, 8 );
+
+                q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
+                if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
+                {
+                    q2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q2_org );
+
+                    AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, q1_org_r, q2_org_r,
+                                      negate_tc_r, tc_r, q1_r );
+                }
+                is_less_than_beta_l =
+                    ( v8u16 ) __msa_sldi_b( zero,
+                                            ( v16i8 ) is_less_than_beta, 8 );
+
+                q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
+                if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
+                {
+                    q2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q2_org );
+
+                    AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, q1_org_l, q2_org_l,
+                                      i16_negatetc_l, tc_l, q1_l );
+                }
+            }
+            if( !__msa_test_bz_v( is_less_than_beta ) )
+            {
+                q1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q1_l, ( v16i8 ) q1_r );
+                q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
+                ST_UB( q1_org, p_data + u_image_width );
+
+                is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
+                tc = tc + ( v16i8 ) is_less_than_beta;
+            }
+            {
+                v16i8 negate_thresh, sign_negate_thresh;
+                v8i16 threshold_r, threshold_l;
+                v8i16 negate_thresh_l, negate_thresh_r;
+
+                negate_thresh = zero - tc;
+                sign_negate_thresh = __msa_clti_s_b( negate_thresh, 0 );
+
+                ILVR_B2_SH( zero, tc, sign_negate_thresh, negate_thresh,
+                            threshold_r, negate_thresh_r );
+                AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
+                              negate_thresh_r, threshold_r, p0_r, q0_r );
+
+                threshold_l = ( v8i16 ) __msa_ilvl_b( zero, tc );
+                negate_thresh_l = ( v8i16 ) __msa_ilvl_b( sign_negate_thresh,
+                                                          negate_thresh );
+                AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
+                              negate_thresh_l, threshold_l, p0_l, q0_l );
+            }
+
+            PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
+
+            p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
+            q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
+
+            ST_UB( p0_org, ( p_data - u_image_width ) );
+            ST_UB( q0_org, p_data );
+        }
+    }
+}
+
+static void avc_lpf_cbcr_interleaved_inter_edge_hor_msa( uint8_t *p_chroma,
+                                                         uint8_t u_bs0,
+                                                         uint8_t u_bs1,
+                                                         uint8_t u_bs2,
+                                                         uint8_t u_bs3,
+                                                         uint8_t u_tc0,
+                                                         uint8_t u_tc1,
+                                                         uint8_t u_tc2,
+                                                         uint8_t u_tc3,
+                                                         uint8_t u_alpha_in,
+                                                         uint8_t u_beta_in,
+                                                         uint32_t u_img_width )
+{
+    v16u8 alpha, beta;
+    v4i32 tmp_vec, bs = { 0 };
+    v4i32 tc = { 0 };
+    v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than;
+    v8i16 is_less_than_r, is_less_than_l;
+    v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
+    v16u8 p0, q0;
+    v8i16 p0_r = { 0 };
+    v8i16 q0_r = { 0 };
+    v8i16 p0_l = { 0 };
+    v8i16 q0_l = { 0 };
+    v16u8 p1_org, p0_org, q0_org, q1_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v16i8 negate_tc, sign_negate_tc;
+    v8i16 negate_tc_r, i16_negatetc_l;
+    v8i16 tc_r, tc_l;
+    v16i8 zero = { 0 };
+    v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+
+    tmp_vec = ( v4i32 ) __msa_fill_b( u_bs0 );
+    bs = __msa_insve_w( bs, 0, tmp_vec );
+    tmp_vec = ( v4i32 ) __msa_fill_b( u_bs1 );
+    bs = __msa_insve_w( bs, 1, tmp_vec );
+    tmp_vec = ( v4i32 ) __msa_fill_b( u_bs2 );
+    bs = __msa_insve_w( bs, 2, tmp_vec );
+    tmp_vec = ( v4i32 ) __msa_fill_b( u_bs3 );
+    bs = __msa_insve_w( bs, 3, tmp_vec );
+
+    if( !__msa_test_bz_v( ( v16u8 ) bs ) )
+    {
+        tmp_vec = ( v4i32 ) __msa_fill_b( u_tc0 );
+        tc = __msa_insve_w( tc, 0, tmp_vec );
+        tmp_vec = ( v4i32 ) __msa_fill_b( u_tc1 );
+        tc = __msa_insve_w( tc, 1, tmp_vec );
+        tmp_vec = ( v4i32 ) __msa_fill_b( u_tc2 );
+        tc = __msa_insve_w( tc, 2, tmp_vec );
+        tmp_vec = ( v4i32 ) __msa_fill_b( u_tc3 );
+        tc = __msa_insve_w( tc, 3, tmp_vec );
+
+        is_bs_greater_than0 = ( v16u8 ) ( zero < ( v16i8 ) bs );
+
+        alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
+        beta = ( v16u8 ) __msa_fill_b( u_beta_in );
+
+        LD_UB4( p_chroma - ( u_img_width << 1 ), u_img_width,
+                p1_org, p0_org, q0_org, q1_org );
+
+        p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
+        p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
+        q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
+
+        is_less_than_alpha = ( p0_asub_q0 < alpha );
+        is_less_than_beta = ( p1_asub_p0 < beta );
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = ( q1_asub_q0 < beta );
+        is_less_than = is_less_than_beta & is_less_than;
+
+        is_less_than = is_less_than & is_bs_greater_than0;
+
+        if( !__msa_test_bz_v( is_less_than ) )
+        {
+            negate_tc = zero - ( v16i8 ) tc;
+            sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
+
+            ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
+                         i16_negatetc_l );
+
+            UNPCK_UB_SH( tc, tc_r, tc_l );
+            UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
+            UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
+            UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
+            UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
+
+            is_less_than_r =
+                ( v8i16 ) __msa_sldi_b( ( v16i8 ) is_less_than, zero, 8 );
+            if( !__msa_test_bz_v( ( v16u8 ) is_less_than_r ) )
+            {
+                AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
+                              negate_tc_r, tc_r, p0_r, q0_r );
+            }
+
+            is_less_than_l =
+                ( v8i16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than, 8 );
+            if( !__msa_test_bz_v( ( v16u8 ) is_less_than_l ) )
+            {
+                AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
+                              i16_negatetc_l, tc_l, p0_l, q0_l );
+            }
+
+            PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
+
+            p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
+            q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
+
+            ST_UB( p0_org, p_chroma - u_img_width );
+            ST_UB( q0_org, p_chroma );
+        }
+    }
+}
+
+static void avc_lpf_cbcr_interleaved_inter_edge_ver_msa( uint8_t *p_chroma,
+                                                         uint8_t u_bs0,
+                                                         uint8_t u_bs1,
+                                                         uint8_t u_bs2,
+                                                         uint8_t u_bs3,
+                                                         uint8_t u_tc0,
+                                                         uint8_t u_tc1,
+                                                         uint8_t u_tc2,
+                                                         uint8_t u_tc3,
+                                                         uint8_t u_alpha_in,
+                                                         uint8_t u_beta_in,
+                                                         uint32_t u_img_width )
+{
+    v16u8 alpha, beta;
+    v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
+    v16u8 is_less_than, is_less_than1;
+    v8i16 is_less_than_r, is_less_than_l;
+    v16u8 is_less_than_beta, is_less_than_alpha;
+    v8i16 p0_r = { 0 };
+    v8i16 q0_r = { 0 };
+    v8i16 p0_l = { 0 };
+    v8i16 q0_l = { 0 };
+    v16u8 p1_org, p0_org, q0_org, q1_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+    v16u8 is_bs_less_than4, is_bs_greater_than0;
+    v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l;
+    v16u8 const4;
+    v16i8 zero = { 0 };
+    v8i16 tmp_vec, bs = { 0 };
+    v8i16 tc = { 0 };
+    v16u8 p1_u_org, p0_u_org, q0_u_org, q1_u_org;
+    v16u8 p1_v_org, p0_v_org, q0_v_org, q1_v_org;
+    v16i8 tmp0, tmp1, tmp2, tmp3;
+    v4i32 vec0, vec1;
+    v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+    v16i8 negate_tc, sign_negate_tc;
+
+    const4 = ( v16u8 ) __msa_ldi_b( 4 );
+
+    tmp_vec = ( v8i16 ) __msa_fill_b( u_bs0 );
+    bs = __msa_insve_h( bs, 0, tmp_vec );
+    bs = __msa_insve_h( bs, 4, tmp_vec );
+
+    tmp_vec = ( v8i16 ) __msa_fill_b( u_bs1 );
+    bs = __msa_insve_h( bs, 1, tmp_vec );
+    bs = __msa_insve_h( bs, 5, tmp_vec );
+
+    tmp_vec = ( v8i16 ) __msa_fill_b( u_bs2 );
+    bs = __msa_insve_h( bs, 2, tmp_vec );
+    bs = __msa_insve_h( bs, 6, tmp_vec );
+
+    tmp_vec = ( v8i16 ) __msa_fill_b( u_bs3 );
+    bs = __msa_insve_h( bs, 3, tmp_vec );
+    bs = __msa_insve_h( bs, 7, tmp_vec );
+
+    if( !__msa_test_bz_v( ( v16u8 ) bs ) )
+    {
+        tmp_vec = ( v8i16 ) __msa_fill_b( u_tc0 );
+        tc = __msa_insve_h( tc, 0, tmp_vec );
+        tc = __msa_insve_h( tc, 4, tmp_vec );
+
+        tmp_vec = ( v8i16 ) __msa_fill_b( u_tc1 );
+        tc = __msa_insve_h( tc, 1, tmp_vec );
+        tc = __msa_insve_h( tc, 5, tmp_vec );
+
+        tmp_vec = ( v8i16 ) __msa_fill_b( u_tc2 );
+        tc = __msa_insve_h( tc, 2, tmp_vec );
+        tc = __msa_insve_h( tc, 6, tmp_vec );
+
+        tmp_vec = ( v8i16 ) __msa_fill_b( u_tc3 );
+        tc = __msa_insve_h( tc, 3, tmp_vec );
+        tc = __msa_insve_h( tc, 7, tmp_vec );
+
+        is_bs_greater_than0 = ( v16u8 ) ( zero < ( v16i8 ) bs );
+
+        LD_UB8( ( p_chroma - 4 ), u_img_width,
+                row0, row1, row2, row3, row4, row5, row6, row7 );
+
+        TRANSPOSE8x8_UB_UB( row0, row1, row2, row3,
+                            row4, row5, row6, row7,
+                            p1_u_org, p1_v_org, p0_u_org, p0_v_org,
+                            q0_u_org, q0_v_org, q1_u_org, q1_v_org );
+
+        ILVR_D4_UB( p1_v_org, p1_u_org, p0_v_org, p0_u_org, q0_v_org, q0_u_org,
+                    q1_v_org, q1_u_org, p1_org, p0_org, q0_org, q1_org );
+
+        p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
+        p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
+        q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
+
+        alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
+        beta = ( v16u8 ) __msa_fill_b( u_beta_in );
+
+        is_less_than_alpha = ( p0_asub_q0 < alpha );
+        is_less_than_beta = ( p1_asub_p0 < beta );
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = ( q1_asub_q0 < beta );
+        is_less_than = is_less_than_beta & is_less_than;
+        is_less_than = is_bs_greater_than0 & is_less_than;
+
+        if( !__msa_test_bz_v( is_less_than ) )
+        {
+            UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
+            UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
+            UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
+            UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
+
+            is_bs_less_than4 = ( ( v16u8 ) bs < const4 );
+
+            is_less_than1 = is_less_than & is_bs_less_than4;
+            if( !__msa_test_bz_v( ( v16u8 ) is_less_than1 ) )
+            {
+                negate_tc = zero - ( v16i8 ) tc;
+                sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
+
+                ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
+                             i16_negatetc_l );
+
+                UNPCK_UB_SH( tc, tc_r, tc_l );
+
+                is_less_than_r =
+                    ( v8i16 ) __msa_sldi_b( ( v16i8 ) is_less_than1, zero, 8 );
+                if( !__msa_test_bz_v( ( v16u8 ) is_less_than_r ) )
+                {
+                    AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
+                                  negate_tc_r, tc_r, p0_r, q0_r );
+                }
+
+                is_less_than_l =
+                    ( v8i16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than1, 8 );
+                if( !__msa_test_bz_v( ( v16u8 ) is_less_than_l ) )
+                {
+                    AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
+                                  i16_negatetc_l, tc_l, p0_l, q0_l );
+                }
+
+                PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
+
+                p0_org = __msa_bmnz_v( p0_org, p0, is_less_than1 );
+                q0_org = __msa_bmnz_v( q0_org, q0, is_less_than1 );
+            }
+
+            SLDI_B2_0_UB( p0_org, q0_org, p0_v_org, q0_v_org, 8 );
+            ILVR_D2_SB( p0_v_org, p0_org, q0_v_org, q0_org, tmp0, tmp1 );
+            ILVRL_B2_SB( tmp1, tmp0, tmp2, tmp3 );
+            ILVRL_B2_SW( tmp3, tmp2, vec0, vec1 );
+            ST4x8_UB( vec0, vec1, ( p_chroma - 2 ), u_img_width );
+        }
+    }
+}
+
+static void avc_deblock_strength_msa( uint8_t *nnz,
+                                      int8_t pi_ref[2][X264_SCAN8_LUMA_SIZE],
+                                      int16_t pi_mv[2][X264_SCAN8_LUMA_SIZE][2],
+                                      uint8_t pu_bs[2][8][4],
+                                      int32_t i_mvy_limit )
+{
+    uint32_t u_tmp;
+    v16u8 nnz0, nnz1, nnz2, nnz3, nnz4;
+    v16u8 nnz_mask, ref_mask, mask, one, two, dst = { 0 };
+    v16i8 ref0, ref1, ref2, ref3, ref4;
+    v16i8 temp_vec0, temp_vec1, temp_vec4, temp_vec5;
+    v8i16 mv0, mv1, mv2, mv3, mv4, mv5, mv6, mv7, mv8, mv9, mv_a, mv_b;
+    v8u16 four, mvy_limit_vec, sub0, sub1;
+
+    nnz0 = LD_UB( nnz + 4 );
+    nnz2 = LD_UB( nnz + 20 );
+    nnz4 = LD_UB( nnz + 36 );
+
+    ref0 = LD_SB( pi_ref[0] + 4 );
+    ref2 = LD_SB( pi_ref[0] + 20 );
+    ref4 = LD_SB( pi_ref[0] + 36 );
+
+    mv0 = LD_SH( ( pi_mv[0] + 4 )[0] );
+    mv1 = LD_SH( ( pi_mv[0] + 12 )[0] );
+    mv2 = LD_SH( ( pi_mv[0] + 20 )[0] );
+    mv3 = LD_SH( ( pi_mv[0] + 28 )[0] );
+    mv4 = LD_SH( ( pi_mv[0] + 36 )[0] );
+
+    mvy_limit_vec = ( v8u16 ) __msa_fill_h( i_mvy_limit );
+    four = ( v8u16 ) __msa_fill_h( 4 );
+    mask = ( v16u8 ) __msa_ldi_b( 0 );
+    one = ( v16u8 ) __msa_ldi_b( 1 );
+    two = ( v16u8 ) __msa_ldi_b( 2 );
+
+    mv5 = __msa_pckod_h( mv0, mv0 );
+    mv6 = __msa_pckod_h( mv1, mv1 );
+    mv_a = __msa_pckev_h( mv0, mv0 );
+    mv_b = __msa_pckev_h( mv1, mv1 );
+    nnz1 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz0, 2 );
+    ref1 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref0, 2 );
+    nnz_mask = nnz0 | nnz1;
+    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
+    two = __msa_bmnz_v( two, mask, nnz_mask );
+
+    ref_mask = ( v16u8 ) __msa_ceq_b( ref0, ref1 );
+    ref_mask = ref_mask ^ 255;
+
+    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
+    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
+
+    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
+    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
+
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
+
+    dst = __msa_bmnz_v( dst, one, ref_mask );
+    dst = __msa_bmnz_v( two, dst, nnz_mask );
+
+    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
+    SW( u_tmp, pu_bs[1][0] );
+
+    dst = ( v16u8 ) __msa_ldi_b( 0 );
+    two = ( v16u8 ) __msa_ldi_b( 2 );
+
+    mv5 = __msa_pckod_h( mv1, mv1 );
+    mv6 = __msa_pckod_h( mv2, mv2 );
+    mv_a = __msa_pckev_h( mv1, mv1 );
+    mv_b = __msa_pckev_h( mv2, mv2 );
+
+    nnz_mask = nnz2 | nnz1;
+    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
+    two = __msa_bmnz_v( two, mask, nnz_mask );
+
+    ref_mask = ( v16u8 ) __msa_ceq_b( ref1, ref2 );
+    ref_mask = ref_mask ^ 255;
+
+    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
+    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
+    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
+    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
+
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
+
+    dst = __msa_bmnz_v( dst, one, ref_mask );
+    dst = __msa_bmnz_v( two, dst, nnz_mask );
+
+    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
+    SW( u_tmp, pu_bs[1][1] );
+
+    dst = ( v16u8 ) __msa_ldi_b( 0 );
+    two = ( v16u8 ) __msa_ldi_b( 2 );
+
+    mv5 = __msa_pckod_h( mv2, mv2 );
+    mv6 = __msa_pckod_h( mv3, mv3 );
+    mv_a = __msa_pckev_h( mv2, mv2 );
+    mv_b = __msa_pckev_h( mv3, mv3 );
+
+    nnz3 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz2, 2 );
+    ref3 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref2, 2 );
+
+    nnz_mask = nnz3 | nnz2;
+    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
+    two = __msa_bmnz_v( two, mask, nnz_mask );
+
+    ref_mask = ( v16u8 ) __msa_ceq_b( ref2, ref3 );
+    ref_mask = ref_mask ^ 255;
+
+    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
+    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
+
+    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
+    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
+
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
+
+    dst = __msa_bmnz_v( dst, one, ref_mask );
+    dst = __msa_bmnz_v( two, dst, nnz_mask );
+
+    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
+    SW( u_tmp, pu_bs[1][2] );
+
+    dst = ( v16u8 ) __msa_ldi_b( 0 );
+    two = ( v16u8 ) __msa_ldi_b( 2 );
+
+    mv5 = __msa_pckod_h( mv3, mv3 );
+    mv6 = __msa_pckod_h( mv4, mv4 );
+    mv_a = __msa_pckev_h( mv3, mv3 );
+    mv_b = __msa_pckev_h( mv4, mv4 );
+
+    nnz_mask = nnz4 | nnz3;
+    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
+    two = __msa_bmnz_v( two, mask, nnz_mask );
+
+    ref_mask = ( v16u8 ) __msa_ceq_b( ref3, ref4 );
+    ref_mask = ref_mask ^ 255;
+
+    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
+    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
+
+    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
+    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
+
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
+
+    dst = __msa_bmnz_v( dst, one, ref_mask );
+    dst = __msa_bmnz_v( two, dst, nnz_mask );
+
+    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
+    SW( u_tmp, pu_bs[1][3] );
+
+    nnz0 = LD_UB( nnz + 8 );
+    nnz2 = LD_UB( nnz + 24 );
+
+    ref0 = LD_SB( pi_ref[0] + 8 );
+    ref2 = LD_SB( pi_ref[0] + 24 );
+
+    mv0 = LD_SH( ( pi_mv[0] + 8 )[0] );
+    mv1 = LD_SH( ( pi_mv[0] + 12 )[0] );
+    mv2 = LD_SH( ( pi_mv[0] + 16 )[0] );
+    mv3 = LD_SH( ( pi_mv[0] + 20 )[0] );
+    mv4 = LD_SH( ( pi_mv[0] + 24 )[0] );
+    mv7 = LD_SH( ( pi_mv[0] + 28 )[0] );
+    mv8 = LD_SH( ( pi_mv[0] + 32 )[0] );
+    mv9 = LD_SH( ( pi_mv[0] + 36 )[0] );
+
+    nnz1 = ( v16u8 ) __msa_splati_d( ( v2i64 ) nnz0, 1 );
+    nnz3 = ( v16u8 ) __msa_splati_d( ( v2i64 ) nnz2, 1 );
+
+    ILVR_B2_SB( nnz2, nnz0, nnz3, nnz1, temp_vec0, temp_vec1 );
+
+    ILVRL_B2_SB( temp_vec1, temp_vec0, temp_vec5, temp_vec4 );
+
+    nnz0 = ( v16u8 ) __msa_splati_w( ( v4i32 ) temp_vec5, 3 );
+    nnz1 = ( v16u8 ) temp_vec4;
+    nnz2 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 1 );
+    nnz3 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 2 );
+    nnz4 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 3 );
+
+    ref1 = ( v16i8 ) __msa_splati_d( ( v2i64 ) ref0, 1 );
+    ref3 = ( v16i8 ) __msa_splati_d( ( v2i64 ) ref2, 1 );
+
+    ILVR_B2_SB( ref2, ref0, ref3, ref1, temp_vec0, temp_vec1 );
+
+    ILVRL_B2_SB( temp_vec1, temp_vec0, temp_vec5, ref1 );
+
+    ref0 = ( v16i8 ) __msa_splati_w( ( v4i32 ) temp_vec5, 3 );
+
+    ref2 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 1 );
+    ref3 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 2 );
+    ref4 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 3 );
+
+    TRANSPOSE8X4_SH_SH( mv0, mv2, mv4, mv8, mv5, mv5, mv5, mv0 );
+    TRANSPOSE8X4_SH_SH( mv1, mv3, mv7, mv9, mv1, mv2, mv3, mv4 );
+
+    mvy_limit_vec = ( v8u16 ) __msa_fill_h( i_mvy_limit );
+    four = ( v8u16 ) __msa_fill_h( 4 );
+    mask = ( v16u8 ) __msa_ldi_b( 0 );
+    one = ( v16u8 ) __msa_ldi_b( 1 );
+    two = ( v16u8 ) __msa_ldi_b( 2 );
+    dst = ( v16u8 ) __msa_ldi_b( 0 );
+
+    mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv0, 1 );
+    mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv1, 1 );
+    mv_a = mv0;
+    mv_b = mv1;
+
+    nnz_mask = nnz0 | nnz1;
+    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
+    two = __msa_bmnz_v( two, mask, nnz_mask );
+
+    ref_mask = ( v16u8 ) __msa_ceq_b( ref0, ref1 );
+    ref_mask = ref_mask ^ 255;
+
+    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
+    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
+
+    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
+    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
+
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
+
+    dst = __msa_bmnz_v( dst, one, ref_mask );
+    dst = __msa_bmnz_v( two, dst, nnz_mask );
+
+    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
+    SW( u_tmp, pu_bs[0][0] );
+
+    two = ( v16u8 ) __msa_ldi_b( 2 );
+    dst = ( v16u8 ) __msa_ldi_b( 0 );
+
+    mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv1, 1 );
+    mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv2, 1 );
+    mv_a = mv1;
+    mv_b = mv2;
+
+    nnz_mask = nnz1 | nnz2;
+    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
+    two = __msa_bmnz_v( two, mask, nnz_mask );
+
+    ref_mask = ( v16u8 ) __msa_ceq_b( ref1, ref2 );
+    ref_mask = ref_mask ^ 255;
+
+    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
+    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
+    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
+    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
+
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
+
+    dst = __msa_bmnz_v( dst, one, ref_mask );
+    dst = __msa_bmnz_v( two, dst, nnz_mask );
+
+    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
+    SW( u_tmp, pu_bs[0][1] );
+
+    two = ( v16u8 ) __msa_ldi_b( 2 );
+    dst = ( v16u8 ) __msa_ldi_b( 0 );
+
+    mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv2, 1 );
+    mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv3, 1 );
+    mv_a = mv2;
+    mv_b = mv3;
+
+    nnz_mask = nnz2 | nnz3;
+    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
+    two = __msa_bmnz_v( two, mask, nnz_mask );
+
+    ref_mask = ( v16u8 ) __msa_ceq_b( ref2, ref3 );
+    ref_mask = ref_mask ^ 255;
+
+    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
+    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
+    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
+    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
+
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
+
+    dst = __msa_bmnz_v( dst, one, ref_mask );
+    dst = __msa_bmnz_v( two, dst, nnz_mask );
+
+    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
+    SW( u_tmp, pu_bs[0][2] );
+
+    two = ( v16u8 ) __msa_ldi_b( 2 );
+    dst = ( v16u8 ) __msa_ldi_b( 0 );
+
+    mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv3, 1 );
+    mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv4, 1 );
+    mv_a = mv3;
+    mv_b = mv4;
+
+    nnz_mask = nnz3 | nnz4;
+    nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
+    two = __msa_bmnz_v( two, mask, nnz_mask );
+
+    ref_mask = ( v16u8 ) __msa_ceq_b( ref3, ref4 );
+    ref_mask = ref_mask ^ 255;
+
+    sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
+    sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
+    sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
+    sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
+
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
+    ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
+
+    dst = __msa_bmnz_v( dst, one, ref_mask );
+    dst = __msa_bmnz_v( two, dst, nnz_mask );
+
+    u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
+    SW( u_tmp, pu_bs[0][3] );
+}
+
+void x264_deblock_v_luma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
+                                    int32_t i_alpha, int32_t i_beta )
+{
+    avc_loopfilter_luma_intra_edge_hor_msa( p_pix, ( uint8_t ) i_alpha,
+                                            ( uint8_t ) i_beta, i_stride );
+}
+
+void x264_deblock_h_luma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
+                                    int32_t i_alpha, int32_t i_beta )
+{
+    avc_loopfilter_luma_intra_edge_ver_msa( p_pix, ( uint8_t ) i_alpha,
+                                            ( uint8_t ) i_beta, i_stride );
+}
+
+void x264_deblock_v_chroma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
+                                      int32_t i_alpha, int32_t i_beta )
+{
+    avc_lpf_cbcr_interleaved_intra_edge_hor_msa( p_pix, ( uint8_t ) i_alpha,
+                                                 ( uint8_t ) i_beta, i_stride );
+}
+
+void x264_deblock_h_chroma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
+                                      int32_t i_alpha, int32_t i_beta )
+{
+    avc_lpf_cbcr_interleaved_intra_edge_ver_msa( p_pix, ( uint8_t ) i_alpha,
+                                                 ( uint8_t ) i_beta, i_stride );
+}
+
+void x264_deblock_h_luma_msa( uint8_t *p_pix, intptr_t i_stride,
+                              int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
+{
+    uint8_t u_bs0 = 1;
+    uint8_t u_bs1 = 1;
+    uint8_t u_bs2 = 1;
+    uint8_t u_bs3 = 1;
+
+    if( p_tc0[0] < 0 ) u_bs0 = 0;
+    if( p_tc0[1] < 0 ) u_bs1 = 0;
+    if( p_tc0[2] < 0 ) u_bs2 = 0;
+    if( p_tc0[3] < 0 ) u_bs3 = 0;
+
+    avc_loopfilter_luma_inter_edge_ver_msa( p_pix,
+                                            u_bs0, u_bs1, u_bs2, u_bs3,
+                                            p_tc0[0], p_tc0[1], p_tc0[2],
+                                            p_tc0[3], i_alpha, i_beta,
+                                            i_stride );
+}
+
+void x264_deblock_v_luma_msa( uint8_t *p_pix, intptr_t i_stride,
+                              int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
+{
+    uint8_t u_bs0 = 1;
+    uint8_t u_bs1 = 1;
+    uint8_t u_bs2 = 1;
+    uint8_t u_bs3 = 1;
+
+    if( p_tc0[0] < 0 ) u_bs0 = 0;
+    if( p_tc0[1] < 0 ) u_bs1 = 0;
+    if( p_tc0[2] < 0 ) u_bs2 = 0;
+    if( p_tc0[3] < 0 ) u_bs3 = 0;
+
+    avc_loopfilter_luma_inter_edge_hor_msa( p_pix,
+                                            u_bs0, u_bs1, u_bs2, u_bs3,
+                                            p_tc0[0], p_tc0[1], p_tc0[2],
+                                            p_tc0[3], i_alpha, i_beta,
+                                            i_stride );
+}
+
+void x264_deblock_v_chroma_msa( uint8_t *p_pix, intptr_t i_stride,
+                                int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
+{
+    uint8_t u_bs0 = 1;
+    uint8_t u_bs1 = 1;
+    uint8_t u_bs2 = 1;
+    uint8_t u_bs3 = 1;
+
+    if( p_tc0[0] < 0 ) u_bs0 = 0;
+    if( p_tc0[1] < 0 ) u_bs1 = 0;
+    if( p_tc0[2] < 0 ) u_bs2 = 0;
+    if( p_tc0[3] < 0 ) u_bs3 = 0;
+
+    avc_lpf_cbcr_interleaved_inter_edge_hor_msa( p_pix,
+                                                 u_bs0, u_bs1, u_bs2, u_bs3,
+                                                 p_tc0[0], p_tc0[1], p_tc0[2],
+                                                 p_tc0[3], i_alpha, i_beta,
+                                                 i_stride );
+}
+
+void x264_deblock_h_chroma_msa( uint8_t *p_pix, intptr_t i_stride,
+                                int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
+{
+    uint8_t u_bs0 = 1;
+    uint8_t u_bs1 = 1;
+    uint8_t u_bs2 = 1;
+    uint8_t u_bs3 = 1;
+
+    if( p_tc0[0] < 0 ) u_bs0 = 0;
+    if( p_tc0[1] < 0 ) u_bs1 = 0;
+    if( p_tc0[2] < 0 ) u_bs2 = 0;
+    if( p_tc0[3] < 0 ) u_bs3 = 0;
+
+    avc_lpf_cbcr_interleaved_inter_edge_ver_msa( p_pix,
+                                                 u_bs0, u_bs1, u_bs2, u_bs3,
+                                                 p_tc0[0], p_tc0[1], p_tc0[2],
+                                                 p_tc0[3], i_alpha, i_beta,
+                                                 i_stride );
+}
+
+void x264_deblock_strength_msa( uint8_t u_nnz[X264_SCAN8_SIZE],
+                                int8_t pi_ref[2][X264_SCAN8_LUMA_SIZE],
+                                int16_t pi_mv[2][X264_SCAN8_LUMA_SIZE][2],
+                                uint8_t pu_bs[2][8][4], int32_t i_mvy_limit,
+                                int32_t i_bframe )
+{
+    if( i_bframe )
+    {
+        for( int32_t i_dir = 0; i_dir < 2; i_dir++ )
+        {
+            int32_t s1 = i_dir ? 1 : 8;
+            int32_t s2 = i_dir ? 8 : 1;
+
+            for( int32_t i_edge = 0; i_edge < 4; i_edge++ )
+            {
+                for( int32_t i = 0, loc = X264_SCAN8_0 + i_edge * s2; i < 4;
+                     i++, loc += s1 )
+                {
+                    int32_t locn = loc - s2;
+                    if( u_nnz[loc] || u_nnz[locn] )
+                    {
+                        pu_bs[i_dir][i_edge][i] = 2;
+                    }
+                    else if( pi_ref[0][loc] != pi_ref[0][locn] ||
+                             abs(  pi_mv[0][loc][0] -
+                                   pi_mv[0][locn][0]  ) >= 4 ||
+                             abs(  pi_mv[0][loc][1] -
+                                   pi_mv[0][locn][1]  ) >= i_mvy_limit ||
+                             ( i_bframe &&
+                                 ( pi_ref[1][loc] != pi_ref[1][locn] ||
+                                   abs(  pi_mv[1][loc][0] -
+                                         pi_mv[1][locn][0]  ) >= 4 ||
+                                   abs(  pi_mv[1][loc][1] -
+                                         pi_mv[1][locn][1]  ) >= i_mvy_limit ) )
+                           )
+                    {
+                        pu_bs[i_dir][i_edge][i] = 1;
+                    }
+                    else
+                    {
+                        pu_bs[i_dir][i_edge][i] = 0;
+                    }
+                }
+            }
+        }
+    }
+    else
+    {
+        avc_deblock_strength_msa( u_nnz, pi_ref, pi_mv, pu_bs, i_mvy_limit );
+    }
+}
+#endif

x264-snapshot-20150804-2245.tar.bz2/common/mips/macros.h Added

@@ -0,0 +1,1952 @@
+/*****************************************************************************
+ * macros.h: msa macros
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Rishikesh More <rishikesh.more@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_MACROS_H
+#define X264_MIPS_MACROS_H
+
+#include <stdint.h>
+#include <msa.h>
+
+#define LD_B( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
+#define LD_UB( ... ) LD_B( v16u8, __VA_ARGS__ )
+#define LD_SB( ... ) LD_B( v16i8, __VA_ARGS__ )
+
+#define LD_H( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
+#define LD_SH( ... ) LD_H( v8i16, __VA_ARGS__ )
+
+#define LD_W( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
+#define LD_SW( ... ) LD_W( v4i32, __VA_ARGS__ )
+
+#define ST_B( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in )
+#define ST_UB( ... ) ST_B( v16u8, __VA_ARGS__ )
+#define ST_SB( ... ) ST_B( v16i8, __VA_ARGS__ )
+
+#define ST_H( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in )
+#define ST_UH( ... ) ST_H( v8u16, __VA_ARGS__ )
+#define ST_SH( ... ) ST_H( v8i16, __VA_ARGS__ )
+
+#if ( __mips_isa_rev >= 6 )
+    #define LH( p_src )                              \
+    ( {                                              \
+        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+        uint16_t u_val_h_m;                          \
+                                                     \
+        asm volatile (                               \
+            "lh  %[u_val_h_m],  %[p_src_m]  \n\t"    \
+                                                     \
+            : [u_val_h_m] "=r" ( u_val_h_m )         \
+            : [p_src_m] "m" ( *p_src_m )             \
+        );                                           \
+                                                     \
+        u_val_h_m;                                   \
+    } )
+
+    #define LW( p_src )                              \
+    ( {                                              \
+        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+        uint32_t u_val_w_m;                          \
+                                                     \
+        asm volatile (                               \
+            "lw  %[u_val_w_m],  %[p_src_m]  \n\t"    \
+                                                     \
+            : [u_val_w_m] "=r" ( u_val_w_m )         \
+            : [p_src_m] "m" ( *p_src_m )             \
+        );                                           \
+                                                     \
+        u_val_w_m;                                   \
+    } )
+
+    #if ( __mips == 64 )
+        #define LD( p_src )                              \
+        ( {                                              \
+            uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+            uint64_t u_val_d_m = 0;                      \
+                                                         \
+            asm volatile (                               \
+                "ld  %[u_val_d_m],  %[p_src_m]  \n\t"    \
+                                                         \
+                : [u_val_d_m] "=r" ( u_val_d_m )         \
+                : [p_src_m] "m" ( *p_src_m )             \
+            );                                           \
+                                                         \
+            u_val_d_m;                                   \
+        } )
+    #else  // !( __mips == 64 )
+        #define LD( p_src )                                                  \
+        ( {                                                                  \
+            uint8_t *p_src_m = ( uint8_t * ) ( p_src );                      \
+            uint32_t u_val0_m, u_val1_m;                                     \
+            uint64_t u_val_d_m = 0;                                          \
+                                                                             \
+            u_val0_m = LW( p_src_m );                                        \
+            u_val1_m = LW( p_src_m + 4 );                                    \
+                                                                             \
+            u_val_d_m = ( uint64_t ) ( u_val1_m );                           \
+            u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) &                 \
+                                       0xFFFFFFFF00000000 );                 \
+            u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m );  \
+                                                                             \
+            u_val_d_m;                                                       \
+        } )
+    #endif  // ( __mips == 64 )
+
+    #define SH( u_val, p_dst )                       \
+    {                                                \
+        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
+        uint16_t u_val_h_m = ( u_val );              \
+                                                     \
+        asm volatile (                               \
+            "sh  %[u_val_h_m],  %[p_dst_m]  \n\t"    \
+                                                     \
+            : [p_dst_m] "=m" ( *p_dst_m )            \
+            : [u_val_h_m] "r" ( u_val_h_m )          \
+        );                                           \
+    }
+
+    #define SW( u_val, p_dst )                       \
+    {                                                \
+        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
+        uint32_t u_val_w_m = ( u_val );              \
+                                                     \
+        asm volatile (                               \
+            "sw  %[u_val_w_m],  %[p_dst_m]  \n\t"    \
+                                                     \
+            : [p_dst_m] "=m" ( *p_dst_m )            \
+            : [u_val_w_m] "r" ( u_val_w_m )          \
+        );                                           \
+    }
+
+    #define SD( u_val, p_dst )                       \
+    {                                                \
+        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
+        uint64_t u_val_d_m = ( u_val );              \
+                                                     \
+        asm volatile (                               \
+            "sd  %[u_val_d_m],  %[p_dst_m]  \n\t"    \
+                                                     \
+            : [p_dst_m] "=m" ( *p_dst_m )            \
+            : [u_val_d_m] "r" ( u_val_d_m )          \
+        );                                           \
+    }
+
+#else  // !( __mips_isa_rev >= 6 )
+    #define LH( p_src )                              \
+    ( {                                              \
+        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+        uint16_t u_val_h_m;                          \
+                                                     \
+        asm volatile (                               \
+            "ulh  %[u_val_h_m],  %[p_src_m]  \n\t"   \
+                                                     \
+            : [u_val_h_m] "=r" ( u_val_h_m )         \
+            : [p_src_m] "m" ( *p_src_m )             \
+        );                                           \
+                                                     \
+        u_val_h_m;                                   \
+    } )
+
+    #define LW( p_src )                              \
+    ( {                                              \
+        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+        uint32_t u_val_w_m;                          \
+                                                     \
+        asm volatile (                               \
+            "ulw  %[u_val_w_m],  %[p_src_m]  \n\t"   \
+                                                     \
+            : [u_val_w_m] "=r" ( u_val_w_m )         \
+            : [p_src_m] "m" ( *p_src_m )             \
+        );                                           \
+                                                     \
+        u_val_w_m;                                   \
+    } )
+
+    #if ( __mips == 64 )
+        #define LD( p_src )                              \
+        ( {                                              \
+            uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+            uint64_t u_val_d_m = 0;                      \
+                                                         \
+            asm volatile (                               \
+                "uld  %[u_val_d_m],  %[p_src_m]  \n\t"   \
+                                                         \
+                : [u_val_d_m] "=r" ( u_val_d_m )         \
+                : [p_src_m] "m" ( *p_src_m )             \
+            );                                           \
+                                                         \
+            u_val_d_m;                                   \
+        } )
+    #else  // !( __mips == 64 )
+        #define LD( p_src )                                                  \
+        ( {                                                                  \
+            uint8_t *psrc_m1 = ( uint8_t * ) ( p_src );                      \
+            uint32_t u_val0_m, u_val1_m;                                     \
+            uint64_t u_val_d_m = 0;                                          \
+                                                                             \
+            u_val0_m = LW( psrc_m1 );                                        \
+            u_val1_m = LW( psrc_m1 + 4 );                                    \
+                                                                             \
+            u_val_d_m = ( uint64_t ) ( u_val1_m );                           \
+            u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) &                 \
+                                       0xFFFFFFFF00000000 );                 \
+            u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m );  \
+                                                                             \
+            u_val_d_m;                                                       \
+        } )
+    #endif  // ( __mips == 64 )
+
+    #define SH( u_val, p_dst )                       \
+    {                                                \
+        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
+        uint16_t u_val_h_m = ( u_val );              \
+                                                     \
+        asm volatile (                               \
+            "ush  %[u_val_h_m],  %[p_dst_m]  \n\t"   \
+                                                     \
+            : [p_dst_m] "=m" ( *p_dst_m )            \
+            : [u_val_h_m] "r" ( u_val_h_m )          \
+        );                                           \
+    }
+
+    #define SW( u_val, p_dst )                       \
+    {                                                \
+        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
+        uint32_t u_val_w_m = ( u_val );              \
+                                                     \
+        asm volatile (                               \
+            "usw  %[u_val_w_m],  %[p_dst_m]  \n\t"   \
+                                                     \
+            : [p_dst_m] "=m" ( *p_dst_m )            \
+            : [u_val_w_m] "r" ( u_val_w_m )          \
+        );                                           \
+    }
+
+    #define SD( u_val, p_dst )                                                 \
+    {                                                                          \
+        uint8_t *p_dst_m1 = ( uint8_t * ) ( p_dst );                           \
+        uint32_t u_val0_m, u_val1_m;                                           \
+                                                                               \
+        u_val0_m = ( uint32_t ) ( ( u_val ) & 0x00000000FFFFFFFF );            \
+        u_val1_m = ( uint32_t ) ( ( ( u_val ) >> 32 ) & 0x00000000FFFFFFFF );  \
+                                                                               \
+        SW( u_val0_m, p_dst_m1 );                                              \
+        SW( u_val1_m, p_dst_m1 + 4 );                                          \
+    }
+
+#endif // ( __mips_isa_rev >= 6 )
+
+/* Description : Load 4 words with stride
+   Arguments   : Inputs  - psrc    (source pointer to load from)
+                         - stride
+                 Outputs - out0, out1, out2, out3
+   Details     : Load word in 'out0' from (psrc)
+                 Load word in 'out1' from (psrc + stride)
+                 Load word in 'out2' from (psrc + 2 * stride)
+                 Load word in 'out3' from (psrc + 3 * stride)
+*/
+#define LW4( p_src, stride, out0, out1, out2, out3 )  \
+{                                                     \
+    out0 = LW( ( p_src ) );                           \
+    out1 = LW( ( p_src ) + stride );                  \
+    out2 = LW( ( p_src ) + 2 * stride );              \
+    out3 = LW( ( p_src ) + 3 * stride );              \
+}
+
+/* Description : Store 4 words with stride
+   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
+   Details     : Store word from 'in0' to (pdst)
+                 Store word from 'in1' to (pdst + stride)
+                 Store word from 'in2' to (pdst + 2 * stride)
+                 Store word from 'in3' to (pdst + 3 * stride)
+*/
+#define SW4( in0, in1, in2, in3, p_dst, stride )  \
+{                                                 \
+    SW( in0, ( p_dst ) )                          \
+    SW( in1, ( p_dst ) + stride );                \
+    SW( in2, ( p_dst ) + 2 * stride );            \
+    SW( in3, ( p_dst ) + 3 * stride );            \
+}
+
+/* Description : Store 4 double words with stride
+   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
+   Details     : Store double word from 'in0' to (pdst)
+                 Store double word from 'in1' to (pdst + stride)
+                 Store double word from 'in2' to (pdst + 2 * stride)
+                 Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4( in0, in1, in2, in3, p_dst, stride )  \
+{                                                 \
+    SD( in0, ( p_dst ) )                          \
+    SD( in1, ( p_dst ) + stride );                \
+    SD( in2, ( p_dst ) + 2 * stride );            \
+    SD( in3, ( p_dst ) + 3 * stride );            \
+}
+
+/* Description : Load vectors with 16 byte elements with stride
+   Arguments   : Inputs  - psrc    (source pointer to load from)
+                         - stride
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Load 16 byte elements in 'out0' from (psrc)
+                 Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2( RTYPE, p_src, stride, out0, out1 )  \
+{                                                  \
+    out0 = LD_B( RTYPE, ( p_src ) );               \
+    out1 = LD_B( RTYPE, ( p_src ) + stride );      \
+}
+#define LD_UB2( ... ) LD_B2( v16u8, __VA_ARGS__ )
+#define LD_SB2( ... ) LD_B2( v16i8, __VA_ARGS__ )
+
+#define LD_B3( RTYPE, p_src, stride, out0, out1, out2 )  \
+{                                                        \
+    LD_B2( RTYPE, ( p_src ), stride, out0, out1 );       \
+    out2 = LD_B( RTYPE, ( p_src ) + 2 * stride );        \
+}
+#define LD_UB3( ... ) LD_B3( v16u8, __VA_ARGS__ )
+#define LD_SB3( ... ) LD_B3( v16i8, __VA_ARGS__ )
+
+#define LD_B4( RTYPE, p_src, stride, out0, out1, out2, out3 )     \
+{                                                                 \
+    LD_B2( RTYPE, ( p_src ), stride, out0, out1 );                \
+    LD_B2( RTYPE, ( p_src ) + 2 * stride , stride, out2, out3 );  \
+}
+#define LD_UB4( ... ) LD_B4( v16u8, __VA_ARGS__ )
+#define LD_SB4( ... ) LD_B4( v16i8, __VA_ARGS__ )
+
+#define LD_B5( RTYPE, p_src, stride, out0, out1, out2, out3, out4 )  \
+{                                                                    \
+    LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 );       \
+    out4 = LD_B( RTYPE, ( p_src ) + 4 * stride );                    \
+}
+#define LD_UB5( ... ) LD_B5( v16u8, __VA_ARGS__ )
+#define LD_SB5( ... ) LD_B5( v16i8, __VA_ARGS__ )
+
+#define LD_B8( RTYPE, p_src, stride,                                         \
+               out0, out1, out2, out3, out4, out5, out6, out7 )              \
+{                                                                            \
+    LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 );               \
+    LD_B4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 );  \
+}
+#define LD_UB8( ... ) LD_B8( v16u8, __VA_ARGS__ )
+#define LD_SB8( ... ) LD_B8( v16i8, __VA_ARGS__ )
+
+/* Description : Load vectors with 8 halfword elements with stride
+   Arguments   : Inputs  - psrc    (source pointer to load from)
+                         - stride
+                 Outputs - out0, out1
+   Details     : Load 8 halfword elements in 'out0' from (psrc)
+                 Load 8 halfword elements in 'out1' from (psrc + stride)
+*/
+#define LD_H2( RTYPE, p_src, stride, out0, out1 )  \
+{                                                  \
+    out0 = LD_H( RTYPE, ( p_src ) );               \
+    out1 = LD_H( RTYPE, ( p_src ) + ( stride ) );  \
+}
+#define LD_SH2( ... ) LD_H2( v8i16, __VA_ARGS__ )
+
+#define LD_H4( RTYPE, p_src, stride, out0, out1, out2, out3 )    \
+{                                                                \
+    LD_H2( RTYPE, ( p_src ), stride, out0, out1 );               \
+    LD_H2( RTYPE, ( p_src ) + 2 * stride, stride, out2, out3 );  \
+}
+#define LD_SH4( ... ) LD_H4( v8i16, __VA_ARGS__ )
+
+#define LD_H8( RTYPE, p_src, stride,                                         \
+               out0, out1, out2, out3, out4, out5, out6, out7 )              \
+{                                                                            \
+    LD_H4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 );               \
+    LD_H4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 );  \
+}
+#define LD_SH8( ... ) LD_H8( v8i16, __VA_ARGS__ )
+
+/* Description : Load 4x4 block of signed halfword elements from 1D source
+                 data into 4 vectors (Each vector with 4 signed halfwords)
+   Arguments   : Inputs  - psrc
+                 Outputs - out0, out1, out2, out3
+*/
+#define LD4x4_SH( p_src, out0, out1, out2, out3 )                     \
+{                                                                     \
+    out0 = LD_SH( p_src );                                            \
+    out2 = LD_SH( p_src + 8 );                                        \
+    out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 );  \
+    out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out2, ( v2i64 ) out2 );  \
+}
+
+/* Description : Load 2 vectors of signed word elements with stride
+   Arguments   : Inputs  - psrc    (source pointer to load from)
+                         - stride
+                 Outputs - out0, out1
+                 Return Type - signed word
+*/
+#define LD_SW2( p_src, stride, out0, out1 )    \
+{                                              \
+    out0 = LD_SW( ( p_src ) );                 \
+    out1 = LD_SW( ( p_src ) + stride );        \
+}
+
+/* Description : Store vectors of 16 byte elements with stride
+   Arguments   : Inputs  - in0, in1, stride
+                         - pdst    (destination pointer to store to)
+   Details     : Store 16 byte elements from 'in0' to (pdst)
+                 Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2( RTYPE, in0, in1, p_dst, stride )  \
+{                                                \
+    ST_B( RTYPE, in0, ( p_dst ) );               \
+    ST_B( RTYPE, in1, ( p_dst ) + stride );      \
+}
+#define ST_UB2( ... ) ST_B2( v16u8, __VA_ARGS__ )
+
+#define ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride )      \
+{                                                              \
+    ST_B2( RTYPE, in0, in1, ( p_dst ), stride );               \
+    ST_B2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride );  \
+}
+#define ST_UB4( ... ) ST_B4( v16u8, __VA_ARGS__ )
+#define ST_SB4( ... ) ST_B4( v16i8, __VA_ARGS__ )
+
+#define ST_B8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,            \
+               p_dst, stride )                                           \
+{                                                                        \
+    ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride );                   \
+    ST_B4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride );  \
+}
+#define ST_UB8( ... ) ST_B8( v16u8, __VA_ARGS__ )
+
+/* Description : Store vectors of 8 halfword elements with stride
+   Arguments   : Inputs  - in0, in1, stride
+                         - pdst    (destination pointer to store to)
+   Details     : Store 8 halfword elements from 'in0' to (pdst)
+                 Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2( RTYPE, in0, in1, p_dst, stride )  \
+{                                                \
+    ST_H( RTYPE, in0, ( p_dst ) );               \
+    ST_H( RTYPE, in1, ( p_dst ) + stride );      \
+}
+#define ST_SH2( ... ) ST_H2( v8i16, __VA_ARGS__ )
+
+#define ST_H4( RTYPE, in0, in1, in2, in3, p_dst, stride )      \
+{                                                              \
+    ST_H2( RTYPE, in0, in1, ( p_dst ), stride );               \
+    ST_H2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride );  \
+}
+#define ST_SH4( ... ) ST_H4( v8i16, __VA_ARGS__ )
+
+#define ST_H8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, p_dst, stride )  \
+{                                                                              \
+    ST_H4( RTYPE, in0, in1, in2, in3, ( p_dst ), stride );                     \
+    ST_H4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride );        \
+}
+#define ST_SH8( ... ) ST_H8( v8i16, __VA_ARGS__ )
+
+/* Description : Store 2x4 byte block to destination memory from input vector
+   Arguments   : Inputs  - in, stidx, pdst, stride
+   Details     : Index 'stidx' halfword element from 'in' vector is copied to
+                 GP register and stored to (pdst)
+                 Index 'stidx+1' halfword element from 'in' vector is copied to
+                 GP register and stored to (pdst + stride)
+                 Index 'stidx+2' halfword element from 'in' vector is copied to
+                 GP register and stored to (pdst + 2 * stride)
+                 Index 'stidx+3' halfword element from 'in' vector is copied to
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST2x4_UB( in, stidx, p_dst, stride )                   \
+{                                                              \
+    uint16_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;           \
+    uint8_t *pblk_2x4_m = ( uint8_t * ) ( p_dst );             \
+                                                               \
+    u_out0_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx ) );      \
+    u_out1_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 1 ) );  \
+    u_out2_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 2 ) );  \
+    u_out3_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 3 ) );  \
+                                                               \
+    SH( u_out0_m, pblk_2x4_m );                                \
+    SH( u_out1_m, pblk_2x4_m + stride );                       \
+    SH( u_out2_m, pblk_2x4_m + 2 * stride );                   \
+    SH( u_out3_m, pblk_2x4_m + 3 * stride );                   \
+}
+
+/* Description : Store 4x4 byte block to destination memory from input vector
+   Arguments   : Inputs  - in0, in1, pdst, stride
+   Details     : 'Idx0' word element from input vector 'in0' is copied to
+                 GP register and stored to (pdst)
+                 'Idx1' word element from input vector 'in0' is copied to
+                 GP register and stored to (pdst + stride)
+                 'Idx2' word element from input vector 'in0' is copied to
+                 GP register and stored to (pdst + 2 * stride)
+                 'Idx3' word element from input vector 'in0' is copied to
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST4x4_UB( in0, in1, idx0, idx1, idx2, idx3, p_dst, stride )     \
+{                                                                       \
+    uint32_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;                    \
+    uint8_t *pblk_4x4_m = ( uint8_t * ) ( p_dst );                      \
+                                                                        \
+    u_out0_m = __msa_copy_u_w( ( v4i32 ) in0, idx0 );                   \
+    u_out1_m = __msa_copy_u_w( ( v4i32 ) in0, idx1 );                   \
+    u_out2_m = __msa_copy_u_w( ( v4i32 ) in1, idx2 );                   \
+    u_out3_m = __msa_copy_u_w( ( v4i32 ) in1, idx3 );                   \
+                                                                        \
+    SW4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_4x4_m, stride );  \
+}
+
+#define ST4x8_UB( in0, in1, p_dst, stride )                           \
+{                                                                     \
+    uint8_t *pblk_4x8 = ( uint8_t * ) ( p_dst );                      \
+                                                                      \
+    ST4x4_UB( in0, in0, 0, 1, 2, 3, pblk_4x8, stride );               \
+    ST4x4_UB( in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride );  \
+}
+
+/* Description : Store 8x1 byte block to destination memory from input vector
+   Arguments   : Inputs  - in, pdst
+   Details     : Index 0 double word element from 'in' vector is copied to
+                 GP register and stored to (pdst)
+*/
+#define ST8x1_UB( in, p_dst )                      \
+{                                                  \
+    uint64_t u_out0_m;                             \
+    u_out0_m = __msa_copy_u_d( ( v2i64 ) in, 0 );  \
+    SD( u_out0_m, p_dst );                         \
+}
+
+/* Description : Store 8x4 byte block to destination memory from input
+                 vectors
+   Arguments   : Inputs  - in0, in1, pdst, stride
+   Details     : Index 0 double word element from 'in0' vector is copied to
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in0' vector is copied to
+                 GP register and stored to (pdst + stride)
+                 Index 0 double word element from 'in1' vector is copied to
+                 GP register and stored to (pdst + 2 * stride)
+                 Index 1 double word element from 'in1' vector is copied to
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST8x4_UB( in0, in1, p_dst, stride )                             \
+{                                                                       \
+    uint64_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;                    \
+    uint8_t *pblk_8x4_m = ( uint8_t * ) ( p_dst );                      \
+                                                                        \
+    u_out0_m = __msa_copy_u_d( ( v2i64 ) in0, 0 );                      \
+    u_out1_m = __msa_copy_u_d( ( v2i64 ) in0, 1 );                      \
+    u_out2_m = __msa_copy_u_d( ( v2i64 ) in1, 0 );                      \
+    u_out3_m = __msa_copy_u_d( ( v2i64 ) in1, 1 );                      \
+                                                                        \
+    SD4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_8x4_m, stride );  \
+}
+
+/* Description : average with rounding (in0 + in1 + 1) / 2.
+   Arguments   : Inputs  - in0, in1, in2, in3,
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned byte element from 'in0' vector is added with
+                 each unsigned byte element from 'in1' vector.
+                 Average with rounding is calculated and written to 'out0'
+*/
+#define AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 )             \
+{                                                                     \
+    out0 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in0, ( v16u8 ) in1 );  \
+    out1 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in2, ( v16u8 ) in3 );  \
+}
+#define AVER_UB2_UB( ... ) AVER_UB2( v16u8, __VA_ARGS__ )
+
+#define AVER_UB4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                  out0, out1, out2, out3 )                        \
+{                                                                 \
+    AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 )             \
+    AVER_UB2( RTYPE, in4, in5, in6, in7, out2, out3 )             \
+}
+#define AVER_UB4_UB( ... ) AVER_UB4( v16u8, __VA_ARGS__ )
+
+/* Description : Immediate number of elements to slide with zero
+   Arguments   : Inputs  - in0, in1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'zero_m' vector are slide into 'in0' by
+                 value specified in 'slide_val'
+*/
+#define SLDI_B2_0( RTYPE, in0, in1, out0, out1, slide_val )     \
+{                                                               \
+    v16i8 zero_m = { 0 };                                       \
+    out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m,            \
+                                   ( v16i8 ) in0, slide_val );  \
+    out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m,            \
+                                   ( v16i8 ) in1, slide_val );  \
+}
+#define SLDI_B2_0_UB( ... ) SLDI_B2_0( v16u8, __VA_ARGS__ )
+
+/* Description : Immediate number of elements to slide
+   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0_0' vector are slide into 'in1_0' by
+                 value specified in 'slide_val'
+*/
+#define SLDI_B2( RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val )  \
+{                                                                            \
+    out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_0, ( v16i8 ) in1_0,         \
+                                   slide_val );                              \
+    out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_1, ( v16i8 ) in1_1,         \
+                                   slide_val );                              \
+}
+#define SLDI_B2_UB( ... ) SLDI_B2( v16u8, __VA_ARGS__ )
+
+/* Description : Shuffle byte vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Selective byte elements from 'in0' & 'in1' are copied to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 )  \
+{                                                                       \
+    out0 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask0,                     \
+                                   ( v16i8 ) in1, ( v16i8 ) in0 );      \
+    out1 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask1,                     \
+                                   ( v16i8 ) in3, ( v16i8 ) in2 );      \
+}
+#define VSHF_B2_UB( ... ) VSHF_B2( v16u8, __VA_ARGS__ )
+#define VSHF_B2_SB( ... ) VSHF_B2( v16i8, __VA_ARGS__ )
+
+/* Description : Shuffle halfword vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Selective byte elements from 'in0' & 'in1' are copied to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_H2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 )  \
+{                                                                       \
+    out0 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask0,                     \
+                                   ( v8i16 ) in1, ( v8i16 ) in0 );      \
+    out1 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask1,                     \
+                                   ( v8i16 ) in3, ( v8i16 ) in2 );      \
+}
+#define VSHF_H2_SH( ... ) VSHF_H2( v8i16, __VA_ARGS__ )
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1
+                           cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Unsigned byte elements from 'mult0' are multiplied with
+                 unsigned byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. unsigned halfword.
+                 Multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 )         \
+{                                                                         \
+    out0 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult0, ( v16u8 ) cnst0 );  \
+    out1 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult1, ( v16u8 ) cnst1 );  \
+}
+#define DOTP_UB2_UH( ... ) DOTP_UB2( v8u16, __VA_ARGS__ )
+
+#define DOTP_UB4( RTYPE, mult0, mult1, mult2, mult3,            \
+                  cnst0, cnst1, cnst2, cnst3,                   \
+                  out0, out1, out2, out3 )                      \
+{                                                               \
+    DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 );  \
+    DOTP_UB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 );  \
+}
+#define DOTP_UB4_UH( ... ) DOTP_UB4( v8u16, __VA_ARGS__ )
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1
+                           cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 Multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 )         \
+{                                                                          \
+    out0 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out0,                      \
+                                      ( v16i8 ) mult0, ( v16i8 ) cnst0 );  \
+    out1 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out1,                      \
+                                      ( v16i8 ) mult1, ( v16i8 ) cnst1 );  \
+}
+#define DPADD_SB2_SH( ... ) DPADD_SB2( v8i16, __VA_ARGS__ )
+
+#define DPADD_SB4( RTYPE, mult0, mult1, mult2, mult3,                    \
+                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3 )  \
+{                                                                        \
+    DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 );          \
+    DPADD_SB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 );          \
+}
+#define DPADD_SB4_SH( ... ) DPADD_SB4( v8i16, __VA_ARGS__ )
+
+/* Description : Dot product of halfword vector elements
+   Arguments   : Inputs  - mult0, mult1
+                           cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed word.
+                 Multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DPADD_SH2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 )         \
+{                                                                          \
+    out0 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out0,                      \
+                                      ( v8i16 ) mult0, ( v8i16 ) cnst0 );  \
+    out1 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out1,                      \
+                                      ( v8i16 ) mult1, ( v8i16 ) cnst1 );  \
+}
+#define DPADD_SH2_SW( ... ) DPADD_SH2( v4i32, __VA_ARGS__ )
+
+/* Description : Clips all halfword elements of input vector between min & max
+                 out = (in < min) ? min : ((in > max) ? max : in)
+   Arguments   : Inputs  - in, min, max
+                 Output - out_m
+                 Return Type - signed halfword
+*/
+#define CLIP_SH( in, min, max )                               \
+( {                                                           \
+    v8i16 out_m;                                              \
+                                                              \
+    out_m = __msa_max_s_h( ( v8i16 ) min, ( v8i16 ) in );     \
+    out_m = __msa_min_s_h( ( v8i16 ) max, ( v8i16 ) out_m );  \
+    out_m;                                                    \
+} )
+
+/* Description : Clips all signed halfword elements of input vector
+                 between 0 & 255
+   Arguments   : Input  - in
+                 Output - out_m
+                 Return Type - signed halfword
+*/
+#define CLIP_SH_0_255( in )                                     \
+( {                                                             \
+    v8i16 max_m = __msa_ldi_h( 255 );                           \
+    v8i16 out_m;                                                \
+                                                                \
+    out_m = __msa_maxi_s_h( ( v8i16 ) in, 0 );                  \
+    out_m = __msa_min_s_h( ( v8i16 ) max_m, ( v8i16 ) out_m );  \
+    out_m;                                                      \
+} )
+#define CLIP_SH2_0_255( in0, in1 )  \
+{                                   \
+    in0 = CLIP_SH_0_255( in0 );     \
+    in1 = CLIP_SH_0_255( in1 );     \
+}
+#define CLIP_SH4_0_255( in0, in1, in2, in3 )  \
+{                                             \
+    CLIP_SH2_0_255( in0, in1 );               \
+    CLIP_SH2_0_255( in2, in3 );               \
+}
+
+/* Description : Horizontal addition of 4 signed word elements of input vector
+   Arguments   : Input  - in       (signed word vector)
+                 Output - sum_m    (i32 sum)
+                 Return Type - signed word (GP)
+   Details     : 4 signed word elements of 'in' vector are added together and
+                 the resulting integer sum is returned
+*/
+#define HADD_SW_S32( in )                                   \
+( {                                                         \
+    v2i64 res0_m, res1_m;                                   \
+    int32_t i_sum_m;                                        \
+                                                            \
+    res0_m = __msa_hadd_s_d( ( v4i32 ) in, ( v4i32 ) in );  \
+    res1_m = __msa_splati_d( res0_m, 1 );                   \
+    res0_m = res0_m + res1_m;                               \
+    i_sum_m = __msa_copy_s_w( ( v4i32 ) res0_m, 0 );        \
+    i_sum_m;                                                \
+} )
+
+/* Description : Horizontal addition of 4 signed word elements of input vector
+   Arguments   : Input  - in       (signed word vector)
+                 Output - sum_m    (i32 sum)
+                 Return Type - signed word (GP)
+   Details     : 4 signed word elements of 'in' vector are added together and
+                 the resulting integer sum is returned
+*/
+#define HADD_UH_U32( in )                                      \
+( {                                                            \
+    v4u32 res_m;                                               \
+    v2u64 res0_m, res1_m;                                      \
+    uint32_t u_sum_m;                                          \
+                                                               \
+    res_m = __msa_hadd_u_w( ( v8u16 ) in, ( v8u16 ) in );      \
+    res0_m = __msa_hadd_u_d( res_m, res_m );                   \
+    res1_m = ( v2u64 ) __msa_splati_d( ( v2i64 ) res0_m, 1 );  \
+    res0_m = res0_m + res1_m;                                  \
+    u_sum_m = __msa_copy_u_w( ( v4i32 ) res0_m, 0 );           \
+    u_sum_m;                                                   \
+} )
+
+/* Description : Horizontal addition of signed byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed odd byte element from 'in0' is added to
+                 even signed byte element from 'in0' (pairwise) and the
+                 halfword result is written in 'out0'
+*/
+#define HADD_SB2( RTYPE, in0, in1, out0, out1 )                       \
+{                                                                     \
+    out0 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in0, ( v16i8 ) in0 );  \
+    out1 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in1, ( v16i8 ) in1 );  \
+}
+#define HADD_SB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 )  \
+{                                                                      \
+    HADD_SB2( RTYPE, in0, in1, out0, out1 );                           \
+    HADD_SB2( RTYPE, in2, in3, out2, out3 );                           \
+}
+#define HADD_SB4_SH( ... ) HADD_SB4( v8i16, __VA_ARGS__ )
+
+/* Description : Horizontal addition of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is added to
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HADD_UB2( RTYPE, in0, in1, out0, out1 )                       \
+{                                                                     \
+    out0 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in0, ( v16u8 ) in0 );  \
+    out1 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in1, ( v16u8 ) in1 );  \
+}
+#define HADD_UB2_UH( ... ) HADD_UB2( v8u16, __VA_ARGS__ )
+
+#define HADD_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 )  \
+{                                                                      \
+    HADD_UB2( RTYPE, in0, in1, out0, out1 );                           \
+    HADD_UB2( RTYPE, in2, in3, out2, out3 );                           \
+}
+#define HADD_UB4_UH( ... ) HADD_UB4( v8u16, __VA_ARGS__ )
+
+/* Description : Horizontal subtraction of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is subtracted from
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HSUB_UB2( RTYPE, in0, in1, out0, out1 )                       \
+{                                                                     \
+    out0 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in0, ( v16u8 ) in0 );  \
+    out1 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in1, ( v16u8 ) in1 );  \
+}
+#define HSUB_UB2_SH( ... ) HSUB_UB2( v8i16, __VA_ARGS__ )
+
+#define HSUB_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 )  \
+{                                                                      \
+    HSUB_UB2( RTYPE, in0, in1, out0, out1 );                           \
+    HSUB_UB2( RTYPE, in2, in3, out2, out3 );                           \
+}
+#define HSUB_UB4_SH( ... ) HSUB_UB4( v8i16, __VA_ARGS__ )
+
+/* Description : SAD (Sum of Absolute Difference)
+   Arguments   : Inputs  - in0, in1, ref0, ref1
+                 Outputs - sad_m                 (halfword vector)
+                 Return Type - unsigned halfword
+   Details     : Absolute difference of all the byte elements from 'in0' with
+                 'ref0' is calculated and preserved in 'diff0'. Then even-odd
+                 pairs are added together to generate 8 halfword results.
+*/
+#define SAD_UB2_UH( in0, in1, ref0, ref1 )                            \
+( {                                                                   \
+    v16u8 diff0_m, diff1_m;                                           \
+    v8u16 sad_m = { 0 };                                              \
+                                                                      \
+    diff0_m = __msa_asub_u_b( ( v16u8 ) in0, ( v16u8 ) ref0 );        \
+    diff1_m = __msa_asub_u_b( ( v16u8 ) in1, ( v16u8 ) ref1 );        \
+                                                                      \
+    sad_m += __msa_hadd_u_h( ( v16u8 ) diff0_m, ( v16u8 ) diff0_m );  \
+    sad_m += __msa_hadd_u_h( ( v16u8 ) diff1_m, ( v16u8 ) diff1_m );  \
+                                                                      \
+    sad_m;                                                            \
+} )
+
+/* Description : Set element n input vector to GPR value
+   Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
+                 Output - out                 (output vector)
+                 Return Type - as per RTYPE
+   Details     : Set element 0 in vector 'out' to value specified in 'in0'
+*/
+#define INSERT_W2( RTYPE, in0, in1, out )                     \
+{                                                             \
+    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 );  \
+    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 );  \
+}
+#define INSERT_W2_SB( ... ) INSERT_W2( v16i8, __VA_ARGS__ )
+
+#define INSERT_W4( RTYPE, in0, in1, in2, in3, out )           \
+{                                                             \
+    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 );  \
+    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 );  \
+    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 2, in2 );  \
+    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 3, in3 );  \
+}
+#define INSERT_W4_UB( ... ) INSERT_W4( v16u8, __VA_ARGS__ )
+#define INSERT_W4_SB( ... ) INSERT_W4( v16i8, __VA_ARGS__ )
+
+#define INSERT_D2( RTYPE, in0, in1, out )                     \
+{                                                             \
+    out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 0, in0 );  \
+    out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 1, in1 );  \
+}
+#define INSERT_D2_UB( ... ) INSERT_D2( v16u8, __VA_ARGS__ )
+
+/* Description : Interleave even halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
+{                                                                    \
+    out0 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in1, ( v8i16 ) in0 );  \
+    out1 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in3, ( v8i16 ) in2 );  \
+}
+#define ILVEV_H2_UB( ... ) ILVEV_H2( v16u8, __VA_ARGS__ )
+
+/* Description : Interleave even double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
+{                                                                    \
+    out0 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in1, ( v2i64 ) in0 );  \
+    out1 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in3, ( v2i64 ) in2 );  \
+}
+#define ILVEV_D2_UB( ... ) ILVEV_D2( v16u8, __VA_ARGS__ )
+
+/* Description : Interleave left half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
+{                                                                   \
+    out0 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
+    out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
+}
+#define ILVL_B2_UH( ... ) ILVL_B2( v8u16, __VA_ARGS__ )
+#define ILVL_B2_SH( ... ) ILVL_B2( v8i16, __VA_ARGS__ )
+
+#define ILVL_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3 )                        \
+{                                                                \
+    ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
+    ILVL_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
+}
+#define ILVL_B4_UB( ... ) ILVL_B4( v16u8, __VA_ARGS__ )
+#define ILVL_B4_SB( ... ) ILVL_B4( v16i8, __VA_ARGS__ )
+#define ILVL_B4_UH( ... ) ILVL_B4( v8u16, __VA_ARGS__ )
+#define ILVL_B4_SH( ... ) ILVL_B4( v8i16, __VA_ARGS__ )
+
+/* Description : Interleave left half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
+{                                                                   \
+    out0 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
+    out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
+}
+#define ILVL_H2_SH( ... ) ILVL_H2( v8i16, __VA_ARGS__ )
+#define ILVL_H2_SW( ... ) ILVL_H2( v4i32, __VA_ARGS__ )
+
+#define ILVL_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3 )                        \
+{                                                                \
+    ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
+    ILVL_H2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
+}
+#define ILVL_H4_SW( ... ) ILVL_H4( v4i32, __VA_ARGS__ )
+
+/* Description : Interleave left half of word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_W2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
+{                                                                   \
+    out0 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
+    out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in2, ( v4i32 ) in3 );  \
+}
+#define ILVL_W2_SH( ... ) ILVL_W2( v8i16, __VA_ARGS__ )
+
+/* Description : Interleave right half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to out0.
+*/
+#define ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
+{                                                                   \
+    out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
+    out1 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
+}
+#define ILVR_B2_SB( ... ) ILVR_B2( v16i8, __VA_ARGS__ )
+#define ILVR_B2_UH( ... ) ILVR_B2( v8u16, __VA_ARGS__ )
+#define ILVR_B2_SH( ... ) ILVR_B2( v8i16, __VA_ARGS__ )
+
+#define ILVR_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3 )                        \
+{                                                                \
+    ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
+    ILVR_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
+}
+#define ILVR_B4_UB( ... ) ILVR_B4( v16u8, __VA_ARGS__ )
+#define ILVR_B4_SB( ... ) ILVR_B4( v16i8, __VA_ARGS__ )
+#define ILVR_B4_UH( ... ) ILVR_B4( v8u16, __VA_ARGS__ )
+#define ILVR_B4_SH( ... ) ILVR_B4( v8i16, __VA_ARGS__ )
+
+/* Description : Interleave right half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
+{                                                                   \
+    out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
+    out1 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
+}
+#define ILVR_H2_SH( ... ) ILVR_H2( v8i16, __VA_ARGS__ )
+#define ILVR_H2_SW( ... ) ILVR_H2( v4i32, __VA_ARGS__ )
+
+#define ILVR_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3 )                        \
+{                                                                \
+    ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
+    ILVR_H2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
+}
+#define ILVR_H4_SH( ... ) ILVR_H4( v8i16, __VA_ARGS__ )
+#define ILVR_H4_SW( ... ) ILVR_H4( v4i32, __VA_ARGS__ )
+
+#define ILVR_W2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
+{                                                                   \
+    out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
+    out1 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in2, ( v4i32 ) in3 );  \
+}
+#define ILVR_W2_SH( ... ) ILVR_W2( v8i16, __VA_ARGS__ )
+
+/* Description : Interleave right half of double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of double word elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 )                    \
+{                                                                           \
+    out0 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in0 ), ( v2i64 ) ( in1 ) );  \
+    out1 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in2 ), ( v2i64 ) ( in3 ) );  \
+}
+#define ILVR_D2_UB( ... ) ILVR_D2( v16u8, __VA_ARGS__ )
+#define ILVR_D2_SB( ... ) ILVR_D2( v16i8, __VA_ARGS__ )
+#define ILVR_D2_SH( ... ) ILVR_D2( v8i16, __VA_ARGS__ )
+
+#define ILVR_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3 )                        \
+{                                                                \
+    ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
+    ILVR_D2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
+}
+#define ILVR_D4_UB( ... ) ILVR_D4( v16u8, __VA_ARGS__ )
+
+/* Description : Interleave both left and right half of input vectors
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements from 'in0' and 'in1' are
+                 interleaved and written to 'out0'
+*/
+#define ILVRL_B2( RTYPE, in0, in1, out0, out1 )                     \
+{                                                                   \
+    out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
+    out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
+}
+#define ILVRL_B2_UB( ... ) ILVRL_B2( v16u8, __VA_ARGS__ )
+#define ILVRL_B2_SB( ... ) ILVRL_B2( v16i8, __VA_ARGS__ )
+#define ILVRL_B2_UH( ... ) ILVRL_B2( v8u16, __VA_ARGS__ )
+#define ILVRL_B2_SH( ... ) ILVRL_B2( v8i16, __VA_ARGS__ )
+#define ILVRL_B2_SW( ... ) ILVRL_B2( v4i32, __VA_ARGS__ )
+
+#define ILVRL_H2( RTYPE, in0, in1, out0, out1 )                     \
+{                                                                   \
+    out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
+    out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
+}
+#define ILVRL_H2_SH( ... ) ILVRL_H2( v8i16, __VA_ARGS__ )
+#define ILVRL_H2_SW( ... ) ILVRL_H2( v4i32, __VA_ARGS__ )
+
+#define ILVRL_W2( RTYPE, in0, in1, out0, out1 )                     \
+{                                                                   \
+    out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
+    out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
+}
+#define ILVRL_W2_SH( ... ) ILVRL_W2( v8i16, __VA_ARGS__ )
+#define ILVRL_W2_SW( ... ) ILVRL_W2( v4i32, __VA_ARGS__ )
+
+/* Description : Maximum values between signed elements of vector and
+                 5-bit signed immediate value are copied to the output vector
+   Arguments   : Inputs  - in0, in1, in2, in3, max_val
+                 Outputs - in place operation
+                 Return Type - unsigned halfword
+   Details     : Maximum of signed halfword element values from 'in0' and
+                 'max_val' are written in place
+*/
+#define MAXI_SH2( RTYPE, in0, in1, max_val )                       \
+{                                                                  \
+    in0 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in0, ( max_val ) );  \
+    in1 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in1, ( max_val ) );  \
+}
+#define MAXI_SH2_UH( ... ) MAXI_SH2( v8u16, __VA_ARGS__ )
+#define MAXI_SH2_SH( ... ) MAXI_SH2( v8i16, __VA_ARGS__ )
+
+#define MAXI_SH4( RTYPE, in0, in1, in2, in3, max_val )  \
+{                                                       \
+    MAXI_SH2( RTYPE, in0, in1, max_val );               \
+    MAXI_SH2( RTYPE, in2, in3, max_val );               \
+}
+#define MAXI_SH4_UH( ... ) MAXI_SH4( v8u16, __VA_ARGS__ )
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1 bits)
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val+1) bit range.
+                 The results are written in place
+*/
+#define SAT_UH2( RTYPE, in0, in1, sat_val )                   \
+{                                                             \
+    in0 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in0, sat_val );  \
+    in1 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in1, sat_val );  \
+}
+#define SAT_UH2_UH( ... ) SAT_UH2( v8u16, __VA_ARGS__ )
+
+#define SAT_UH4( RTYPE, in0, in1, in2, in3, sat_val )  \
+{                                                      \
+    SAT_UH2( RTYPE, in0, in1, sat_val );               \
+    SAT_UH2( RTYPE, in2, in3, sat_val )                \
+}
+#define SAT_UH4_UH( ... ) SAT_UH4( v8u16, __VA_ARGS__ )
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val+1 bits)
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val+1) bit range
+                 The results are written in place
+*/
+#define SAT_SH2( RTYPE, in0, in1, sat_val )                   \
+{                                                             \
+    in0 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in0, sat_val );  \
+    in1 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in1, sat_val );  \
+}
+#define SAT_SH2_SH( ... ) SAT_SH2( v8i16, __VA_ARGS__ )
+
+#define SAT_SH4( RTYPE, in0, in1, in2, in3, sat_val )  \
+{                                                      \
+    SAT_SH2( RTYPE, in0, in1, sat_val );               \
+    SAT_SH2( RTYPE, in2, in3, sat_val );               \
+}
+#define SAT_SH4_SH( ... ) SAT_SH4( v8i16, __VA_ARGS__ )
+
+/* Description : Saturate the word element values to the max
+                 unsigned value of (sat_val+1 bits)
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned word element from 'in0' is saturated to the
+                 value generated with (sat_val+1) bit range
+                 The results are written in place
+*/
+#define SAT_SW2( RTYPE, in0, in1, sat_val )                   \
+{                                                             \
+    in0 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in0, sat_val );  \
+    in1 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in1, sat_val );  \
+}
+#define SAT_SW2_SW( ... ) SAT_SW2( v4i32, __VA_ARGS__ )
+
+/* Description : Pack even byte elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' are copied to the left half of
+                 'out0' & even byte elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
+{                                                                    \
+    out0 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
+    out1 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
+}
+#define PCKEV_B2_SB( ... ) PCKEV_B2( v16i8, __VA_ARGS__ )
+#define PCKEV_B2_UB( ... ) PCKEV_B2( v16u8, __VA_ARGS__ )
+#define PCKEV_B2_SH( ... ) PCKEV_B2( v8i16, __VA_ARGS__ )
+#define PCKEV_B2_SW( ... ) PCKEV_B2( v4i32, __VA_ARGS__ )
+
+#define PCKEV_B3( RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2 ) \
+{                                                                         \
+    PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 );                    \
+    out2 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in4, ( v16i8 ) in5 );       \
+}
+#define PCKEV_B3_UB( ... ) PCKEV_B3( v16u8, __VA_ARGS__ )
+
+#define PCKEV_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                  out0, out1, out2, out3 )                        \
+{                                                                 \
+    PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
+    PCKEV_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
+}
+#define PCKEV_B4_SB( ... ) PCKEV_B4( v16i8, __VA_ARGS__ )
+#define PCKEV_B4_UB( ... ) PCKEV_B4( v16u8, __VA_ARGS__ )
+
+/* Description : Pack even halfword elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' are copied to the left half of
+                 'out0' & even halfword elements of 'in1' are copied to the
+                 right half of 'out0'.
+*/
+#define PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
+{                                                                    \
+    out0 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
+    out1 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
+}
+#define PCKEV_H2_SH( ... ) PCKEV_H2( v8i16, __VA_ARGS__ )
+
+#define PCKEV_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                  out0, out1, out2, out3 )                        \
+{                                                                 \
+    PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
+    PCKEV_H2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
+}
+#define PCKEV_H4_SH( ... ) PCKEV_H4( v8i16, __VA_ARGS__ )
+
+/* Description : Pack even double word elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double elements of 'in0' are copied to the left half of
+                 'out0' & even double elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
+{                                                                    \
+    out0 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in0, ( v2i64 ) in1 );  \
+    out1 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in2, ( v2i64 ) in3 );  \
+}
+#define PCKEV_D2_UB( ... ) PCKEV_D2( v16u8, __VA_ARGS__ )
+
+#define PCKEV_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                  out0, out1, out2, out3 )                        \
+{                                                                 \
+    PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
+    PCKEV_D2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
+}
+#define PCKEV_D4_UB( ... ) PCKEV_D4( v16u8, __VA_ARGS__ )
+
+/* Description : Pack odd byte elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Odd byte elements of 'in0' are copied to the left half of
+                 'out0' & odd byte elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
+{                                                                    \
+    out0 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
+    out1 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
+}
+#define PCKOD_B2_UB( ... ) PCKOD_B2( v16u8, __VA_ARGS__ )
+
+#define PCKOD_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                  out0, out1, out2, out3 )                        \
+{                                                                 \
+    PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
+    PCKOD_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
+}
+#define PCKOD_B4_UB( ... ) PCKOD_B4( v16u8, __VA_ARGS__ )
+
+/* Description : Pack odd double word elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Odd double word elements of 'in0' are copied to the left half
+                 of 'out0' & odd double word elements of 'in1' are copied to
+                 the right half of 'out0'.
+*/
+#define PCKOD_D2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
+{                                                                    \
+    out0 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in0, ( v2i64 ) in1 );  \
+    out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in2, ( v2i64 ) in3 );  \
+}
+#define PCKOD_D2_SH( ... ) PCKOD_D2( v8i16, __VA_ARGS__ )
+#define PCKOD_D2_SD( ... ) PCKOD_D2( v2i64, __VA_ARGS__ )
+
+/* Description : Each byte element is logically xor'ed with immediate 128
+   Arguments   : Inputs  - in0, in1
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned byte element from input vector 'in0' is
+                 logically xor'ed with 128 and the result is stored in-place.
+*/
+#define XORI_B2_128( RTYPE, in0, in1 )                   \
+{                                                        \
+    in0 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in0, 128 );  \
+    in1 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in1, 128 );  \
+}
+#define XORI_B2_128_UB( ... ) XORI_B2_128( v16u8, __VA_ARGS__ )
+#define XORI_B2_128_SB( ... ) XORI_B2_128( v16i8, __VA_ARGS__ )
+
+#define XORI_B3_128( RTYPE, in0, in1, in2 )              \
+{                                                        \
+    XORI_B2_128( RTYPE, in0, in1 );                      \
+    in2 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in2, 128 );  \
+}
+#define XORI_B3_128_SB( ... ) XORI_B3_128( v16i8, __VA_ARGS__ )
+
+#define XORI_B4_128( RTYPE, in0, in1, in2, in3 )  \
+{                                                 \
+    XORI_B2_128( RTYPE, in0, in1 );               \
+    XORI_B2_128( RTYPE, in2, in3 );               \
+}
+#define XORI_B4_128_UB( ... ) XORI_B4_128( v16u8, __VA_ARGS__ )
+#define XORI_B4_128_SB( ... ) XORI_B4_128( v16i8, __VA_ARGS__ )
+
+#define XORI_B5_128( RTYPE, in0, in1, in2, in3, in4 )  \
+{                                                      \
+    XORI_B3_128( RTYPE, in0, in1, in2 );               \
+    XORI_B2_128( RTYPE, in3, in4 );                    \
+}
+#define XORI_B5_128_SB( ... ) XORI_B5_128( v16i8, __VA_ARGS__ )
+
+/* Description : Addition of signed halfword elements and signed saturation
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'in0' are added to signed
+                 halfword elements of 'in1'. The result is then signed saturated
+                 between halfword data type range
+*/
+#define ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 )             \
+{                                                                     \
+    out0 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
+    out1 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
+}
+#define ADDS_SH2_SH( ... ) ADDS_SH2( v8i16, __VA_ARGS__ )
+
+#define ADDS_SH4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                  out0, out1, out2, out3 )                        \
+{                                                                 \
+    ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
+    ADDS_SH2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
+}
+#define ADDS_SH4_UH( ... ) ADDS_SH4( v8u16, __VA_ARGS__ )
+
+/* Description : Shift left all elements of vector (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is left shifted by 'shift' and
+                 the result is written in-place.
+*/
+#define SLLI_4V( in0, in1, in2, in3, shift )  \
+{                                             \
+    in0 = in0 << shift;                       \
+    in1 = in1 << shift;                       \
+    in2 = in2 << shift;                       \
+    in3 = in3 << shift;                       \
+}
+
+/* Description : Arithmetic shift right all elements of vector
+                 (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is right shifted by 'shift' and
+                 the result is written in-place. 'shift' is a GP variable.
+*/
+#define SRA_4V( in0, in1, in2, in3, shift )  \
+{                                            \
+    in0 = in0 >> shift;                      \
+    in1 = in1 >> shift;                      \
+    in2 = in2 >> shift;                      \
+    in3 = in3 >> shift;                      \
+}
+
+/* Description : Shift right arithmetic rounded halfwords
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetic by
+                 number of bits respective element holds in vector 'shift'.
+                 The last discarded bit is added to shifted value for rounding
+                 and the result is written in-place.
+                 'shift' is a vector.
+*/
+#define SRAR_H2( RTYPE, in0, in1, shift )                            \
+{                                                                    \
+    in0 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in0, ( v8i16 ) shift );  \
+    in1 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in1, ( v8i16 ) shift );  \
+}
+#define SRAR_H2_SH( ... ) SRAR_H2( v8i16, __VA_ARGS__ )
+
+#define SRAR_H4( RTYPE, in0, in1, in2, in3, shift )  \
+{                                                    \
+    SRAR_H2( RTYPE, in0, in1, shift )                \
+    SRAR_H2( RTYPE, in2, in3, shift )                \
+}
+#define SRAR_H4_SH( ... ) SRAR_H4( v8i16, __VA_ARGS__ )
+
+/* Description : Shift right logical all halfword elements of vector
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right logical by
+                 number of bits respective element holds in vector 'shift' and
+                 the result is stored in-place.'shift' is a vector.
+*/
+#define SRL_H4( RTYPE, in0, in1, in2, in3, shift )                  \
+{                                                                   \
+    in0 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in0, ( v8i16 ) shift );  \
+    in1 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in1, ( v8i16 ) shift );  \
+    in2 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in2, ( v8i16 ) shift );  \
+    in3 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in3, ( v8i16 ) shift );  \
+}
+#define SRL_H4_UH( ... ) SRL_H4( v8u16, __VA_ARGS__ )
+
+/* Description : Shift right arithmetic rounded (immediate)
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetic by
+                 value in 'shift'. The last discarded bit is added to shifted
+                 value for rounding and the result is written in-place.
+                 'shift' is an immediate value.
+*/
+#define SRARI_H2( RTYPE, in0, in1, shift )                  \
+{                                                           \
+    in0 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in0, shift );  \
+    in1 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in1, shift );  \
+}
+#define SRARI_H2_UH( ... ) SRARI_H2( v8u16, __VA_ARGS__ )
+#define SRARI_H2_SH( ... ) SRARI_H2( v8i16, __VA_ARGS__ )
+
+#define SRARI_H4( RTYPE, in0, in1, in2, in3, shift )    \
+{                                                       \
+    SRARI_H2( RTYPE, in0, in1, shift );                 \
+    SRARI_H2( RTYPE, in2, in3, shift );                 \
+}
+#define SRARI_H4_UH( ... ) SRARI_H4( v8u16, __VA_ARGS__ )
+#define SRARI_H4_SH( ... ) SRARI_H4( v8i16, __VA_ARGS__ )
+
+#define SRARI_W2( RTYPE, in0, in1, shift )                  \
+{                                                           \
+    in0 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in0, shift );  \
+    in1 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in1, shift );  \
+}
+#define SRARI_W2_SW( ... ) SRARI_W2( v4i32, __VA_ARGS__ )
+
+#define SRARI_W4( RTYPE, in0, in1, in2, in3, shift )  \
+{                                                     \
+    SRARI_W2( RTYPE, in0, in1, shift );               \
+    SRARI_W2( RTYPE, in2, in3, shift );               \
+}
+#define SRARI_W4_SW( ... ) SRARI_W4( v4i32, __VA_ARGS__ )
+
+/* Description : Multiplication of pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element from 'in0' is multiplied with elements from 'in1'
+                 and the result is written to 'out0'
+*/
+#define MUL2( in0, in1, in2, in3, out0, out1 )  \
+{                                               \
+    out0 = in0 * in1;                           \
+    out1 = in2 * in3;                           \
+}
+#define MUL4( in0, in1, in2, in3, in4, in5, in6, in7,  \
+              out0, out1, out2, out3 )                 \
+{                                                      \
+    MUL2( in0, in1, in2, in3, out0, out1 );            \
+    MUL2( in4, in5, in6, in7, out2, out3 );            \
+}
+
+/* Description : Addition of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in0' is added to 'in1' and result is written
+                 to 'out0'.
+*/
+#define ADD2( in0, in1, in2, in3, out0, out1 )  \
+{                                               \
+    out0 = in0 + in1;                           \
+    out1 = in2 + in3;                           \
+}
+#define ADD4( in0, in1, in2, in3, in4, in5, in6, in7,  \
+              out0, out1, out2, out3 )                 \
+{                                                      \
+    ADD2( in0, in1, in2, in3, out0, out1 );            \
+    ADD2( in4, in5, in6, in7, out2, out3 );            \
+}
+
+#define SUB4( in0, in1, in2, in3, in4, in5, in6, in7,  \
+              out0, out1, out2, out3 )                 \
+{                                                      \
+    out0 = in0 - in1;                                  \
+    out1 = in2 - in3;                                  \
+    out2 = in4 - in5;                                  \
+    out3 = in6 - in7;                                  \
+}
+
+/* Description : Sign extend halfword elements from right half of the vector
+   Arguments   : Input  - in    (halfword vector)
+                 Output - out   (sign extended word vector)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved with same vector 'in0' to generate
+                 4 word elements keeping sign intact
+*/
+#define UNPCK_R_SH_SW( in, out )                           \
+{                                                          \
+    v8i16 sign_m;                                          \
+                                                           \
+    sign_m = __msa_clti_s_h( ( v8i16 ) in, 0 );            \
+    out = ( v4i32 ) __msa_ilvr_h( sign_m, ( v8i16 ) in );  \
+}
+
+/* Description : Zero extend unsigned byte elements to halfword elements
+   Arguments   : Input  - in           (unsigned byte vector)
+                 Outputs - out0, out1  (unsigned  halfword vectors)
+                 Return Type - signed halfword
+   Details     : Zero extended right half of vector is returned in 'out0'
+                 Zero extended left half of vector is returned in 'out1'
+*/
+#define UNPCK_UB_SH( in, out0, out1 )       \
+{                                           \
+    v16i8 zero_m = { 0 };                   \
+                                            \
+    ILVRL_B2_SH( zero_m, in, out0, out1 );  \
+}
+
+/* Description : Sign extend halfword elements from input vector and return
+                 the result in pair of vectors
+   Arguments   : Input  - in            (halfword vector)
+                 Outputs - out0, out1   (sign extended word vectors)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved right with same vector 'in0' to
+                 generate 4 signed word elements in 'out0'
+                 Then interleaved left with same vector 'in0' to
+                 generate 4 signed word elements in 'out1'
+*/
+#define UNPCK_SH_SW( in, out0, out1 )           \
+{                                               \
+    v8i16 tmp_m;                                \
+                                                \
+    tmp_m = __msa_clti_s_h( ( v8i16 ) in, 0 );  \
+    ILVRL_H2_SW( tmp_m, in, out0, out1 );       \
+}
+
+/* Description : Butterfly of 4 input vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_4( in0, in1, in2, in3, out0, out1, out2, out3 )  \
+{                                                                  \
+    out0 = in0 + in3;                                              \
+    out1 = in1 + in2;                                              \
+                                                                   \
+    out2 = in1 - in2;                                              \
+    out3 = in0 - in3;                                              \
+}
+
+/* Description : Butterfly of 8 input vectors
+   Arguments   : Inputs  - in0 ...  in7
+                 Outputs - out0 .. out7
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_8( in0, in1, in2, in3, in4, in5, in6, in7,           \
+                     out0, out1, out2, out3, out4, out5, out6, out7 )  \
+{                                                                      \
+    out0 = in0 + in7;                                                  \
+    out1 = in1 + in6;                                                  \
+    out2 = in2 + in5;                                                  \
+    out3 = in3 + in4;                                                  \
+                                                                       \
+    out4 = in3 - in4;                                                  \
+    out5 = in2 - in5;                                                  \
+    out6 = in1 - in6;                                                  \
+    out7 = in0 - in7;                                                  \
+}
+
+/* Description : Transpose input 8x8 byte block
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_UB( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
+                         out0, out1, out2, out3, out4, out5, out6, out7 )  \
+{                                                                          \
+    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
+    v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                  \
+                                                                           \
+    ILVR_B4_SB( in2, in0, in3, in1, in6, in4, in7, in5,                    \
+                tmp0_m, tmp1_m, tmp2_m, tmp3_m );                          \
+    ILVRL_B2_SB( tmp1_m, tmp0_m, tmp4_m, tmp5_m );                         \
+    ILVRL_B2_SB( tmp3_m, tmp2_m, tmp6_m, tmp7_m );                         \
+    ILVRL_W2( RTYPE, tmp6_m, tmp4_m, out0, out2 );                         \
+    ILVRL_W2( RTYPE, tmp7_m, tmp5_m, out4, out6 );                         \
+    SLDI_B2_0( RTYPE, out0, out2, out1, out3, 8 );                         \
+    SLDI_B2_0( RTYPE, out4, out6, out5, out7, 8 );                         \
+}
+#define TRANSPOSE8x8_UB_UB( ... ) TRANSPOSE8x8_UB( v16u8, __VA_ARGS__ )
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+                           in8, in9, in10, in11, in12, in13, in14, in15
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - unsigned byte
+*/
+#define TRANSPOSE16x8_UB_UB( in0, in1, in2, in3, in4, in5, in6, in7,           \
+                             in8, in9, in10, in11, in12, in13, in14, in15,     \
+                             out0, out1, out2, out3, out4, out5, out6, out7 )  \
+{                                                                              \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
+    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
+                                                                               \
+    ILVEV_D2_UB( in0, in8, in1, in9, out7, out6 );                             \
+    ILVEV_D2_UB( in2, in10, in3, in11, out5, out4 );                           \
+    ILVEV_D2_UB( in4, in12, in5, in13, out3, out2 );                           \
+    ILVEV_D2_UB( in6, in14, in7, in15, out1, out0 );                           \
+                                                                               \
+    tmp0_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out6, ( v16i8 ) out7 );        \
+    tmp4_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out6, ( v16i8 ) out7 );        \
+    tmp1_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out4, ( v16i8 ) out5 );        \
+    tmp5_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out4, ( v16i8 ) out5 );        \
+    out5 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out2, ( v16i8 ) out3 );          \
+    tmp6_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out2, ( v16i8 ) out3 );        \
+    out7 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out0, ( v16i8 ) out1 );          \
+    tmp7_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out0, ( v16i8 ) out1 );        \
+                                                                               \
+    ILVEV_H2_UB( tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m );                 \
+    out0 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
+    out4 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
+                                                                               \
+    tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m );    \
+    tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) out7, ( v8i16 ) out5 );        \
+    out2 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
+    out6 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
+                                                                               \
+    ILVEV_H2_UB( tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m );             \
+    out1 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
+    out5 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
+                                                                               \
+    tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m );    \
+    tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m );    \
+    tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m );    \
+    tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m );    \
+    out3 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
+    out7 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
+}
+
+/* Description : Transpose 4x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE4x4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 )  \
+{                                                                         \
+    v8i16 s0_m, s1_m;                                                     \
+                                                                          \
+    ILVR_H2_SH( in1, in0, in3, in2, s0_m, s1_m );                         \
+    ILVRL_W2_SH( s1_m, s0_m, out0, out2 );                                \
+    out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 );      \
+    out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out2 );      \
+}
+
+/* Description : Transpose 4x8 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE4X8_SH_SH( in0, in1, in2, in3, in4, in5, in6, in7,           \
+                            out0, out1, out2, out3, out4, out5, out6, out7 )  \
+{                                                                             \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
+    v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                     \
+    v8i16 zero_m = { 0 };                                                     \
+                                                                              \
+    ILVR_H4_SH( in1, in0, in3, in2, in5, in4, in7, in6,                       \
+                tmp0_n, tmp1_n, tmp2_n, tmp3_n );                             \
+    ILVRL_W2_SH( tmp1_n, tmp0_n, tmp0_m, tmp2_m );                            \
+    ILVRL_W2_SH( tmp3_n, tmp2_n, tmp1_m, tmp3_m );                            \
+                                                                              \
+    out0 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m );      \
+    out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m );      \
+    out2 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m );      \
+    out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m );      \
+                                                                              \
+    out4 = zero_m;                                                            \
+    out5 = zero_m;                                                            \
+    out6 = zero_m;                                                            \
+    out7 = zero_m;                                                            \
+}
+
+/* Description : Transpose 8x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE8X4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 )  \
+{                                                                         \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
+                                                                          \
+    ILVR_H2_SH( in1, in0, in3, in2, tmp0_m, tmp1_m );                     \
+    ILVL_H2_SH( in1, in0, in3, in2, tmp2_m, tmp3_m );                     \
+    ILVR_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2 );             \
+    ILVL_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3 );             \
+}
+
+/* Description : Transpose 8x8 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_H( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
+                        out0, out1, out2, out3, out4, out5, out6, out7 )   \
+{                                                                          \
+    v8i16 s0_m, s1_m;                                                      \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
+    v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                  \
+                                                                           \
+    ILVR_H2_SH( in6, in4, in7, in5, s0_m, s1_m );                          \
+    ILVRL_H2_SH( s1_m, s0_m, tmp0_m, tmp1_m );                             \
+    ILVL_H2_SH( in6, in4, in7, in5, s0_m, s1_m );                          \
+    ILVRL_H2_SH( s1_m, s0_m, tmp2_m, tmp3_m );                             \
+    ILVR_H2_SH( in2, in0, in3, in1, s0_m, s1_m );                          \
+    ILVRL_H2_SH( s1_m, s0_m, tmp4_m, tmp5_m );                             \
+    ILVL_H2_SH( in2, in0, in3, in1, s0_m, s1_m );                          \
+    ILVRL_H2_SH( s1_m, s0_m, tmp6_m, tmp7_m );                             \
+    PCKEV_D4( RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,       \
+              tmp3_m, tmp7_m, out0, out2, out4, out6 );                    \
+    out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp0_m, ( v2i64 ) tmp4_m );  \
+    out3 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp5_m );  \
+    out5 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp2_m, ( v2i64 ) tmp6_m );  \
+    out7 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp7_m );  \
+}
+#define TRANSPOSE8x8_SH_SH( ... ) TRANSPOSE8x8_H( v8i16, __VA_ARGS__ )
+
+/* Description : Transpose 4x4 block with word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed word
+*/
+#define TRANSPOSE4x4_SW_SW( in0, in1, in2, in3, out0, out1, out2, out3 )  \
+{                                                                         \
+    v4i32 s0_m, s1_m, s2_m, s3_m;                                         \
+                                                                          \
+    ILVRL_W2_SW( in1, in0, s0_m, s1_m );                                  \
+    ILVRL_W2_SW( in3, in2, s2_m, s3_m );                                  \
+                                                                          \
+    out0 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m );      \
+    out1 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m );      \
+    out2 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m );      \
+    out3 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m );      \
+}
+
+/* Description : Add block 4x4
+   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
+   Details     : Least significant 4 bytes from each input vector are added to
+                 the destination bytes, clipped between 0-255 and stored.
+*/
+#define ADDBLK_ST4x4_UB( in0, in1, in2, in3, p_dst, stride )        \
+{                                                                   \
+    uint32_t src0_m, src1_m, src2_m, src3_m;                        \
+    uint32_t out0_m, out1_m, out2_m, out3_m;                        \
+    v8i16 inp0_m, inp1_m, res0_m, res1_m;                           \
+    v16i8 dst0_m = { 0 };                                           \
+    v16i8 dst1_m = { 0 };                                           \
+    v16i8 zero_m = { 0 };                                           \
+                                                                    \
+    ILVR_D2_SH( in1, in0, in3, in2, inp0_m, inp1_m )                \
+    LW4( p_dst, stride,  src0_m, src1_m, src2_m, src3_m );          \
+    INSERT_W2_SB( src0_m, src1_m, dst0_m );                         \
+    INSERT_W2_SB( src2_m, src3_m, dst1_m );                         \
+    ILVR_B2_SH( zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m );   \
+    ADD2( res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m );         \
+    CLIP_SH2_0_255( res0_m, res1_m );                               \
+    PCKEV_B2_SB( res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m );  \
+                                                                    \
+    out0_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 0 );                 \
+    out1_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 1 );                 \
+    out2_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 0 );                 \
+    out3_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 1 );                 \
+    SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride );           \
+}
+
+/* Description : Dot product and addition of 3 signed halfword input vectors
+   Arguments   : Inputs  - in0, in1, in2, coeff0, coeff1, coeff2
+                 Output - out0_m
+                 Return Type - signed halfword
+   Details     : Dot product of 'in0' with 'coeff0'
+                 Dot product of 'in1' with 'coeff1'
+                 Dot product of 'in2' with 'coeff2'
+                 Addition of all the 3 vector results
+                 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
+*/
+#define DPADD_SH3_SH( in0, in1, in2, coeff0, coeff1, coeff2 )             \
+( {                                                                       \
+    v8i16 tmp1_m;                                                         \
+    v8i16 out0_m;                                                         \
+                                                                          \
+    out0_m = __msa_dotp_s_h( ( v16i8 ) in0, ( v16i8 ) coeff0 );           \
+    out0_m = __msa_dpadd_s_h( out0_m, ( v16i8 ) in1, ( v16i8 ) coeff1 );  \
+    tmp1_m = __msa_dotp_s_h( ( v16i8 ) in2, ( v16i8 ) coeff2 );           \
+    out0_m = __msa_adds_s_h( out0_m, tmp1_m );                            \
+                                                                          \
+    out0_m;                                                               \
+} )
+
+/* Description : Pack even elements of input vectors & xor with 128
+   Arguments   : Inputs  - in0, in1
+                 Output - out_m
+                 Return Type - unsigned byte
+   Details     : Signed byte even elements from 'in0' and 'in1' are packed
+                 together in one vector and the resulting vector is xor'ed with
+                 128 to shift the range from signed to unsigned byte
+*/
+#define PCKEV_XORI128_UB( in0, in1 )                                  \
+( {                                                                   \
+    v16u8 out_m;                                                      \
+    out_m = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 );  \
+    out_m = ( v16u8 ) __msa_xori_b( ( v16u8 ) out_m, 128 );           \
+    out_m;                                                            \
+} )
+
+/* Description : Pack even byte elements, extract 0 & 2 index words from pair
+                 of results and store 4 words in destination memory as per
+                 stride
+   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
+*/
+#define PCKEV_ST4x4_UB( in0, in1, in2, in3, p_dst, stride )  \
+{                                                            \
+    uint32_t out0_m, out1_m, out2_m, out3_m;                 \
+    v16i8 tmp0_m, tmp1_m;                                    \
+                                                             \
+    PCKEV_B2_SB( in1, in0, in3, in2, tmp0_m, tmp1_m );       \
+                                                             \
+    out0_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 0 );          \
+    out1_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 2 );          \
+    out2_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 0 );          \
+    out3_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 2 );          \
+                                                             \
+    SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride );    \
+}
+
+/* Description : Pack even byte elements and store byte vector in destination
+                 memory
+   Arguments   : Inputs  - in0, in1, pdst
+*/
+#define PCKEV_ST_SB( in0, in1, p_dst )                      \
+{                                                           \
+    v16i8 tmp_m;                                            \
+    tmp_m = __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 );  \
+    ST_SB( tmp_m, ( p_dst ) );                              \
+}
+
+#define AVC_CALC_DPADD_H_6PIX_2COEFF_SH( in0, in1, in2, in3, in4, in5 )    \
+( {                                                                        \
+    v4i32 tmp0_m, tmp1_m;                                                  \
+    v8i16 out0_m, out1_m, out2_m, out3_m;                                  \
+    v8i16 minus5h_m = __msa_ldi_h( -5 );                                   \
+    v8i16 plus20h_m = __msa_ldi_h( 20 );                                   \
+                                                                           \
+    ILVRL_H2_SW( in5, in0, tmp0_m, tmp1_m );                               \
+                                                                           \
+    tmp0_m = __msa_hadd_s_w( ( v8i16 ) tmp0_m, ( v8i16 ) tmp0_m );         \
+    tmp1_m = __msa_hadd_s_w( ( v8i16 ) tmp1_m, ( v8i16 ) tmp1_m );         \
+                                                                           \
+    ILVRL_H2_SH( in1, in4, out0_m, out1_m );                               \
+    DPADD_SH2_SW( out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m );  \
+    ILVRL_H2_SH( in2, in3, out2_m, out3_m );                               \
+    DPADD_SH2_SW( out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m );  \
+                                                                           \
+    SRARI_W2_SW( tmp0_m, tmp1_m, 10 );                                     \
+    SAT_SW2_SW( tmp0_m, tmp1_m, 7 );                                       \
+    out0_m = __msa_pckev_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m );          \
+                                                                           \
+    out0_m;                                                                \
+} )
+
+#define AVC_HORZ_FILTER_SH( in, mask0, mask1, mask2 )      \
+( {                                                        \
+    v8i16 out0_m, out1_m;                                  \
+    v16i8 tmp0_m, tmp1_m;                                  \
+    v16i8 minus5b = __msa_ldi_b( -5 );                     \
+    v16i8 plus20b = __msa_ldi_b( 20 );                     \
+                                                           \
+    tmp0_m = __msa_vshf_b( ( v16i8 ) mask0, in, in );      \
+    out0_m = __msa_hadd_s_h( tmp0_m, tmp0_m );             \
+                                                           \
+    tmp0_m = __msa_vshf_b( ( v16i8 ) mask1, in, in );      \
+    out0_m = __msa_dpadd_s_h( out0_m, minus5b, tmp0_m );   \
+                                                           \
+    tmp1_m = __msa_vshf_b( ( v16i8 ) ( mask2 ), in, in );  \
+    out1_m = __msa_dpadd_s_h( out0_m, plus20b, tmp1_m );   \
+                                                           \
+    out1_m;                                                \
+} )
+
+#endif  /* X264_MIPS_MACROS_H */

x264-snapshot-20150804-2245.tar.bz2/common/mips/mc-c.c Added

@@ -0,0 +1,3807 @@
+/*****************************************************************************
+ * mc-c.c: msa motion compensation
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Neha Rana <neha.rana@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macros.h"
+#include "mc.h"
+
+#if !HIGH_BIT_DEPTH
+static const uint8_t pu_luma_mask_arr[16 * 8] =
+{
+    /* 8 width cases */
+    0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
+    1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
+    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+    /* 4 width cases */
+    0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
+    1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
+    2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
+    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
+    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
+};
+
+static const uint8_t pu_chroma_mask_arr[16 * 5] =
+{
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                           uint8_t *p_src, intptr_t i_src_stride,
+                           int32_t i_height );
+void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                          uint8_t *p_src, intptr_t i_src_stride,
+                          int32_t i_height );
+void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
+                          intptr_t i_src_stride, int32_t i_height );
+void x264_memzero_aligned_msa( void *p_dst, size_t n );
+
+void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                               uint8_t *p_pix2, intptr_t i_pix2_stride,
+                               uint8_t *p_pix3, intptr_t i_pix3_stride,
+                               int32_t i_weight );
+void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                              uint8_t *p_pix2, intptr_t i_pix2_stride,
+                              uint8_t *p_pix3, intptr_t i_pix3_stride,
+                              int32_t i_weight );
+void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                              uint8_t *p_pix2, intptr_t i_pix2_stride,
+                              uint8_t *p_pix3, intptr_t i_pix3_stride,
+                              int32_t i_weight );
+void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                             uint8_t *p_pix2, intptr_t i_pix2_stride,
+                             uint8_t *p_pix3, intptr_t i_pix3_stride,
+                             int32_t i_weight );
+void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                             uint8_t *p_pix2, intptr_t i_pix2_stride,
+                             uint8_t *p_pix3, intptr_t i_pix3_stride,
+                             int32_t i_weight );
+void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+                              uint8_t *p_pix2, intptr_t pix2_stride,
+                              uint8_t *p_pix3, intptr_t pix3_stride,
+                              int32_t i_weight );
+void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                             uint8_t *p_pix2, intptr_t i_pix2_stride,
+                             uint8_t *p_pix3, intptr_t i_pix3_stride,
+                             int32_t i_weight );
+void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                             uint8_t *p_pix2, intptr_t i_pix2_stride,
+                             uint8_t *p_pix3, intptr_t i_pix3_stride,
+                             int32_t i_weight );
+void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                             uint8_t *p_pix2, intptr_t i_pix2_stride,
+                             uint8_t *p_pix3, intptr_t i_pix3_stride,
+                             int32_t i_weight );
+
+void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                             uint8_t *p_src, intptr_t i_src_stride,
+                             const x264_weight_t *pWeight, int32_t i_height );
+void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                            uint8_t *p_src, intptr_t i_src_stride,
+                            const x264_weight_t *pWeight, int32_t i_height );
+void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                            uint8_t *p_src, intptr_t i_src_stride,
+                            const x264_weight_t *pWeight, int32_t i_height );
+void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                             uint8_t *p_src, intptr_t i_src_stride,
+                             const x264_weight_t *pWeight, int32_t i_height );
+
+weight_fn_t x264_mc_weight_wtab_msa[6] =
+{
+    x264_mc_weight_w4_msa,
+    x264_mc_weight_w4_msa,
+    x264_mc_weight_w8_msa,
+    x264_mc_weight_w16_msa,
+    x264_mc_weight_w16_msa,
+    x264_mc_weight_w20_msa,
+};
+
+void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                       uint8_t *p_src[4], intptr_t i_src_stride,
+                       int32_t m_vx, int32_t m_vy,
+                       int32_t i_width, int32_t i_height,
+                       const x264_weight_t *pWeight );
+uint8_t *x264_get_ref_msa( uint8_t *p_dst,   intptr_t *p_dst_stride,
+                           uint8_t *p_src[4], intptr_t i_src_stride,
+                           int32_t m_vx, int32_t m_vy,
+                           int32_t i_width, int32_t i_height,
+                           const x264_weight_t *pWeight );
+void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
+                         intptr_t i_dst_stride,
+                         uint8_t *p_src, intptr_t i_src_stride,
+                         int32_t m_vx, int32_t m_vy,
+                         int32_t i_width, int32_t i_height );
+void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v,
+                           uint8_t *p_dstc, uint8_t *p_src,
+                           intptr_t i_stride, int32_t i_width,
+                           int32_t i_height, int16_t *p_buf );
+
+void x264_plane_copy_interleave_msa( uint8_t *p_dst,  intptr_t i_dst_stride,
+                                     uint8_t *p_src0, intptr_t i_src_stride0,
+                                     uint8_t *p_src1, intptr_t i_src_stride1,
+                                     int32_t i_width, int32_t i_height );
+void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0,
+                                       uint8_t *p_dst1, intptr_t i_dst_stride1,
+                                       uint8_t *p_src,  intptr_t i_src_stride,
+                                       int32_t i_width, int32_t i_height );
+void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0,
+                                           intptr_t i_dst_stride0,
+                                           uint8_t *p_dst1,
+                                           intptr_t i_dst_stride1,
+                                           uint8_t *p_dst2,
+                                           intptr_t i_dst_stride2,
+                                           uint8_t *p_src,
+                                           intptr_t i_src_stride,
+                                           int32_t i_src_width, int32_t i_width,
+                                           int32_t i_height );
+void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                                       uint8_t *p_src0, uint8_t *p_src1,
+                                       int32_t i_height );
+void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src,
+                                             intptr_t i_src_stride,
+                                             int32_t i_height );
+void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src,
+                                             intptr_t i_src_stride,
+                                             int32_t i_height );
+void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0,
+                                      uint8_t *p_dst1, uint8_t *p_dst2,
+                                      uint8_t *p_dst3, intptr_t i_src_stride,
+                                      intptr_t i_dst_stride, int32_t i_width,
+                                      int32_t i_height );
+
+static void avc_luma_hz_16w_msa( uint8_t *p_src, int32_t i_src_stride,
+                                 uint8_t *p_dst, int32_t i_dst_stride,
+                                 int32_t i_height )
+{
+    uint32_t u_loop_cnt, u_h4w;
+    v16u8 dst0;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b( -5 );
+    v16i8 plus20b = __msa_ldi_b( 20 );
+
+    u_h4w = i_height % 4;
+    LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
+
+    for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
+    {
+        LD_SB2( p_src, 8, src0, src1 );
+        p_src += i_src_stride;
+        LD_SB2( p_src, 8, src2, src3 );
+        p_src += i_src_stride;
+
+        XORI_B4_128_SB( src0, src1, src2, src3 );
+        VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 );
+        VSHF_B2_SB( src2, src2, src3, src3, mask0, mask0, vec6, vec9 );
+        VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 );
+        VSHF_B2_SB( src2, src2, src3, src3, mask1, mask1, vec7, vec10 );
+        VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 );
+        VSHF_B2_SB( src2, src2, src3, src3, mask2, mask2, vec8, vec11 );
+        HADD_SB4_SH( vec0, vec3, vec6, vec9, res0, res1, res2, res3 );
+        DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                      minus5b, res0, res1, res2, res3 );
+        DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                      plus20b, res0, res1, res2, res3 );
+
+        LD_SB2( p_src, 8, src4, src5 );
+        p_src += i_src_stride;
+        LD_SB2( p_src, 8, src6, src7 );
+        p_src += i_src_stride;
+
+        XORI_B4_128_SB( src4, src5, src6, src7 );
+        VSHF_B2_SB( src4, src4, src5, src5, mask0, mask0, vec0, vec3 );
+        VSHF_B2_SB( src6, src6, src7, src7, mask0, mask0, vec6, vec9 );
+        VSHF_B2_SB( src4, src4, src5, src5, mask1, mask1, vec1, vec4 );
+        VSHF_B2_SB( src6, src6, src7, src7, mask1, mask1, vec7, vec10 );
+        VSHF_B2_SB( src4, src4, src5, src5, mask2, mask2, vec2, vec5 );
+        VSHF_B2_SB( src6, src6, src7, src7, mask2, mask2, vec8, vec11 );
+        HADD_SB4_SH( vec0, vec3, vec6, vec9, res4, res5, res6, res7 );
+        DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
+                      minus5b, res4, res5, res6, res7 );
+        DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
+                      plus20b, res4, res5, res6, res7 );
+        SRARI_H4_SH( res0, res1, res2, res3, 5 );
+        SRARI_H4_SH( res4, res5, res6, res7, 5 );
+        SAT_SH4_SH( res0, res1, res2, res3, 7 );
+        SAT_SH4_SH( res4, res5, res6, res7, 7 );
+        PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
+                     vec0, vec1, vec2, vec3 );
+        XORI_B4_128_SB( vec0, vec1, vec2, vec3 );
+
+        ST_SB4( vec0, vec1, vec2, vec3, p_dst, i_dst_stride );
+        p_dst += ( 4 * i_dst_stride );
+    }
+
+    for( u_loop_cnt = u_h4w; u_loop_cnt--; )
+    {
+        LD_SB2( p_src, 8, src0, src1 );
+        p_src += i_src_stride;
+
+        XORI_B2_128_SB( src0, src1 );
+        VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 );
+        VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 );
+        VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 );
+        res0 = __msa_hadd_s_h( vec0, vec0 );
+        DPADD_SB2_SH( vec1, vec2, minus5b, plus20b, res0, res0 );
+        res1 = __msa_hadd_s_h( vec3, vec3 );
+        DPADD_SB2_SH( vec4, vec5, minus5b, plus20b, res1, res1 );
+        SRARI_H2_SH( res0, res1, 5 );
+        SAT_SH2_SH( res0, res1, 7 );
+        dst0 = PCKEV_XORI128_UB( res0, res1 );
+        ST_UB( dst0, p_dst );
+        p_dst += i_dst_stride;
+    }
+}
+
+static void avc_luma_vt_16w_msa( uint8_t *p_src, int32_t i_src_stride,
+                                 uint8_t *p_dst, int32_t i_dst_stride,
+                                 int32_t i_height )
+{
+    uint32_t u_loop_cnt, u_h4w;
+    const int16_t i_filt_const0 = 0xfb01;
+    const int16_t i_filt_const1 = 0x1414;
+    const int16_t i_filt_const2 = 0x1fb;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+    v16i8 src65_l, src87_l;
+    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+    v16u8 res0, res1, res2, res3;
+    v16i8 filt0, filt1, filt2;
+
+    u_h4w = i_height % 4;
+    filt0 = ( v16i8 ) __msa_fill_h( i_filt_const0 );
+    filt1 = ( v16i8 ) __msa_fill_h( i_filt_const1 );
+    filt2 = ( v16i8 ) __msa_fill_h( i_filt_const2 );
+
+    LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
+    p_src += ( 5 * i_src_stride );
+
+    XORI_B5_128_SB( src0, src1, src2, src3, src4 );
+    ILVR_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3,
+                src10_r, src21_r, src32_r, src43_r );
+    ILVL_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3,
+                src10_l, src21_l, src32_l, src43_l );
+
+    for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
+    {
+        LD_SB4( p_src, i_src_stride, src5, src6, src7, src8 );
+        p_src += ( 4 * i_src_stride );
+
+        XORI_B4_128_SB( src5, src6, src7, src8 );
+        ILVR_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7,
+                    src54_r, src65_r, src76_r, src87_r );
+        ILVL_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7,
+                    src54_l, src65_l, src76_l, src87_l );
+        out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r,
+                               filt0, filt1, filt2 );
+        out1_r = DPADD_SH3_SH( src21_r, src43_r, src65_r,
+                               filt0, filt1, filt2 );
+        out2_r = DPADD_SH3_SH( src32_r, src54_r, src76_r,
+                               filt0, filt1, filt2 );
+        out3_r = DPADD_SH3_SH( src43_r, src65_r, src87_r,
+                               filt0, filt1, filt2 );
+        out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l,
+                               filt0, filt1, filt2 );
+        out1_l = DPADD_SH3_SH( src21_l, src43_l, src65_l,
+                               filt0, filt1, filt2 );
+        out2_l = DPADD_SH3_SH( src32_l, src54_l, src76_l,
+                               filt0, filt1, filt2 );
+        out3_l = DPADD_SH3_SH( src43_l, src65_l, src87_l,
+                               filt0, filt1, filt2 );
+        SRARI_H4_SH( out0_r, out1_r, out2_r, out3_r, 5 );
+        SAT_SH4_SH( out0_r, out1_r, out2_r, out3_r, 7 );
+        SRARI_H4_SH( out0_l, out1_l, out2_l, out3_l, 5 );
+        SAT_SH4_SH( out0_l, out1_l, out2_l, out3_l, 7 );
+        PCKEV_B4_UB( out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
+                     out3_r, res0, res1, res2, res3 );
+        XORI_B4_128_UB( res0, res1, res2, res3 );
+
+        ST_UB4( res0, res1, res2, res3, p_dst, i_dst_stride );
+        p_dst += ( 4 * i_dst_stride );
+
+        src10_r = src54_r;
+        src32_r = src76_r;
+        src21_r = src65_r;
+        src43_r = src87_r;
+        src10_l = src54_l;
+        src32_l = src76_l;
+        src21_l = src65_l;
+        src43_l = src87_l;
+        src4 = src8;
+    }
+
+    for( u_loop_cnt = u_h4w; u_loop_cnt--; )
+    {
+        src5 = LD_SB( p_src );
+        p_src += ( i_src_stride );
+        src5 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src5, 128 );
+        ILVRL_B2_SB( src5, src4, src54_r, src54_l );
+        out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r,
+                               filt0, filt1, filt2 );
+        out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l,
+                               filt0, filt1, filt2 );
+        SRARI_H2_SH( out0_r, out0_l, 5 );
+        SAT_SH2_SH( out0_r, out0_l, 7 );
+        out0_r = ( v8i16 ) __msa_pckev_b( ( v16i8 ) out0_l, ( v16i8 ) out0_r );
+        res0 = __msa_xori_b( ( v16u8 ) out0_r, 128 );
+        ST_UB( res0, p_dst );
+        p_dst += i_dst_stride;
+
+        src10_r = src21_r;
+        src21_r = src32_r;
+        src32_r = src43_r;
+        src43_r = src54_r;
+
+        src10_l = src21_l;
+        src21_l = src32_l;
+        src32_l = src43_l;
+        src43_l = src54_l;
+
+        src4 = src5;
+    }
+}
+
+static void avc_luma_mid_8w_msa( uint8_t *p_src, int32_t i_src_stride,
+                                 uint8_t *p_dst, int32_t i_dst_stride,
+                                 int32_t i_height )
+{
+    uint32_t u_loop_cnt, u_h4w;
+    uint64_t u_out0;
+    v16i8 tmp0;
+    v16i8 src0, src1, src2, src3, src4;
+    v16i8 mask0, mask1, mask2;
+    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
+    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
+    v8i16 dst0, dst1, dst2, dst3;
+    v16u8 out0, out1;
+
+    u_h4w = i_height % 4;
+    LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
+
+    LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
+    XORI_B5_128_SB( src0, src1, src2, src3, src4 );
+    p_src += ( 5 * i_src_stride );
+
+    hz_out0 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
+    hz_out1 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 );
+    hz_out2 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 );
+    hz_out3 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 );
+    hz_out4 = AVC_HORZ_FILTER_SH( src4, mask0, mask1, mask2 );
+
+    for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
+    {
+        LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
+        XORI_B4_128_SB( src0, src1, src2, src3 );
+        p_src += ( 4 * i_src_stride );
+
+        hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
+        hz_out6 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 );
+        hz_out7 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 );
+        hz_out8 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 );
+        dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1, hz_out2,
+                                                hz_out3, hz_out4, hz_out5 );
+        dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out1, hz_out2, hz_out3,
+                                                hz_out4, hz_out5, hz_out6 );
+        dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out2, hz_out3, hz_out4,
+                                                hz_out5, hz_out6, hz_out7 );
+        dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out3, hz_out4, hz_out5,
+                                                hz_out6, hz_out7, hz_out8 );
+        out0 = PCKEV_XORI128_UB( dst0, dst1 );
+        out1 = PCKEV_XORI128_UB( dst2, dst3 );
+        ST8x4_UB( out0, out1, p_dst, i_dst_stride );
+
+        p_dst += ( 4 * i_dst_stride );
+        hz_out3 = hz_out7;
+        hz_out1 = hz_out5;
+        hz_out5 = hz_out4;
+        hz_out4 = hz_out8;
+        hz_out2 = hz_out6;
+        hz_out0 = hz_out5;
+    }
+
+    for( u_loop_cnt = u_h4w; u_loop_cnt--; )
+    {
+        src0 = LD_SB( p_src );
+        p_src += i_src_stride;
+
+        src0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src0, 128 );
+        hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
+
+        dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1,
+                                                hz_out2, hz_out3,
+                                                hz_out4, hz_out5 );
+
+        tmp0 = __msa_pckev_b( ( v16i8 ) ( dst0 ), ( v16i8 ) ( dst0 ) );
+        tmp0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp0, 128 );
+        u_out0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
+        SD( u_out0, p_dst );
+        p_dst += i_dst_stride;
+
+        hz_out0 = hz_out1;
+        hz_out1 = hz_out2;
+        hz_out2 = hz_out3;
+        hz_out3 = hz_out4;
+        hz_out4 = hz_out5;
+    }
+}
+
+static void avc_luma_mid_16w_msa( uint8_t *p_src, int32_t i_src_stride,
+                                  uint8_t *p_dst, int32_t i_dst_stride,
+                                  int32_t i_height )
+{
+    uint32_t u_multiple8_cnt;
+
+    for( u_multiple8_cnt = 2; u_multiple8_cnt--; )
+    {
+        avc_luma_mid_8w_msa( p_src, i_src_stride, p_dst, i_dst_stride,
+                             i_height );
+        p_src += 8;
+        p_dst += 8;
+    }
+}
+
+static void avc_interleaved_chroma_hv_2x2_msa( uint8_t *p_src,
+                                               int32_t i_src_stride,
+                                               uint8_t *p_dst_u,
+                                               uint8_t *p_dst_v,
+                                               int32_t i_dst_stride,
+                                               uint32_t u_coef_hor0,
+                                               uint32_t u_coef_hor1,
+                                               uint32_t u_coef_ver0,
+                                               uint32_t u_coef_ver1 )
+{
+    uint16_t u_out0, u_out1, u_out2, u_out3;
+    v16u8 src0, src1, src2, src3, src4;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
+    v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
+    v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
+    v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
+    v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
+    v8i16 res0, res1;
+
+    mask = LD_SB( &pu_chroma_mask_arr[16] );
+
+    LD_UB3( p_src, i_src_stride, src0, src1, src2 );
+    VSHF_B2_UB( src0, src1, src1, src2,
+                ( mask + 1 ), ( mask + 1 ), src3, src4 );
+    VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
+    DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
+                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+                 res_hz3 );
+    MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+          coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+          res_vt3 );
+    ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
+    SRARI_H2_UH( res_vt0, res_vt2, 6 );
+    SAT_UH2_UH( res_vt0, res_vt2, 7 );
+    PCKEV_B2_SH( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
+
+    u_out0 = __msa_copy_u_h( res0, 0 );
+    u_out1 = __msa_copy_u_h( res0, 2 );
+    u_out2 = __msa_copy_u_h( res1, 0 );
+    u_out3 = __msa_copy_u_h( res1, 2 );
+
+    SH( u_out0, p_dst_u );
+    p_dst_u += i_dst_stride;
+    SH( u_out1, p_dst_u );
+
+    SH( u_out2, p_dst_v );
+    p_dst_v += i_dst_stride;
+    SH( u_out3, p_dst_v );
+}
+
+static void avc_interleaved_chroma_hv_2x4_msa( uint8_t *p_src,
+                                               int32_t i_src_stride,
+                                               uint8_t *p_dst_u,
+                                               uint8_t *p_dst_v,
+                                               int32_t i_dst_stride,
+                                               uint32_t u_coef_hor0,
+                                               uint32_t u_coef_hor1,
+                                               uint32_t u_coef_ver0,
+                                               uint32_t u_coef_ver1 )
+{
+    uint16_t u_out0, u_out1, u_out2, u_out3;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16i8 mask;
+    v8i16 res0, res1;
+    v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
+    v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
+    v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
+    v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
+    v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
+
+    mask = LD_SB( &pu_chroma_mask_arr[16] );
+
+    LD_UB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
+
+    VSHF_B2_UB( src0, src1, src1, src2,
+                ( mask + 1 ), ( mask + 1 ), src5, src6 );
+    VSHF_B2_UB( src2, src3, src3, src4,
+                ( mask + 1 ), ( mask + 1 ), src7, src8 );
+    VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
+    VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
+    DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
+                 coeff_hz_vec, coeff_hz_vec, res_hz0,
+                 res_hz1, res_hz2, res_hz3 );
+    MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+          coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+          res_vt3 );
+    ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
+    SRARI_H2_UH( res_vt0, res_vt1, 6 );
+    SAT_UH2_UH( res_vt0, res_vt1, 7 );
+    PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
+
+    u_out0 = __msa_copy_u_h( res0, 0 );
+    u_out1 = __msa_copy_u_h( res0, 2 );
+    u_out2 = __msa_copy_u_h( res1, 0 );
+    u_out3 = __msa_copy_u_h( res1, 2 );
+
+    SH( u_out0, p_dst_u );
+    p_dst_u += i_dst_stride;
+    SH( u_out1, p_dst_u );
+    p_dst_u += i_dst_stride;
+    SH( u_out2, p_dst_u );
+    p_dst_u += i_dst_stride;
+    SH( u_out3, p_dst_u );
+
+    DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
+                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+                 res_hz3 );
+    MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+          coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+          res_vt3 );
+    ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
+    SRARI_H2_UH( res_vt0, res_vt1, 6 );
+    SAT_UH2_UH( res_vt0, res_vt1, 7 );
+    PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
+
+    u_out0 = __msa_copy_u_h( res0, 0 );
+    u_out1 = __msa_copy_u_h( res0, 2 );
+    u_out2 = __msa_copy_u_h( res1, 0 );
+    u_out3 = __msa_copy_u_h( res1, 2 );
+
+    SH( u_out0, p_dst_v );
+    p_dst_v += i_dst_stride;
+    SH( u_out1, p_dst_v );
+    p_dst_v += i_dst_stride;
+    SH( u_out2, p_dst_v );
+    p_dst_v += i_dst_stride;
+    SH( u_out3, p_dst_v );
+}
+
+static void avc_interleaved_chroma_hv_2w_msa( uint8_t *p_src,
+                                              int32_t i_src_stride,
+                                              uint8_t *p_dst_u,
+                                              uint8_t *p_dst_v,
+                                              int32_t i_dst_stride,
+                                              uint32_t u_coef_hor0,
+                                              uint32_t u_coef_hor1,
+                                              uint32_t u_coef_ver0,
+                                              uint32_t u_coef_ver1,
+                                              int32_t i_height )
+{
+    if( 2 == i_height )
+    {
+        avc_interleaved_chroma_hv_2x2_msa( p_src, i_src_stride,
+                                           p_dst_u, p_dst_v, i_dst_stride,
+                                           u_coef_hor0, u_coef_hor1,
+                                           u_coef_ver0, u_coef_ver1 );
+    }
+    else if( 4 == i_height )
+    {
+        avc_interleaved_chroma_hv_2x4_msa( p_src, i_src_stride,
+                                           p_dst_u, p_dst_v, i_dst_stride,
+                                           u_coef_hor0, u_coef_hor1,
+                                           u_coef_ver0, u_coef_ver1 );
+    }
+}
+
+static void avc_interleaved_chroma_hv_4x2_msa( uint8_t *p_src,
+                                               int32_t i_src_stride,
+                                               uint8_t *p_dst_u,
+                                               uint8_t *p_dst_v,
+                                               int32_t i_dst_stride,
+                                               uint32_t u_coef_hor0,
+                                               uint32_t u_coef_hor1,
+                                               uint32_t u_coef_ver0,
+                                               uint32_t u_coef_ver1 )
+{
+    uint32_t u_out0, u_out1, u_out2, u_out3;
+    v16u8 src0, src1, src2, src3, src4;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16i8 mask;
+    v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
+    v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
+    v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
+    v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
+    v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
+    v4i32 res0, res1;
+
+    mask = LD_SB( &pu_chroma_mask_arr[16] );
+
+    LD_UB3( p_src, i_src_stride, src0, src1, src2 );
+    VSHF_B2_UB( src0, src1, src1, src2,
+                ( mask + 1 ), ( mask + 1 ), src3, src4 );
+    VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
+    DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
+                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+                 res_hz3 );
+    MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+          coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+          res_vt3 );
+    ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
+    SRARI_H2_UH( res_vt0, res_vt2, 6 );
+    SAT_UH2_UH( res_vt0, res_vt2, 7 );
+    PCKEV_B2_SW( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
+
+    u_out0 = __msa_copy_u_w( res0, 0 );
+    u_out1 = __msa_copy_u_w( res0, 1 );
+    u_out2 = __msa_copy_u_w( res1, 0 );
+    u_out3 = __msa_copy_u_w( res1, 1 );
+    SW( u_out0, p_dst_u );
+    p_dst_u += i_dst_stride;
+    SW( u_out1, p_dst_u );
+    SW( u_out2, p_dst_v );
+    p_dst_v += i_dst_stride;
+    SW( u_out3, p_dst_v );
+}
+
+static void avc_interleaved_chroma_hv_4x4mul_msa( uint8_t *p_src,
+                                                  int32_t i_src_stride,
+                                                  uint8_t *p_dst_u,
+                                                  uint8_t *p_dst_v,
+                                                  int32_t i_dst_stride,
+                                                  uint32_t u_coef_hor0,
+                                                  uint32_t u_coef_hor1,
+                                                  uint32_t u_coef_ver0,
+                                                  uint32_t u_coef_ver1,
+                                                  int32_t i_height )
+{
+    uint32_t u_row;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16i8 mask;
+    v4i32 res0, res1;
+    v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
+    v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
+    v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
+    v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
+    v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
+
+    mask = LD_SB( &pu_chroma_mask_arr[16] );
+
+    src0 = LD_UB( p_src );
+    p_src += i_src_stride;
+
+    for( u_row = ( i_height >> 2 ); u_row--; )
+    {
+        LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
+        p_src += ( 4 * i_src_stride );
+
+        VSHF_B2_UB( src0, src1, src1, src2,
+                    ( mask + 1 ), ( mask + 1 ), src5, src6 );
+        VSHF_B2_UB( src2, src3, src3, src4,
+                    ( mask + 1 ), ( mask + 1 ), src7, src8 );
+        VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
+        VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
+        DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
+                     coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+                     res_hz3 );
+        MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+              coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+              res_vt3 );
+        ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
+        SRARI_H2_UH( res_vt0, res_vt1, 6 );
+        SAT_UH2_UH( res_vt0, res_vt1, 7 );
+        PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
+
+        ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_u, i_dst_stride );
+        p_dst_u += ( 4 * i_dst_stride );
+
+        DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
+                     coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+                     res_hz3 );
+        MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+              coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+              res_vt3 );
+        ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
+        SRARI_H2_UH( res_vt0, res_vt1, 6 );
+        SAT_UH2_UH( res_vt0, res_vt1, 7 );
+        PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
+
+        ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_v, i_dst_stride );
+        p_dst_v += ( 4 * i_dst_stride );
+        src0 = src4;
+    }
+}
+
+static void avc_interleaved_chroma_hv_4w_msa( uint8_t *p_src,
+                                              int32_t i_src_stride,
+                                              uint8_t *p_dst_u,
+                                              uint8_t *p_dst_v,
+                                              int32_t i_dst_stride,
+                                              uint32_t u_coef_hor0,
+                                              uint32_t u_coef_hor1,
+                                              uint32_t u_coef_ver0,
+                                              uint32_t u_coef_ver1,
+                                              int32_t i_height )
+{
+    if( 2 == i_height )
+    {
+        avc_interleaved_chroma_hv_4x2_msa( p_src, i_src_stride,
+                                           p_dst_u, p_dst_v, i_dst_stride,
+                                           u_coef_hor0, u_coef_hor1,
+                                           u_coef_ver0, u_coef_ver1 );
+    }
+    else
+    {
+        avc_interleaved_chroma_hv_4x4mul_msa( p_src, i_src_stride,
+                                              p_dst_u, p_dst_v, i_dst_stride,
+                                              u_coef_hor0, u_coef_hor1,
+                                              u_coef_ver0, u_coef_ver1,
+                                              i_height );
+    }
+}
+
+static void avc_interleaved_chroma_hv_8w_msa( uint8_t *p_src,
+                                              int32_t i_src_stride,
+                                              uint8_t *p_dst_u,
+                                              uint8_t *p_dst_v,
+                                              int32_t i_dst_stride,
+                                              uint32_t u_coef_hor0,
+                                              uint32_t u_coef_hor1,
+                                              uint32_t u_coef_ver0,
+                                              uint32_t u_coef_ver1,
+                                              int32_t i_height )
+{
+    uint32_t u_row;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+    v16u8 src10, src11, src12, src13, src14;
+    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5;
+    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+    v16i8 mask = { 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16 };
+    v16i8 coeff_hz_vec0, coeff_hz_vec1;
+    v16i8 tmp0, tmp1;
+    v16u8 coeff_hz_vec;
+    v8u16 coeff_vt_vec0, coeff_vt_vec1;
+
+    coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
+    coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
+    coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
+    coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
+    coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
+
+    LD_UB2( p_src, 16, src0, src13 );
+    p_src += i_src_stride;
+
+    VSHF_B2_UB( src0, src13, src0, src13, ( mask + 1 ), mask, src14, src0 );
+    DOTP_UB2_UH( src0, src14, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz5 );
+
+    for( u_row = ( i_height >> 2 ); u_row--; )
+    {
+        LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
+        LD_UB4( p_src + 16, i_src_stride, src5, src6, src7, src8 );
+        p_src += ( 4 * i_src_stride );
+
+        VSHF_B2_UB( src1, src5, src2, src6, mask, mask, src9, src10 );
+        VSHF_B2_UB( src3, src7, src4, src8, mask, mask, src11, src12 );
+        DOTP_UB4_UH( src9, src10, src11, src12, coeff_hz_vec, coeff_hz_vec,
+                     coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
+                     res_hz4 );
+        MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
+              coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+              res_vt3 );
+
+        res_vt0 += ( res_hz0 * coeff_vt_vec1 );
+        res_vt1 += ( res_hz1 * coeff_vt_vec1 );
+        res_vt2 += ( res_hz2 * coeff_vt_vec1 );
+        res_vt3 += ( res_hz3 * coeff_vt_vec1 );
+
+        SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
+        SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
+        PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
+        ST8x4_UB( tmp0, tmp1, p_dst_u, i_dst_stride );
+        p_dst_u += ( 4 * i_dst_stride );
+        res_hz0 = res_hz4;
+
+        VSHF_B2_UB( src1, src5, src2, src6,
+                    ( mask + 1 ), ( mask + 1 ), src5, src6 );
+        VSHF_B2_UB( src3, src7, src4, src8,
+                    ( mask + 1 ), ( mask + 1 ), src7, src8 );
+        DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
+                     coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
+                     res_hz4 );
+        MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
+              coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+              res_vt3 );
+
+        res_vt0 += ( res_hz5 * coeff_vt_vec1 );
+        res_vt1 += ( res_hz1 * coeff_vt_vec1 );
+        res_vt2 += ( res_hz2 * coeff_vt_vec1 );
+        res_vt3 += ( res_hz3 * coeff_vt_vec1 );
+
+        SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
+        SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
+        PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
+        ST8x4_UB( tmp0, tmp1, p_dst_v, i_dst_stride );
+        p_dst_v += ( 4 * i_dst_stride );
+        res_hz5 = res_hz4;
+    }
+}
+
+static void avc_wgt_opscale_4x2_msa( uint8_t *p_src, int32_t i_src_stride,
+                                     uint8_t *p_dst, int32_t i_dst_stride,
+                                     int32_t i_log2_denom, int32_t i_weight,
+                                     int32_t i_offset_in )
+{
+    uint32_t u_load0, u_load1, u_out0, u_out1;
+    v16u8 zero = { 0 };
+    v16u8 src0, src1;
+    v4i32 dst0, dst1;
+    v8u16 temp0, temp1, wgt, denom, offset, tp0, tp1;
+    v8i16 vec0, vec1;
+
+    i_offset_in <<= ( i_log2_denom );
+
+    if( i_log2_denom )
+    {
+        i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
+    }
+
+    wgt = ( v8u16 ) __msa_fill_h( i_weight );
+    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+    denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
+
+    u_load0 = LW( p_src );
+    p_src += i_src_stride;
+    u_load1 = LW( p_src );
+
+    src0 = ( v16u8 ) __msa_fill_w( u_load0 );
+    src1 = ( v16u8 ) __msa_fill_w( u_load1 );
+
+    ILVR_B2_UH( zero, src0, zero, src1, temp0, temp1 );
+    MUL2( wgt, temp0, wgt, temp1, temp0, temp1 );
+    ADDS_SH2_SH( temp0, offset, temp1, offset, vec0, vec1 );
+    MAXI_SH2_SH( vec0, vec1, 0 );
+
+    tp0 = ( v8u16 ) __msa_srl_h( vec0, ( v8i16 ) denom );
+    tp1 = ( v8u16 ) __msa_srl_h( vec1, ( v8i16 ) denom );
+
+    SAT_UH2_UH( tp0, tp1, 7 );
+    PCKEV_B2_SW( tp0, tp0, tp1, tp1, dst0, dst1 );
+
+    u_out0 = __msa_copy_u_w( dst0, 0 );
+    u_out1 = __msa_copy_u_w( dst1, 0 );
+    SW( u_out0, p_dst );
+    p_dst += i_dst_stride;
+    SW( u_out1, p_dst );
+}
+
+static void avc_wgt_opscale_4x4multiple_msa( uint8_t *p_src,
+                                             int32_t i_src_stride,
+                                             uint8_t *p_dst,
+                                             int32_t i_dst_stride,
+                                             int32_t i_height,
+                                             int32_t i_log2_denom,
+                                             int32_t i_weight,
+                                             int32_t i_offset_in )
+{
+    uint8_t u_cnt;
+    uint32_t u_load0, u_load1, u_load2, u_load3;
+    v16u8 zero = { 0 };
+    v16u8 src0, src1, src2, src3;
+    v8u16 temp0, temp1, temp2, temp3;
+    v8u16 wgt, denom, offset;
+
+    i_offset_in <<= ( i_log2_denom );
+
+    if( i_log2_denom )
+    {
+        i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
+    }
+
+    wgt = ( v8u16 ) __msa_fill_h( i_weight );
+    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+    denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
+
+    for( u_cnt = i_height / 4; u_cnt--; )
+    {
+        LW4( p_src, i_src_stride, u_load0, u_load1, u_load2, u_load3 );
+        p_src += 4 * i_src_stride;
+
+        src0 = ( v16u8 ) __msa_fill_w( u_load0 );
+        src1 = ( v16u8 ) __msa_fill_w( u_load1 );
+        src2 = ( v16u8 ) __msa_fill_w( u_load2 );
+        src3 = ( v16u8 ) __msa_fill_w( u_load3 );
+
+        ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
+                    temp0, temp1, temp2, temp3 );
+        MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
+              temp0, temp1, temp2, temp3 );
+        ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
+                     temp0, temp1, temp2, temp3 );
+        MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
+        SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
+        SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
+        PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
+        p_dst += ( 4 * i_dst_stride );
+    }
+}
+
+static void avc_wgt_opscale_4width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                        uint8_t *p_dst, int32_t i_dst_stride,
+                                        int32_t i_height, int32_t i_log2_denom,
+                                        int32_t i_weight, int32_t i_offset_in )
+{
+    if( 2 == i_height )
+    {
+        avc_wgt_opscale_4x2_msa( p_src, i_src_stride, p_dst, i_dst_stride,
+                                 i_log2_denom, i_weight, i_offset_in );
+    }
+    else
+    {
+        avc_wgt_opscale_4x4multiple_msa( p_src, i_src_stride,
+                                         p_dst, i_dst_stride,
+                                         i_height, i_log2_denom,
+                                         i_weight, i_offset_in );
+    }
+}
+
+static void avc_wgt_opscale_8width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                        uint8_t *p_dst, int32_t i_dst_stride,
+                                        int32_t i_height, int32_t i_log2_denom,
+                                        int32_t i_weight, int32_t i_offset_in )
+{
+    uint8_t u_cnt;
+    v16u8 zero = { 0 };
+    v16u8 src0, src1, src2, src3;
+    v8u16 temp0, temp1, temp2, temp3;
+    v8u16 wgt, denom, offset;
+    v16i8 out0, out1;
+
+    i_offset_in <<= ( i_log2_denom );
+
+    if( i_log2_denom )
+    {
+        i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
+    }
+
+    wgt = ( v8u16 ) __msa_fill_h( i_weight );
+    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+    denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
+
+    for( u_cnt = i_height / 4; u_cnt--; )
+    {
+        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+        p_src += 4 * i_src_stride;
+
+        ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
+                    temp0, temp1, temp2, temp3 );
+        MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
+              temp0, temp1, temp2, temp3 );
+        ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
+                     temp0, temp1, temp2, temp3 );
+        MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
+        SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
+        SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
+        PCKEV_B2_SB( temp1, temp0, temp3, temp2, out0, out1 );
+        ST8x4_UB( out0, out1, p_dst, i_dst_stride );
+        p_dst += ( 4 * i_dst_stride );
+    }
+}
+
+static void avc_wgt_opscale_16width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                         uint8_t *p_dst, int32_t i_dst_stride,
+                                         int32_t i_height, int32_t i_log2_denom,
+                                         int32_t i_weight, int32_t i_offset_in )
+{
+    uint8_t u_cnt;
+    v16i8 zero = { 0 };
+    v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8u16 wgt, denom, offset;
+
+    i_offset_in <<= ( i_log2_denom );
+
+    if( i_log2_denom )
+    {
+        i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
+    }
+
+    wgt = ( v8u16 ) __msa_fill_h( i_weight );
+    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+    denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
+
+    for( u_cnt = i_height / 4; u_cnt--; )
+    {
+        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+        p_src += 4 * i_src_stride;
+
+        ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
+                    temp0, temp2, temp4, temp6 );
+        ILVL_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
+                    temp1, temp3, temp5, temp7 );
+        MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
+              temp0, temp1, temp2, temp3 );
+        MUL4( wgt, temp4, wgt, temp5, wgt, temp6, wgt, temp7,
+              temp4, temp5, temp6, temp7 );
+        ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
+                     temp0, temp1, temp2, temp3 );
+        ADDS_SH4_UH( temp4, offset, temp5, offset, temp6, offset, temp7, offset,
+                     temp4, temp5, temp6, temp7 );
+        MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
+        MAXI_SH4_UH( temp4, temp5, temp6, temp7, 0 );
+        SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
+        SRL_H4_UH( temp4, temp5, temp6, temp7, denom );
+        SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
+        SAT_UH4_UH( temp4, temp5, temp6, temp7, 7 );
+        PCKEV_B4_UB( temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
+                     dst0, dst1, dst2, dst3 );
+
+        ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
+        p_dst += 4 * i_dst_stride;
+    }
+}
+
+static void avc_biwgt_opscale_4x2_nw_msa( uint8_t *p_src1_in,
+                                          int32_t i_src1_stride,
+                                          uint8_t *p_src2_in,
+                                          int32_t i_src2_stride,
+                                          uint8_t *p_dst,
+                                          int32_t i_dst_stride,
+                                          int32_t i_log2_denom,
+                                          int32_t i_src1_weight,
+                                          int32_t i_src2_weight,
+                                          int32_t i_offset_in )
+{
+    uint32_t u_load0, u_load1, u_out0, u_out1;
+    v8i16 src1_wgt, src2_wgt;
+    v16u8 in0, in1, in2, in3;
+    v8i16 temp0, temp1, temp2, temp3;
+    v16i8 zero = { 0 };
+    v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
+
+    src1_wgt = __msa_fill_h( i_src1_weight );
+    src2_wgt = __msa_fill_h( i_src2_weight );
+    u_load0 = LW( p_src1_in );
+    u_load1 = LW( p_src1_in + i_src1_stride );
+    in0 = ( v16u8 ) __msa_fill_w( u_load0 );
+    in1 = ( v16u8 ) __msa_fill_w( u_load1 );
+    u_load0 = LW( p_src2_in );
+    u_load1 = LW( p_src2_in + i_src2_stride );
+    in2 = ( v16u8 ) __msa_fill_w( u_load0 );
+    in3 = ( v16u8 ) __msa_fill_w( u_load1 );
+    ILVR_B4_SH( zero, in0, zero, in1, zero, in2, zero, in3,
+                temp0, temp1, temp2, temp3 );
+    temp0 = ( temp0 * src1_wgt ) + ( temp2 * src2_wgt );
+    temp1 = ( temp1 * src1_wgt ) + ( temp3 * src2_wgt );
+    SRAR_H2_SH( temp0, temp1, denom );
+    CLIP_SH2_0_255( temp0, temp1 );
+    PCKEV_B2_UB( temp0, temp0, temp1, temp1, in0, in1 );
+    u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
+    u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
+    SW( u_out0, p_dst );
+    p_dst += i_dst_stride;
+    SW( u_out1, p_dst );
+}
+
+static void avc_biwgt_opscale_4x4multiple_nw_msa( uint8_t *p_src1_in,
+                                                  int32_t i_src1_stride,
+                                                  uint8_t *p_src2_in,
+                                                  int32_t i_src2_stride,
+                                                  uint8_t *p_dst,
+                                                  int32_t i_dst_stride,
+                                                  int32_t i_height,
+                                                  int32_t i_log2_denom,
+                                                  int32_t i_src1_weight,
+                                                  int32_t i_src2_weight,
+                                                  int32_t i_offset_in )
+{
+    uint8_t u_cnt;
+    uint32_t u_load0, u_load1, u_load2, u_load3;
+    v8i16 src1_wgt, src2_wgt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v16i8 zero = { 0 };
+    v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
+
+    src1_wgt = __msa_fill_h( i_src1_weight );
+    src2_wgt = __msa_fill_h( i_src2_weight );
+    for( u_cnt = i_height / 4; u_cnt--; )
+    {
+        LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
+        p_src1_in += ( 4 * i_src1_stride );
+        src0 = ( v16u8 ) __msa_fill_w( u_load0 );
+        src1 = ( v16u8 ) __msa_fill_w( u_load1 );
+        src2 = ( v16u8 ) __msa_fill_w( u_load2 );
+        src3 = ( v16u8 ) __msa_fill_w( u_load3 );
+        LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
+        p_src2_in += ( 4 * i_src2_stride );
+        src4 = ( v16u8 ) __msa_fill_w( u_load0 );
+        src5 = ( v16u8 ) __msa_fill_w( u_load1 );
+        src6 = ( v16u8 ) __msa_fill_w( u_load2 );
+        src7 = ( v16u8 ) __msa_fill_w( u_load3 );
+        ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
+                    temp0, temp1, temp2, temp3 );
+        ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7,
+                    temp4, temp5, temp6, temp7 );
+        temp0 = ( temp0 * src1_wgt ) + ( temp4 * src2_wgt );
+        temp1 = ( temp1 * src1_wgt ) + ( temp5 * src2_wgt );
+        temp2 = ( temp2 * src1_wgt ) + ( temp6 * src2_wgt );
+        temp3 = ( temp3 * src1_wgt ) + ( temp7 * src2_wgt );
+        SRAR_H4_SH( temp0, temp1, temp2, temp3, denom );
+        CLIP_SH4_0_255( temp0, temp1, temp2, temp3 );
+        PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
+        p_dst += ( 4 * i_dst_stride );
+    }
+}
+
+static void avc_biwgt_opscale_4width_nw_msa( uint8_t *p_src1_in,
+                                             int32_t i_src1_stride,
+                                             uint8_t *p_src2_in,
+                                             int32_t i_src2_stride,
+                                             uint8_t *p_dst,
+                                             int32_t i_dst_stride,
+                                             int32_t i_height,
+                                             int32_t i_log2_denom,
+                                             int32_t i_src1_weight,
+                                             int32_t i_src2_weight,
+                                             int32_t i_offset_in )
+{
+    if( 2 == i_height )
+    {
+        avc_biwgt_opscale_4x2_nw_msa( p_src1_in, i_src1_stride,
+                                      p_src2_in, i_src2_stride,
+                                      p_dst, i_dst_stride,
+                                      i_log2_denom, i_src1_weight,
+                                      i_src2_weight, i_offset_in );
+    }
+    else
+    {
+        avc_biwgt_opscale_4x4multiple_nw_msa( p_src1_in, i_src1_stride,
+                                              p_src2_in, i_src2_stride,
+                                              p_dst, i_dst_stride,
+                                              i_height, i_log2_denom,
+                                              i_src1_weight, i_src2_weight,
+                                              i_offset_in );
+    }
+}
+
+static void avc_biwgt_opscale_8width_nw_msa( uint8_t *p_src1_in,
+                                             int32_t i_src1_stride,
+                                             uint8_t *p_src2_in,
+                                             int32_t i_src2_stride,
+                                             uint8_t *p_dst,
+                                             int32_t i_dst_stride,
+                                             int32_t i_height,
+                                             int32_t i_log2_denom,
+                                             int32_t i_src1_weight,
+                                             int32_t i_src2_weight,
+                                             int32_t i_offset_in )
+{
+    uint8_t u_cnt;
+    v8i16 src1_wgt, src2_wgt;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8i16 temp0, temp1, temp2, temp3;
+    v8i16 res0, res1, res2, res3;
+    v16i8 zero = { 0 };
+    v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
+
+    src1_wgt = __msa_fill_h( i_src1_weight );
+    src2_wgt = __msa_fill_h( i_src2_weight );
+
+    for( u_cnt = i_height / 4; u_cnt--; )
+    {
+        LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
+        p_src1_in += ( 4 * i_src1_stride );
+        LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
+        p_src2_in += ( 4 * i_src2_stride );
+        ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
+                    temp0, temp1, temp2, temp3 );
+        ILVR_B4_SH( zero, dst0, zero, dst1, zero, dst2, zero, dst3,
+                    res0, res1, res2, res3 );
+        res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
+        res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
+        res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
+        res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
+        SRAR_H4_SH( res0, res1, res2, res3, denom );
+        CLIP_SH4_0_255( res0, res1, res2, res3 );
+        PCKEV_B4_UB( res0, res0, res1, res1, res2, res2, res3, res3,
+                     dst0, dst1, dst2, dst3 );
+        ST8x1_UB( dst0, p_dst );
+        p_dst += i_dst_stride;
+        ST8x1_UB( dst1, p_dst );
+        p_dst += i_dst_stride;
+        ST8x1_UB( dst2, p_dst );
+        p_dst += i_dst_stride;
+        ST8x1_UB( dst3, p_dst );
+        p_dst += i_dst_stride;
+    }
+}
+
+static void avc_biwgt_opscale_16width_nw_msa( uint8_t *p_src1_in,
+                                              int32_t i_src1_stride,
+                                              uint8_t *p_src2_in,
+                                              int32_t i_src2_stride,
+                                              uint8_t *p_dst,
+                                              int32_t i_dst_stride,
+                                              int32_t i_height,
+                                              int32_t i_log2_denom,
+                                              int32_t i_src1_weight,
+                                              int32_t i_src2_weight,
+                                              int32_t i_offset_in )
+{
+    uint8_t u_cnt;
+    v8i16 src1_wgt, src2_wgt;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1, dst2, dst3;
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 zero = { 0 };
+    v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
+
+    src1_wgt = __msa_fill_h( i_src1_weight );
+    src2_wgt = __msa_fill_h( i_src2_weight );
+
+    for( u_cnt = i_height / 4; u_cnt--; )
+    {
+        LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
+        p_src1_in += ( 4 * i_src1_stride );
+        LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
+        p_src2_in += ( 4 * i_src2_stride );
+        ILVRL_B2_SH( zero, src0, temp1, temp0 );
+        ILVRL_B2_SH( zero, src1, temp3, temp2 );
+        ILVRL_B2_SH( zero, src2, temp5, temp4 );
+        ILVRL_B2_SH( zero, src3, temp7, temp6 );
+        ILVRL_B2_SH( zero, dst0, res1, res0 );
+        ILVRL_B2_SH( zero, dst1, res3, res2 );
+        ILVRL_B2_SH( zero, dst2, res5, res4 );
+        ILVRL_B2_SH( zero, dst3, res7, res6 );
+        res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
+        res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
+        res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
+        res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
+        res4 = ( temp4 * src1_wgt ) + ( res4 * src2_wgt );
+        res5 = ( temp5 * src1_wgt ) + ( res5 * src2_wgt );
+        res6 = ( temp6 * src1_wgt ) + ( res6 * src2_wgt );
+        res7 = ( temp7 * src1_wgt ) + ( res7 * src2_wgt );
+        SRAR_H4_SH( res0, res1, res2, res3, denom );
+        SRAR_H4_SH( res4, res5, res6, res7, denom );
+        CLIP_SH4_0_255( res0, res1, res2, res3 );
+        CLIP_SH4_0_255( res4, res5, res6, res7 );
+        PCKEV_B4_UB( res0, res1, res2, res3, res4, res5, res6, res7,
+                     dst0, dst1, dst2, dst3 );
+        ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
+        p_dst += 4 * i_dst_stride;
+    }
+}
+
+static void avc_biwgt_opscale_4x2_msa( uint8_t *p_src1_in,
+                                       int32_t i_src1_stride,
+                                       uint8_t *p_src2_in,
+                                       int32_t i_src2_stride,
+                                       uint8_t *p_dst, int32_t i_dst_stride,
+                                       int32_t i_log2_denom,
+                                       int32_t i_src1_weight,
+                                       int32_t i_src2_weight,
+                                       int32_t i_offset_in )
+{
+    uint32_t u_load0, u_load1, u_out0, u_out1;
+    v16u8 src1_wgt, src2_wgt, wgt;
+    v16i8 in0, in1, in2, in3;
+    v8u16 temp0, temp1, denom, offset;
+
+    i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
+
+    src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
+    src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
+    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+    denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
+
+    wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
+
+    u_load0 = LW( p_src1_in );
+    u_load1 = LW( p_src1_in + i_src1_stride );
+    in0 = ( v16i8 ) __msa_fill_w( u_load0 );
+    in1 = ( v16i8 ) __msa_fill_w( u_load1 );
+
+    u_load0 = LW( p_src2_in );
+    u_load1 = LW( p_src2_in + i_src2_stride );
+    in2 = ( v16i8 ) __msa_fill_w( u_load0 );
+    in3 = ( v16i8 ) __msa_fill_w( u_load1 );
+
+    ILVR_B2_SB( in2, in0, in3, in1, in0, in1 );
+
+    temp0 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in0 );
+    temp1 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in1 );
+    temp0 >>= denom;
+    temp1 >>= denom;
+    MAXI_SH2_UH( temp0, temp1, 0 );
+    SAT_UH2_UH( temp0, temp1, 7 );
+    PCKEV_B2_SB( temp0, temp0, temp1, temp1, in0, in1 );
+
+    u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
+    u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
+    SW( u_out0, p_dst );
+    p_dst += i_dst_stride;
+    SW( u_out1, p_dst );
+}
+
+static void avc_biwgt_opscale_4x4multiple_msa( uint8_t *p_src1_in,
+                                               int32_t i_src1_stride,
+                                               uint8_t *p_src2_in,
+                                               int32_t i_src2_stride,
+                                               uint8_t *p_dst,
+                                               int32_t i_dst_stride,
+                                               int32_t i_height,
+                                               int32_t i_log2_denom,
+                                               int32_t i_src1_weight,
+                                               int32_t i_src2_weight,
+                                               int32_t i_offset_in )
+{
+    uint8_t u_cnt;
+    uint32_t u_load0, u_load1, u_load2, u_load3;
+    v16u8 src1_wgt, src2_wgt, wgt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 temp0, temp1, temp2, temp3;
+    v8u16 res0, res1, res2, res3;
+    v8u16 denom, offset;
+
+    i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
+
+    src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
+    src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
+    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+    denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
+
+    wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
+
+    for( u_cnt = i_height / 4; u_cnt--; )
+    {
+        LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
+        p_src1_in += ( 4 * i_src1_stride );
+
+        src0 = ( v16u8 ) __msa_fill_w( u_load0 );
+        src1 = ( v16u8 ) __msa_fill_w( u_load1 );
+        src2 = ( v16u8 ) __msa_fill_w( u_load2 );
+        src3 = ( v16u8 ) __msa_fill_w( u_load3 );
+
+        LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
+        p_src2_in += ( 4 * i_src2_stride );
+
+        src4 = ( v16u8 ) __msa_fill_w( u_load0 );
+        src5 = ( v16u8 ) __msa_fill_w( u_load1 );
+        src6 = ( v16u8 ) __msa_fill_w( u_load2 );
+        src7 = ( v16u8 ) __msa_fill_w( u_load3 );
+
+        ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
+                    temp0, temp1, temp2, temp3 );
+        DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
+                     res0, res1, res2, res3 );
+        ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
+              res0, res1, res2, res3 );
+        SRA_4V( res0, res1, res2, res3, denom );
+        MAXI_SH4_UH( res0, res1, res2, res3, 0 );
+        SAT_UH4_UH( res0, res1, res2, res3, 7 );
+        PCKEV_ST4x4_UB( res0, res1, res2, res3, p_dst, i_dst_stride );
+        p_dst += ( 4 * i_dst_stride );
+    }
+}
+
+static void avc_biwgt_opscale_4width_msa( uint8_t *p_src1_in,
+                                          int32_t i_src1_stride,
+                                          uint8_t *p_src2_in,
+                                          int32_t i_src2_stride,
+                                          uint8_t *p_dst,
+                                          int32_t i_dst_stride,
+                                          int32_t i_height,
+                                          int32_t i_log2_denom,
+                                          int32_t i_src1_weight,
+                                          int32_t i_src2_weight,
+                                          int32_t i_offset_in )
+{
+    if( 2 == i_height )
+    {
+        avc_biwgt_opscale_4x2_msa( p_src1_in, i_src1_stride,
+                                   p_src2_in, i_src2_stride,
+                                   p_dst, i_dst_stride,
+                                   i_log2_denom, i_src1_weight,
+                                   i_src2_weight, i_offset_in );
+    }
+    else
+    {
+        avc_biwgt_opscale_4x4multiple_msa( p_src1_in, i_src1_stride,
+                                           p_src2_in, i_src2_stride,
+                                           p_dst, i_dst_stride,
+                                           i_height, i_log2_denom,
+                                           i_src1_weight,
+                                           i_src2_weight, i_offset_in );
+    }
+}
+
+
+static void avc_biwgt_opscale_8width_msa( uint8_t *p_src1_in,
+                                          int32_t i_src1_stride,
+                                          uint8_t *p_src2_in,
+                                          int32_t i_src2_stride,
+                                          uint8_t *p_dst,
+                                          int32_t i_dst_stride,
+                                          int32_t i_height,
+                                          int32_t i_log2_denom,
+                                          int32_t i_src1_weight,
+                                          int32_t i_src2_weight,
+                                          int32_t i_offset_in )
+{
+    uint8_t u_cnt;
+    v16u8 src1_wgt, src2_wgt, wgt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 temp0, temp1, temp2, temp3;
+    v8u16 res0, res1, res2, res3;
+    v8u16 denom, offset;
+    v16i8 out0, out1;
+
+    i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
+
+    src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
+    src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
+    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+    denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
+
+    wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
+
+    for( u_cnt = i_height / 4; u_cnt--; )
+    {
+        LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
+        p_src1_in += ( 4 * i_src1_stride );
+
+        LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
+        p_src2_in += ( 4 * i_src2_stride );
+
+        ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
+                    temp0, temp1, temp2, temp3 );
+        DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
+                     res0, res1, res2, res3 );
+        ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
+              res0, res1, res2, res3 );
+        SRA_4V( res0, res1, res2, res3, denom );
+        MAXI_SH4_UH( res0, res1, res2, res3, 0 );
+        SAT_UH4_UH( res0, res1, res2, res3, 7 );
+        PCKEV_B2_SB( res1, res0, res3, res2, out0, out1 );
+        ST8x4_UB( out0, out1, p_dst, i_dst_stride );
+        p_dst += 4 * i_dst_stride;
+    }
+}
+
+static void avc_biwgt_opscale_16width_msa( uint8_t *p_src1_in,
+                                           int32_t i_src1_stride,
+                                           uint8_t *p_src2_in,
+                                           int32_t i_src2_stride,
+                                           uint8_t *p_dst,
+                                           int32_t i_dst_stride,
+                                           int32_t i_height,
+                                           int32_t i_log2_denom,
+                                           int32_t i_src1_weight,
+                                           int32_t i_src2_weight,
+                                           int32_t i_offset_in )
+{
+    uint8_t u_cnt;
+    v16u8 src1_wgt, src2_wgt, wgt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v8u16 denom, offset;
+
+    i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
+
+    src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
+    src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
+    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+    denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
+
+    wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
+
+    for( u_cnt = i_height / 4; u_cnt--; )
+    {
+        LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
+        p_src1_in += ( 4 * i_src1_stride );
+
+        LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
+        p_src2_in += ( 4 * i_src2_stride );
+
+        ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
+                    temp0, temp2, temp4, temp6 );
+        ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
+                    temp1, temp3, temp5, temp7 );
+        DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
+                     res0, res1, res2, res3 );
+        ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
+              res0, res1, res2, res3 );
+        DOTP_UB4_UH( temp4, temp5, temp6, temp7, wgt, wgt, wgt, wgt,
+                     res4, res5, res6, res7 );
+        ADD4( res4, offset, res5, offset, res6, offset, res7, offset,
+              res4, res5, res6, res7 );
+        SRA_4V( res0, res1, res2, res3, denom );
+        SRA_4V( res4, res5, res6, res7, denom );
+        MAXI_SH4_UH( res0, res1, res2, res3, 0 );
+        MAXI_SH4_UH( res4, res5, res6, res7, 0 );
+        SAT_UH4_UH( res0, res1, res2, res3, 7 );
+        SAT_UH4_UH( res4, res5, res6, res7, 7 );
+        PCKEV_B4_UB( res1, res0, res3, res2, res5, res4, res7, res6,
+                     temp0, temp1, temp2, temp3 );
+        ST_UB4( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
+        p_dst += 4 * i_dst_stride;
+    }
+}
+
+static void copy_width4_msa( uint8_t *p_src, int32_t i_src_stride,
+                             uint8_t *p_dst, int32_t i_dst_stride,
+                             int32_t i_height )
+{
+    int32_t i_cnt;
+    uint32_t u_src0, u_src1;
+
+    for( i_cnt = ( i_height / 2 ); i_cnt--;  )
+    {
+        u_src0 = LW( p_src );
+        p_src += i_src_stride;
+        u_src1 = LW( p_src );
+        p_src += i_src_stride;
+
+        SW( u_src0, p_dst );
+        p_dst += i_dst_stride;
+        SW( u_src1, p_dst );
+        p_dst += i_dst_stride;
+    }
+}
+
+static void copy_width8_msa( uint8_t *p_src, int32_t i_src_stride,
+                             uint8_t *p_dst, int32_t i_dst_stride,
+                             int32_t i_height )
+{
+    int32_t i_cnt;
+    uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if( 0 == i_height % 12 )
+    {
+        for( i_cnt = ( i_height / 12 ); i_cnt--; )
+        {
+            LD_UB8( p_src, i_src_stride,
+                    src0, src1, src2, src3, src4, src5, src6, src7 );
+            p_src += ( 8 * i_src_stride );
+
+            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
+            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
+            u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
+            u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
+            u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
+            u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
+            u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
+            u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
+
+            SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+            p_dst += ( 4 * i_dst_stride );
+            SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
+            p_dst += ( 4 * i_dst_stride );
+
+            LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+            p_src += ( 4 * i_src_stride );
+
+            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
+            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
+            u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
+            u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
+
+            SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+            p_dst += ( 4 * i_dst_stride );
+        }
+    }
+    else if( 0 == i_height % 8 )
+    {
+        for( i_cnt = i_height >> 3; i_cnt--; )
+        {
+            LD_UB8( p_src, i_src_stride,
+                    src0, src1, src2, src3, src4, src5, src6, src7 );
+            p_src += ( 8 * i_src_stride );
+
+            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
+            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
+            u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
+            u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
+            u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
+            u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
+            u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
+            u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
+
+            SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+            p_dst += ( 4 * i_dst_stride );
+            SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
+            p_dst += ( 4 * i_dst_stride );
+        }
+    }
+    else if( 0 == i_height % 4 )
+    {
+        for( i_cnt = ( i_height / 4 ); i_cnt--; )
+        {
+            LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+            p_src += ( 4 * i_src_stride );
+            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
+            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
+            u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
+            u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
+
+            SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+            p_dst += ( 4 * i_dst_stride );
+        }
+    }
+    else if( 0 == i_height % 2 )
+    {
+        for( i_cnt = ( i_height / 2 ); i_cnt--; )
+        {
+            LD_UB2( p_src, i_src_stride, src0, src1 );
+            p_src += ( 2 * i_src_stride );
+            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
+            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
+
+            SD( u_out0, p_dst );
+            p_dst += i_dst_stride;
+            SD( u_out1, p_dst );
+            p_dst += i_dst_stride;
+        }
+    }
+}
+
+
+static void copy_16multx8mult_msa( uint8_t *p_src, int32_t i_src_stride,
+                                   uint8_t *p_dst, int32_t i_dst_stride,
+                                   int32_t i_height, int32_t i_width )
+{
+    int32_t i_cnt, i_loop_cnt;
+    uint8_t *p_src_tmp, *p_dst_tmp;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    for( i_cnt = ( i_width >> 4 ); i_cnt--; )
+    {
+        p_src_tmp = p_src;
+        p_dst_tmp = p_dst;
+
+        for( i_loop_cnt = ( i_height >> 3 ); i_loop_cnt--; )
+        {
+            LD_UB8( p_src_tmp, i_src_stride,
+                    src0, src1, src2, src3, src4, src5, src6, src7 );
+            p_src_tmp += ( 8 * i_src_stride );
+
+            ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
+                    p_dst_tmp, i_dst_stride );
+            p_dst_tmp += ( 8 * i_dst_stride );
+        }
+
+        p_src += 16;
+        p_dst += 16;
+    }
+}
+
+static void copy_width16_msa( uint8_t *p_src, int32_t i_src_stride,
+                              uint8_t *p_dst, int32_t i_dst_stride,
+                              int32_t i_height )
+{
+    int32_t i_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    if( 0 == i_height % 12 )
+    {
+        for( i_cnt = ( i_height / 12 ); i_cnt--; )
+        {
+            LD_UB8( p_src, i_src_stride,
+                    src0, src1, src2, src3, src4, src5, src6, src7 );
+            p_src += ( 8 * i_src_stride );
+            ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
+                    p_dst, i_dst_stride );
+            p_dst += ( 8 * i_dst_stride );
+
+            LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+            p_src += ( 4 * i_src_stride );
+            ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
+            p_dst += ( 4 * i_dst_stride );
+        }
+    }
+    else if( 0 == i_height % 8 )
+    {
+        copy_16multx8mult_msa( p_src, i_src_stride,
+                               p_dst, i_dst_stride, i_height, 16 );
+    }
+    else if( 0 == i_height % 4 )
+    {
+        for( i_cnt = ( i_height >> 2 ); i_cnt--; )
+        {
+            LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+            p_src += ( 4 * i_src_stride );
+
+            ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
+            p_dst += ( 4 * i_dst_stride );
+        }
+    }
+}
+
+static void avg_src_width4_msa( uint8_t *p_src1, int32_t i_src1_stride,
+                                uint8_t *p_src2, int32_t i_src2_stride,
+                                uint8_t *p_dst, int32_t i_dst_stride,
+                                int32_t i_height )
+{
+    int32_t i_cnt;
+    uint32_t u_out0, u_out1;
+    v16u8 src0, src1, src2, src3;
+    v16u8 dst0, dst1;
+
+    for( i_cnt = ( i_height / 2 ); i_cnt--; )
+    {
+        LD_UB2( p_src1, i_src1_stride, src0, src1 );
+        p_src1 += ( 2 * i_src1_stride );
+        LD_UB2( p_src2, i_src2_stride, src2, src3 );
+        p_src2 += ( 2 * i_src2_stride );
+
+        AVER_UB2_UB( src0, src2, src1, src3, dst0, dst1 );
+
+        u_out0 = __msa_copy_u_w( ( v4i32 ) dst0, 0 );
+        u_out1 = __msa_copy_u_w( ( v4i32 ) dst1, 0 );
+        SW( u_out0, p_dst );
+        p_dst += i_dst_stride;
+        SW( u_out1, p_dst );
+        p_dst += i_dst_stride;
+    }
+}
+
+static void avg_src_width8_msa( uint8_t *p_src1, int32_t i_src1_stride,
+                                uint8_t *p_src2, int32_t i_src2_stride,
+                                uint8_t *p_dst, int32_t i_dst_stride,
+                                int32_t i_height )
+{
+    int32_t i_cnt;
+    uint64_t u_out0, u_out1, u_out2, u_out3;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3;
+
+    for( i_cnt = ( i_height / 4 ); i_cnt--; )
+    {
+        LD_UB4( p_src1, i_src1_stride, src0, src1, src2, src3 );
+        p_src1 += ( 4 * i_src1_stride );
+        LD_UB4( p_src2, i_src2_stride, src4, src5, src6, src7 );
+        p_src2 += ( 4 * i_src2_stride );
+
+        AVER_UB4_UB( src0, src4, src1, src5, src2, src6, src3, src7,
+                     dst0, dst1, dst2, dst3 );
+
+        u_out0 = __msa_copy_u_d( ( v2i64 ) dst0, 0 );
+        u_out1 = __msa_copy_u_d( ( v2i64 ) dst1, 0 );
+        u_out2 = __msa_copy_u_d( ( v2i64 ) dst2, 0 );
+        u_out3 = __msa_copy_u_d( ( v2i64 ) dst3, 0 );
+        SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+        p_dst += ( 4 * i_dst_stride );
+    }
+}
+
+static void avg_src_width16_msa( uint8_t *p_src1, int32_t i_src1_stride,
+                                 uint8_t *p_src2, int32_t i_src2_stride,
+                                 uint8_t *p_dst, int32_t i_dst_stride,
+                                 int32_t i_height )
+{
+    int32_t i_cnt;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+    for( i_cnt = ( i_height / 8 ); i_cnt--; )
+    {
+        LD_UB8( p_src1, i_src1_stride,
+                src0, src1, src2, src3, src4, src5, src6, src7 );
+        p_src1 += ( 8 * i_src1_stride );
+        LD_UB8( p_src2, i_src2_stride,
+                dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 );
+        p_src2 += ( 8 * i_src2_stride );
+
+        AVER_UB4_UB( src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+                     dst0, dst1, dst2, dst3 );
+        AVER_UB4_UB( src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+                     dst4, dst5, dst6, dst7 );
+
+        ST_UB8( dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
+                p_dst, i_dst_stride );
+        p_dst += ( 8 * i_dst_stride );
+    }
+}
+
+static void memset_zero_16width_msa( uint8_t *p_src, int32_t i_stride,
+                                     int32_t i_height )
+{
+    int8_t i_cnt;
+    v16u8 zero = { 0 };
+
+    for( i_cnt = ( i_height / 2 ); i_cnt--; )
+    {
+        ST_UB( zero, p_src );
+        p_src += i_stride;
+        ST_UB( zero, p_src );
+        p_src += i_stride;
+    }
+}
+
+static void plane_copy_interleave_msa( uint8_t *p_src0, int32_t i_src0_stride,
+                                       uint8_t *p_src1, int32_t i_src1_stride,
+                                       uint8_t *p_dst, int32_t i_dst_stride,
+                                       int32_t i_width, int32_t i_height )
+{
+    int32_t i_loop_width, i_loop_height, i_w_mul8, i_h4w;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16u8 vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3;
+    v16u8 vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3;
+
+    i_w_mul8 = i_width - i_width % 8;
+    i_h4w = i_height - i_height % 4;
+
+    for( i_loop_height = ( i_h4w >> 2 ); i_loop_height--; )
+    {
+        for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
+        {
+            LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 );
+            LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 );
+            ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
+                        vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 );
+            ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
+                        vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3 );
+            ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3,
+                    p_dst, i_dst_stride );
+            ST_UB4( vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3,
+                    ( p_dst + 16 ), i_dst_stride );
+            p_src0 += 16;
+            p_src1 += 16;
+            p_dst += 32;
+        }
+
+        for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; )
+        {
+            LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 );
+            LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 );
+            ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
+                        vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 );
+            ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3,
+                    p_dst, i_dst_stride );
+            p_src0 += 8;
+            p_src1 += 8;
+            p_dst += 16;
+        }
+
+        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
+        {
+            p_dst[0] = p_src0[0];
+            p_dst[1] = p_src1[0];
+            p_dst[i_dst_stride] = p_src0[i_src0_stride];
+            p_dst[i_dst_stride + 1] = p_src1[i_src1_stride];
+            p_dst[2 * i_dst_stride] = p_src0[2 * i_src0_stride];
+            p_dst[2 * i_dst_stride + 1] = p_src1[2 * i_src1_stride];
+            p_dst[3 * i_dst_stride] = p_src0[3 * i_src0_stride];
+            p_dst[3 * i_dst_stride + 1] = p_src1[3 * i_src1_stride];
+            p_src0 += 1;
+            p_src1 += 1;
+            p_dst += 2;
+        }
+
+        p_src0 += ( ( 4 * i_src0_stride ) - i_width );
+        p_src1 += ( ( 4 * i_src1_stride ) - i_width );
+        p_dst += ( ( 4 * i_dst_stride ) - ( i_width * 2 ) );
+    }
+
+    for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ )
+    {
+        for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
+        {
+            src0 = LD_UB( p_src0 );
+            src4 = LD_UB( p_src1 );
+            ILVRL_B2_UB( src4, src0, vec_ilv_r0, vec_ilv_l0 );
+            ST_UB2( vec_ilv_r0, vec_ilv_l0, p_dst, 16 );
+            p_src0 += 16;
+            p_src1 += 16;
+            p_dst += 32;
+        }
+
+        for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; )
+        {
+            src0 = LD_UB( p_src0 );
+            src4 = LD_UB( p_src1 );
+            vec_ilv_r0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) src4,
+                                                 ( v16i8 ) src0 );
+            ST_UB( vec_ilv_r0, p_dst );
+            p_src0 += 8;
+            p_src1 += 8;
+            p_dst += 16;
+        }
+
+        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
+        {
+            p_dst[0] = p_src0[0];
+            p_dst[1] = p_src1[0];
+            p_src0 += 1;
+            p_src1 += 1;
+            p_dst += 2;
+        }
+
+        p_src0 += ( i_src0_stride - i_width );
+        p_src1 += ( i_src1_stride - i_width );
+        p_dst += ( i_dst_stride - ( i_width * 2 ) );
+    }
+}
+
+static void plane_copy_deinterleave_msa( uint8_t *p_src, int32_t i_src_stride,
+                                         uint8_t *p_dst0, int32_t dst0_stride,
+                                         uint8_t *p_dst1, int32_t dst1_stride,
+                                         int32_t i_width, int32_t i_height )
+{
+    int32_t i_loop_width, i_loop_height, i_w_mul4, i_w_mul8, i_h4w;
+    uint32_t u_res_w0, u_res_w1;
+    v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16u8 vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3;
+    v16u8 vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3;
+    uint8_t *p_dst;
+
+    i_w_mul8 = i_width - i_width % 8;
+    i_w_mul4 = i_width - i_width % 4;
+    i_h4w = i_height - i_height % 8;
+
+    for( i_loop_height = ( i_h4w >> 3 ); i_loop_height--; )
+    {
+        for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; )
+        {
+            LD_UB8( p_src, i_src_stride,
+                    in0, in1, in2, in3, in4, in5, in6, in7 );
+            p_src += 16;
+            PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
+                         vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 );
+            PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
+                         vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 );
+            ST8x4_UB( vec_pckev0, vec_pckev1, p_dst0, dst0_stride );
+            p_dst = p_dst0 + 4 * dst0_stride;
+            ST8x4_UB( vec_pckev2, vec_pckev3, p_dst, dst0_stride );
+            ST8x4_UB( vec_pckod0, vec_pckod1, p_dst1, dst1_stride );
+            p_dst = p_dst1 + 4 * dst1_stride;
+            ST8x4_UB( vec_pckod2, vec_pckod3, p_dst, dst1_stride );
+            p_dst0 += 8;
+            p_dst1 += 8;
+        }
+
+        for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; )
+        {
+            LD_UB8( p_src, i_src_stride,
+                    in0, in1, in2, in3, in4, in5, in6, in7 );
+            p_src += 8;
+            PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
+                         vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 );
+            PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
+                         vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 );
+            ST4x4_UB( vec_pckev0, vec_pckev1, 0, 2, 0, 2, p_dst0, dst0_stride );
+            p_dst = p_dst0 + 4 * dst0_stride;
+            ST4x4_UB( vec_pckev2, vec_pckev3, 0, 2, 0, 2, p_dst, dst0_stride );
+            ST4x4_UB( vec_pckod0, vec_pckod1, 0, 2, 0, 2, p_dst1, dst1_stride );
+            p_dst = p_dst1 + 4 * dst1_stride;
+            ST4x4_UB( vec_pckod2, vec_pckod3, 0, 2, 0, 2, p_dst, dst1_stride );
+            p_dst0 += 4;
+            p_dst1 += 4;
+        }
+
+        for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ )
+        {
+            p_dst0[0] = p_src[0];
+            p_dst1[0] = p_src[1];
+            p_dst0[dst0_stride] = p_src[i_src_stride];
+            p_dst1[dst1_stride] = p_src[i_src_stride + 1];
+            p_dst0[2 * dst0_stride] = p_src[2 * i_src_stride];
+            p_dst1[2 * dst1_stride] = p_src[2 * i_src_stride + 1];
+            p_dst0[3 * dst0_stride] = p_src[3 * i_src_stride];
+            p_dst1[3 * dst1_stride] = p_src[3 * i_src_stride + 1];
+            p_dst0[4 * dst0_stride] = p_src[4 * i_src_stride];
+            p_dst1[4 * dst1_stride] = p_src[4 * i_src_stride + 1];
+            p_dst0[5 * dst0_stride] = p_src[5 * i_src_stride];
+            p_dst1[5 * dst1_stride] = p_src[5 * i_src_stride + 1];
+            p_dst0[6 * dst0_stride] = p_src[6 * i_src_stride];
+            p_dst1[6 * dst1_stride] = p_src[6 * i_src_stride + 1];
+            p_dst0[7 * dst0_stride] = p_src[7 * i_src_stride];
+            p_dst1[7 * dst1_stride] = p_src[7 * i_src_stride + 1];
+            p_dst0 += 1;
+            p_dst1 += 1;
+            p_src += 2;
+        }
+
+        p_src += ( ( 8 * i_src_stride ) - ( i_width << 1 ) );
+        p_dst0 += ( ( 8 * dst0_stride ) - i_width );
+        p_dst1 += ( ( 8 * dst1_stride ) - i_width );
+    }
+
+    for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ )
+    {
+        for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; )
+        {
+            in0 = LD_UB( p_src );
+            p_src += 16;
+            vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0,
+                                                  ( v16i8 ) in0 );
+            vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0,
+                                                  ( v16i8 ) in0 );
+            ST8x1_UB( vec_pckev0, p_dst0 );
+            ST8x1_UB( vec_pckod0, p_dst1 );
+            p_dst0 += 8;
+            p_dst1 += 8;
+        }
+
+        for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; )
+        {
+            in0 = LD_UB( p_src );
+            p_src += 8;
+            vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0,
+                                                  ( v16i8 ) in0 );
+            vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0,
+                                                  ( v16i8 ) in0 );
+            u_res_w0 = __msa_copy_u_w( ( v4i32 ) vec_pckev0, 0 );
+            SW( u_res_w0, p_dst0 );
+            u_res_w1 = __msa_copy_u_w( ( v4i32 ) vec_pckod0, 0 );
+            SW( u_res_w1, p_dst1 );
+            p_dst0 += 4;
+            p_dst1 += 4;
+        }
+
+        for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ )
+        {
+            p_dst0[0] = p_src[0];
+            p_dst1[0] = p_src[1];
+            p_dst0 += 1;
+            p_dst1 += 1;
+            p_src += 2;
+        }
+
+        p_src += ( ( i_src_stride ) - ( i_width << 1 ) );
+        p_dst0 += ( ( dst0_stride ) - i_width );
+        p_dst1 += ( ( dst1_stride ) - i_width );
+    }
+}
+
+
+static void plane_copy_deinterleave_rgb_msa( uint8_t *p_src,
+                                             int32_t i_src_stride,
+                                             uint8_t *p_dst0,
+                                             int32_t i_dst0_stride,
+                                             uint8_t *p_dst1,
+                                             int32_t i_dst1_stride,
+                                             uint8_t *p_dst2,
+                                             int32_t i_dst2_stride,
+                                             int32_t i_width,
+                                             int32_t i_height )
+{
+    uint8_t *p_src_orig = p_src;
+    uint8_t *p_dst0_orig = p_dst0;
+    uint8_t *p_dst1_orig = p_dst1;
+    uint8_t *p_dst2_orig = p_dst2;
+    int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4;
+    v16i8 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 temp0, temp1, temp2, temp3;
+    v16i8 mask0 = { 0, 3, 6, 9, 12, 15, 18, 21, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16i8 mask1 = { 1, 4, 7, 10, 13, 16, 19, 22, 0, 0, 0, 0, 0, 0, 0, 0 };
+    v16i8 mask2 = { 2, 5, 8, 11, 14, 17, 20, 23, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    i_w_mul8 = i_width - i_width % 8;
+    i_h_mul4 = i_height - i_height % 4;
+
+    for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
+    {
+        p_src = p_src_orig;
+        p_dst0 = p_dst0_orig;
+        p_dst1 = p_dst1_orig;
+        p_dst2 = p_dst2_orig;
+
+        for( i_loop_width = ( i_width >> 3 ); i_loop_width--; )
+        {
+            LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
+            LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 );
+
+            VSHF_B2_SB( in0, in4, in1, in5, mask0, mask0, temp0, temp1 );
+            VSHF_B2_SB( in2, in6, in3, in7, mask0, mask0, temp2, temp3 );
+            ST8x1_UB( temp0, p_dst0 );
+            ST8x1_UB( temp1, p_dst0 + i_dst0_stride );
+            ST8x1_UB( temp2, p_dst0 + 2 * i_dst0_stride );
+            ST8x1_UB( temp3, p_dst0 + 3 * i_dst0_stride );
+
+            VSHF_B2_SB( in0, in4, in1, in5, mask1, mask1, temp0, temp1 );
+            VSHF_B2_SB( in2, in6, in3, in7, mask1, mask1, temp2, temp3 );
+            ST8x1_UB( temp0, p_dst1 );
+            ST8x1_UB( temp1, p_dst1 + i_dst1_stride );
+            ST8x1_UB( temp2, p_dst1 + 2 * i_dst1_stride );
+            ST8x1_UB( temp3, p_dst1 + 3 * i_dst1_stride );
+
+            VSHF_B2_SB( in0, in4, in1, in5, mask2, mask2, temp0, temp1 );
+            VSHF_B2_SB( in2, in6, in3, in7, mask2, mask2, temp2, temp3 );
+            ST8x1_UB( temp0, p_dst2 );
+            ST8x1_UB( temp1, p_dst2 + i_dst2_stride );
+            ST8x1_UB( temp2, p_dst2 + 2 * i_dst2_stride );
+            ST8x1_UB( temp3, p_dst2 + 3 * i_dst2_stride );
+
+            p_src += 8 * 3;
+            p_dst0 += 8;
+            p_dst1 += 8;
+            p_dst2 += 8;
+        }
+
+        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
+        {
+            p_dst0_orig[i_loop_width] = p_src_orig[0 + 3 * i_loop_width];
+            p_dst1_orig[i_loop_width] = p_src_orig[1 + 3 * i_loop_width];
+            p_dst2_orig[i_loop_width] = p_src_orig[2 + 3 * i_loop_width];
+
+            p_dst0_orig[i_loop_width + i_dst0_stride] =
+                p_src_orig[0 + i_src_stride + 3 * i_loop_width];
+            p_dst1_orig[i_loop_width + i_dst1_stride] =
+                p_src_orig[1 + i_src_stride + 3 * i_loop_width];
+            p_dst2_orig[i_loop_width + i_dst2_stride] =
+                p_src_orig[2 + i_src_stride + 3 * i_loop_width];
+
+            p_dst0_orig[i_loop_width + 2 * i_dst0_stride] =
+                p_src_orig[0 + 2 * i_src_stride + 3 * i_loop_width];
+            p_dst1_orig[i_loop_width + 2 * i_dst1_stride] =
+                p_src_orig[1 + 2 * i_src_stride + 3 * i_loop_width];
+            p_dst2_orig[i_loop_width + 2 * i_dst2_stride] =
+                p_src_orig[2 + 2 * i_src_stride + 3 * i_loop_width];
+
+            p_dst0_orig[i_loop_width + 3 * i_dst0_stride] =
+                p_src_orig[0 + 3 * i_src_stride + 3 * i_loop_width];
+            p_dst1_orig[i_loop_width + 3 * i_dst1_stride] =
+                p_src_orig[1 + 3 * i_src_stride + 3 * i_loop_width];
+            p_dst2_orig[i_loop_width + 3 * i_dst2_stride] =
+                p_src_orig[2 + 3 * i_src_stride + 3 * i_loop_width];
+        }
+
+        p_src_orig += ( 4 * i_src_stride );
+        p_dst0_orig += ( 4 * i_dst0_stride );
+        p_dst1_orig += ( 4 * i_dst1_stride );
+        p_dst2_orig += ( 4 * i_dst2_stride );
+    }
+
+    for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ )
+    {
+        p_src = p_src_orig;
+        p_dst0 = p_dst0_orig;
+        p_dst1 = p_dst1_orig;
+        p_dst2 = p_dst2_orig;
+
+        for( i_loop_width = ( i_width >> 3 ); i_loop_width--; )
+        {
+            in0 = LD_SB( p_src );
+            in4 = LD_SB( p_src + 16 );
+            temp0 = __msa_vshf_b( mask0, in4, in0 );
+            ST8x1_UB( temp0, p_dst0 );
+            temp0 = __msa_vshf_b( mask1, in4, in0 );
+            ST8x1_UB( temp0, p_dst1 );
+            temp0 = __msa_vshf_b( mask2, in4, in0 );
+            ST8x1_UB( temp0, p_dst2 );
+
+            p_src += 8 * 3;
+            p_dst0 += 8;
+            p_dst1 += 8;
+            p_dst2 += 8;
+        }
+
+        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
+        {
+            p_dst0_orig[i_loop_width] = p_src_orig[3 * i_loop_width];
+            p_dst1_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 1];
+            p_dst2_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 2];
+        }
+
+        p_src_orig += ( i_src_stride );
+        p_dst0_orig += ( i_dst0_stride );
+        p_dst1_orig += ( i_dst1_stride );
+        p_dst2_orig += ( i_dst2_stride );
+    }
+}
+
+static void plane_copy_deinterleave_rgba_msa( uint8_t *p_src,
+                                              int32_t i_src_stride,
+                                              uint8_t *p_dst0,
+                                              int32_t i_dst0_stride,
+                                              uint8_t *p_dst1,
+                                              int32_t i_dst1_stride,
+                                              uint8_t *p_dst2,
+                                              int32_t i_dst2_stride,
+                                              int32_t i_width,
+                                              int32_t i_height )
+{
+    uint8_t *p_src_orig = p_src;
+    uint8_t *p_dst0_orig = p_dst0;
+    uint8_t *p_dst1_orig = p_dst1;
+    uint8_t *p_dst2_orig = p_dst2;
+    int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4;
+    v16i8 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16i8 in8, in9, in10, in11, in12, in13, in14, in15;
+    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+    v8i16 temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15;
+
+    i_w_mul8 = i_width - i_width % 8;
+    i_h_mul4 = i_height - i_height % 4;
+
+    for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
+    {
+        p_src = p_src_orig;
+        p_dst0 = p_dst0_orig;
+        p_dst1 = p_dst1_orig;
+        p_dst2 = p_dst2_orig;
+
+        for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
+        {
+            LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
+            LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 );
+            LD_SB4( ( p_src + 32 ), i_src_stride, in8, in9, in10, in11 );
+            LD_SB4( ( p_src + 48 ), i_src_stride, in12, in13, in14, in15 );
+
+            PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 );
+            temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
+            temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 );
+            PCKEV_H2_SH( in5, in1, in13, in9, temp4, temp5 );
+            temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 );
+            temp7 = __msa_pckod_h( ( v8i16 ) in13, ( v8i16 ) in9 );
+            PCKEV_H2_SH( in6, in2, in14, in10, temp8, temp9 );
+            temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 );
+            temp11 = __msa_pckod_h( ( v8i16 ) in14, ( v8i16 ) in10 );
+            PCKEV_H2_SH( in7, in3, in15, in11, temp12, temp13 );
+            temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 );
+            temp15 = __msa_pckod_h( ( v8i16 ) in15, ( v8i16 ) in11 );
+            PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 );
+            in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 );
+            PCKEV_B2_SB( temp5, temp4, temp7, temp6, in4, in5 );
+            in6 = __msa_pckod_b( ( v16i8 ) temp5, ( v16i8 ) temp4 );
+            PCKEV_B2_SB( temp9, temp8, temp11, temp10, in8, in9 );
+            in10 = __msa_pckod_b( ( v16i8 ) temp9, ( v16i8 ) temp8 );
+            PCKEV_B2_SB( temp13, temp12, temp15, temp14, in12, in13 );
+            in14 = __msa_pckod_b( ( v16i8 ) temp13, ( v16i8 ) temp12 );
+            ST_SB4( in0, in4, in8, in12, p_dst0, i_dst0_stride );
+            ST_SB4( in1, in5, in9, in13, p_dst2, i_dst2_stride );
+            ST_SB4( in2, in6, in10, in14, p_dst1, i_dst1_stride );
+
+            p_src += 16 * 4;
+            p_dst0 += 16;
+            p_dst1 += 16;
+            p_dst2 += 16;
+        }
+
+        for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; )
+        {
+            LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
+            LD_SB4( p_src + 16, i_src_stride, in4, in5, in6, in7 );
+
+            PCKEV_H2_SH( in4, in0, in5, in1, temp0, temp4 );
+            temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
+            temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 );
+
+            PCKEV_H2_SH( in6, in2, in7, in3, temp8, temp12 );
+            temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 );
+            temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 );
+
+            PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 );
+            in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 );
+            PCKEV_B2_SB( temp4, temp4, temp6, temp6, in4, in5 );
+            in6 = __msa_pckod_b( ( v16i8 ) temp4, ( v16i8 ) temp4 );
+            PCKEV_B2_SB( temp8, temp8, temp10, temp10, in8, in9 );
+            in10 = __msa_pckod_b( ( v16i8 ) temp8, ( v16i8 ) temp8 );
+            PCKEV_B2_SB( temp12, temp12, temp14, temp14, in12, in13 );
+            in14 = __msa_pckod_b( ( v16i8 ) temp12, ( v16i8 ) temp12 );
+
+            ST8x1_UB( in0, p_dst0 );
+            ST8x1_UB( in4, p_dst0 + i_dst0_stride );
+            ST8x1_UB( in8, p_dst0 + 2 * i_dst0_stride );
+            ST8x1_UB( in12, p_dst0 + 3 * i_dst0_stride );
+
+            ST8x1_UB( in1, p_dst2 );
+            ST8x1_UB( in5, p_dst2 + i_dst2_stride );
+            ST8x1_UB( in9, p_dst2 + 2 * i_dst2_stride );
+            ST8x1_UB( in13, p_dst2 + 3 * i_dst2_stride );
+
+            ST8x1_UB( in2, p_dst1 );
+            ST8x1_UB( in6, p_dst1 + i_dst1_stride );
+            ST8x1_UB( in10, p_dst1 + 2 * i_dst1_stride );
+            ST8x1_UB( in14, p_dst1 + 3 * i_dst1_stride );
+
+            p_src += 8 * 4;
+            p_dst0 += 8;
+            p_dst1 += 8;
+            p_dst2 += 8;
+        }
+
+        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
+        {
+            p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width];
+            p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1];
+            p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2];
+
+            p_dst0_orig[i_dst0_stride + i_loop_width] =
+                p_src_orig[i_src_stride + 4 * i_loop_width];
+            p_dst1_orig[i_dst1_stride + i_loop_width] =
+                p_src_orig[i_src_stride + 4 * i_loop_width + 1];
+            p_dst2_orig[i_dst2_stride + i_loop_width] =
+                p_src_orig[i_src_stride + 4 * i_loop_width + 2];
+
+            p_dst0_orig[2 * i_dst0_stride + i_loop_width] =
+                p_src_orig[2 * i_src_stride + 4 * i_loop_width];
+            p_dst1_orig[2 * i_dst1_stride + i_loop_width] =
+                p_src_orig[2 * i_src_stride + 4 * i_loop_width + 1];
+            p_dst2_orig[2 * i_dst2_stride + i_loop_width] =
+                p_src_orig[2 * i_src_stride + 4 * i_loop_width + 2];
+
+            p_dst0_orig[3 * i_dst0_stride + i_loop_width] =
+                p_src_orig[3 * i_src_stride + 4 * i_loop_width];
+            p_dst1_orig[3 * i_dst1_stride + i_loop_width] =
+                p_src_orig[3 * i_src_stride + 4 * i_loop_width + 1];
+            p_dst2_orig[3 * i_dst2_stride + i_loop_width] =
+                p_src_orig[3 * i_src_stride + 4 * i_loop_width + 2];
+        }
+
+        p_src_orig += ( 4 * i_src_stride );
+        p_dst0_orig += ( 4 * i_dst0_stride );
+        p_dst1_orig += ( 4 * i_dst1_stride );
+        p_dst2_orig += ( 4 * i_dst2_stride );
+    }
+
+    for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ )
+    {
+        p_src = p_src_orig;
+        p_dst0 = p_dst0_orig;
+        p_dst1 = p_dst1_orig;
+        p_dst2 = p_dst2_orig;
+
+        for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
+        {
+            LD_SB4( p_src, 16, in0, in4, in8, in12 );
+
+            PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 );
+            temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
+            temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 );
+            PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 );
+            in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 );
+            ST_SB( in0, p_dst0 );
+            ST_SB( in0, p_dst0 );
+            ST_SB( in1, p_dst2 );
+            ST_SB( in1, p_dst2 );
+            ST_SB( in2, p_dst1 );
+            ST_SB( in2, p_dst1 );
+
+            p_src += 16 * 4;
+            p_dst0 += 16;
+            p_dst1 += 16;
+            p_dst2 += 16;
+        }
+
+        for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; )
+        {
+            in0 = LD_SB( p_src );
+            in4 = LD_SB( p_src + 16 );
+
+            temp0 = __msa_pckev_h( ( v8i16 ) in4, ( v8i16 ) in0 );
+            temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
+            PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 );
+            in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 );
+            ST8x1_UB( in0, p_dst0 );
+            ST8x1_UB( in1, p_dst2 );
+            ST8x1_UB( in2, p_dst1 );
+
+            p_src += 8 * 4;
+            p_dst0 += 8;
+            p_dst1 += 8;
+            p_dst2 += 8;
+        }
+
+        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
+        {
+            p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width];
+            p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1];
+            p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2];
+        }
+
+        p_src_orig += ( i_src_stride );
+        p_dst0_orig += ( i_dst0_stride );
+        p_dst1_orig += ( i_dst1_stride );
+        p_dst2_orig += ( i_dst2_stride );
+    }
+}
+
+static void store_interleave_chroma_msa( uint8_t *p_src0, int32_t i_src0_stride,
+                                         uint8_t *p_src1, int32_t i_src1_stride,
+                                         uint8_t *p_dst, int32_t i_dst_stride,
+                                         int32_t i_height )
+{
+    int32_t i_loop_height, i_h4w;
+    v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
+    v16u8 ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3;
+
+    i_h4w = i_height % 4;
+    for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
+    {
+        LD_UB4( p_src0, i_src0_stride, in0, in1, in2, in3 );
+        p_src0 += ( 4 * i_src0_stride );
+        LD_UB4( p_src1, i_src1_stride, in4, in5, in6, in7 );
+        p_src1 += ( 4 * i_src1_stride );
+        ILVR_B4_UB( in4, in0, in5, in1, in6, in2, in7, in3,
+                    ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3 );
+        ST_UB4( ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3,
+                p_dst, i_dst_stride );
+        p_dst += ( 4 * i_dst_stride );
+    }
+
+    for( i_loop_height = i_h4w; i_loop_height--; )
+    {
+        in0 = LD_UB( p_src0 );
+        p_src0 += ( i_src0_stride );
+        in1 = LD_UB( p_src1 );
+        p_src1 += ( i_src1_stride );
+        ilvr_vec0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) in1, ( v16i8 ) in0 );
+        ST_UB( ilvr_vec0, p_dst );
+        p_dst += ( i_dst_stride );
+    }
+}
+
+static void frame_init_lowres_core_msa( uint8_t *p_src, int32_t i_src_stride,
+                                        uint8_t *p_dst0, int32_t dst0_stride,
+                                        uint8_t *p_dst1, int32_t dst1_stride,
+                                        uint8_t *p_dst2, int32_t dst2_stride,
+                                        uint8_t *p_dst3, int32_t dst3_stride,
+                                        int32_t i_width, int32_t i_height )
+{
+    int32_t i_loop_width, i_loop_height, i_w16_mul;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+    v16u8 sld1_vec0, sld1_vec1, sld1_vec2, sld1_vec3, sld1_vec4, sld1_vec5;
+    v16u8 pckev_vec0, pckev_vec1, pckev_vec2;
+    v16u8 pckod_vec0, pckod_vec1, pckod_vec2;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v16u8 res0, res1;
+
+    i_w16_mul = i_width - i_width % 16;
+    for( i_loop_height = i_height; i_loop_height--; )
+    {
+        LD_UB3( p_src, i_src_stride, src0, src1, src2 );
+        p_src += 16;
+        for( i_loop_width = 0; i_loop_width < ( i_w16_mul >> 4 ); i_loop_width++ )
+        {
+            LD_UB3( p_src, i_src_stride, src3, src4, src5 );
+            p_src += 16;
+            LD_UB3( p_src, i_src_stride, src6, src7, src8 );
+            p_src += 16;
+            PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 );
+            PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 );
+            pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5,
+                                                  ( v16i8 ) src2 );
+            pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5,
+                                                  ( v16i8 ) src2 );
+            AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
+                         pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
+                         tmp0, tmp1, tmp2, tmp3 );
+            AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
+            ST_UB( res0, p_dst0 );
+            ST_UB( res1, p_dst2 );
+
+            SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 );
+            SLDI_B2_UB( src5, src6, src2, src3, sld1_vec2, sld1_vec3, 1 );
+            SLDI_B2_UB( src7, src8, src4, src5, sld1_vec4, sld1_vec5, 1 );
+            PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1,
+                         pckev_vec0, pckev_vec1 )
+            pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5,
+                                                  ( v16i8 ) sld1_vec2 );
+            AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
+                         pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
+                         tmp0, tmp1, tmp2, tmp3 );
+            AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
+            ST_UB( res0, p_dst1 );
+            ST_UB( res1, p_dst3 );
+
+            src0 = src6;
+            src1 = src7;
+            src2 = src8;
+            p_dst0 += 16;
+            p_dst1 += 16;
+            p_dst2 += 16;
+            p_dst3 += 16;
+        }
+
+        for( i_loop_width = i_w16_mul; i_loop_width < i_width;
+             i_loop_width += 8 )
+        {
+            LD_UB3( p_src, i_src_stride, src3, src4, src5 );
+            p_src += 16;
+            PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 );
+            PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 );
+            pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5,
+                                                  ( v16i8 ) src2 );
+            pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5,
+                                                  ( v16i8 ) src2 );
+            AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
+                         pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
+                         tmp0, tmp1, tmp2, tmp3 );
+            AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
+            ST8x1_UB( res0, p_dst0 );
+            ST8x1_UB( res1, p_dst2 );
+
+            SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 );
+            SLDI_B2_UB( src5, src3, src2, src3, sld1_vec2, sld1_vec3, 1 );
+            SLDI_B2_UB( src4, src5, src4, src5, sld1_vec4, sld1_vec5, 1 );
+            PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1,
+                         pckev_vec0, pckev_vec1 )
+            pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5,
+                                                  ( v16i8 ) sld1_vec2 );
+            AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
+                         pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
+                         tmp0, tmp1, tmp2, tmp3 );
+            AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
+            ST8x1_UB( res0, p_dst1 );
+            ST8x1_UB( res1, p_dst3 );
+            p_dst0 += 8;
+            p_dst1 += 8;
+            p_dst2 += 8;
+            p_dst3 += 8;
+        }
+
+        p_src += ( i_src_stride * 2 - ( ( i_width * 2 ) + 16 ) );
+        p_dst0 += ( dst0_stride - i_width );
+        p_dst1 += ( dst1_stride - i_width );
+        p_dst2 += ( dst2_stride - i_width );
+        p_dst3 += ( dst3_stride - i_width );
+    }
+}
+
+void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                           uint8_t *p_src, intptr_t i_src_stride,
+                           int32_t i_height )
+{
+    copy_width16_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
+}
+
+void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
+                          intptr_t i_src_stride, int32_t i_height )
+{
+    copy_width8_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
+}
+
+void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
+                          intptr_t i_src_stride, int32_t i_height )
+{
+    copy_width4_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
+}
+
+void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+                               uint8_t *p_pix2, intptr_t pix2_stride,
+                               uint8_t *p_pix3, intptr_t pix3_stride,
+                               int32_t i_weight )
+{
+    if( 32 == i_weight )
+    {
+        avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+                             p_pix1, pix1_stride, 16 );
+    }
+    else if( i_weight < 0 || i_weight > 63 )
+    {
+        avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
+                                          p_pix3, pix3_stride,
+                                          p_pix1, pix1_stride,
+                                          16, 5, i_weight,
+                                          ( 64 - i_weight ), 0 );
+    }
+    else
+    {
+        avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
+                                       p_pix3, pix3_stride,
+                                       p_pix1, pix1_stride,
+                                       16, 5, i_weight,
+                                       ( 64 - i_weight ), 0 );
+    }
+}
+
+void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+                              uint8_t *p_pix2, intptr_t pix2_stride,
+                              uint8_t *p_pix3, intptr_t pix3_stride,
+                              int32_t i_weight )
+{
+    if( 32 == i_weight )
+    {
+        avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+                             p_pix1, pix1_stride, 8 );
+    }
+    else if( i_weight < 0 || i_weight > 63 )
+    {
+        avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
+                                          p_pix3, pix3_stride,
+                                          p_pix1, pix1_stride,
+                                          8, 5, i_weight,
+                                          ( 64 - i_weight ), 0 );
+    }
+    else
+    {
+        avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
+                                       p_pix3, pix3_stride,
+                                       p_pix1, pix1_stride,
+                                       8, 5, i_weight,
+                                       ( 64 - i_weight ), 0 );
+    }
+}
+
+void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+                              uint8_t *p_pix2, intptr_t pix2_stride,
+                              uint8_t *p_pix3, intptr_t pix3_stride,
+                              int32_t i_weight )
+{
+    if( 32 == i_weight )
+    {
+        avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+                            p_pix1, pix1_stride, 16 );
+    }
+    else if( i_weight < 0 || i_weight > 63 )
+    {
+        avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
+                                         p_pix3, pix3_stride,
+                                         p_pix1, pix1_stride, 16, 5, i_weight,
+                                         ( 64 - i_weight ), 0 );
+    }
+    else
+    {
+        avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
+                                      p_pix3, pix3_stride,
+                                      p_pix1, pix1_stride, 16, 5, i_weight,
+                                      ( 64 - i_weight ), 0 );
+    }
+}
+
+void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+                             uint8_t *p_pix2, intptr_t pix2_stride,
+                             uint8_t *p_pix3, intptr_t pix3_stride,
+                             int32_t i_weight )
+{
+    if( 32 == i_weight )
+    {
+        avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+                            p_pix1, pix1_stride, 8 );
+    }
+    else if( i_weight < 0 || i_weight > 63 )
+    {
+        avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
+                                         p_pix3, pix3_stride,
+                                         p_pix1, pix1_stride, 8, 5, i_weight,
+                                         ( 64 - i_weight ), 0 );
+    }
+    else
+    {
+        avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
+                                      p_pix3, pix3_stride,
+                                      p_pix1, pix1_stride, 8, 5, i_weight,
+                                      ( 64 - i_weight ), 0 );
+    }
+}
+
+void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+                             uint8_t *p_pix2, intptr_t pix2_stride,
+                             uint8_t *p_pix3, intptr_t pix3_stride,
+                             int32_t i_weight )
+{
+    if( 32 == i_weight )
+    {
+        avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+                            p_pix1, pix1_stride, 4 );
+    }
+    else if( i_weight < 0 || i_weight > 63 )
+    {
+        avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
+                                         p_pix3, pix3_stride,
+                                         p_pix1, pix1_stride, 4, 5, i_weight,
+                                         ( 64 - i_weight ), 0 );
+    }
+    else
+    {
+        avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
+                                      p_pix3, pix3_stride,
+                                      p_pix1, pix1_stride, 4, 5, i_weight,
+                                      ( 64 - i_weight ), 0 );
+    }
+}
+
+void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+                              uint8_t *p_pix2, intptr_t pix2_stride,
+                              uint8_t *p_pix3, intptr_t pix3_stride,
+                              int32_t i_weight )
+{
+    if( 32 == i_weight )
+    {
+        avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+                            p_pix1, pix1_stride, 16 );
+    }
+    else if( i_weight < 0 || i_weight > 63 )
+    {
+        avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
+                                         p_pix3, pix3_stride,
+                                         p_pix1, pix1_stride, 16, 5, i_weight,
+                                         ( 64 - i_weight ), 0 );
+    }
+    else
+    {
+        avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
+                                      p_pix3, pix3_stride,
+                                      p_pix1, pix1_stride, 16, 5, i_weight,
+                                      ( 64 - i_weight ), 0 );
+    }
+}
+
+void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+                             uint8_t *p_pix2, intptr_t pix2_stride,
+                             uint8_t *p_pix3, intptr_t pix3_stride,
+                             int32_t i_weight )
+{
+    if( 32 == i_weight )
+    {
+        avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+                            p_pix1, pix1_stride, 8 );
+    }
+    else if( i_weight < 0 || i_weight > 63 )
+    {
+        avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
+                                         p_pix3, pix3_stride,
+                                         p_pix1, pix1_stride, 8, 5, i_weight,
+                                         ( 64 - i_weight ), 0 );
+    }
+    else
+    {
+        avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
+                                      p_pix3, pix3_stride,
+                                      p_pix1, pix1_stride, 8, 5, i_weight,
+                                      ( 64 - i_weight ), 0 );
+    }
+}
+
+void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+                             uint8_t *p_pix2, intptr_t pix2_stride,
+                             uint8_t *p_pix3, intptr_t pix3_stride,
+                             int32_t i_weight )
+{
+    if( 32 == i_weight )
+    {
+        avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+                            p_pix1, pix1_stride, 4 );
+    }
+    else if( i_weight < 0 || i_weight > 63 )
+    {
+        avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
+                                         p_pix3, pix3_stride,
+                                         p_pix1, pix1_stride, 4, 5, i_weight,
+                                         ( 64 - i_weight ), 0 );
+    }
+    else
+    {
+        avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
+                                      p_pix3, pix3_stride,
+                                      p_pix1, pix1_stride, 4, 5, i_weight,
+                                      ( 64 - i_weight ), 0 );
+    }
+}
+
+void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+                             uint8_t *p_pix2, intptr_t pix2_stride,
+                             uint8_t *p_pix3, intptr_t pix3_stride,
+                             int32_t i_weight )
+{
+    if( 32 == i_weight )
+    {
+        avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+                            p_pix1, pix1_stride, 2 );
+    }
+    else if( i_weight < 0 || i_weight > 63 )
+    {
+        avc_biwgt_opscale_4x2_nw_msa( p_pix2, pix2_stride,
+                                      p_pix3, pix3_stride,
+                                      p_pix1, pix1_stride, 5, i_weight,
+                                      ( 64 - i_weight ), 0 );
+    }
+    else
+    {
+        avc_biwgt_opscale_4x2_msa( p_pix2, pix2_stride,
+                                   p_pix3, pix3_stride,
+                                   p_pix1, pix1_stride, 5, i_weight,
+                                   ( 64 - i_weight ), 0 );
+    }
+}
+
+
+void x264_memzero_aligned_msa( void *p_dst, size_t n )
+{
+    uint32_t u_tot32_mul_lines = n >> 5;
+    uint32_t u_remaining = n - ( u_tot32_mul_lines << 5 );
+
+    memset_zero_16width_msa( p_dst, 16, ( n / 16 ) );
+
+    if( u_remaining )
+    {
+        memset( p_dst + ( u_tot32_mul_lines << 5 ), 0, u_remaining );
+    }
+}
+
+void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                            uint8_t *p_src, intptr_t i_src_stride,
+                            const x264_weight_t *pWeight, int32_t i_height )
+{
+    int32_t i_log2_denom = pWeight->i_denom;
+    int32_t i_offset = pWeight->i_offset;
+    int32_t i_weight = pWeight->i_scale;
+
+    avc_wgt_opscale_4width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
+                                i_height, i_log2_denom, i_weight, i_offset );
+}
+
+void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                            uint8_t *p_src, intptr_t i_src_stride,
+                            const x264_weight_t *pWeight, int32_t i_height )
+{
+    int32_t i_log2_denom = pWeight->i_denom;
+    int32_t i_offset = pWeight->i_offset;
+    int32_t i_weight = pWeight->i_scale;
+
+    avc_wgt_opscale_8width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
+                                i_height, i_log2_denom, i_weight, i_offset );
+}
+
+void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                             uint8_t *p_src, intptr_t i_src_stride,
+                             const x264_weight_t *pWeight, int32_t i_height )
+{
+    int32_t i_log2_denom = pWeight->i_denom;
+    int32_t i_offset = pWeight->i_offset;
+    int32_t i_weight = pWeight->i_scale;
+
+    avc_wgt_opscale_16width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
+                                 i_height, i_log2_denom, i_weight, i_offset );
+}
+
+void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                             uint8_t *p_src, intptr_t i_src_stride,
+                             const x264_weight_t *pWeight, int32_t i_height )
+{
+    x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src, i_src_stride,
+                            pWeight, i_height );
+    x264_mc_weight_w4_msa( p_dst + 16, i_dst_stride, p_src + 16, i_src_stride,
+                           pWeight, i_height );
+}
+
+void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                       uint8_t *p_src[4], intptr_t i_src_stride,
+                       int32_t m_vx, int32_t m_vy,
+                       int32_t i_width, int32_t i_height,
+                       const x264_weight_t *pWeight )
+{
+    int32_t  i_qpel_idx;
+    int32_t  i_offset;
+    uint8_t  *p_src1;
+
+    i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
+    i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
+    p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
+             ( 3 == ( m_vy & 3 ) ) * i_src_stride;
+
+    if( i_qpel_idx & 5 )
+    {
+        uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
+                          i_offset + ( 3 == ( m_vx&3 ) );
+
+        if( 16 == i_width )
+        {
+            avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
+                                 p_dst, i_dst_stride, i_height );
+        }
+        else if( 8 == i_width )
+        {
+            avg_src_width8_msa( p_src1, i_src_stride, p_src2, i_src_stride,
+                                p_dst, i_dst_stride, i_height );
+        }
+        else if( 4 == i_width )
+        {
+            avg_src_width4_msa( p_src1, i_src_stride, p_src2, i_src_stride,
+                                p_dst, i_dst_stride, i_height );
+        }
+
+        if( pWeight->weightfn )
+        {
+            if( 16 == i_width )
+            {
+                x264_mc_weight_w16_msa( p_dst, i_dst_stride,
+                                        p_dst, i_dst_stride,
+                                        pWeight, i_height );
+            }
+            else if( 8 == i_width )
+            {
+                x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
+                                       pWeight, i_height );
+            }
+            else if( 4 == i_width )
+            {
+                x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
+                                       pWeight, i_height );
+            }
+        }
+    }
+    else if( pWeight->weightfn )
+    {
+        if( 16 == i_width )
+        {
+            x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
+                                    pWeight, i_height );
+        }
+        else if( 8 == i_width )
+        {
+            x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
+                                   pWeight, i_height );
+        }
+        else if( 4 == i_width )
+        {
+            x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
+                                   pWeight, i_height );
+        }
+    }
+    else
+    {
+        if( 16 == i_width )
+        {
+            copy_width16_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
+                              i_height );
+        }
+        else if( 8 == i_width )
+        {
+            copy_width8_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
+                             i_height );
+        }
+        else if( 4 == i_width )
+        {
+            copy_width4_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
+                             i_height );
+        }
+    }
+}
+
+void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
+                         intptr_t i_dst_stride,
+                         uint8_t *p_src, intptr_t i_src_stride,
+                         int32_t m_vx, int32_t m_vy,
+                         int32_t i_width, int32_t i_height )
+{
+    int32_t i_d8x = m_vx & 0x07;
+    int32_t i_d8y = m_vy & 0x07;
+    int32_t i_coeff_horiz1 = ( 8 - i_d8x );
+    int32_t i_coeff_vert1 = ( 8 - i_d8y );
+    int32_t i_coeff_horiz0 = i_d8x;
+    int32_t i_coeff_vert0 = i_d8y;
+
+    p_src += ( m_vy >> 3 ) * i_src_stride + ( m_vx >> 3 ) * 2;
+
+    if( 2 == i_width )
+    {
+        avc_interleaved_chroma_hv_2w_msa( p_src, i_src_stride,
+                                          p_dst_u, p_dst_v, i_dst_stride,
+                                          i_coeff_horiz0, i_coeff_horiz1,
+                                          i_coeff_vert0, i_coeff_vert1,
+                                          i_height );
+    }
+    else if( 4 == i_width )
+    {
+        avc_interleaved_chroma_hv_4w_msa( p_src, i_src_stride,
+                                          p_dst_u, p_dst_v, i_dst_stride,
+                                          i_coeff_horiz0, i_coeff_horiz1,
+                                          i_coeff_vert0, i_coeff_vert1,
+                                          i_height );
+    }
+    else if( 8 == i_width )
+    {
+        avc_interleaved_chroma_hv_8w_msa( p_src, i_src_stride,
+                                          p_dst_u, p_dst_v, i_dst_stride,
+                                          i_coeff_horiz0, i_coeff_horiz1,
+                                          i_coeff_vert0, i_coeff_vert1,
+                                          i_height );
+    }
+}
+
+void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v,
+                           uint8_t *p_dstc, uint8_t *p_src,
+                           intptr_t i_stride, int32_t i_width,
+                           int32_t i_height, int16_t *p_buf )
+{
+    for( int32_t i = 0; i < ( i_width / 16 ); i++ )
+    {
+        avc_luma_vt_16w_msa( p_src - 2 - ( 2 * i_stride ), i_stride,
+                             p_dst_v - 2, i_stride, i_height );
+        avc_luma_mid_16w_msa( p_src - 2 - ( 2 * i_stride ) , i_stride,
+                              p_dstc, i_stride, i_height );
+        avc_luma_hz_16w_msa( p_src - 2, i_stride, p_dsth, i_stride, i_height );
+
+        p_src += 16;
+        p_dst_v += 16;
+        p_dsth += 16;
+        p_dstc += 16;
+    }
+}
+
+void x264_plane_copy_interleave_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                                     uint8_t *p_src0, intptr_t i_src_stride0,
+                                     uint8_t *p_src1, intptr_t i_src_stride1,
+                                     int32_t i_width, int32_t i_height )
+{
+    plane_copy_interleave_msa( p_src0, i_src_stride0, p_src1, i_src_stride1,
+                               p_dst, i_dst_stride, i_width, i_height );
+}
+
+void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0,
+                                       uint8_t *p_dst1, intptr_t i_dst_stride1,
+                                       uint8_t *p_src, intptr_t i_src_stride,
+                                       int32_t i_width, int32_t i_height )
+{
+    plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst0, i_dst_stride0,
+                                 p_dst1, i_dst_stride1, i_width, i_height );
+}
+
+void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0,
+                                           intptr_t i_dst_stride0,
+                                           uint8_t *p_dst1,
+                                           intptr_t i_dst_stride1,
+                                           uint8_t *p_dst2,
+                                           intptr_t i_dst_stride2,
+                                           uint8_t *p_src,
+                                           intptr_t i_src_stride,
+                                           int32_t i_src_width,
+                                           int32_t i_width,
+                                           int32_t i_height )
+{
+    if( 3 == i_src_width )
+    {
+        plane_copy_deinterleave_rgb_msa( p_src, i_src_stride,
+                                         p_dst0, i_dst_stride0,
+                                         p_dst1, i_dst_stride1,
+                                         p_dst2, i_dst_stride2,
+                                         i_width, i_height );
+    }
+    else if( 4 == i_src_width )
+    {
+        plane_copy_deinterleave_rgba_msa( p_src, i_src_stride,
+                                          p_dst0, i_dst_stride0,
+                                          p_dst1, i_dst_stride1,
+                                          p_dst2, i_dst_stride2,
+                                          i_width, i_height );
+    }
+}
+
+void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                                       uint8_t *p_src0, uint8_t *p_src1,
+                                       int32_t i_height )
+{
+    store_interleave_chroma_msa( p_src0, FDEC_STRIDE, p_src1, FDEC_STRIDE,
+                                 p_dst, i_dst_stride, i_height );
+}
+
+void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src,
+                                             intptr_t i_src_stride,
+                                             int32_t i_height )
+{
+    plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FENC_STRIDE,
+                                 ( p_dst + ( FENC_STRIDE / 2 ) ), FENC_STRIDE,
+                                 8, i_height );
+}
+
+void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src,
+                                             intptr_t i_src_stride,
+                                             int32_t i_height )
+{
+    plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FDEC_STRIDE,
+                                 ( p_dst + ( FDEC_STRIDE / 2 ) ), FDEC_STRIDE,
+                                 8, i_height );
+}
+
+void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0,
+                                      uint8_t *p_dst1, uint8_t *p_dst2,
+                                      uint8_t *p_dst3, intptr_t i_src_stride,
+                                      intptr_t i_dst_stride, int32_t i_width,
+                                      int32_t i_height )
+{
+    frame_init_lowres_core_msa( p_src, i_src_stride, p_dst0, i_dst_stride,
+                                p_dst1, i_dst_stride, p_dst2, i_dst_stride,
+                                p_dst3, i_dst_stride, i_width, i_height );
+}
+
+uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
+                           uint8_t *p_src[4], intptr_t i_src_stride,
+                           int32_t m_vx, int32_t m_vy,
+                           int32_t i_width, int32_t i_height,
+                           const x264_weight_t *pWeight )
+{
+    int32_t i_qpel_idx, i_cnt, i_h4w;
+    int32_t i_offset;
+    uint8_t *p_src1, *src1_org;
+
+    i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
+    i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
+    p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
+           ( 3 == ( m_vy & 3 ) ) * i_src_stride;
+
+    i_h4w = i_height - i_height%4;
+
+    if( i_qpel_idx & 5 )
+    {
+        uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
+                          i_offset + ( 3 == ( m_vx & 3 ) );
+
+        if( 16 == i_width )
+        {
+            avg_src_width16_msa( p_src1, i_src_stride,
+                                 p_src2, i_src_stride,
+                                 p_dst, *p_dst_stride, i_h4w );
+            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
+            {
+                v16u8 src_vec1, src_vec2;
+                v16u8 dst_vec0;
+
+                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
+                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
+
+                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
+
+                ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) );
+            }
+        }
+        else if( 20 == i_width )
+        {
+            avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
+                                 p_dst, *p_dst_stride, i_h4w );
+            avg_src_width4_msa( p_src1 + 16, i_src_stride,
+                                p_src2 + 16, i_src_stride,
+                                p_dst + 16, *p_dst_stride, i_h4w );
+
+            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
+            {
+                v16u8 src_vec1, src_vec2, src_vec3, src_vec4;
+                v16u8 dst_vec0, dst_vec1;
+                uint32_t temp0;
+
+                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
+                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
+                src_vec3 = LD_UB( p_src1 + i_cnt * i_src_stride + 16 );
+                src_vec4 = LD_UB( p_src2 + i_cnt * i_src_stride + 16 );
+
+                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
+                dst_vec1 = __msa_aver_u_b( src_vec3, src_vec4 );
+
+                temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec1, 0 );
+
+                ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) );
+                SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 );
+            }
+        }
+        else if( 12 == i_width )
+        {
+            avg_src_width8_msa( p_src1, i_src_stride,
+                                p_src2, i_src_stride,
+                                p_dst, *p_dst_stride, i_h4w );
+            avg_src_width4_msa( p_src1 + 8, i_src_stride,
+                                p_src2 + 8, i_src_stride,
+                                p_dst + 8, *p_dst_stride, i_h4w );
+            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
+            {
+                uint32_t temp0;
+                uint64_t dst0;
+                v16u8 src_vec1, src_vec2;
+                v16u8 dst_vec0;
+
+                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
+                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
+
+                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
+
+                dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 );
+                temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 2 );
+
+                SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) );
+                SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 8 );
+            }
+        }
+        else if( 8 == i_width )
+        {
+            avg_src_width8_msa( p_src1, i_src_stride,
+                                p_src2, i_src_stride,
+                                p_dst, *p_dst_stride, i_h4w );
+            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
+            {
+                uint64_t dst0;
+                v16u8 src_vec1, src_vec2;
+                v16u8 dst_vec0;
+
+                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
+                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
+
+                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
+
+                dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 );
+
+                SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) );
+            }
+        }
+        else if( 4 == i_width )
+        {
+            avg_src_width4_msa( p_src1, i_src_stride,
+                                p_src2, i_src_stride,
+                                p_dst, *p_dst_stride, i_h4w );
+            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
+            {
+                uint32_t temp0;
+                v16u8 src_vec1, src_vec2;
+                v16u8 dst_vec0;
+
+                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
+                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
+
+                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
+                temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 0 );
+
+                SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
+            }
+        }
+
+        if( pWeight->weightfn )
+        {
+            int32_t i_log2_denom;
+            int32_t i_offset_val;
+            int32_t i_weight;
+
+            i_log2_denom = pWeight->i_denom;
+            i_offset_val = pWeight->i_offset;
+            i_weight = pWeight->i_scale;
+
+            if( 16 == i_width || 12 == i_width )
+            {
+                x264_mc_weight_w16_msa( p_dst, *p_dst_stride,
+                                        p_dst, *p_dst_stride,
+                                        pWeight, i_h4w );
+                for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
+                {
+                    v16i8 zero = {0};
+                    v16u8 src_vec0;
+                    v16i8 tmp0;
+                    v8u16 temp_vec0, temp_vec1;
+                    v8u16 wgt, offset_val0;
+                    v8i16 denom;
+
+                    i_offset_val <<= ( i_log2_denom );
+
+                    if( i_log2_denom )
+                    {
+                        i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
+                    }
+
+                    wgt = ( v8u16 ) __msa_fill_h( i_weight );
+                    offset_val0 = ( v8u16 ) __msa_fill_h( i_offset_val );
+                    denom = __msa_fill_h( i_log2_denom );
+
+                    src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
+
+                    temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero,
+                                                        ( v16i8 ) src_vec0 );
+                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
+                                                        ( v16i8 ) src_vec0 );
+
+                    temp_vec0 = wgt * temp_vec0;
+                    temp_vec1 = wgt * temp_vec1;
+
+                    temp_vec0 =
+                        ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
+                                                  ( v8i16 ) offset_val0 );
+                    temp_vec1 =
+                        ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
+                                                  ( v8i16 ) offset_val0 );
+
+                    temp_vec0 =
+                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
+                    temp_vec1 =
+                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
+
+                    temp_vec0 =
+                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
+                    temp_vec1 =
+                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
+
+                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
+                    temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
+
+                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
+                                          ( v16i8 ) temp_vec0 );
+                    ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
+                }
+            }
+            else if( 20 == i_width )
+            {
+                x264_mc_weight_w20_msa( p_dst, *p_dst_stride,
+                                        p_dst, *p_dst_stride,
+                                        pWeight, i_h4w );
+                for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
+                {
+                    uint32_t temp0;
+                    v16i8 zero = {0};
+                    v16u8 src_vec0;
+                    v16i8 tmp0;
+                    v8u16 temp_vec0, temp_vec1;
+                    v8u16 wgt;
+                    v8i16 denom, offset_val0;
+
+                    i_offset_val <<= ( i_log2_denom );
+
+                    if( i_log2_denom )
+                    {
+                        i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
+                    }
+
+                    wgt = ( v8u16 ) __msa_fill_h( i_weight );
+                    offset_val0 = __msa_fill_h( i_offset_val );
+                    denom = __msa_fill_h( i_log2_denom );
+
+                    src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
+                    temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) + 16 );
+
+                    temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero,
+                                                        ( v16i8 ) src_vec0 );
+                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
+                                                        ( v16i8 ) src_vec0 );
+
+                    temp_vec0 = wgt * temp_vec0;
+                    temp_vec1 = wgt * temp_vec1;
+
+                    temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
+                                                          offset_val0 );
+                    temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
+                                                          offset_val0 );
+
+                    temp_vec0 =
+                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
+                    temp_vec1 =
+                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
+
+                    temp_vec0 =
+                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
+                    temp_vec1 =
+                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
+
+                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
+                    temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
+
+                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
+                                          ( v16i8 ) temp_vec0 );
+                    ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
+
+                    src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
+                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
+                                                        ( v16i8 ) src_vec0 );
+                    temp_vec0 = wgt * temp_vec0;
+
+                    temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
+                                                          offset_val0 );
+                    temp_vec0 =
+                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
+                    temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0,
+                                                       denom );
+                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
+
+                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
+                                          ( v16i8 ) temp_vec0 );
+                    temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
+                    SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 );
+                }
+            }
+            else if( 8 == i_width )
+            {
+                x264_mc_weight_w8_msa( p_dst, *p_dst_stride,
+                                       p_dst, *p_dst_stride,
+                                       pWeight, i_h4w );
+                for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ )
+                {
+                    uint64_t temp0;
+                    v16i8 zero = {0};
+                    v16u8 src_vec0;
+                    v16i8 tmp0;
+                    v8u16 temp_vec0;
+                    v8u16 wgt;
+                    v8i16 denom, offset_val0;
+
+                    i_offset_val = i_offset_val << i_log2_denom;
+
+                    if( i_log2_denom )
+                    {
+                        i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
+                    }
+
+                    wgt = ( v8u16 ) __msa_fill_h( i_weight );
+                    offset_val0 = __msa_fill_h( i_offset_val );
+                    denom = __msa_fill_h( i_log2_denom );
+
+                    src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
+
+                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
+                                                        ( v16i8 ) src_vec0 );
+                    temp_vec0 = wgt * temp_vec0;
+
+                    temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
+                                                          offset_val0 );
+                    temp_vec0 =
+                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
+                    temp_vec0 =
+                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
+                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
+
+                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
+                                          ( v16i8 ) temp_vec0 );
+                    temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
+                    SD( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
+                }
+            }
+            else if( 4 == i_width )
+            {
+                x264_mc_weight_w4_msa( p_dst, *p_dst_stride,
+                                       p_dst, *p_dst_stride,
+                                       pWeight, i_h4w );
+                for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
+                {
+                    uint32_t temp0;
+                    v16i8 zero = {0};
+                    v16u8 src_vec0;
+                    v16i8 tmp0;
+                    v8u16 temp_vec0;
+                    v8u16 wgt;
+                    v8i16 denom, offset_val0;
+
+                    i_offset_val <<= ( i_log2_denom );
+
+                    if( i_log2_denom )
+                    {
+                        i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
+                    }
+
+                    wgt = ( v8u16 ) __msa_fill_h( i_weight );
+                    offset_val0 = __msa_fill_h( i_offset_val );
+                    denom = __msa_fill_h( i_log2_denom );
+
+                    temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) );
+
+                    src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
+
+                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
+                                                        ( v16i8 ) src_vec0 );
+                    temp_vec0 = wgt * temp_vec0;
+
+                    temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
+                                                          offset_val0 );
+                    temp_vec0 =
+                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
+                    temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0,
+                                                       denom );
+                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
+
+                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
+                                          ( v16i8 ) temp_vec0 );
+                    temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
+                    SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
+                }
+            }
+        }
+
+        return p_dst;
+    }
+    else if( pWeight->weightfn )
+    {
+        int32_t i_offset_val, i_log2_denom, i_weight;
+
+        i_log2_denom = pWeight->i_denom;
+        i_offset_val = pWeight->i_offset;
+        i_weight = pWeight->i_scale;
+
+        i_h4w = i_height - i_height%4;
+
+        src1_org = p_src1;
+
+        if( 16 == i_width || 12 == i_width )
+        {
+            x264_mc_weight_w16_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
+                                    pWeight, i_h4w );
+            p_src1 = src1_org + i_h4w * i_src_stride;
+
+            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
+            {
+                v16i8 zero = {0};
+                v16u8 src_vec0;
+                v16i8 tmp0;
+                v8u16 temp_vec0, temp_vec1;
+                v8u16 wgt;
+                v8i16 denom, offset_val0;
+
+                i_offset_val <<= ( i_log2_denom );
+
+                if( i_log2_denom )
+                {
+                    i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
+                }
+
+                wgt = ( v8u16 ) __msa_fill_h( i_weight );
+                offset_val0 = __msa_fill_h( i_offset_val );
+                denom = __msa_fill_h( i_log2_denom );
+
+                src_vec0 = LD_UB( p_src1 );
+                p_src1 += i_src_stride;
+
+                temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 );
+                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
+
+                temp_vec0 = wgt * temp_vec0;
+                temp_vec1 = wgt * temp_vec1;
+
+                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
+                                                      offset_val0 );
+                temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
+                                                      offset_val0 );
+
+                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
+                temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
+
+                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
+                temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
+
+                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
+                temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
+
+                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
+                                      ( v16i8 ) temp_vec0 );
+                ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
+            }
+        }
+        else if( 20 == i_width )
+        {
+            x264_mc_weight_w20_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
+                                    pWeight, i_h4w );
+            p_src1 = src1_org + i_h4w * i_src_stride;
+
+            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
+            {
+                uint32_t temp0;
+                v16i8 zero = {0};
+                v16u8 src_vec0;
+                v16i8 tmp0;
+                v8u16 temp_vec0, temp_vec1;
+                v8u16 wgt;
+                v8i16 denom, offset_val0;
+
+                i_offset_val <<= ( i_log2_denom );
+
+                if( i_log2_denom )
+                {
+                    i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
+                }
+
+                wgt = ( v8u16 ) __msa_fill_h( i_weight );
+                offset_val0 = __msa_fill_h( i_offset_val );
+                denom = __msa_fill_h( i_log2_denom );
+
+                src_vec0 = LD_UB( p_src1 );
+                temp0 = LW( p_src1 + 16 );
+                p_src1 += i_src_stride;
+
+                temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 );
+                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
+
+                temp_vec0 = wgt * temp_vec0;
+                temp_vec1 = wgt * temp_vec1;
+
+                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
+                                                      offset_val0 );
+                temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
+                                                      offset_val0 );
+
+                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
+                temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
+
+                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
+                temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
+
+                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
+                temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
+
+                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
+                                      ( v16i8 ) temp_vec0 );
+                ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
+
+                src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
+                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
+                temp_vec0 = wgt * temp_vec0;
+
+                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
+                                                      offset_val0 );
+                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
+                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
+                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
+
+                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
+                                      ( v16i8 ) temp_vec0 );
+                temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
+                SW( temp0,p_dst + i_cnt * ( *p_dst_stride ) + 16 );
+            }
+        }
+        else if( 8 == i_width )
+        {
+            x264_mc_weight_w8_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
+                                   pWeight, i_h4w );
+            p_src1 = src1_org + i_h4w * i_src_stride;
+
+            for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ )
+            {
+                uint64_t u_temp0;
+                v16i8 zero = {0};
+                v16u8 src_vec0;
+                v16i8 tmp0;
+                v8u16 temp_vec0;
+                v8u16 wgt;
+                v8i16 denom, offset_val0;
+
+                i_offset_val = i_offset_val << i_log2_denom;
+
+                if( i_log2_denom )
+                {
+                    i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
+                }
+
+                wgt = ( v8u16 ) __msa_fill_h( i_weight );
+                offset_val0 = __msa_fill_h( i_offset_val );
+                denom = __msa_fill_h( i_log2_denom );
+
+                src_vec0 = LD_UB( p_src1 );
+                p_src1 += i_src_stride;
+
+                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
+                temp_vec0 = wgt * temp_vec0;
+
+                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
+                                                      offset_val0 );
+                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
+                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
+                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
+
+                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
+                                      ( v16i8 ) temp_vec0 );
+                u_temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
+                SD( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) );
+            }
+        }
+        else if( 4 == i_width )
+        {
+            x264_mc_weight_w4_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
+                                   pWeight, i_h4w );
+            p_src1 = src1_org + i_h4w * i_src_stride;
+
+            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
+            {
+                uint32_t u_temp0;
+                v16i8 zero = {0};
+                v16u8 src_vec0;
+                v16i8 tmp0;
+                v8u16 temp_vec0;
+                v8u16 wgt;
+                v8i16 denom, offset_val0;
+
+                i_offset_val <<= ( i_log2_denom );
+
+                if( i_log2_denom )
+                {
+                    i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
+                }
+
+                wgt = ( v8u16 ) __msa_fill_h( i_weight );
+                offset_val0 = __msa_fill_h( i_offset_val );
+                denom = __msa_fill_h( i_log2_denom );
+
+                u_temp0 = LW( p_src1 );
+                p_src1 += i_src_stride;
+
+                src_vec0 = ( v16u8 ) __msa_fill_w( u_temp0 );
+
+                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
+                temp_vec0 = wgt * temp_vec0;
+
+                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
+                                                      offset_val0 );
+                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
+                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
+                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
+
+                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
+                                      ( v16i8 ) temp_vec0 );
+                u_temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
+                SW( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) );
+            }
+        }
+
+        return p_dst;
+    }
+    else
+    {
+        *p_dst_stride = i_src_stride;
+        return p_src1;
+    }
+}
+
+void x264_mc_init_mips( int32_t cpu, x264_mc_functions_t *pf  )
+{
+    if( cpu & X264_CPU_MSA )
+    {
+        pf->mc_luma = x264_mc_luma_msa;
+        pf->mc_chroma = x264_mc_chroma_msa;
+        pf->get_ref = x264_get_ref_msa;
+
+        pf->avg[PIXEL_16x16]= x264_pixel_avg_16x16_msa;
+        pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_msa;
+        pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_msa;
+        pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_msa;
+        pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_msa;
+        pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_msa;
+        pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_msa;
+        pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_msa;
+        pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_msa;
+
+        pf->weight = x264_mc_weight_wtab_msa;
+        pf->offsetadd = x264_mc_weight_wtab_msa;
+        pf->offsetsub = x264_mc_weight_wtab_msa;
+
+        pf->copy_16x16_unaligned = x264_mc_copy_w16_msa;
+        pf->copy[PIXEL_16x16] = x264_mc_copy_w16_msa;
+        pf->copy[PIXEL_8x8] = x264_mc_copy_w8_msa;
+        pf->copy[PIXEL_4x4] = x264_mc_copy_w4_msa;
+
+        pf->store_interleave_chroma = x264_store_interleave_chroma_msa;
+        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_msa;
+        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_msa;
+
+        pf->plane_copy_interleave = x264_plane_copy_interleave_msa;
+        pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_msa;
+        pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_msa;
+
+        pf->hpel_filter = x264_hpel_filter_msa;
+
+        pf->memcpy_aligned = memcpy;
+        pf->memzero_aligned = x264_memzero_aligned_msa;
+        pf->frame_init_lowres_core = x264_frame_init_lowres_core_msa;
+    }
+}
+#endif

x264-snapshot-20150804-2245.tar.bz2/common/mips/mc.h Added

@@ -0,0 +1,31 @@
+/*****************************************************************************
+ * mc.h: msa motion compensation
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Neha Rana <neha.rana@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_MC_H
+#define X264_MIPS_MC_H
+
+void x264_mc_init_mips( int cpu, x264_mc_functions_t *pf );
+
+#endif

x264-snapshot-20150804-2245.tar.bz2/common/mips/pixel-c.c Added

@@ -0,0 +1,1491 @@
+/*****************************************************************************
+ * pixel-c.c: msa pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macros.h"
+#include "pixel.h"
+#include "predict.h"
+
+#if !HIGH_BIT_DEPTH
+#define CALC_MSE_B( src, ref, var )                                    \
+{                                                                      \
+    v16u8 src_l0_m, src_l1_m;                                          \
+    v8i16 res_l0_m, res_l1_m;                                          \
+                                                                       \
+    ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m );                       \
+    HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m );             \
+    DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var );  \
+}
+
+#define CALC_MSE_AVG_B( src, ref, var, sub )                           \
+{                                                                      \
+    v16u8 src_l0_m, src_l1_m;                                          \
+    v8i16 res_l0_m, res_l1_m;                                          \
+                                                                       \
+    ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m );                       \
+    HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m );             \
+    DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var );  \
+                                                                       \
+    sub += res_l0_m + res_l1_m;                                        \
+}
+
+#define VARIANCE_WxH( sse, diff, shift )                                \
+    ( ( sse ) - ( ( ( uint32_t )( diff ) * ( diff ) ) >> ( shift ) ) )
+
+static uint32_t sad_4width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                uint8_t *p_ref, int32_t i_ref_stride,
+                                int32_t i_height )
+{
+    int32_t i_ht_cnt;
+    uint32_t u_src0, u_src1, u_src2, u_src3, u_ref0, u_ref1, u_ref2, u_ref3;
+    v16u8 src = { 0 };
+    v16u8 ref = { 0 };
+    v16u8 diff;
+    v8u16 sad = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 );
+        p_src += ( 4 * i_src_stride );
+        LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 );
+        p_ref += ( 4 * i_ref_stride );
+
+        INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src );
+        INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref );
+
+        diff = __msa_asub_u_b( src, ref );
+        sad += __msa_hadd_u_h( diff, diff );
+    }
+
+    return ( HADD_UH_U32( sad ) );
+}
+
+static uint32_t sad_8width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                uint8_t *p_ref, int32_t i_ref_stride,
+                                int32_t i_height )
+{
+    int32_t i_ht_cnt;
+    v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+    v8u16 sad = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+        p_src += ( 4 * i_src_stride );
+        LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
+        p_ref += ( 4 * i_ref_stride );
+
+        PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                     src0, src1, ref0, ref1 );
+        sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
+    }
+
+    return ( HADD_UH_U32( sad ) );
+}
+
+static uint32_t sad_16width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                 uint8_t *p_ref, int32_t i_ref_stride,
+                                 int32_t i_height )
+{
+    int32_t i_ht_cnt;
+    v16u8 src0, src1, ref0, ref1;
+    v8u16 sad = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LD_UB2( p_src, i_src_stride, src0, src1 );
+        p_src += ( 2 * i_src_stride );
+        LD_UB2( p_ref, i_ref_stride, ref0, ref1 );
+        p_ref += ( 2 * i_ref_stride );
+        sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
+
+        LD_UB2( p_src, i_src_stride, src0, src1 );
+        p_src += ( 2 * i_src_stride );
+        LD_UB2( p_ref, i_ref_stride, ref0, ref1 );
+        p_ref += ( 2 * i_ref_stride );
+        sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
+    }
+
+    return ( HADD_UH_U32( sad ) );
+}
+
+static void sad_4width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
+                                uint8_t *p_ref0, uint8_t *p_ref1,
+                                uint8_t *p_ref2, int32_t i_ref_stride,
+                                int32_t i_height, uint32_t *pu_sad_array )
+{
+    int32_t i_ht_cnt;
+    v16u8 src = { 0 };
+    uint32_t src0, src1, src2, src3, load0, load1, load2, load3;
+    v16u8 ref0 = { 0 };
+    v16u8 ref1 = { 0 };
+    v16u8 ref2 = { 0 };
+    v16u8 diff;
+    v8u16 sad0 = { 0 };
+    v8u16 sad1 = { 0 };
+    v8u16 sad2 = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LW4( p_src, i_src_stride, src0, src1, src2, src3 );
+        INSERT_W4_UB( src0, src1, src2, src3, src );
+        p_src += ( 4 * i_src_stride );
+
+        LW4( p_ref0, i_ref_stride, load0, load1, load2, load3 );
+        INSERT_W4_UB( load0, load1, load2, load3, ref0 );
+        p_ref0 += ( 4 * i_ref_stride );
+
+        LW4( p_ref1, i_ref_stride, load0, load1, load2, load3 );
+        INSERT_W4_UB( load0, load1, load2, load3, ref1 );
+        p_ref1 += ( 4 * i_ref_stride );
+
+        LW4( p_ref2, i_ref_stride, load0, load1, load2, load3 );
+        INSERT_W4_UB( load0, load1, load2, load3, ref2 );
+        p_ref2 += ( 4 * i_ref_stride );
+
+        diff = __msa_asub_u_b( src, ref0 );
+        sad0 += __msa_hadd_u_h( diff, diff );
+
+        diff = __msa_asub_u_b( src, ref1 );
+        sad1 += __msa_hadd_u_h( diff, diff );
+
+        diff = __msa_asub_u_b( src, ref2 );
+        sad2 += __msa_hadd_u_h( diff, diff );
+    }
+
+    pu_sad_array[0] = HADD_UH_U32( sad0 );
+    pu_sad_array[1] = HADD_UH_U32( sad1 );
+    pu_sad_array[2] = HADD_UH_U32( sad2 );
+}
+
+static void sad_8width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
+                                uint8_t *p_ref0, uint8_t *p_ref1,
+                                uint8_t *p_ref2, int32_t i_ref_stride,
+                                int32_t i_height, uint32_t *pu_sad_array )
+{
+    int32_t i_ht_cnt;
+    v16u8 src0, src1, src2, src3;
+    v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
+    v8u16 sad0 = { 0 };
+    v8u16 sad1 = { 0 };
+    v8u16 sad2 = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+        p_src += ( 4 * i_src_stride );
+        LD_UB4( p_ref0, i_ref_stride, ref00, ref11, ref22, ref33 );
+        p_ref0 += ( 4 * i_ref_stride );
+
+        PCKEV_D4_UB( src1, src0, src3, src2, ref11, ref00, ref33, ref22,
+                     src0, src1, ref0, ref1 );
+        sad0 += SAD_UB2_UH( src0, src1, ref0, ref1 );
+
+        LD_UB4( p_ref1, i_ref_stride, ref00, ref11, ref22, ref33 );
+        p_ref1 += ( 4 * i_ref_stride );
+
+        PCKEV_D2_UB( ref11, ref00, ref33, ref22, ref0, ref1 );
+        sad1 += SAD_UB2_UH( src0, src1, ref0, ref1 );
+
+        LD_UB4( p_ref2, i_ref_stride, ref00, ref11, ref22, ref33 );
+        p_ref2 += ( 4 * i_ref_stride );
+
+        PCKEV_D2_UB( ref11, ref00, ref33, ref22, ref0, ref1 );
+        sad2 += SAD_UB2_UH( src0, src1, ref0, ref1 );
+    }
+
+    pu_sad_array[0] = HADD_UH_U32( sad0 );
+    pu_sad_array[1] = HADD_UH_U32( sad1 );
+    pu_sad_array[2] = HADD_UH_U32( sad2 );
+}
+
+static void sad_16width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
+                                 uint8_t *p_ref0, uint8_t *p_ref1,
+                                 uint8_t *p_ref2, int32_t i_ref_stride,
+                                 int32_t i_height, uint32_t *pu_sad_array )
+{
+    int32_t i_ht_cnt;
+    v16u8 src, ref;
+    v16u8 diff;
+    v8u16 sad0 = { 0 };
+    v8u16 sad1 = { 0 };
+    v8u16 sad2 = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 1 ); i_ht_cnt--; )
+    {
+        src = LD_UB( p_src );
+        p_src += i_src_stride;
+
+        ref = LD_UB( p_ref0 );
+        p_ref0 += i_ref_stride;
+        diff = __msa_asub_u_b( src, ref );
+        sad0 += __msa_hadd_u_h( diff, diff );
+
+        ref = LD_UB( p_ref1 );
+        p_ref1 += i_ref_stride;
+        diff = __msa_asub_u_b( src, ref );
+        sad1 += __msa_hadd_u_h( diff, diff );
+
+        ref = LD_UB( p_ref2 );
+        p_ref2 += i_ref_stride;
+        diff = __msa_asub_u_b( src, ref );
+        sad2 += __msa_hadd_u_h( diff, diff );
+
+        src = LD_UB( p_src );
+        p_src += i_src_stride;
+
+        ref = LD_UB( p_ref0 );
+        p_ref0 += i_ref_stride;
+        diff = __msa_asub_u_b( src, ref );
+        sad0 += __msa_hadd_u_h( diff, diff );
+
+        ref = LD_UB( p_ref1 );
+        p_ref1 += i_ref_stride;
+        diff = __msa_asub_u_b( src, ref );
+        sad1 += __msa_hadd_u_h( diff, diff );
+
+        ref = LD_UB( p_ref2 );
+        p_ref2 += i_ref_stride;
+        diff = __msa_asub_u_b( src, ref );
+        sad2 += __msa_hadd_u_h( diff, diff );
+    }
+
+    pu_sad_array[0] = HADD_UH_U32( sad0 );
+    pu_sad_array[1] = HADD_UH_U32( sad1 );
+    pu_sad_array[2] = HADD_UH_U32( sad2 );
+}
+
+static void sad_4width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
+                                uint8_t *p_aref[], int32_t i_ref_stride,
+                                int32_t i_height, uint32_t *pu_sad_array )
+{
+    uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
+    int32_t i_ht_cnt;
+    uint32_t src0, src1, src2, src3;
+    uint32_t ref0, ref1, ref2, ref3;
+    v16u8 src = { 0 };
+    v16u8 ref = { 0 };
+    v16u8 diff;
+    v8u16 sad0 = { 0 };
+    v8u16 sad1 = { 0 };
+    v8u16 sad2 = { 0 };
+    v8u16 sad3 = { 0 };
+
+    p_ref0 = p_aref[0];
+    p_ref1 = p_aref[1];
+    p_ref2 = p_aref[2];
+    p_ref3 = p_aref[3];
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LW4( p_src, i_src_stride, src0, src1, src2, src3 );
+        INSERT_W4_UB( src0, src1, src2, src3, src );
+        p_src += ( 4 * i_src_stride );
+
+        LW4( p_ref0, i_ref_stride, ref0, ref1, ref2, ref3 );
+        INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
+        p_ref0 += ( 4 * i_ref_stride );
+
+        diff = __msa_asub_u_b( src, ref );
+        sad0 += __msa_hadd_u_h( diff, diff );
+
+        LW4( p_ref1, i_ref_stride, ref0, ref1, ref2, ref3 );
+        INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
+        p_ref1 += ( 4 * i_ref_stride );
+
+        diff = __msa_asub_u_b( src, ref );
+        sad1 += __msa_hadd_u_h( diff, diff );
+
+        LW4( p_ref2, i_ref_stride, ref0, ref1, ref2, ref3 );
+        INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
+        p_ref2 += ( 4 * i_ref_stride );
+
+        diff = __msa_asub_u_b( src, ref );
+        sad2 += __msa_hadd_u_h( diff, diff );
+
+        LW4( p_ref3, i_ref_stride, ref0, ref1, ref2, ref3 );
+        INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
+        p_ref3 += ( 4 * i_ref_stride );
+
+        diff = __msa_asub_u_b( src, ref );
+        sad3 += __msa_hadd_u_h( diff, diff );
+    }
+
+    pu_sad_array[0] = HADD_UH_U32( sad0 );
+    pu_sad_array[1] = HADD_UH_U32( sad1 );
+    pu_sad_array[2] = HADD_UH_U32( sad2 );
+    pu_sad_array[3] = HADD_UH_U32( sad3 );
+}
+
+static void sad_8width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
+                                uint8_t *p_aref[], int32_t i_ref_stride,
+                                int32_t i_height, uint32_t *pu_sad_array )
+{
+    int32_t i_ht_cnt;
+    uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
+    v16u8 src0, src1, src2, src3;
+    v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+    v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
+    v8u16 sad0 = { 0 };
+    v8u16 sad1 = { 0 };
+    v8u16 sad2 = { 0 };
+    v8u16 sad3 = { 0 };
+
+    p_ref0 = p_aref[0];
+    p_ref1 = p_aref[1];
+    p_ref2 = p_aref[2];
+    p_ref3 = p_aref[3];
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+        p_src += ( 4 * i_src_stride );
+        LD_UB4( p_ref0, i_ref_stride, ref0, ref1, ref2, ref3 );
+        p_ref0 += ( 4 * i_ref_stride );
+        LD_UB4( p_ref1, i_ref_stride, ref4, ref5, ref6, ref7 );
+        p_ref1 += ( 4 * i_ref_stride );
+        LD_UB4( p_ref2, i_ref_stride, ref8, ref9, ref10, ref11 );
+        p_ref2 += ( 4 * i_ref_stride );
+        LD_UB4( p_ref3, i_ref_stride, ref12, ref13, ref14, ref15 );
+        p_ref3 += ( 4 * i_ref_stride );
+
+        PCKEV_D2_UB( src1, src0, src3, src2, src0, src1 );
+        PCKEV_D2_UB( ref1, ref0, ref3, ref2, ref0, ref1 );
+        sad0 += SAD_UB2_UH( src0, src1, ref0, ref1 );
+
+        PCKEV_D2_UB( ref5, ref4, ref7, ref6, ref0, ref1 );
+        sad1 += SAD_UB2_UH( src0, src1, ref0, ref1 );
+
+        PCKEV_D2_UB( ref9, ref8, ref11, ref10, ref0, ref1 );
+        sad2 += SAD_UB2_UH( src0, src1, ref0, ref1 );
+
+        PCKEV_D2_UB( ref13, ref12, ref15, ref14, ref0, ref1 );
+        sad3 += SAD_UB2_UH( src0, src1, ref0, ref1 );
+    }
+
+    pu_sad_array[0] = HADD_UH_U32( sad0 );
+    pu_sad_array[1] = HADD_UH_U32( sad1 );
+    pu_sad_array[2] = HADD_UH_U32( sad2 );
+    pu_sad_array[3] = HADD_UH_U32( sad3 );
+}
+
+static void sad_16width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
+                                 uint8_t *p_aref[], int32_t i_ref_stride,
+                                 int32_t i_height, uint32_t *pu_sad_array )
+{
+    int32_t i_ht_cnt;
+    uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
+    v16u8 src, ref0, ref1, ref2, ref3, diff;
+    v8u16 sad0 = { 0 };
+    v8u16 sad1 = { 0 };
+    v8u16 sad2 = { 0 };
+    v8u16 sad3 = { 0 };
+
+    p_ref0 = p_aref[0];
+    p_ref1 = p_aref[1];
+    p_ref2 = p_aref[2];
+    p_ref3 = p_aref[3];
+
+    for ( i_ht_cnt = ( i_height >> 1 ); i_ht_cnt--; )
+    {
+        src = LD_UB( p_src );
+        p_src += i_src_stride;
+        ref0 = LD_UB( p_ref0 );
+        p_ref0 += i_ref_stride;
+        ref1 = LD_UB( p_ref1 );
+        p_ref1 += i_ref_stride;
+        ref2 = LD_UB( p_ref2 );
+        p_ref2 += i_ref_stride;
+        ref3 = LD_UB( p_ref3 );
+        p_ref3 += i_ref_stride;
+
+        diff = __msa_asub_u_b( src, ref0 );
+        sad0 += __msa_hadd_u_h( diff, diff );
+        diff = __msa_asub_u_b( src, ref1 );
+        sad1 += __msa_hadd_u_h( diff, diff );
+        diff = __msa_asub_u_b( src, ref2 );
+        sad2 += __msa_hadd_u_h( diff, diff );
+        diff = __msa_asub_u_b( src, ref3 );
+        sad3 += __msa_hadd_u_h( diff, diff );
+
+        src = LD_UB( p_src );
+        p_src += i_src_stride;
+        ref0 = LD_UB( p_ref0 );
+        p_ref0 += i_ref_stride;
+        ref1 = LD_UB( p_ref1 );
+        p_ref1 += i_ref_stride;
+        ref2 = LD_UB( p_ref2 );
+        p_ref2 += i_ref_stride;
+        ref3 = LD_UB( p_ref3 );
+        p_ref3 += i_ref_stride;
+
+        diff = __msa_asub_u_b( src, ref0 );
+        sad0 += __msa_hadd_u_h( diff, diff );
+        diff = __msa_asub_u_b( src, ref1 );
+        sad1 += __msa_hadd_u_h( diff, diff );
+        diff = __msa_asub_u_b( src, ref2 );
+        sad2 += __msa_hadd_u_h( diff, diff );
+        diff = __msa_asub_u_b( src, ref3 );
+        sad3 += __msa_hadd_u_h( diff, diff );
+    }
+
+    pu_sad_array[0] = HADD_UH_U32( sad0 );
+    pu_sad_array[1] = HADD_UH_U32( sad1 );
+    pu_sad_array[2] = HADD_UH_U32( sad2 );
+    pu_sad_array[3] = HADD_UH_U32( sad3 );
+}
+
+static uint64_t avc_pixel_var16width_msa( uint8_t *p_pix, int32_t i_stride,
+                                          uint8_t i_height )
+{
+    uint32_t u_sum = 0, u_sqr_out = 0, u_cnt;
+    v16i8 pix, zero = { 0 };
+    v8u16 add, pix_r, pix_l;
+    v4u32 sqr = { 0 };
+
+    for ( u_cnt = i_height; u_cnt--; )
+    {
+        pix = LD_SB( p_pix );
+        p_pix += i_stride;
+        add = __msa_hadd_u_h( ( v16u8 ) pix, ( v16u8 ) pix );
+        u_sum += HADD_UH_U32( add );
+        ILVRL_B2_UH( zero, pix, pix_r, pix_l );
+        sqr = __msa_dpadd_u_w( sqr, pix_r, pix_r );
+        sqr = __msa_dpadd_u_w( sqr, pix_l, pix_l );
+    }
+
+    u_sqr_out = HADD_SW_S32( sqr );
+
+    return ( u_sum + ( ( uint64_t ) u_sqr_out << 32 ) );
+}
+
+static uint64_t avc_pixel_var8width_msa( uint8_t *p_pix, int32_t i_stride,
+                                         uint8_t i_height )
+{
+    uint32_t u_sum = 0, u_sqr_out = 0, u_cnt;
+    v16i8 pix, zero = { 0 };
+    v8u16 add, pix_r;
+    v4u32 sqr = { 0 };
+
+    for ( u_cnt = i_height; u_cnt--; )
+    {
+        pix = LD_SB( p_pix );
+        p_pix += i_stride;
+        pix_r = ( v8u16 ) __msa_ilvr_b( zero, pix );
+        add = __msa_hadd_u_h( ( v16u8 ) pix_r, ( v16u8 ) pix_r );
+        u_sum += HADD_UH_U32( add );
+        sqr = __msa_dpadd_u_w( sqr, pix_r, pix_r );
+    }
+
+    u_sqr_out = HADD_SW_S32( sqr );
+
+    return ( u_sum + ( ( uint64_t ) u_sqr_out << 32 ) );
+}
+
+static uint32_t sse_diff_8width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                     uint8_t *p_ref, int32_t i_ref_stride,
+                                     int32_t i_height, int32_t *p_diff )
+{
+    int32_t i_ht_cnt;
+    uint32_t u_sse;
+    v16u8 src0, src1, src2, src3;
+    v16u8 ref0, ref1, ref2, ref3;
+    v8i16 avg = { 0 };
+    v4i32 vec, var = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+        p_src += ( 4 * i_src_stride );
+        LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
+        p_ref += ( 4 * i_ref_stride );
+
+        PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                     src0, src1, ref0, ref1 );
+        CALC_MSE_AVG_B( src0, ref0, var, avg );
+        CALC_MSE_AVG_B( src1, ref1, var, avg );
+    }
+
+    vec = __msa_hadd_s_w( avg, avg );
+    *p_diff = HADD_SW_S32( vec );
+    u_sse = HADD_SW_S32( var );
+
+    return u_sse;
+}
+
+static uint32_t sse_4width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                uint8_t *p_ref, int32_t i_ref_stride,
+                                int32_t i_height )
+{
+    int32_t i_ht_cnt;
+    uint32_t u_sse;
+    uint32_t u_src0, u_src1, u_src2, u_src3;
+    uint32_t u_ref0, u_ref1, u_ref2, u_ref3;
+    v16u8 src = { 0 };
+    v16u8 ref = { 0 };
+    v4i32 var = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 );
+        p_src += ( 4 * i_src_stride );
+        LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 );
+        p_ref += ( 4 * i_ref_stride );
+
+        INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src );
+        INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref );
+        CALC_MSE_B( src, ref, var );
+    }
+
+    u_sse = HADD_SW_S32( var );
+
+    return u_sse;
+}
+
+static uint32_t sse_8width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                uint8_t *p_ref, int32_t i_ref_stride,
+                                int32_t i_height )
+{
+    int32_t i_ht_cnt;
+    uint32_t u_sse;
+    v16u8 src0, src1, src2, src3;
+    v16u8 ref0, ref1, ref2, ref3;
+    v4i32 var = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+        p_src += ( 4 * i_src_stride );
+        LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
+        p_ref += ( 4 * i_ref_stride );
+
+        PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                     src0, src1, ref0, ref1 );
+        CALC_MSE_B( src0, ref0, var );
+        CALC_MSE_B( src1, ref1, var );
+    }
+
+    u_sse = HADD_SW_S32( var );
+
+    return u_sse;
+}
+
+static uint32_t sse_16width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                 uint8_t *p_ref, int32_t i_ref_stride,
+                                 int32_t i_height )
+{
+    int32_t i_ht_cnt;
+    uint32_t u_sse;
+    v16u8 src, ref;
+    v4i32 var = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        src = LD_UB( p_src );
+        p_src += i_src_stride;
+        ref = LD_UB( p_ref );
+        p_ref += i_ref_stride;
+        CALC_MSE_B( src, ref, var );
+
+        src = LD_UB( p_src );
+        p_src += i_src_stride;
+        ref = LD_UB( p_ref );
+        p_ref += i_ref_stride;
+        CALC_MSE_B( src, ref, var );
+
+        src = LD_UB( p_src );
+        p_src += i_src_stride;
+        ref = LD_UB( p_ref );
+        p_ref += i_ref_stride;
+        CALC_MSE_B( src, ref, var );
+
+        src = LD_UB( p_src );
+        p_src += i_src_stride;
+        ref = LD_UB( p_ref );
+        p_ref += i_ref_stride;
+        CALC_MSE_B( src, ref, var );
+    }
+
+    u_sse = HADD_SW_S32( var );
+
+    return u_sse;
+}
+
+static void ssim_4x4x2_core_msa( const uint8_t *p_src, int32_t i_src_stride,
+                                 const uint8_t *p_ref, int32_t i_ref_stride,
+                                 int32_t pi_sum_array[2][4] )
+{
+    v16i8 zero = { 0 };
+    v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+    v8u16 temp0, temp1, temp2, temp3;
+    v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+    v4u32 tmp0;
+    v4i32 tmp2, tmp3;
+
+    LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+    p_src += ( 4 * i_src_stride );
+    LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
+    p_ref += ( 4 * i_ref_stride );
+
+    ILVR_D2_UB( src1, src0, src3, src2, src0, src2 );
+    ILVR_D2_UB( ref1, ref0, ref3, ref2, ref0, ref2 );
+    HADD_UB2_UH( src0, src2, temp0, temp1 );
+
+    temp2 = ( v8u16 ) __msa_ilvev_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
+    temp3 = ( v8u16 ) __msa_ilvod_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
+
+    pi_sum_array[0][0] = ( int32_t ) HADD_UH_U32( temp2 );
+    pi_sum_array[1][0] = ( int32_t ) HADD_UH_U32( temp3 );
+
+    HADD_UB2_UH( ref0, ref2, temp0, temp1 );
+
+    temp2 = ( v8u16 ) __msa_ilvev_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
+    temp3 = ( v8u16 ) __msa_ilvod_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
+
+    pi_sum_array[0][1] = ( int32_t ) HADD_UH_U32( temp2 );
+    pi_sum_array[1][1] = ( int32_t ) HADD_UH_U32( temp3 );
+
+    ILVR_B4_UH( zero, src0, zero, src2, zero, ref0, zero, ref2, vec0, vec2,
+                vec4, vec6 );
+    ILVL_B4_UH( zero, src0, zero, src2, zero, ref0, zero, ref2, vec1, vec3,
+                vec5, vec7 );
+
+    tmp0 = __msa_dotp_u_w( vec0, vec0 );
+    tmp0 = __msa_dpadd_u_w( tmp0, vec1, vec1 );
+    tmp0 = __msa_dpadd_u_w( tmp0, vec2, vec2 );
+    tmp0 = __msa_dpadd_u_w( tmp0, vec3, vec3 );
+    tmp0 = __msa_dpadd_u_w( tmp0, vec4, vec4 );
+    tmp0 = __msa_dpadd_u_w( tmp0, vec5, vec5 );
+    tmp0 = __msa_dpadd_u_w( tmp0, vec6, vec6 );
+    tmp0 = __msa_dpadd_u_w( tmp0, vec7, vec7 );
+
+    tmp2 = ( v4i32 ) __msa_ilvev_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
+    tmp3 = ( v4i32 ) __msa_ilvod_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
+    tmp2 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp2, ( v4u32 ) tmp2 );
+    tmp3 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp3, ( v4u32 ) tmp3 );
+
+    pi_sum_array[0][2] = __msa_copy_u_w( tmp2, 0 );
+    pi_sum_array[1][2] = __msa_copy_u_w( tmp3, 0 );
+
+    tmp0 = __msa_dotp_u_w( vec4, vec0 );
+    tmp0 = __msa_dpadd_u_w( tmp0, vec5, vec1 );
+    tmp0 = __msa_dpadd_u_w( tmp0, vec6, vec2 );
+    tmp0 = __msa_dpadd_u_w( tmp0, vec7, vec3 );
+
+    tmp2 = ( v4i32 ) __msa_ilvev_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
+    tmp3 = ( v4i32 ) __msa_ilvod_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
+    tmp2 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp2, ( v4u32 ) tmp2 );
+    tmp3 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp3, ( v4u32 ) tmp3 );
+
+    pi_sum_array[0][3] = __msa_copy_u_w( tmp2, 0 );
+    pi_sum_array[1][3] = __msa_copy_u_w( tmp3, 0 );
+}
+
+static int32_t pixel_satd_4width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                      uint8_t *p_ref, int32_t i_ref_stride,
+                                      uint8_t i_height )
+{
+    int32_t cnt;
+    uint32_t u_sum = 0;
+    v16i8 src0, src1, src2, src3;
+    v16i8 ref0, ref1, ref2, ref3;
+    v8i16 zero = { 0 };
+    v8i16 diff0, diff1, diff2, diff3;
+    v8i16 temp0, temp1, temp2, temp3;
+
+    for ( cnt = i_height >> 2; cnt--; )
+    {
+        LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
+        p_src += 4 * i_src_stride;
+        LD_SB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
+        p_ref += 4 * i_ref_stride;
+
+        ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3,
+                    diff0, diff1, diff2, diff3 );
+        HSUB_UB4_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 );
+        TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
+                            diff0, diff1, diff2, diff3 );
+        BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
+        BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
+        TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
+                            diff0, diff1, diff2, diff3 );
+        BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
+        BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
+
+        diff0 = __msa_add_a_h( diff0, zero );
+        diff1 = __msa_add_a_h( diff1, zero );
+        diff2 = __msa_add_a_h( diff2, zero );
+        diff3 = __msa_add_a_h( diff3, zero );
+        diff0 = ( diff0 + diff1 + diff2 + diff3 );
+        diff0 = ( v8i16 ) __msa_hadd_u_w( ( v8u16 ) diff0, ( v8u16 ) diff0 );
+        diff0 = ( v8i16 ) __msa_hadd_u_d( ( v4u32 ) diff0, ( v4u32 ) diff0 );
+        u_sum += __msa_copy_u_w( ( v4i32 ) diff0, 0 );
+    }
+
+    return ( u_sum >> 1 );
+}
+
+static int32_t pixel_satd_8width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                      uint8_t *p_ref, int32_t i_ref_stride,
+                                      uint8_t i_height )
+{
+    int32_t cnt;
+    uint32_t u_sum = 0;
+    v16i8 src0, src1, src2, src3;
+    v16i8 ref0, ref1, ref2, ref3;
+    v8i16 zero = { 0 };
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 temp0, temp1, temp2, temp3;
+
+    for ( cnt = i_height >> 2; cnt--; )
+    {
+        LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
+        p_src += 4 * i_src_stride;
+        LD_SB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
+        p_ref += 4 * i_ref_stride;
+
+        ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3,
+                    diff0, diff1, diff2, diff3 );
+        HSUB_UB4_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 );
+        TRANSPOSE8X4_SH_SH( diff0, diff1, diff2, diff3,
+                            diff0, diff2, diff4, diff6 );
+
+        diff1 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff0, 1 );
+        diff3 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff2, 1 );
+        diff5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff4, 1 );
+        diff7 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff6, 1 );
+
+        BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
+        BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
+        BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
+        BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
+        TRANSPOSE4X8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6,
+                            diff7, diff0, diff1, diff2, diff3, diff4, diff5,
+                            diff6, diff7 );
+        BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
+        BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
+
+        diff0 = __msa_add_a_h( diff0, zero );
+        diff1 = __msa_add_a_h( diff1, zero );
+        diff2 = __msa_add_a_h( diff2, zero );
+        diff3 = __msa_add_a_h( diff3, zero );
+        diff0 = ( diff0 + diff1 + diff2 + diff3 );
+        u_sum += HADD_UH_U32( diff0 );
+    }
+
+    return ( u_sum >> 1 );
+}
+
+static int32_t sa8d_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
+                             uint8_t *p_ref, int32_t i_ref_stride )
+{
+    uint32_t u_sum = 0;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v16i8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+    v8i16 zero = { 0 };
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7;
+    v8i16 temp0, temp1, temp2, temp3;
+
+    LD_SB8( p_src, i_src_stride, src0, src1, src2, src3, src4, src5, src6, src7 );
+    LD_SB8( p_ref, i_ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7 );
+    ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3, sub0, sub1,
+                sub2, sub3 );
+    ILVR_B4_SH( src4, ref4, src5, ref5, src6, ref6, src7, ref7, sub4, sub5,
+               sub6, sub7 );
+    HSUB_UB4_SH( sub0, sub1, sub2, sub3, sub0, sub1, sub2, sub3 );
+    HSUB_UB4_SH( sub4, sub5, sub6, sub7, sub4, sub5, sub6, sub7 );
+    TRANSPOSE8x8_SH_SH( sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
+                        sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7 );
+    BUTTERFLY_4( sub0, sub2, sub3, sub1, diff0, diff1, diff4, diff5 );
+    BUTTERFLY_4( sub4, sub6, sub7, sub5, diff2, diff3, diff7, diff6 );
+    BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
+    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
+    BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
+    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
+    TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
+                        diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7 );
+    BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
+    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
+    BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
+    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
+
+    temp0 = diff0 + diff4;
+    temp1 = diff1 + diff5;
+    temp2 = diff2 + diff6;
+    temp3 = diff3 + diff7;
+
+    temp0 = __msa_add_a_h( temp0, zero );
+    temp1 = __msa_add_a_h( temp1, zero );
+    temp2 = __msa_add_a_h( temp2, zero );
+    temp3 = __msa_add_a_h( temp3, zero );
+
+    diff0 = temp0 + __msa_asub_s_h( diff0, diff4 );
+    diff1 = temp1 + __msa_asub_s_h( diff1, diff5 );
+    diff2 = temp2 + __msa_asub_s_h( diff2, diff6 );
+    diff3 = temp3 + __msa_asub_s_h( diff3, diff7 );
+    diff0 = ( diff0 + diff1 + diff2 + diff3 );
+
+    u_sum = HADD_UH_U32( diff0 );
+
+    return u_sum;
+}
+
+static uint64_t pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, int32_t i_stride )
+{
+    int16_t tmp0, tmp1, tmp2, tmp3;
+    uint32_t u_sum4 = 0, u_sum8 = 0, u_dc;
+    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 zero = { 0 };
+    v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+    v8i16 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7;
+    v8i16 temp0, temp1, temp2, temp3;
+
+    LD_UB8( p_pix, i_stride, src0, src1, src2, src3, src4, src5, src6, src7 );
+
+    ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3, diff0, diff1,
+                diff2, diff3 );
+    ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7, diff4, diff5,
+                diff6, diff7 );
+    TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3,
+                        diff4, diff5, diff6, diff7,
+                        diff0, diff1, diff2, diff3,
+                        diff4, diff5, diff6, diff7 );
+    BUTTERFLY_4( diff0, diff2, diff3, diff1,
+                 temp0, temp2, temp3, temp1 );
+    BUTTERFLY_4( temp0, temp1, temp3, temp2,
+                 diff0, diff1, diff3, diff2 );
+    BUTTERFLY_4( diff4, diff6, diff7, diff5,
+                 temp0, temp2, temp3, temp1 );
+    BUTTERFLY_4( temp0, temp1, temp3, temp2,
+                 diff4, diff5, diff7, diff6 );
+    TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3,
+                        diff4, diff5, diff6, diff7,
+                        diff0, diff1, diff2, diff3,
+                        diff4, diff5, diff6, diff7 );
+    BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
+    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
+    BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
+    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
+
+    tmp0 = diff0[0];
+    tmp1 = diff0[4];
+    tmp2 = diff4[0];
+    tmp3 = diff4[4];
+
+    sub0 = __msa_add_a_h( diff0, zero );
+    sub1 = __msa_add_a_h( diff1, zero );
+    sub2 = __msa_add_a_h( diff2, zero );
+    sub3 = __msa_add_a_h( diff3, zero );
+    sub4 = __msa_add_a_h( diff4, zero );
+    sub5 = __msa_add_a_h( diff5, zero );
+    sub6 = __msa_add_a_h( diff6, zero );
+    sub7 = __msa_add_a_h( diff7, zero );
+
+    sub0 = ( sub0 + sub1 + sub2 + sub3 );
+    sub1 = ( sub4 + sub5 + sub6 + sub7 );
+    sub0 += sub1;
+
+    u_sum4 += HADD_UH_U32( sub0 );
+
+    TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
+                        sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7 );
+
+    ILVR_D2_SH( sub2, sub0, sub6, sub4, diff0, diff1 );
+    ILVR_D2_SH( sub3, sub1, sub7, sub5, diff4, diff6 );
+
+    diff2 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub2, ( v2i64 ) sub0 );
+    diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub6, ( v2i64 ) sub4 );
+    diff5 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub3, ( v2i64 ) sub1 );
+    diff7 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub7, ( v2i64 ) sub5 );
+
+    BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
+    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
+    BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
+    BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
+
+    sub0 = __msa_add_a_h( diff0, zero );
+    sub1 = __msa_add_a_h( diff1, zero );
+    sub2 = __msa_add_a_h( diff2, zero );
+    sub3 = __msa_add_a_h( diff3, zero );
+    sub4 = __msa_add_a_h( diff4, zero );
+    sub5 = __msa_add_a_h( diff5, zero );
+    sub6 = __msa_add_a_h( diff6, zero );
+    sub7 = __msa_add_a_h( diff7, zero );
+
+    sub0 = ( sub0 + sub1 + sub2 + sub3 );
+    sub1 = ( sub4 + sub5 + sub6 + sub7 );
+    sub0 += sub1;
+
+    u_sum8 += HADD_UH_U32( sub0 );
+
+    u_dc = ( uint16_t ) ( tmp0 + tmp1 + tmp2 + tmp3 );
+    u_sum4 = u_sum4 - u_dc;
+    u_sum8 = u_sum8 - u_dc;
+
+    return ( ( uint64_t ) u_sum8 << 32 ) + u_sum4;
+}
+
+int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                  uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sad_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
+}
+
+int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sad_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
+}
+
+int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
+}
+
+int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
+}
+
+int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
+}
+
+int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
+}
+
+int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
+}
+
+int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
+}
+
+void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                  uint8_t *p_ref1, uint8_t *p_ref2,
+                                  uint8_t *p_ref3, intptr_t i_ref_stride,
+                                  int32_t p_sad_array[4] )
+{
+    uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
+
+    sad_16width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 16,
+                         ( uint32_t * ) p_sad_array );
+}
+
+void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                 uint8_t *p_ref1, uint8_t *p_ref2,
+                                 uint8_t *p_ref3, intptr_t i_ref_stride,
+                                 int32_t p_sad_array[4] )
+{
+    uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
+
+    sad_16width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
+                         ( uint32_t * ) p_sad_array );
+}
+
+void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                 uint8_t *p_ref1, uint8_t *p_ref2,
+                                 uint8_t *p_ref3, intptr_t i_ref_stride,
+                                 int32_t p_sad_array[4] )
+{
+    uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
+
+    sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 16,
+                        ( uint32_t * ) p_sad_array );
+}
+
+void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                uint8_t *p_ref3, intptr_t i_ref_stride,
+                                int32_t p_sad_array[4] )
+{
+    uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
+
+    sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
+                        ( uint32_t * ) p_sad_array );
+}
+
+void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                uint8_t *p_ref3, intptr_t i_ref_stride,
+                                int32_t p_sad_array[4] )
+{
+    uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
+
+    sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 4,
+                        ( uint32_t * ) p_sad_array );
+}
+
+void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                uint8_t *p_ref3, intptr_t i_ref_stride,
+                                int32_t p_sad_array[4] )
+{
+    uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
+
+    sad_4width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
+                        ( uint32_t * ) p_sad_array );
+}
+
+void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                uint8_t *p_ref3, intptr_t i_ref_stride,
+                                int32_t p_sad_array[4] )
+{
+    uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
+
+    sad_4width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 4,
+                        ( uint32_t * ) p_sad_array );
+}
+
+void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                  uint8_t *p_ref1, uint8_t *p_ref2,
+                                  intptr_t i_ref_stride,
+                                  int32_t p_sad_array[3] )
+{
+    sad_16width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
+                         i_ref_stride, 16, ( uint32_t * ) p_sad_array );
+}
+
+void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                 uint8_t *p_ref1, uint8_t *p_ref2,
+                                 intptr_t i_ref_stride,
+                                 int32_t p_sad_array[3] )
+{
+    sad_16width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
+                         i_ref_stride, 8, ( uint32_t * ) p_sad_array );
+}
+
+void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                 uint8_t *p_ref1, uint8_t *p_ref2,
+                                 intptr_t i_ref_stride,
+                                 int32_t p_sad_array[3] )
+{
+    sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
+                        i_ref_stride, 16, ( uint32_t * ) p_sad_array );
+}
+
+void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                intptr_t i_ref_stride,
+                                int32_t p_sad_array[3] )
+{
+    sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
+                        i_ref_stride, 8, ( uint32_t * ) p_sad_array );
+}
+
+void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                intptr_t i_ref_stride,
+                                int32_t p_sad_array[3] )
+{
+    sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
+                        i_ref_stride, 4, ( uint32_t * ) p_sad_array );
+}
+
+void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                intptr_t i_ref_stride,
+                                int32_t p_sad_array[3] )
+{
+    sad_4width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
+                        i_ref_stride, 8, ( uint32_t * ) p_sad_array );
+}
+
+void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                intptr_t i_ref_stride,
+                                int32_t p_sad_array[3] )
+{
+    sad_4width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
+                        i_ref_stride, 4, ( uint32_t * ) p_sad_array );
+}
+
+int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                  uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sse_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
+}
+
+int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sse_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
+}
+
+int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
+}
+
+int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
+}
+
+int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
+}
+
+int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
+}
+
+int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
+}
+
+int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride )
+{
+    return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
+}
+
+void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                int32_t p_sad_array[3] )
+{
+    x264_intra_predict_vert_4x4_msa( p_dec );
+    p_sad_array[0] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
+                                             p_enc, FENC_STRIDE );
+
+    x264_intra_predict_hor_4x4_msa( p_dec );
+    p_sad_array[1] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
+                                             p_enc, FENC_STRIDE );
+
+    x264_intra_predict_dc_4x4_msa( p_dec );
+    p_sad_array[2] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
+                                             p_enc, FENC_STRIDE );
+}
+
+void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                  int32_t p_sad_array[3] )
+{
+    x264_intra_predict_vert_16x16_msa( p_dec );
+    p_sad_array[0] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
+                                               p_enc, FENC_STRIDE );
+
+    x264_intra_predict_hor_16x16_msa( p_dec );
+    p_sad_array[1] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
+                                               p_enc, FENC_STRIDE );
+
+    x264_intra_predict_dc_16x16_msa( p_dec );
+    p_sad_array[2] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
+                                               p_enc, FENC_STRIDE );
+}
+
+void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
+                                int32_t p_sad_array[3] )
+{
+    ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
+
+    x264_intra_predict_v_8x8_msa( pix, p_edge );
+    p_sad_array[0] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
+                                             p_enc, FENC_STRIDE );
+
+    x264_intra_predict_h_8x8_msa( pix, p_edge );
+    p_sad_array[1] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
+                                             p_enc, FENC_STRIDE );
+
+    x264_intra_predict_dc_8x8_msa( pix, p_edge );
+    p_sad_array[2] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
+                                             p_enc, FENC_STRIDE );
+}
+
+void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                 int32_t p_sad_array[3] )
+{
+    x264_intra_predict_dc_4blk_8x8_msa( p_dec );
+    p_sad_array[0] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
+                                             p_enc, FENC_STRIDE );
+
+    x264_intra_predict_hor_8x8_msa( p_dec );
+    p_sad_array[1] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
+                                             p_enc, FENC_STRIDE );
+
+    x264_intra_predict_vert_8x8_msa( p_dec );
+    p_sad_array[2] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
+                                             p_enc, FENC_STRIDE );
+}
+
+void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1,
+                               const uint8_t *p_pix2, intptr_t i_stride2,
+                               int32_t i_sums[2][4] )
+{
+    ssim_4x4x2_core_msa( p_pix1, i_stride1, p_pix2, i_stride2, i_sums );
+}
+
+uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride )
+{
+    uint64_t u_sum;
+
+    u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
+
+    return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
+}
+
+uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride )
+{
+    uint64_t u_sum;
+
+    u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
+    u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride, i_stride );
+
+    return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
+}
+
+uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride )
+{
+    uint64_t u_sum;
+
+    u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
+    u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8, i_stride );
+
+    return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
+}
+
+uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride )
+{
+    uint64_t u_sum;
+
+    u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
+    u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8, i_stride );
+    u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride, i_stride );
+    u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride + 8, i_stride );
+
+    return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
+}
+
+int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 )
+{
+    return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 4 );
+}
+
+int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 )
+{
+    return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
+}
+
+int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                  uint8_t *p_pix2, intptr_t i_stride2 )
+{
+    return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
+}
+
+int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 )
+{
+    return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 4 );
+}
+
+int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 )
+{
+    return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
+}
+
+int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                  uint8_t *p_pix2, intptr_t i_stride2 )
+{
+    return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
+}
+
+int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                  uint8_t *p_pix2, intptr_t i_stride2 )
+{
+    uint32_t u32Sum = 0;
+
+    u32Sum = pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
+    u32Sum += pixel_satd_8width_msa( p_pix1 + 8, i_stride,
+                                     p_pix2 + 8, i_stride2, 8 );
+
+    return u32Sum;
+}
+
+int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                   uint8_t *p_pix2, intptr_t i_stride2 )
+{
+    uint32_t u32Sum = 0;
+
+    u32Sum = pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
+    u32Sum += pixel_satd_8width_msa( p_pix1 + 8, i_stride,
+                                     p_pix2 + 8, i_stride2, 16 );
+
+    return u32Sum;
+}
+
+int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 )
+{
+    int32_t i32Sum = sa8d_8x8_msa( p_pix1, i_stride, p_pix2, i_stride2 );
+
+    return ( i32Sum + 2 ) >> 2;
+}
+
+int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                   uint8_t *p_pix2, intptr_t i_stride2 )
+{
+    int32_t i32Sum = sa8d_8x8_msa( p_pix1, i_stride, p_pix2, i_stride2 ) +
+                     sa8d_8x8_msa( p_pix1 + 8, i_stride,
+                                   p_pix2 + 8, i_stride2 ) +
+                     sa8d_8x8_msa( p_pix1 + 8 * i_stride, i_stride,
+                                   p_pix2 + 8 * i_stride2, i_stride2 ) +
+                     sa8d_8x8_msa( p_pix1 + 8 + 8 * i_stride, i_stride,
+                                   p_pix2 + 8 + 8 * i_stride2, i_stride2 );
+
+    return ( i32Sum + 2 ) >> 2;
+}
+
+void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                 int32_t p_sad_array[3] )
+{
+    x264_intra_predict_vert_4x4_msa( p_dec );
+    p_sad_array[0] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
+                                              p_enc, FENC_STRIDE );
+
+    x264_intra_predict_hor_4x4_msa( p_dec );
+    p_sad_array[1] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
+                                              p_enc, FENC_STRIDE );
+
+    x264_intra_predict_dc_4x4_msa( p_dec );
+    p_sad_array[2] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
+                                              p_enc, FENC_STRIDE );
+}
+
+void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                   int32_t p_sad_array[3] )
+{
+    x264_intra_predict_vert_16x16_msa( p_dec );
+    p_sad_array[0] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
+                                                p_enc, FENC_STRIDE );
+
+    x264_intra_predict_hor_16x16_msa( p_dec );
+    p_sad_array[1] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
+                                                p_enc, FENC_STRIDE );
+
+    x264_intra_predict_dc_16x16_msa( p_dec );
+    p_sad_array[2] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
+                                                p_enc, FENC_STRIDE );
+}
+
+void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
+                                 int32_t p_sad_array[3] )
+{
+    ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
+
+    x264_intra_predict_v_8x8_msa( pix, p_edge );
+    p_sad_array[0] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
+                                              p_enc, FENC_STRIDE );
+
+    x264_intra_predict_h_8x8_msa( pix, p_edge );
+    p_sad_array[1] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
+                                              p_enc, FENC_STRIDE );
+
+    x264_intra_predict_dc_8x8_msa( pix, p_edge );
+    p_sad_array[2] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
+                                              p_enc, FENC_STRIDE );
+}
+
+void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                  int32_t p_sad_array[3] )
+{
+    x264_intra_predict_dc_4blk_8x8_msa( p_dec );
+    p_sad_array[0] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
+                                              p_enc, FENC_STRIDE );
+
+    x264_intra_predict_hor_8x8_msa( p_dec );
+    p_sad_array[1] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
+                                              p_enc, FENC_STRIDE );
+
+    x264_intra_predict_vert_8x8_msa( p_dec );
+    p_sad_array[2] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
+                                              p_enc, FENC_STRIDE );
+}
+
+uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride )
+{
+    return avc_pixel_var16width_msa( p_pix, i_stride, 16 );
+}
+
+uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride )
+{
+    return avc_pixel_var8width_msa( p_pix, i_stride, 16 );
+}
+
+uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride )
+{
+    return avc_pixel_var8width_msa( p_pix, i_stride, 8 );
+}
+
+int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1,
+                                  uint8_t *p_pix2, intptr_t i_stride2,
+                                  int32_t *p_ssd )
+{
+    int32_t i_var = 0, i_diff = 0, i_sqr = 0;
+
+    i_sqr = sse_diff_8width_msa( p_pix1, i_stride1, p_pix2, i_stride2, 16,
+                                 &i_diff );
+    i_var = VARIANCE_WxH( i_sqr, i_diff, 7 );
+    *p_ssd = i_sqr;
+
+    return i_var;
+}
+
+int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1,
+                                 uint8_t *p_pix2, intptr_t i_stride2,
+                                 int32_t *p_ssd )
+{
+    int32_t i_var = 0, i_diff = 0, i_sqr = 0;
+
+    i_sqr = sse_diff_8width_msa( p_pix1, i_stride1,
+                                 p_pix2, i_stride2, 8, &i_diff );
+    i_var = VARIANCE_WxH( i_sqr, i_diff, 6 );
+    *p_ssd = i_sqr;
+
+    return i_var;
+}
+#endif

x264-snapshot-20150804-2245.tar.bz2/common/mips/pixel.h Added

@@ -0,0 +1,170 @@
+/*****************************************************************************
+ * pixel.h: msa pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_SAD_H
+#define X264_MIPS_SAD_H
+
+int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                  uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                  uint8_t *p_ref1, uint8_t *p_ref2,
+                                  uint8_t *p_ref3, intptr_t i_ref_stride,
+                                  int32_t p_sad_array[4] );
+void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                 uint8_t *p_ref1, uint8_t *p_ref2,
+                                 uint8_t *p_ref3, intptr_t i_ref_stride,
+                                 int32_t p_sad_array[4] );
+void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                 uint8_t *p_ref1, uint8_t *p_ref2,
+                                 uint8_t *p_ref3, intptr_t i_ref_stride,
+                                 int32_t p_sad_array[4] );
+void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                uint8_t *p_ref3, intptr_t i_ref_stride,
+                                int32_t p_sad_array[4] );
+void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                uint8_t *p_ref3, intptr_t i_ref_stride,
+                                int32_t p_sad_array[4] );
+void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                uint8_t *p_ref3, intptr_t i_ref_stride,
+                                int32_t p_sad_array[4] );
+void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                uint8_t *p_ref3, intptr_t i_ref_stride,
+                                int32_t p_sad_array[4] );
+void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                  uint8_t *p_ref1, uint8_t *p_ref2,
+                                  intptr_t i_ref_stride,
+                                  int32_t p_sad_array[3] );
+void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                 uint8_t *p_ref1, uint8_t *p_ref2,
+                                 intptr_t i_ref_stride,
+                                 int32_t p_sad_array[3] );
+void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                 uint8_t *p_ref1, uint8_t *p_ref2,
+                                 intptr_t i_ref_stride,
+                                 int32_t p_sad_array[3] );
+void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                intptr_t i_ref_stride,
+                                int32_t p_sad_array[3] );
+void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                intptr_t i_ref_stride,
+                                int32_t p_sad_array[3] );
+void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                intptr_t i_ref_stride,
+                                int32_t p_sad_array[3] );
+void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                intptr_t i_ref_stride,
+                                int32_t p_sad_array[3] );
+int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                  uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                int32_t p_sad_array[3] );
+void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                  int32_t p_sad_array[3] );
+void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
+                                int32_t p_sad_array[3] );
+void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                 int32_t p_sad_array[3] );
+void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1,
+                               const uint8_t *p_pix2, intptr_t i_stride2,
+                               int32_t i_sums[2][4] );
+uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride );
+uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride );
+uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride );
+uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride );
+int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                  uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                  uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                  uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                   uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                   uint8_t *p_pix2, intptr_t i_stride2 );
+void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                 int32_t p_sad_array[3] );
+void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                   int32_t p_sad_array[3] );
+void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
+                                 int32_t p_sad_array[3] );
+void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                  int32_t p_sad_array[3] );
+uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride );
+uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride );
+uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride );
+int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1,
+                                  uint8_t *p_pix2, intptr_t i_stride2,
+                                  int32_t *p_ssd );
+int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1,
+                                 uint8_t *p_pix2, intptr_t i_stride2,
+                                 int32_t *p_ssd );
+
+#endif

x264-snapshot-20150804-2245.tar.bz2/common/mips/predict-c.c Added

@@ -0,0 +1,607 @@
+/*****************************************************************************
+ * predict-c.c: msa intra prediction
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macros.h"
+
+#if !HIGH_BIT_DEPTH
+static void intra_predict_vert_4x4_msa( uint8_t *p_src, uint8_t *p_dst,
+                                        int32_t i_dst_stride )
+{
+    uint32_t u_src_data;
+
+    u_src_data = LW( p_src );
+
+    SW4( u_src_data, u_src_data, u_src_data, u_src_data, p_dst, i_dst_stride );
+}
+
+static void intra_predict_vert_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
+                                        int32_t i_dst_stride )
+{
+    uint64_t u_out;
+
+    u_out = LD( p_src );
+
+    SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
+    p_dst += ( 4 * i_dst_stride );
+    SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
+}
+
+static void intra_predict_vert_16x16_msa( uint8_t *p_src, uint8_t *p_dst,
+                                          int32_t i_dst_stride )
+{
+    v16u8 src0 = LD_UB( p_src );
+
+    ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
+            i_dst_stride );
+    p_dst += ( 8 * i_dst_stride );
+    ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
+            i_dst_stride );
+}
+
+static void intra_predict_horiz_4x4_msa( uint8_t *p_src, int32_t i_src_stride,
+                                         uint8_t *p_dst, int32_t i_dst_stride )
+{
+    uint32_t u_out0, u_out1, u_out2, u_out3;
+
+    u_out0 = p_src[0 * i_src_stride] * 0x01010101;
+    u_out1 = p_src[1 * i_src_stride] * 0x01010101;
+    u_out2 = p_src[2 * i_src_stride] * 0x01010101;
+    u_out3 = p_src[3 * i_src_stride] * 0x01010101;
+
+    SW4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+}
+
+static void intra_predict_horiz_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
+                                         uint8_t *p_dst, int32_t i_dst_stride )
+{
+    uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
+
+    u_out0 = p_src[0 * i_src_stride] * 0x0101010101010101ull;
+    u_out1 = p_src[1 * i_src_stride] * 0x0101010101010101ull;
+    u_out2 = p_src[2 * i_src_stride] * 0x0101010101010101ull;
+    u_out3 = p_src[3 * i_src_stride] * 0x0101010101010101ull;
+    u_out4 = p_src[4 * i_src_stride] * 0x0101010101010101ull;
+    u_out5 = p_src[5 * i_src_stride] * 0x0101010101010101ull;
+    u_out6 = p_src[6 * i_src_stride] * 0x0101010101010101ull;
+    u_out7 = p_src[7 * i_src_stride] * 0x0101010101010101ull;
+
+    SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+    p_dst += ( 4 * i_dst_stride );
+    SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
+}
+
+static void intra_predict_horiz_16x16_msa( uint8_t *p_src, int32_t i_src_stride,
+                                           uint8_t *p_dst,
+                                           int32_t i_dst_stride )
+{
+    uint32_t u_row;
+    uint8_t u_inp0, u_inp1, u_inp2, u_inp3;
+    v16u8 src0, src1, src2, src3;
+
+    for ( u_row = 4; u_row--; )
+    {
+        u_inp0 = p_src[0];
+        p_src += i_src_stride;
+        u_inp1 = p_src[0];
+        p_src += i_src_stride;
+        u_inp2 = p_src[0];
+        p_src += i_src_stride;
+        u_inp3 = p_src[0];
+        p_src += i_src_stride;
+
+        src0 = ( v16u8 ) __msa_fill_b( u_inp0 );
+        src1 = ( v16u8 ) __msa_fill_b( u_inp1 );
+        src2 = ( v16u8 ) __msa_fill_b( u_inp2 );
+        src3 = ( v16u8 ) __msa_fill_b( u_inp3 );
+
+        ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
+        p_dst += ( 4 * i_dst_stride );
+    }
+}
+
+static void intra_predict_dc_4x4_msa( uint8_t *p_src_top, uint8_t *p_src_left,
+                                      int32_t i_src_stride_left,
+                                      uint8_t *p_dst, int32_t i_dst_stride,
+                                      uint8_t is_above, uint8_t is_left )
+{
+    uint32_t u_row;
+    uint32_t u_out, u_addition = 0;
+    v16u8 src_above, store;
+    v8u16 sum_above;
+    v4u32 sum;
+
+    if ( is_left && is_above )
+    {
+        src_above = LD_UB( p_src_top );
+
+        sum_above = __msa_hadd_u_h( src_above, src_above );
+        sum = __msa_hadd_u_w( sum_above, sum_above );
+        u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
+
+        for ( u_row = 0; u_row < 4; u_row++ )
+        {
+            u_addition += p_src_left[u_row * i_src_stride_left];
+        }
+
+        u_addition = ( u_addition + 4 ) >> 3;
+        store = ( v16u8 ) __msa_fill_b( u_addition );
+    }
+    else if ( is_left )
+    {
+        for ( u_row = 0; u_row < 4; u_row++ )
+        {
+            u_addition += p_src_left[u_row * i_src_stride_left];
+        }
+
+        u_addition = ( u_addition + 2 ) >> 2;
+        store = ( v16u8 ) __msa_fill_b( u_addition );
+    }
+    else if ( is_above )
+    {
+        src_above = LD_UB( p_src_top );
+
+        sum_above = __msa_hadd_u_h( src_above, src_above );
+        sum = __msa_hadd_u_w( sum_above, sum_above );
+        sum = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum, 2 );
+        store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
+    }
+    else
+    {
+        store = ( v16u8 ) __msa_ldi_b( 128 );
+    }
+
+    u_out = __msa_copy_u_w( ( v4i32 ) store, 0 );
+
+    SW4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
+}
+
+static void intra_predict_dc_8x8_msa( uint8_t *p_src_top, uint8_t *p_src_left,
+                                      uint8_t *p_dst, int32_t i_dst_stride )
+{
+    uint64_t u_val0, u_val1;
+    v16i8 store;
+    v16u8 src = { 0 };
+    v8u16 sum_h;
+    v4u32 sum_w;
+    v2u64 sum_d;
+
+    u_val0 = LD( p_src_top );
+    u_val1 = LD( p_src_left );
+    INSERT_D2_UB( u_val0, u_val1, src );
+    sum_h = __msa_hadd_u_h( src, src );
+    sum_w = __msa_hadd_u_w( sum_h, sum_h );
+    sum_d = __msa_hadd_u_d( sum_w, sum_w );
+    sum_w = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum_d, ( v4i32 ) sum_d );
+    sum_d = __msa_hadd_u_d( sum_w, sum_w );
+    sum_w = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum_d, 4 );
+    store = __msa_splati_b( ( v16i8 ) sum_w, 0 );
+    u_val0 = __msa_copy_u_d( ( v2i64 ) store, 0 );
+
+    SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
+    p_dst += ( 4 * i_dst_stride );
+    SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
+}
+
+static void intra_predict_dc_16x16_msa( uint8_t *p_src_top, uint8_t *p_src_left,
+                                        int32_t i_src_stride_left,
+                                        uint8_t *p_dst, int32_t i_dst_stride,
+                                        uint8_t is_above, uint8_t is_left )
+{
+    uint32_t u_row;
+    uint32_t u_addition = 0;
+    v16u8 src_above, store;
+    v8u16 sum_above;
+    v4u32 sum_top;
+    v2u64 sum;
+
+    if ( is_left && is_above )
+    {
+        src_above = LD_UB( p_src_top );
+
+        sum_above = __msa_hadd_u_h( src_above, src_above );
+        sum_top = __msa_hadd_u_w( sum_above, sum_above );
+        sum = __msa_hadd_u_d( sum_top, sum_top );
+        sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
+        sum = __msa_hadd_u_d( sum_top, sum_top );
+        u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
+
+        for ( u_row = 0; u_row < 16; u_row++ )
+        {
+            u_addition += p_src_left[u_row * i_src_stride_left];
+        }
+
+        u_addition = ( u_addition + 16 ) >> 5;
+        store = ( v16u8 ) __msa_fill_b( u_addition );
+    }
+    else if ( is_left )
+    {
+        for ( u_row = 0; u_row < 16; u_row++ )
+        {
+            u_addition += p_src_left[u_row * i_src_stride_left];
+        }
+
+        u_addition = ( u_addition + 8 ) >> 4;
+        store = ( v16u8 ) __msa_fill_b( u_addition );
+    }
+    else if ( is_above )
+    {
+        src_above = LD_UB( p_src_top );
+
+        sum_above = __msa_hadd_u_h( src_above, src_above );
+        sum_top = __msa_hadd_u_w( sum_above, sum_above );
+        sum = __msa_hadd_u_d( sum_top, sum_top );
+        sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
+        sum = __msa_hadd_u_d( sum_top, sum_top );
+        sum = ( v2u64 ) __msa_srari_d( ( v2i64 ) sum, 4 );
+        store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
+    }
+    else
+    {
+        store = ( v16u8 ) __msa_ldi_b( 128 );
+    }
+
+    ST_UB8( store, store, store, store, store, store, store, store, p_dst,
+            i_dst_stride );
+    p_dst += ( 8 * i_dst_stride );
+    ST_UB8( store, store, store, store, store, store, store, store, p_dst,
+            i_dst_stride );
+}
+
+static void intra_predict_plane_8x8_msa( uint8_t *p_src, int32_t i_stride )
+{
+    uint8_t u_lpcnt;
+    int32_t i_res, i_res0, i_res1, i_res2, i_res3;
+    uint64_t u_out0, u_out1;
+    v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
+    v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
+    v4i32 int_multiplier = { 0, 1, 2, 3 };
+    v16u8 p_src_top;
+    v8i16 vec9, vec10, vec11;
+    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
+    v2i64 sum;
+
+    p_src_top = LD_UB( p_src - ( i_stride + 1 ) );
+    p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
+                                        ( v16i8 ) p_src_top );
+
+    vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
+    vec9 *= short_multiplier;
+    vec8 = __msa_hadd_s_w( vec9, vec9 );
+    sum = __msa_hadd_s_d( vec8, vec8 );
+
+    i_res0 = __msa_copy_s_w( ( v4i32 ) sum, 0 );
+
+    i_res1 = ( p_src[4 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
+             2 * ( p_src[5 * i_stride - 1] - p_src[i_stride - 1] ) +
+             3 * ( p_src[6 * i_stride - 1] - p_src[-1] ) +
+             4 * ( p_src[7 * i_stride - 1] - p_src[-i_stride - 1] );
+
+    i_res0 *= 17;
+    i_res1 *= 17;
+    i_res0 = ( i_res0 + 16 ) >> 5;
+    i_res1 = ( i_res1 + 16 ) >> 5;
+
+    i_res3 = 3 * ( i_res0 + i_res1 );
+    i_res2 = 16 * ( p_src[7 * i_stride - 1] + p_src[-i_stride + 7] + 1 );
+    i_res = i_res2 - i_res3;
+
+    vec8 = __msa_fill_w( i_res0 );
+    vec4 = __msa_fill_w( i_res );
+    vec2 = __msa_fill_w( i_res1 );
+    vec5 = vec8 * int_multiplier;
+    vec3 = vec8 * 4;
+
+    for ( u_lpcnt = 4; u_lpcnt--; )
+    {
+        vec0 = vec5;
+        vec0 += vec4;
+        vec1 = vec0 + vec3;
+        vec6 = vec5;
+        vec4 += vec2;
+        vec6 += vec4;
+        vec7 = vec6 + vec3;
+
+        SRA_4V( vec0, vec1, vec6, vec7, 5 );
+        PCKEV_H2_SH( vec1, vec0, vec7, vec6, vec10, vec11 );
+        CLIP_SH2_0_255( vec10, vec11 );
+        PCKEV_B2_SH( vec10, vec10, vec11, vec11, vec10, vec11 );
+
+        u_out0 = __msa_copy_s_d( ( v2i64 ) vec10, 0 );
+        u_out1 = __msa_copy_s_d( ( v2i64 ) vec11, 0 );
+        SD( u_out0, p_src );
+        p_src += i_stride;
+        SD( u_out1, p_src );
+        p_src += i_stride;
+
+        vec4 += vec2;
+    }
+}
+
+static void intra_predict_plane_16x16_msa( uint8_t *p_src, int32_t i_stride )
+{
+    uint8_t u_lpcnt;
+    int32_t i_res0, i_res1, i_res2, i_res3;
+    uint64_t u_load0, u_load1;
+    v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
+    v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
+    v4i32 int_multiplier = { 0, 1, 2, 3 };
+    v16u8 p_src_top = { 0 };
+    v8i16 vec9, vec10;
+    v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
+
+    u_load0 = LD( p_src - ( i_stride + 1 ) );
+    u_load1 = LD( p_src - ( i_stride + 1 ) + 9 );
+
+    INSERT_D2_UB( u_load0, u_load1, p_src_top );
+
+    p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
+                                        ( v16i8 ) p_src_top );
+
+    vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
+    vec9 *= short_multiplier;
+    vec8 = __msa_hadd_s_w( vec9, vec9 );
+    res_add = ( v4i32 ) __msa_hadd_s_d( vec8, vec8 );
+
+    i_res0 = __msa_copy_s_w( res_add, 0 ) + __msa_copy_s_w( res_add, 2 );
+
+    i_res1 = ( p_src[8 * i_stride - 1] - p_src[6 * i_stride - 1] ) +
+             2 * ( p_src[9 * i_stride - 1] - p_src[5 * i_stride - 1] ) +
+             3 * ( p_src[10 * i_stride - 1] - p_src[4 * i_stride - 1] ) +
+             4 * ( p_src[11 * i_stride - 1] - p_src[3 * i_stride - 1] ) +
+             5 * ( p_src[12 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
+             6 * ( p_src[13 * i_stride - 1] - p_src[i_stride - 1] ) +
+             7 * ( p_src[14 * i_stride - 1] - p_src[-1] ) +
+             8 * ( p_src[15 * i_stride - 1] - p_src[-1 * i_stride - 1] );
+
+    i_res0 *= 5;
+    i_res1 *= 5;
+    i_res0 = ( i_res0 + 32 ) >> 6;
+    i_res1 = ( i_res1 + 32 ) >> 6;
+
+    i_res3 = 7 * ( i_res0 + i_res1 );
+    i_res2 = 16 * ( p_src[15 * i_stride - 1] + p_src[-i_stride + 15] + 1 );
+    i_res2 -= i_res3;
+
+    vec8 = __msa_fill_w( i_res0 );
+    vec4 = __msa_fill_w( i_res2 );
+    vec5 = __msa_fill_w( i_res1 );
+    vec6 = vec8 * 4;
+    vec7 = vec8 * int_multiplier;
+
+    for ( u_lpcnt = 16; u_lpcnt--; )
+    {
+        vec0 = vec7;
+        vec0 += vec4;
+        vec1 = vec0 + vec6;
+        vec2 = vec1 + vec6;
+        vec3 = vec2 + vec6;
+
+        SRA_4V( vec0, vec1, vec2, vec3, 5 );
+        PCKEV_H2_SH( vec1, vec0, vec3, vec2, vec9, vec10 );
+        CLIP_SH2_0_255( vec9, vec10 );
+        PCKEV_ST_SB( vec9, vec10, p_src );
+        p_src += i_stride;
+
+        vec4 += vec5;
+    }
+}
+
+static void intra_predict_dc_4blk_8x8_msa( uint8_t *p_src, int32_t i_stride )
+{
+    uint8_t u_lp_cnt;
+    uint32_t u_src0, u_src1, u_src3, u_src2 = 0;
+    uint32_t u_out0, u_out1, u_out2, u_out3;
+    v16u8 p_src_top;
+    v8u16 add;
+    v4u32 sum;
+
+    p_src_top = LD_UB( p_src - i_stride );
+    add = __msa_hadd_u_h( ( v16u8 ) p_src_top, ( v16u8 ) p_src_top );
+    sum = __msa_hadd_u_w( add, add );
+    u_src0 = __msa_copy_u_w( ( v4i32 ) sum, 0 );
+    u_src1 = __msa_copy_u_w( ( v4i32 ) sum, 1 );
+
+    for ( u_lp_cnt = 0; u_lp_cnt < 4; u_lp_cnt++ )
+    {
+        u_src0 += p_src[u_lp_cnt * i_stride - 1];
+        u_src2 += p_src[( 4 + u_lp_cnt ) * i_stride - 1];
+    }
+
+    u_src0 = ( u_src0 + 4 ) >> 3;
+    u_src3 = ( u_src1 + u_src2 + 4 ) >> 3;
+    u_src1 = ( u_src1 + 2 ) >> 2;
+    u_src2 = ( u_src2 + 2 ) >> 2;
+
+    u_out0 = u_src0 * 0x01010101;
+    u_out1 = u_src1 * 0x01010101;
+    u_out2 = u_src2 * 0x01010101;
+    u_out3 = u_src3 * 0x01010101;
+
+    for ( u_lp_cnt = 4; u_lp_cnt--; )
+    {
+        SW( u_out0, p_src );
+        SW( u_out1, ( p_src + 4 ) );
+        SW( u_out2, ( p_src + 4 * i_stride ) );
+        SW( u_out3, ( p_src + 4 * i_stride + 4 ) );
+        p_src += i_stride;
+    }
+}
+
+static void intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
+                                       int32_t i_dst_stride )
+{
+    uint8_t u_src_val = p_src[15];
+    uint64_t u_out0, u_out1, u_out2, u_out3;
+    v16u8 src, vec4, vec5, res0;
+    v8u16 vec0, vec1, vec2, vec3;
+    v2i64 res1, res2, res3;
+
+    src = LD_UB( p_src );
+
+    vec4 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 1 );
+    vec5 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 2 );
+    vec5 = ( v16u8 ) __msa_insert_b( ( v16i8 ) vec5, 14, u_src_val );
+    ILVR_B2_UH( vec5, src, vec4, vec4, vec0, vec1 );
+    ILVL_B2_UH( vec5, src, vec4, vec4, vec2, vec3 );
+    HADD_UB4_UH( vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3 );
+
+    vec0 += vec1;
+    vec2 += vec3;
+    vec0 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec0, 2 );
+    vec2 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec2, 2 );
+
+    res0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec2, ( v16i8 ) vec0 );
+    res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
+    res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
+    res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
+
+    u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
+    u_out1 = __msa_copy_u_d( res1, 0 );
+    u_out2 = __msa_copy_u_d( res2, 0 );
+    u_out3 = __msa_copy_u_d( res3, 0 );
+    SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+    p_dst += ( 4 * i_dst_stride );
+
+    res0 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 4 );
+    res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
+    res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
+    res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
+
+    u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
+    u_out1 = __msa_copy_u_d( res1, 0 );
+    u_out2 = __msa_copy_u_d( res2, 0 );
+    u_out3 = __msa_copy_u_d( res3, 0 );
+    SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+}
+
+static void intra_predict_128dc_16x16_msa( uint8_t *p_dst,
+                                           int32_t i_dst_stride )
+{
+    v16u8 out = ( v16u8 ) __msa_ldi_b( 128 );
+
+    ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
+    p_dst += ( 8 * i_dst_stride );
+    ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
+}
+
+void x264_intra_predict_dc_16x16_msa( uint8_t *p_src )
+{
+    intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
+                                FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
+}
+
+void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src )
+{
+    intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
+                                FDEC_STRIDE, p_src, FDEC_STRIDE, 0, 1 );
+}
+
+void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src )
+{
+    intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
+                                FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 0 );
+}
+
+void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src )
+{
+    intra_predict_128dc_16x16_msa( p_src, FDEC_STRIDE );
+}
+
+void x264_intra_predict_hor_16x16_msa( uint8_t *p_src )
+{
+    intra_predict_horiz_16x16_msa( ( p_src - 1 ), FDEC_STRIDE,
+                                   p_src, FDEC_STRIDE );
+}
+
+void x264_intra_predict_vert_16x16_msa( uint8_t *p_src )
+{
+    intra_predict_vert_16x16_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
+}
+
+void x264_intra_predict_plane_16x16_msa( uint8_t *p_src )
+{
+    intra_predict_plane_16x16_msa( p_src, FDEC_STRIDE );
+}
+
+void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src )
+{
+    intra_predict_dc_4blk_8x8_msa( p_src, FDEC_STRIDE );
+}
+
+void x264_intra_predict_hor_8x8_msa( uint8_t *p_src )
+{
+    intra_predict_horiz_8x8_msa( ( p_src - 1 ), FDEC_STRIDE,
+                                 p_src, FDEC_STRIDE );
+}
+
+void x264_intra_predict_vert_8x8_msa( uint8_t *p_src )
+{
+    intra_predict_vert_8x8_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
+}
+
+void x264_intra_predict_plane_8x8_msa( uint8_t *p_src )
+{
+    intra_predict_plane_8x8_msa( p_src, FDEC_STRIDE );
+}
+
+void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
+{
+    intra_predict_ddl_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
+}
+
+void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
+{
+    intra_predict_dc_8x8_msa( ( pu_xyz + 16 ), ( pu_xyz + 7 ),
+                              p_src, FDEC_STRIDE );
+}
+
+void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
+{
+    intra_predict_horiz_8x8_msa( ( pu_xyz + 14 ), -1, p_src, FDEC_STRIDE );
+}
+
+void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
+{
+    intra_predict_vert_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
+}
+
+void x264_intra_predict_dc_4x4_msa( uint8_t *p_src )
+{
+    intra_predict_dc_4x4_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
+                              FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
+}
+
+void x264_intra_predict_hor_4x4_msa( uint8_t *p_src )
+{
+    intra_predict_horiz_4x4_msa( ( p_src - 1 ), FDEC_STRIDE,
+                                 p_src, FDEC_STRIDE );
+}
+
+void x264_intra_predict_vert_4x4_msa( uint8_t *p_src )
+{
+    intra_predict_vert_4x4_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
+}
+#endif

x264-snapshot-20150804-2245.tar.bz2/common/mips/predict.h Added

@@ -0,0 +1,48 @@
+/*****************************************************************************
+ * predict.h: msa intra prediction
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Rishikesh More <rishikesh.more@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_PREDICT_H
+#define X264_MIPS_PREDICT_H
+
+void x264_intra_predict_dc_16x16_msa( uint8_t *p_src );
+void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src );
+void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src );
+void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src );
+void x264_intra_predict_hor_16x16_msa( uint8_t *p_src );
+void x264_intra_predict_vert_16x16_msa( uint8_t *p_src );
+void x264_intra_predict_plane_16x16_msa( uint8_t *p_src );
+void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src );
+void x264_intra_predict_hor_8x8_msa( uint8_t *p_src );
+void x264_intra_predict_vert_8x8_msa( uint8_t *p_src );
+void x264_intra_predict_plane_8x8_msa( uint8_t *p_src );
+void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
+void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
+void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
+void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
+void x264_intra_predict_dc_4x4_msa( uint8_t *p_src );
+void x264_intra_predict_hor_4x4_msa( uint8_t *p_src );
+void x264_intra_predict_vert_4x4_msa( uint8_t *p_src );
+
+#endif

x264-snapshot-20150804-2245.tar.bz2/common/mips/quant-c.c Added

@@ -0,0 +1,630 @@
+/*****************************************************************************
+ * quant-c.c: msa quantization and level-run
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Rishikesh More <rishikesh.more@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macros.h"
+
+#if !HIGH_BIT_DEPTH
+static void avc_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
+                                 int32_t i_qp )
+{
+    const int32_t i_mf = i_qp % 6;
+    const int32_t q_bits = i_qp / 6 - 4;
+    v8i16 dct0, dct1;
+    v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
+
+    LD_SH2( p_dct, 8, dct0, dct1 );
+
+    LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
+    LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
+
+    if ( q_bits >= 0 )
+    {
+        v8i16 dequant_mf_h0, dequant_mf_h1, q_bits_vec;
+
+        q_bits_vec = __msa_fill_h( q_bits );
+
+        PCKEV_H2_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
+                     dequant_mf_h0, dequant_mf_h1 );
+
+        dct0 *= dequant_mf_h0;
+        dct1 *= dequant_mf_h1;
+        dct0 <<= q_bits_vec;
+        dct1 <<= q_bits_vec;
+        ST_SH2( dct0, dct1, p_dct, 8 );
+    }
+    else
+    {
+        const int32_t q_bits_add = 1 << ( -q_bits - 1 );
+        v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
+        v4i32 q_bits_vec, q_bits_vec_add;
+
+        q_bits_vec_add = __msa_fill_w( q_bits_add );
+        q_bits_vec = __msa_fill_w( -q_bits );
+
+        UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
+        UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
+
+        dct_signed_w0 *= dequant_m_f0;
+        dct_signed_w1 *= dequant_m_f1;
+        dct_signed_w2 *= dequant_m_f2;
+        dct_signed_w3 *= dequant_m_f3;
+        dct_signed_w0 += q_bits_vec_add;
+        dct_signed_w1 += q_bits_vec_add;
+        dct_signed_w2 += q_bits_vec_add;
+        dct_signed_w3 += q_bits_vec_add;
+
+        SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
+                q_bits_vec );
+        PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
+                     dct0, dct1 );
+        ST_SH2( dct0, dct1, p_dct, 8 );
+    }
+}
+
+static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
+                                 int32_t i_qp )
+{
+    const int32_t i_mf = i_qp % 6;
+    const int32_t q_bits = i_qp / 6 - 6;
+    v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7;
+    v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
+    v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7;
+    v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11;
+    v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15;
+
+    LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 );
+
+    LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
+    LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
+    LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 );
+    LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 );
+    LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 );
+    LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 );
+    LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 );
+    LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 );
+
+    if ( q_bits >= 0 )
+    {
+        v8i16 q_bits_vec;
+        v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3;
+        v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7;
+
+        q_bits_vec = __msa_fill_h( q_bits );
+
+        PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
+                     dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6,
+                     dequant_mf_h0, dequant_mf_h1,
+                     dequant_mf_h2, dequant_mf_h3 );
+        PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10,
+                     dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14,
+                     dequant_mf_h4, dequant_mf_h5,
+                     dequant_mf_h6, dequant_mf_h7 );
+
+        dct0 *= dequant_mf_h0;
+        dct1 *= dequant_mf_h1;
+        dct2 *= dequant_mf_h2;
+        dct3 *= dequant_mf_h3;
+        dct4 *= dequant_mf_h4;
+        dct5 *= dequant_mf_h5;
+        dct6 *= dequant_mf_h6;
+        dct7 *= dequant_mf_h7;
+
+        SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec );
+        SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec );
+
+        ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
+    }
+    else
+    {
+        const int32_t q_bits_add = 1 << ( -q_bits - 1 );
+        v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
+        v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
+        v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11;
+        v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15;
+        v4i32 q_bits_vec, q_bits_vec_add;
+
+        q_bits_vec_add = __msa_fill_w( q_bits_add );
+        q_bits_vec = __msa_fill_w( -q_bits );
+
+        UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
+        UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
+        UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
+        UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
+        UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 );
+        UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 );
+        UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 );
+        UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 );
+
+        dct_signed_w0 *= dequant_m_f0;
+        dct_signed_w1 *= dequant_m_f1;
+        dct_signed_w2 *= dequant_m_f2;
+        dct_signed_w3 *= dequant_m_f3;
+        dct_signed_w4 *= dequant_m_f4;
+        dct_signed_w5 *= dequant_m_f5;
+        dct_signed_w6 *= dequant_m_f6;
+        dct_signed_w7 *= dequant_m_f7;
+        dct_signed_w8 *= dequant_m_f8;
+        dct_signed_w9 *= dequant_m_f9;
+        dct_signed_w10 *= dequant_m_f10;
+        dct_signed_w11 *= dequant_m_f11;
+        dct_signed_w12 *= dequant_m_f12;
+        dct_signed_w13 *= dequant_m_f13;
+        dct_signed_w14 *= dequant_m_f14;
+        dct_signed_w15 *= dequant_m_f15;
+
+        dct_signed_w0 += q_bits_vec_add;
+        dct_signed_w1 += q_bits_vec_add;
+        dct_signed_w2 += q_bits_vec_add;
+        dct_signed_w3 += q_bits_vec_add;
+        dct_signed_w4 += q_bits_vec_add;
+        dct_signed_w5 += q_bits_vec_add;
+        dct_signed_w6 += q_bits_vec_add;
+        dct_signed_w7 += q_bits_vec_add;
+        dct_signed_w8 += q_bits_vec_add;
+        dct_signed_w9 += q_bits_vec_add;
+        dct_signed_w10 += q_bits_vec_add;
+        dct_signed_w11 += q_bits_vec_add;
+        dct_signed_w12 += q_bits_vec_add;
+        dct_signed_w13 += q_bits_vec_add;
+        dct_signed_w14 += q_bits_vec_add;
+        dct_signed_w15 += q_bits_vec_add;
+
+        SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
+                q_bits_vec );
+        SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7,
+                q_bits_vec );
+        SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11,
+                q_bits_vec );
+        SRA_4V( dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15,
+                q_bits_vec );
+        PCKEV_H4_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
+                     dct_signed_w5, dct_signed_w4, dct_signed_w7, dct_signed_w6,
+                     dct0, dct1, dct2, dct3 );
+        PCKEV_H4_SH( dct_signed_w9, dct_signed_w8, dct_signed_w11,
+                     dct_signed_w10, dct_signed_w13, dct_signed_w12,
+                     dct_signed_w15, dct_signed_w14, dct4, dct5, dct6, dct7 );
+        ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
+    }
+}
+
+static void avc_dequant_4x4_dc_msa( int16_t *p_dct,
+                                    int32_t pi_dequant_mf[6][16],
+                                    int32_t i_qp )
+{
+    const int32_t q_bits = i_qp / 6 - 6;
+    int32_t i_dmf = pi_dequant_mf[i_qp % 6][0];
+    v8i16 dct0, dct1, dequant_mf_h;
+
+    LD_SH2( p_dct, 8, dct0, dct1 );
+
+    if ( q_bits >= 0 )
+    {
+        i_dmf <<= q_bits;
+
+        dequant_mf_h = __msa_fill_h( i_dmf );
+        dct0 = dct0 * dequant_mf_h;
+        dct1 = dct1 * dequant_mf_h;
+
+        ST_SH2( dct0, dct1, p_dct, 8 );
+    }
+    else
+    {
+        const int32_t q_bits_add = 1 << ( -q_bits - 1 );
+        v4i32 dequant_m_f, q_bits_vec, q_bits_vec_add;
+        v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
+
+        q_bits_vec_add = __msa_fill_w( q_bits_add );
+        q_bits_vec = __msa_fill_w( -q_bits );
+
+        dequant_m_f = __msa_fill_w( i_dmf );
+
+        UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
+        UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
+
+        dct_signed_w0 *= dequant_m_f;
+        dct_signed_w1 *= dequant_m_f;
+        dct_signed_w2 *= dequant_m_f;
+        dct_signed_w3 *= dequant_m_f;
+
+        dct_signed_w0 += q_bits_vec_add;
+        dct_signed_w1 += q_bits_vec_add;
+        dct_signed_w2 += q_bits_vec_add;
+        dct_signed_w3 += q_bits_vec_add;
+
+        SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
+                q_bits_vec );
+        PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
+                     dct0, dct1 );
+        ST_SH2( dct0, dct1, p_dct, 8 );
+    }
+}
+
+static int32_t avc_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf,
+                                  uint16_t *p_bias )
+{
+    int32_t non_zero = 0;
+    v8i16 dct0, dct1;
+    v8i16 zero = { 0 };
+    v8i16 dct0_mask, dct1_mask;
+    v8i16 dct_h0, dct_h1, mf_h0, mf_h1, bias_h0, bias_h1;
+    v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
+    v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
+    v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
+    v4i32 bias0, bias1, bias2, bias3;
+
+    LD_SH2( p_dct, 8, dct0, dct1 );
+    LD_SH2( p_bias, 8, bias_h0, bias_h1 );
+    LD_SH2( p_mf, 8, mf_h0, mf_h1 );
+
+    dct0_mask = __msa_clei_s_h( dct0, 0 );
+    dct1_mask = __msa_clei_s_h( dct1, 0 );
+
+    UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
+    UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
+    ILVR_H2_SW( zero, bias_h0, zero, bias_h1, bias0, bias2 );
+    ILVL_H2_SW( zero, bias_h0, zero, bias_h1, bias1, bias3 );
+    ILVR_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec0, mf_vec2 );
+    ILVL_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec1, mf_vec3 );
+
+    dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
+    dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
+    dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
+    dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
+
+    dct_w0 *= mf_vec0;
+    dct_w1 *= mf_vec1;
+    dct_w2 *= mf_vec2;
+    dct_w3 *= mf_vec3;
+
+    SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
+    PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
+
+    dct0 = zero - dct_h0;
+    dct1 = zero - dct_h1;
+
+    dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0,
+                                   ( v16u8 ) dct0_mask );
+    dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1,
+                                   ( v16u8 ) dct1_mask );
+    non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
+    ST_SH2( dct0, dct1, p_dct, 8 );
+
+    return !!non_zero;
+}
+
+static int32_t avc_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf,
+                                  uint16_t *p_bias )
+{
+    int32_t non_zero = 0;
+    v8i16 dct0, dct1, dct2, dct3;
+    v8i16 zero = { 0 };
+    v8i16 dct0_mask, dct1_mask, dct2_mask, dct3_mask;
+    v8i16 dct_h0, dct_h1, dct_h2, dct_h3, mf_h0, mf_h1, mf_h2, mf_h3;
+    v8i16 bias_h0, bias_h1, bias_h2, bias_h3;
+    v4i32 dct_w0, dct_w1, dct_w2, dct_w3, dct_w4, dct_w5, dct_w6, dct_w7;
+    v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
+    v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
+    v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
+    v4i32 mf_vec4, mf_vec5, mf_vec6, mf_vec7;
+    v4i32 bias0, bias1, bias2, bias3, bias4, bias5, bias6, bias7;
+
+    LD_SH4( p_dct, 8, dct0, dct1, dct2, dct3 );
+
+    dct0_mask = __msa_clei_s_h( dct0, 0 );
+    dct1_mask = __msa_clei_s_h( dct1, 0 );
+    dct2_mask = __msa_clei_s_h( dct2, 0 );
+    dct3_mask = __msa_clei_s_h( dct3, 0 );
+
+    UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
+    UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
+    UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
+    UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
+    LD_SH4( p_bias, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
+    ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
+                bias0, bias2, bias4, bias6 );
+    ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
+                bias1, bias3, bias5, bias7 );
+    LD_SH4( p_mf, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
+    ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
+                mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
+    ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
+                mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
+
+    dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
+    dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
+    dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
+    dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
+    dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
+    dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
+    dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
+    dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
+
+    dct_w0 *= mf_vec0;
+    dct_w1 *= mf_vec1;
+    dct_w2 *= mf_vec2;
+    dct_w3 *= mf_vec3;
+    dct_w4 *= mf_vec4;
+    dct_w5 *= mf_vec5;
+    dct_w6 *= mf_vec6;
+    dct_w7 *= mf_vec7;
+
+    SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
+    SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
+    PCKEV_H4_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_w5, dct_w4, dct_w7, dct_w6,
+                 dct_h0, dct_h1, dct_h2, dct_h3 );
+    SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
+          dct0, dct1, dct2, dct3 );
+
+    dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
+                                   ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
+    dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
+                                   ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
+    dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
+                                   ( v16u8 ) dct2, ( v16u8 ) dct2_mask );
+    dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
+                                   ( v16u8 ) dct3, ( v16u8 ) dct3_mask );
+
+    non_zero = HADD_SW_S32( ( v4u32 )( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
+    ST_SH4( dct0, dct1, dct2, dct3, p_dct, 8 );
+    LD_SH4( p_dct + 32, 8, dct0, dct1, dct2, dct3 );
+
+    dct0_mask = __msa_clei_s_h( dct0, 0 );
+    dct1_mask = __msa_clei_s_h( dct1, 0 );
+    dct2_mask = __msa_clei_s_h( dct2, 0 );
+    dct3_mask = __msa_clei_s_h( dct3, 0 );
+
+    UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
+    UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
+    UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
+    UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
+    LD_SH4( p_bias + 32, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
+    ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
+                bias0, bias2, bias4, bias6 );
+    ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
+                bias1, bias3, bias5, bias7 );
+    LD_SH4( p_mf + 32, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
+    ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
+                mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
+    ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
+                mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
+
+    dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
+    dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
+    dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
+    dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
+    dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
+    dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
+    dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
+    dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
+
+    dct_w0 *= mf_vec0;
+    dct_w1 *= mf_vec1;
+    dct_w2 *= mf_vec2;
+    dct_w3 *= mf_vec3;
+    dct_w4 *= mf_vec4;
+    dct_w5 *= mf_vec5;
+    dct_w6 *= mf_vec6;
+    dct_w7 *= mf_vec7;
+
+    SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
+    SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
+    PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
+    PCKEV_H2_SH( dct_w5, dct_w4, dct_w7, dct_w6, dct_h2, dct_h3 );
+    SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
+          dct0, dct1, dct2, dct3 );
+
+    dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
+                                   ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
+    dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
+                                   ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
+    dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
+                                   ( v16u8 ) dct2, ( v16u8 ) dct2_mask );
+    dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
+                                   ( v16u8 ) dct3, ( v16u8 ) dct3_mask );
+
+    non_zero += HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
+    ST_SH4( dct0, dct1, dct2, dct3, p_dct + 32, 8 );
+
+    return !!non_zero;
+}
+
+static int32_t avc_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf,
+                                     int32_t i_bias )
+{
+    int32_t non_zero = 0;
+    v8i16 dct0, dct1, dct0_mask, dct1_mask;
+    v8i16 zero = { 0 };
+    v8i16 dct_h0, dct_h1;
+    v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
+    v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
+    v4i32 mf_vec, bias_vec;
+
+    LD_SH2( p_dct, 8, dct0, dct1 );
+
+    dct0_mask = __msa_clei_s_h( dct0, 0 );
+    dct1_mask = __msa_clei_s_h( dct1, 0 );
+
+    UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
+    UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
+
+    bias_vec = __msa_fill_w( i_bias );
+    mf_vec = __msa_fill_w( i_mf );
+
+    dct_w0 = __msa_add_a_w( dct_signed_w0, bias_vec );
+    dct_w1 = __msa_add_a_w( dct_signed_w1, bias_vec );
+    dct_w2 = __msa_add_a_w( dct_signed_w2, bias_vec );
+    dct_w3 = __msa_add_a_w( dct_signed_w3, bias_vec );
+
+    dct_w0 *= mf_vec;
+    dct_w1 *= mf_vec;
+    dct_w2 *= mf_vec;
+    dct_w3 *= mf_vec;
+
+    SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
+    PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
+
+    dct0 = zero - dct_h0;
+    dct1 = zero - dct_h1;
+    dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
+                                   ( v16u8 ) dct0, ( v16u8 ) dct0_mask );
+    dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
+                                   ( v16u8 ) dct1, ( v16u8 ) dct1_mask );
+    non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
+
+    ST_SH2( dct0, dct1, p_dct, 8 );
+
+    return !!non_zero;
+}
+
+static int32_t avc_coeff_last64_msa( int16_t *p_src )
+{
+    uint32_t u_res;
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 tmp_h0, tmp_h1, tmp_h2, tmp_h3, tmp_h4, tmp_h5, tmp_h6, tmp_h7;
+    v16u8 tmp0, tmp1, tmp2, tmp3;
+    v8u16 vec0, vec1, vec2, vec3;
+    v4i32 out0;
+    v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
+
+    LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
+
+    tmp_h0 = __msa_ceqi_h( src0, 0 );
+    tmp_h1 = __msa_ceqi_h( src1, 0 );
+    tmp_h2 = __msa_ceqi_h( src2, 0 );
+    tmp_h3 = __msa_ceqi_h( src3, 0 );
+    tmp_h4 = __msa_ceqi_h( src4, 0 );
+    tmp_h5 = __msa_ceqi_h( src5, 0 );
+    tmp_h6 = __msa_ceqi_h( src6, 0 );
+    tmp_h7 = __msa_ceqi_h( src7, 0 );
+
+    PCKEV_B4_UB( tmp_h1, tmp_h0, tmp_h3, tmp_h2, tmp_h5, tmp_h4, tmp_h7, tmp_h6,
+                 tmp0, tmp1, tmp2, tmp3 );
+
+    tmp0 = tmp0 & mask;
+    tmp1 = tmp1 & mask;
+    tmp2 = tmp2 & mask;
+    tmp3 = tmp3 & mask;
+
+    HADD_UB4_UH( tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3 );
+    PCKEV_B2_UB( vec1, vec0, vec3, vec2, tmp0, tmp1 );
+    HADD_UB2_UH( tmp0, tmp1, vec0, vec1 );
+
+    tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec1, ( v16i8 ) vec0 );
+    vec0 = __msa_hadd_u_h( tmp0, tmp0 );
+    tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec0, ( v16i8 ) vec0 );
+    out0 = ( v4i32 ) __msa_nloc_d( ( v2i64 ) tmp0 );
+    u_res = __msa_copy_u_w( out0, 0 );
+
+    return ( 63 - u_res );
+}
+
+static int32_t avc_coeff_last16_msa( int16_t *p_src )
+{
+    uint32_t u_res;
+    v8i16 src0, src1;
+    v8u16 tmp_h0;
+    v16u8 tmp0;
+    v8i16 out0, out1;
+    v16i8 res0;
+    v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
+
+    LD_SH2( p_src, 8, src0, src1 );
+
+    out0 = __msa_ceqi_h( src0, 0 );
+    out1 = __msa_ceqi_h( src1, 0 );
+
+    tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) out1, ( v16i8 ) out0 );
+    tmp0 = tmp0 & mask;
+    tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
+    tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
+    tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
+    tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
+    tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
+    res0 = __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
+    out0 = __msa_nloc_h( ( v8i16 ) res0 );
+    u_res = __msa_copy_u_h( out0, 0 );
+
+    return ( 15 - u_res );
+}
+
+void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
+                           int32_t i_qp )
+{
+    avc_dequant_4x4_msa( p_dct, pi_dequant_mf, i_qp );
+}
+
+void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
+                           int32_t i_qp )
+{
+    avc_dequant_8x8_msa( p_dct, pi_dequant_mf, i_qp );
+}
+
+void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
+                              int32_t i_qp )
+{
+    avc_dequant_4x4_dc_msa( p_dct, pi_dequant_mf, i_qp );
+}
+
+int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
+{
+    return avc_quant_4x4_msa( p_dct, p_mf, p_bias );
+}
+
+int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16],
+                              uint16_t pu_mf[16], uint16_t pu_bias[16] )
+{
+    int32_t i_non_zero, i_non_zero_acc = 0;
+
+    for( int32_t j = 0; j < 4; j++  )
+    {
+        i_non_zero = x264_quant_4x4_msa( p_dct[j], pu_mf, pu_bias );
+
+        i_non_zero_acc |= ( !!i_non_zero ) << j;
+    }
+
+    return i_non_zero_acc;
+}
+
+int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
+{
+    return avc_quant_8x8_msa( p_dct, p_mf, p_bias );
+}
+
+int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias )
+{
+    return avc_quant_4x4_dc_msa( p_dct, i_mf, i_bias );
+}
+
+int32_t x264_coeff_last64_msa( int16_t *p_src )
+{
+    return avc_coeff_last64_msa( p_src );
+}
+
+int32_t x264_coeff_last16_msa( int16_t *p_src )
+{
+    return avc_coeff_last16_msa( p_src );
+}
+#endif

x264-snapshot-20150804-2245.tar.bz2/common/mips/quant.h Added

@@ -0,0 +1,43 @@
+/*****************************************************************************
+ * quant.h: msa quantization and level-run
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Rishikesh More <rishikesh.more@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_QUANT_H
+#define X264_MIPS_QUANT_H
+
+void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
+                           int32_t i_qp );
+void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
+                           int32_t i_qp );
+void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
+                              int32_t i_qp );
+int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias );
+int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16],
+                              uint16_t pu_mf[16], uint16_t pu_bias[16] );
+int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias );
+int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias );
+int32_t x264_coeff_last64_msa( int16_t *p_src );
+int32_t x264_coeff_last16_msa( int16_t *p_src );
+
+#endif

x264-snapshot-20141218-2245.tar.bz2/common/mvpred.c -> x264-snapshot-20150804-2245.tar.bz2/common/mvpred.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/opencl.c -> x264-snapshot-20150804-2245.tar.bz2/common/opencl.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/opencl.h -> x264-snapshot-20150804-2245.tar.bz2/common/opencl.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/osdep.c -> x264-snapshot-20150804-2245.tar.bz2/common/osdep.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * osdep.c: platform-specific code
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Steven Walters <kemuri9@gmail.com>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -94,51 +94,6 @@
 }
 #endif
 
-#if HAVE_MMX
-#ifdef __INTEL_COMPILER
-/* Agner's patch to Intel's CPU dispatcher from pages 131-132 of
- * http://agner.org/optimize/optimizing_cpp.pdf (2011-01-30)
- * adapted to x264's cpu schema. */
-
-// Global variable indicating cpu
-int __intel_cpu_indicator = 0;
-// CPU dispatcher function
-void x264_intel_cpu_indicator_init( void )
-{
-    unsigned int cpu = x264_cpu_detect();
-    if( cpu&X264_CPU_AVX )
-        __intel_cpu_indicator = 0x20000;
-    else if( cpu&X264_CPU_SSE42 )
-        __intel_cpu_indicator = 0x8000;
-    else if( cpu&X264_CPU_SSE4 )
-        __intel_cpu_indicator = 0x2000;
-    else if( cpu&X264_CPU_SSSE3 )
-        __intel_cpu_indicator = 0x1000;
-    else if( cpu&X264_CPU_SSE3 )
-        __intel_cpu_indicator = 0x800;
-    else if( cpu&X264_CPU_SSE2 && !(cpu&X264_CPU_SSE2_IS_SLOW) )
-        __intel_cpu_indicator = 0x200;
-    else if( cpu&X264_CPU_SSE )
-        __intel_cpu_indicator = 0x80;
-    else if( cpu&X264_CPU_MMX2 )
-        __intel_cpu_indicator = 8;
-    else
-        __intel_cpu_indicator = 1;
-}
-
-/* __intel_cpu_indicator_init appears to have a non-standard calling convention that
- * assumes certain registers aren't preserved, so we'll route it through a function
- * that backs up all the registers. */
-void __intel_cpu_indicator_init( void )
-{
-    x264_safe_intel_cpu_indicator_init();
-}
-#else
-void x264_intel_cpu_indicator_init( void )
-{}
-#endif
-#endif
-
 #ifdef _WIN32
 /* Functions for dealing with Unicode on Windows. */
 FILE *x264_fopen( const char *filename, const char *mode )

x264-snapshot-20141218-2245.tar.bz2/common/osdep.h -> x264-snapshot-20150804-2245.tar.bz2/common/osdep.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/pixel.c -> x264-snapshot-20150804-2245.tar.bz2/common/pixel.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * pixel.c: pixel metrics
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -42,6 +42,9 @@
 #   include "aarch64/pixel.h"
 #   include "aarch64/predict.h"
 #endif
+#if ARCH_MIPS
+#   include "mips/pixel.h"
+#endif
 
 
 /****************************************************************************
@@ -598,8 +601,8 @@
 INTRA_MBCMP(satd,  4x4,   v, h, dc,  , _neon, _neon )
 INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _neon, _neon )
 INTRA_MBCMP(satd,  8x8,  dc, h,  v, c, _neon, _neon )
-INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _neon, _c )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _neon, _c )
+INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _neon, _neon )
+INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _neon, _neon )
 INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _neon, _neon )
 INTRA_MBCMP(satd, 16x16,  v, h, dc,  , _neon, _neon )
 #endif
@@ -1409,25 +1412,28 @@
 #if ARCH_AARCH64
     if( cpu&X264_CPU_NEON )
     {
-        INIT7( sad, _neon );
+        INIT8( sad, _neon );
         // AArch64 has no distinct instructions for aligned load/store
-        INIT7_NAME( sad_aligned, sad, _neon );
+        INIT8_NAME( sad_aligned, sad, _neon );
         INIT7( sad_x3, _neon );
         INIT7( sad_x4, _neon );
-        INIT7( ssd, _neon );
-        INIT7( satd, _neon );
+        INIT8( ssd, _neon );
+        INIT8( satd, _neon );
         INIT7( satd_x3, _neon );
         INIT7( satd_x4, _neon );
         INIT4( hadamard_ac, _neon );
 
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon;
 
         pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
         pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_neon;
         pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
         pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
         pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_neon;
+        pixf->vsad = x264_pixel_vsad_neon;
+        pixf->asd8 = x264_pixel_asd8_neon;
 
         pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_neon;
         pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_neon;
@@ -1440,11 +1446,44 @@
         pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_neon;
         pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon;
 
+        pixf->ssd_nv12_core     = x264_pixel_ssd_nv12_core_neon;
         pixf->ssim_4x4x2_core   = x264_pixel_ssim_4x4x2_core_neon;
         pixf->ssim_end4         = x264_pixel_ssim_end4_neon;
     }
 #endif // ARCH_AARCH64
 
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        INIT8( sad, _msa );
+        INIT8_NAME( sad_aligned, sad, _msa );
+        INIT8( ssd, _msa );
+        INIT7( sad_x3, _msa );
+        INIT7( sad_x4, _msa );
+        INIT8( satd, _msa );
+        INIT4( hadamard_ac, _msa );
+
+        pixf->intra_sad_x3_4x4   = x264_intra_sad_x3_4x4_msa;
+        pixf->intra_sad_x3_8x8   = x264_intra_sad_x3_8x8_msa;
+        pixf->intra_sad_x3_8x8c  = x264_intra_sad_x3_8x8c_msa;
+        pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_msa;
+        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_msa;
+        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_msa;
+        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_msa;
+        pixf->intra_sa8d_x3_8x8   = x264_intra_sa8d_x3_8x8_msa;
+
+        pixf->ssim_4x4x2_core = x264_ssim_4x4x2_core_msa;
+
+        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_msa;
+        pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_msa;
+        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_msa;
+        pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_msa;
+        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_msa;
+        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
+        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8;
+    }
+#endif // HAVE_MSA
+
 #endif // HIGH_BIT_DEPTH
 #if HAVE_ALTIVEC
     if( cpu&X264_CPU_ALTIVEC )

x264-snapshot-20141218-2245.tar.bz2/common/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/pixel.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/dct.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/dct.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * dct.c: ppc transform and zigzag
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
  *          Eric Petit <eric.petit@lapsus.org>
@@ -264,7 +264,7 @@
     vec_u8_t lv = vec_ld(0, dest);                              \
     vec_u8_t dstv = vec_perm(lv, zero_u8v, (vec_u8_t)perm_ldv); \
     vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                  \
-    vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv);    \
+    vec_u16_t dst16 = vec_u8_to_u16_h(dstv);                    \
     vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);   \
     vec_u8_t idstsum8 = vec_s16_to_u8(idstsum);                 \
     /* unaligned store */                                       \
@@ -384,7 +384,7 @@
     vec_u8_t lv = vec_ld( 7, dest );                           \
     vec_u8_t dstv   = vec_perm( hv, lv, (vec_u8_t)perm_ldv );  \
     vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                 \
-    vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv);   \
+    vec_u16_t dst16 = vec_u8_to_u16_h(dstv);                   \
     vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);  \
     vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum);        \
     /* unaligned store */                                      \

x264-snapshot-20141218-2245.tar.bz2/common/ppc/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/dct.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/deblock.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/deblock.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/mc.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/mc.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc.c: ppc motion compensation
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Eric Petit <eric.petit@lapsus.org>
  *          Guillaume Poirier <gpoirier@mplayerhq.hu>
@@ -40,24 +40,19 @@
 typedef void (*pf_mc_t)( uint8_t *src, intptr_t i_src,
                          uint8_t *dst, intptr_t i_dst, int i_height );
 
-
-static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-
-
 static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
 {
     return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
            pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
            pix[ 3*i_pix_next];
 }
+
 static inline int x264_tapfilter1( uint8_t *pix )
 {
     return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
            pix[ 3];
 }
 
-
 static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst,  intptr_t i_dst,
                                                uint8_t *src1, intptr_t i_src1,
                                                uint8_t *src2, int i_height )
@@ -181,10 +176,10 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
+    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
 
         switch( i_width )
         {
@@ -229,10 +224,10 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
+    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         switch( i_width )
         {
             case 4:
@@ -296,6 +291,12 @@
     }
  }
 
+#ifdef WORDS_BIGENDIAN
+#define VSLD(a,b,n) vec_sld(a,b,n)
+#else
+#define VSLD(a,b,n) vec_sld(b,a,16-n)
+#endif
+
 static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
                                    uint8_t *src, intptr_t i_src_stride,
                                    int mvx, int mvy, int i_height )
@@ -321,8 +322,13 @@
     vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16, dstv16;
     vec_u16_t   shiftv, k32v;
 
+#ifdef WORDS_BIGENDIAN
     static const vec_u8_t perm0v = CV(1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13);
     static const vec_u8_t perm1v = CV(3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15);
+#else
+    static const vec_u8_t perm0v = CV(0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12);
+    static const vec_u8_t perm1v = CV(2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14);
+#endif
 
     coeff0v = vec_ld( 0, coeff );
     coeff3v = vec_splat( coeff0v, 3 );
@@ -334,7 +340,7 @@
 
     VEC_LOAD( src, src2v_8, 9, vec_u8_t, src );
     src2v_16 = vec_u8_to_u16( src2v_8 );
-    src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
+    src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
 
     for( int y = 0; y < i_height; y += 2 )
     {
@@ -342,7 +348,7 @@
         src1v_16 = src3v_16;
         VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
         src2v_16 = vec_u8_to_u16( src2v_8 );
-        src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
+        src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
 
         dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
         dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
@@ -364,7 +370,7 @@
         src1v_16 = src3v_16;
         VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
         src2v_16 = vec_u8_to_u16( src2v_8 );
-        src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
+        src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
 
         dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
         dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
@@ -420,12 +426,17 @@
     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
     shiftv  = vec_splat_u16( 6 );
 
+#ifdef WORDS_BIGENDIAN
     static const vec_u8_t perm0v = CV(1,5,9,13,17,21,25,29,0,0,0,0,0,0,0,0);
     static const vec_u8_t perm1v = CV(3,7,11,15,19,23,27,31,0,0,0,0,0,0,0,0);
+#else
+    static const vec_u8_t perm0v = CV(0,4,8,12,16,20,24,28,1,1,1,1,1,1,1,1);
+    static const vec_u8_t perm1v = CV(2,6,10,14,18,22,26,30,1,1,1,1,1,1,1,1);
+#endif
 
     VEC_LOAD( src, src2v_8, 16, vec_u8_t, src );
     VEC_LOAD( src+16, src3v_8, 2, vec_u8_t, src );
-    src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
+    src3v_8 = VSLD( src2v_8, src3v_8, 2 );
 
     for( int y = 0; y < i_height; y += 2 )
     {
@@ -434,7 +445,7 @@
         VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
         VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
 
-        src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
+        src3v_8 = VSLD( src2v_8, src3v_8, 2 );
 
         src0v_16h = vec_u8_to_u16_h( src0v_8 );
         src0v_16l = vec_u8_to_u16_l( src0v_8 );
@@ -472,7 +483,7 @@
         VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
         VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
 
-        src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
+        src3v_8 = VSLD( src2v_8, src3v_8, 2 );
 
         src0v_16h = vec_u8_to_u16_h( src0v_8 );
         src0v_16l = vec_u8_to_u16_l( src0v_8 );
@@ -555,11 +566,11 @@
     VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
     VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
                                                              \
-    src2v = vec_sld( src1v, src6v,  1 );                     \
-    src3v = vec_sld( src1v, src6v,  2 );                     \
-    src4v = vec_sld( src1v, src6v,  3 );                     \
-    src5v = vec_sld( src1v, src6v,  4 );                     \
-    src6v = vec_sld( src1v, src6v,  5 );                     \
+    src2v = VSLD( src1v, src6v,  1 );                        \
+    src3v = VSLD( src1v, src6v,  2 );                        \
+    src4v = VSLD( src1v, src6v,  3 );                        \
+    src5v = VSLD( src1v, src6v,  4 );                        \
+    src6v = VSLD( src1v, src6v,  5 );                        \
                                                              \
     temp1v = vec_u8_to_s16_h( src1v );                       \
     temp2v = vec_u8_to_s16_h( src2v );                       \
@@ -634,12 +645,12 @@
 
 #define HPEL_FILTER_CENTRAL()                           \
 {                                                       \
-    temp1v = vec_sld( tempav, tempbv, 12 );             \
-    temp2v = vec_sld( tempav, tempbv, 14 );             \
+    temp1v = VSLD( tempav, tempbv, 12 );                \
+    temp2v = VSLD( tempav, tempbv, 14 );                \
     temp3v = tempbv;                                    \
-    temp4v = vec_sld( tempbv, tempcv,  2 );             \
-    temp5v = vec_sld( tempbv, tempcv,  4 );             \
-    temp6v = vec_sld( tempbv, tempcv,  6 );             \
+    temp4v = VSLD( tempbv, tempcv,  2 );                \
+    temp5v = VSLD( tempbv, tempcv,  4 );                \
+    temp6v = VSLD( tempbv, tempcv,  6 );                \
                                                         \
     HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
                    temp4v, temp5v, temp6v );            \
@@ -647,12 +658,12 @@
     dest1v = vec_add( temp1v, thirtytwov );             \
     dest1v = vec_sra( dest1v, sixv );                   \
                                                         \
-    temp1v = vec_sld( tempbv, tempcv, 12 );             \
-    temp2v = vec_sld( tempbv, tempcv, 14 );             \
+    temp1v = VSLD( tempbv, tempcv, 12 );                \
+    temp2v = VSLD( tempbv, tempcv, 14 );                \
     temp3v = tempcv;                                    \
-    temp4v = vec_sld( tempcv, tempdv,  2 );             \
-    temp5v = vec_sld( tempcv, tempdv,  4 );             \
-    temp6v = vec_sld( tempcv, tempdv,  6 );             \
+    temp4v = VSLD( tempcv, tempdv,  2 );                \
+    temp5v = VSLD( tempcv, tempdv,  4 );                \
+    temp6v = VSLD( tempcv, tempdv,  6 );                \
                                                         \
     HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
                    temp4v, temp5v, temp6v );            \
@@ -769,6 +780,9 @@
     vec_u8_t lv, hv, src1p1v;
     vec_u8_t avg0v, avg1v, avghv, avghp1v, avgleftv, avgrightv;
     static const vec_u8_t inverse_bridge_shuffle = CV(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E );
+#ifndef WORDS_BIGENDIAN
+    static const vec_u8_t inverse_bridge_shuffle_1 = CV(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F );
+#endif
 
     for( int y = 0; y < height; y++ )
     {
@@ -793,11 +807,15 @@
             src1p1v = vec_ld(16*(x*2+2), src1);
             avghp1v = vec_avg(lv, src1p1v);
 
-            avgleftv = vec_avg(vec_sld(avg0v, avghv, 1), avg0v);
-            avgrightv = vec_avg(vec_sld(avghv, avghp1v, 1), avghv);
+            avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v);
+            avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv);
 
             vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dst0);
+#ifdef WORDS_BIGENDIAN
             vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dsth);
+#else
+            vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dsth);
+#endif
 
             avg0v = avghp1v;
 
@@ -807,11 +825,15 @@
             hv = vec_ld(16*(x*2+2), src2);
             avghp1v = vec_avg(src1p1v, hv);
 
-            avgleftv = vec_avg(vec_sld(avg1v, avghv, 1), avg1v);
-            avgrightv = vec_avg(vec_sld(avghv, avghp1v, 1), avghv);
+            avgleftv = vec_avg(VSLD(avg1v, avghv, 1), avg1v);
+            avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv);
 
             vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dstv);
+#ifdef WORDS_BIGENDIAN
             vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dstc);
+#else
+            vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dstc);
+#endif
 
             avg1v = avghp1v;
 
@@ -825,11 +847,15 @@
             lv = vec_ld(16*(x*2+1), src2);
             avghp1v = vec_avg(src1v, lv);
 
-            avgleftv = vec_avg(vec_sld(avg0v, avghv, 1), avg0v);
-            avgrightv = vec_avg(vec_sld(avg1v, avghp1v, 1), avg1v);
+            avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v);
+            avgrightv = vec_avg(VSLD(avg1v, avghp1v, 1), avg1v);
 
             lv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle);
+#ifdef WORDS_BIGENDIAN
             hv = (vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv);
+#else
+            hv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1);
+#endif
 
             vec_ste((vec_u32_t)lv,16*x,(uint32_t*)dst0);
             vec_ste((vec_u32_t)lv,16*x+4,(uint32_t*)dst0);

x264-snapshot-20141218-2245.tar.bz2/common/ppc/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/mc.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/pixel.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/pixel.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/pixel.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/ppccommon.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/ppccommon.h Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * ppccommon.h: ppc utility macros
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Eric Petit <eric.petit@lapsus.org>
  *
@@ -81,10 +81,17 @@
 /***********************************************************************
  * 8 <-> 16 bits conversions
  **********************************************************************/
+#ifdef WORDS_BIGENDIAN
 #define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
 #define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
 #define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
 #define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
+#else
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
+#endif
 
 #define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
 #define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
@@ -96,10 +103,17 @@
 /***********************************************************************
  * 16 <-> 32 bits conversions
  **********************************************************************/
+#ifdef WORDS_BIGENDIAN
 #define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
 #define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
 #define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
 #define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
+#else
+#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
+#endif
 
 #define vec_u16_to_u32(v) vec_u16_to_u32_h(v)
 #define vec_u16_to_s32(v) vec_u16_to_s32_h(v)

x264-snapshot-20141218-2245.tar.bz2/common/ppc/predict.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/predict.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/predict.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/quant.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/quant.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * quant.c: ppc quantization
  *****************************************************************************
- * Copyright (C) 2007-2014 x264 project
+ * Copyright (C) 2007-2015 x264 project
  *
  * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
  *
@@ -251,6 +251,14 @@
     vec_st(dctv, 8*y, dct);                                          \
 }
 
+#ifdef WORDS_BIGENDIAN
+#define VEC_MULE vec_mule
+#define VEC_MULO vec_mulo
+#else
+#define VEC_MULE vec_mulo
+#define VEC_MULO vec_mule
+#endif
+
 #define DEQUANT_SHR()                                          \
 {                                                              \
     dctv = vec_ld(8*y, dct);                                   \
@@ -259,14 +267,14 @@
     mf1v = vec_ld(16*y, dequant_mf[i_mf]);                     \
     mf2v = vec_ld(16+16*y, dequant_mf[i_mf]);                  \
                                                                \
-    multEvenvA = vec_mule(dct1v, (vec_s16_t)mf1v);             \
-    multOddvA = vec_mulo(dct1v, (vec_s16_t)mf1v);              \
+    multEvenvA = VEC_MULE(dct1v, (vec_s16_t)mf1v);             \
+    multOddvA = VEC_MULO(dct1v, (vec_s16_t)mf1v);              \
     temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
     temp1v = vec_add(temp1v, fv);                              \
     temp1v = vec_sra(temp1v, i_qbitsv);                        \
                                                                \
-    multEvenvA = vec_mule(dct2v, (vec_s16_t)mf2v);             \
-    multOddvA = vec_mulo(dct2v, (vec_s16_t)mf2v);              \
+    multEvenvA = VEC_MULE(dct2v, (vec_s16_t)mf2v);             \
+    multOddvA = VEC_MULO(dct2v, (vec_s16_t)mf2v);              \
     temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
     temp2v = vec_add(temp2v, fv);                              \
     temp2v = vec_sra(temp2v, i_qbitsv);                        \

x264-snapshot-20141218-2245.tar.bz2/common/ppc/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/quant.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/predict.c -> x264-snapshot-20150804-2245.tar.bz2/common/predict.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * predict.c: intra prediction
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -43,6 +43,9 @@
 #if ARCH_AARCH64
 #   include "aarch64/predict.h"
 #endif
+#if ARCH_MIPS
+#   include "mips/predict.h"
+#endif
 
 /****************************************************************************
  * 16x16 prediction for intra luma block
@@ -906,6 +909,21 @@
 #if ARCH_AARCH64
     x264_predict_16x16_init_aarch64( cpu, pf );
 #endif
+
+#if !HIGH_BIT_DEPTH
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        pf[I_PRED_16x16_V ]     = x264_intra_predict_vert_16x16_msa;
+        pf[I_PRED_16x16_H ]     = x264_intra_predict_hor_16x16_msa;
+        pf[I_PRED_16x16_DC]     = x264_intra_predict_dc_16x16_msa;
+        pf[I_PRED_16x16_P ]     = x264_intra_predict_plane_16x16_msa;
+        pf[I_PRED_16x16_DC_LEFT]= x264_intra_predict_dc_left_16x16_msa;
+        pf[I_PRED_16x16_DC_TOP ]= x264_intra_predict_dc_top_16x16_msa;
+        pf[I_PRED_16x16_DC_128 ]= x264_intra_predict_dc_128_16x16_msa;
+    }
+#endif
+#endif
 }
 
 void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
@@ -934,6 +952,15 @@
 #if ARCH_AARCH64
     x264_predict_8x8c_init_aarch64( cpu, pf );
 #endif
+
+#if !HIGH_BIT_DEPTH
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        pf[I_PRED_CHROMA_P ]     = x264_intra_predict_plane_8x8_msa;
+    }
+#endif
+#endif
 }
 
 void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
@@ -949,6 +976,10 @@
 #if HAVE_MMX
     x264_predict_8x16c_init_mmx( cpu, pf );
 #endif
+
+#if ARCH_AARCH64
+    x264_predict_8x16c_init_aarch64( cpu, pf );
+#endif
 }
 
 void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
@@ -978,6 +1009,15 @@
 #if ARCH_AARCH64
     x264_predict_8x8_init_aarch64( cpu, pf, predict_filter );
 #endif
+
+#if !HIGH_BIT_DEPTH
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        pf[I_PRED_8x8_DDL]    = x264_intra_predict_ddl_8x8_msa;
+    }
+#endif
+#endif
 }
 
 void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )

x264-snapshot-20141218-2245.tar.bz2/common/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/predict.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/quant.c -> x264-snapshot-20150804-2245.tar.bz2/common/quant.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * quant.c: quantization and level-run
  *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
+ * Copyright (C) 2005-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Fiona Glaser <fiona@x264.com>
@@ -40,6 +40,9 @@
 #if ARCH_AARCH64
 #   include "aarch64/quant.h"
 #endif
+#if ARCH_MIPS
+#   include "mips/quant.h"
+#endif
 
 #define QUANT_ONE( coef, mf, f ) \
 { \
@@ -714,7 +717,8 @@
 #endif // HAVE_MMX
 
 #if HAVE_ALTIVEC
-    if( cpu&X264_CPU_ALTIVEC ) {
+    if( cpu&X264_CPU_ALTIVEC )
+    {
         pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
         pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
         pf->quant_4x4 = x264_quant_4x4_altivec;
@@ -753,6 +757,32 @@
     {
         pf->coeff_last4 = x264_coeff_last4_aarch64;
         pf->coeff_last8 = x264_coeff_last8_aarch64;
+        pf->coeff_level_run4 = x264_coeff_level_run4_aarch64;
+    }
+    if( cpu&X264_CPU_NEON )
+    {
+        pf->coeff_level_run8 = x264_coeff_level_run8_neon;
+        pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_neon;
+        pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
+        pf->decimate_score15 = x264_decimate_score15_neon;
+        pf->decimate_score16 = x264_decimate_score16_neon;
+        pf->decimate_score64 = x264_decimate_score64_neon;
+        pf->denoise_dct = x264_denoise_dct_neon;
+    }
+#endif
+
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        pf->quant_4x4      = x264_quant_4x4_msa;
+        pf->quant_4x4_dc   = x264_quant_4x4_dc_msa;
+        pf->quant_4x4x4    = x264_quant_4x4x4_msa;
+        pf->quant_8x8      = x264_quant_8x8_msa;
+        pf->dequant_4x4    = x264_dequant_4x4_msa;
+        pf->dequant_4x4_dc = x264_dequant_4x4_dc_msa;
+        pf->dequant_8x8    = x264_dequant_8x8_msa;
+        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_msa;
+        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_msa;
     }
 #endif
 #endif // HIGH_BIT_DEPTH

x264-snapshot-20141218-2245.tar.bz2/common/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/quant.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/rectangle.c -> x264-snapshot-20150804-2245.tar.bz2/common/rectangle.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/rectangle.h -> x264-snapshot-20150804-2245.tar.bz2/common/rectangle.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/set.c -> x264-snapshot-20150804-2245.tar.bz2/common/set.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/set.h -> x264-snapshot-20150804-2245.tar.bz2/common/set.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/threadpool.c -> x264-snapshot-20150804-2245.tar.bz2/common/threadpool.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/threadpool.h -> x264-snapshot-20150804-2245.tar.bz2/common/threadpool.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/vlc.c -> x264-snapshot-20150804-2245.tar.bz2/common/vlc.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/win32thread.c -> x264-snapshot-20150804-2245.tar.bz2/common/win32thread.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * win32thread.c: windows threading
  *****************************************************************************
- * Copyright (C) 2010-2014 x264 project
+ * Copyright (C) 2010-2015 x264 project
  *
  * Authors: Steven Walters <kemuri9@gmail.com>
  *          Pegasys Inc. <http://www.pegasys-inc.com>
@@ -138,7 +138,7 @@
     if( !win32_cond )
         return -1;
     cond->ptr = win32_cond;
-    win32_cond->semaphore = CreateSemaphore( NULL, 0, 0x7fffffff, NULL );
+    win32_cond->semaphore = CreateSemaphoreW( NULL, 0, 0x7fffffff, NULL );
     if( !win32_cond->semaphore )
         return -1;
 
@@ -147,7 +147,7 @@
     if( x264_pthread_mutex_init( &win32_cond->mtx_broadcast, NULL ) )
         return -1;
 
-    win32_cond->waiters_done = CreateEvent( NULL, FALSE, FALSE, NULL );
+    win32_cond->waiters_done = CreateEventW( NULL, FALSE, FALSE, NULL );
     if( !win32_cond->waiters_done )
         return -1;

x264-snapshot-20141218-2245.tar.bz2/common/win32thread.h -> x264-snapshot-20150804-2245.tar.bz2/common/win32thread.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/bitstream-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/bitstream-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/cabac-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/cabac-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/const-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/const-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/cpu-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/cpu-a.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* cpu-a.asm: x86 cpu utilities
 ;*****************************************************************************
-;* Copyright (C) 2003-2014 x264 project
+;* Copyright (C) 2003-2015 x264 project
 ;*
 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
 ;*          Loren Merritt <lorenm@u.washington.edu>
@@ -145,53 +145,3 @@
 cglobal cpu_sfence
     sfence
     ret
-
-cextern intel_cpu_indicator_init
-
-;-----------------------------------------------------------------------------
-; void safe_intel_cpu_indicator_init( void );
-;-----------------------------------------------------------------------------
-cglobal safe_intel_cpu_indicator_init
-    push r0
-    push r1
-    push r2
-    push r3
-    push r4
-    push r5
-    push r6
-%if ARCH_X86_64
-    push r7
-    push r8
-    push r9
-    push r10
-    push r11
-    push r12
-    push r13
-    push r14
-%endif
-    push rbp
-    mov  rbp, rsp
-%if WIN64
-    sub  rsp, 32 ; shadow space
-%endif
-    and  rsp, ~31
-    call intel_cpu_indicator_init
-    leave
-%if ARCH_X86_64
-    pop r14
-    pop r13
-    pop r12
-    pop r11
-    pop r10
-    pop r9
-    pop r8
-    pop r7
-%endif
-    pop r6
-    pop r5
-    pop r4
-    pop r3
-    pop r2
-    pop r1
-    pop r0
-    ret

x264-snapshot-20141218-2245.tar.bz2/common/x86/dct-32.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/dct-32.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/dct-64.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/dct-64.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/dct-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/dct-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/dct.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/deblock-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/deblock-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/mc-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/mc-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/mc-a2.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/mc-a2.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* mc-a2.asm: x86 motion compensation
 ;*****************************************************************************
-;* Copyright (C) 2005-2014 x264 project
+;* Copyright (C) 2005-2015 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
@@ -40,6 +40,7 @@
 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
 
 %if HIGH_BIT_DEPTH
+copy_swap_shuf: times 2 db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
 v210_mask: times 4 dq 0xc00ffc003ff003ff
 v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
 v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
@@ -50,6 +51,7 @@
 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
 %else
+copy_swap_shuf: times 2 db 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14
 deinterleave_rgb_shuf: db 0,3,6,9,1,4,7,10,2,5,8,11,-1,-1,-1,-1
                        db 0,4,8,12,1,5,9,13,2,6,10,14,-1,-1,-1,-1
 
@@ -913,64 +915,90 @@
 %undef sfence
 %endif ; !HIGH_BIT_DEPTH
 
+%macro PREFETCHNT_ITER 2 ; src, bytes/iteration
+    %assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal?
+    %rep (%2+63) / 64  ; assume 64 byte cache lines
+        prefetchnta [%1+%%i]
+        %assign %%i %%i + 64
+    %endrep
+%endmacro
+
 ;-----------------------------------------------------------------------------
-; void plane_copy_core( pixel *dst, intptr_t i_dst,
-;                       pixel *src, intptr_t i_src, int w, int h )
+; void plane_copy(_swap)_core( pixel *dst, intptr_t i_dst,
+;                              pixel *src, intptr_t i_src, int w, int h )
 ;-----------------------------------------------------------------------------
-; assumes i_dst and w are multiples of 16, and i_dst>w
-INIT_MMX
-cglobal plane_copy_core_mmx2, 6,7
-    FIX_STRIDES r1, r3, r4d
-%if HIGH_BIT_DEPTH == 0
+; assumes i_dst and w are multiples of mmsize, and i_dst>w
+%macro PLANE_COPY_CORE 1 ; swap
+%if %1
+cglobal plane_copy_swap_core, 6,7
+    mova   m4, [copy_swap_shuf]
+%else
+cglobal plane_copy_core, 6,7
+%endif
+    FIX_STRIDES r1, r3
+%if %1 && HIGH_BIT_DEPTH
+    shl   r4d, 2
+%elif %1 || HIGH_BIT_DEPTH
+    add   r4d, r4d
+%else
     movsxdifnidn r4, r4d
 %endif
-    sub    r1,  r4
-    sub    r3,  r4
+    add    r0, r4
+    add    r2, r4
+    neg    r4
 .loopy:
-    lea   r6d, [r4-63]
+    lea    r6, [r4+4*mmsize]
+%if %1
+    test  r6d, r6d
+    jg .skip
+%endif
 .loopx:
-    prefetchnta [r2+256]
-    movq   m0, [r2   ]
-    movq   m1, [r2+ 8]
-    movntq [r0   ], m0
-    movntq [r0+ 8], m1
-    movq   m2, [r2+16]
-    movq   m3, [r2+24]
-    movntq [r0+16], m2
-    movntq [r0+24], m3
-    movq   m4, [r2+32]
-    movq   m5, [r2+40]
-    movntq [r0+32], m4
-    movntq [r0+40], m5
-    movq   m6, [r2+48]
-    movq   m7, [r2+56]
-    movntq [r0+48], m6
-    movntq [r0+56], m7
-    add    r2,  64
-    add    r0,  64
-    sub    r6d, 64
-    jg .loopx
-    prefetchnta [r2+256]
-    add    r6d, 63
-    jle .end16
-.loop16:
-    movq   m0, [r2  ]
-    movq   m1, [r2+8]
-    movntq [r0  ], m0
-    movntq [r0+8], m1
-    add    r2,  16
-    add    r0,  16
-    sub    r6d, 16
-    jg .loop16
-.end16:
+    PREFETCHNT_ITER r2+r6, 4*mmsize
+    movu   m0, [r2+r6-4*mmsize]
+    movu   m1, [r2+r6-3*mmsize]
+    movu   m2, [r2+r6-2*mmsize]
+    movu   m3, [r2+r6-1*mmsize]
+%if %1
+    pshufb m0, m4
+    pshufb m1, m4
+    pshufb m2, m4
+    pshufb m3, m4
+%endif
+    movnta [r0+r6-4*mmsize], m0
+    movnta [r0+r6-3*mmsize], m1
+    movnta [r0+r6-2*mmsize], m2
+    movnta [r0+r6-1*mmsize], m3
+    add    r6, 4*mmsize
+    jle .loopx
+.skip:
+    PREFETCHNT_ITER r2+r6, 4*mmsize
+    sub    r6, 4*mmsize
+    jz .end
+.loop_end:
+    movu   m0, [r2+r6]
+%if %1
+    pshufb m0, m4
+%endif
+    movnta [r0+r6], m0
+    add    r6, mmsize
+    jl .loop_end
+.end:
     add    r0, r1
     add    r2, r3
-    dec    r5d
+    dec   r5d
     jg .loopy
     sfence
-    emms
     RET
+%endmacro
 
+INIT_XMM sse
+PLANE_COPY_CORE 0
+INIT_XMM ssse3
+PLANE_COPY_CORE 1
+INIT_YMM avx
+PLANE_COPY_CORE 0
+INIT_YMM avx2
+PLANE_COPY_CORE 1
 
 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
 %if HIGH_BIT_DEPTH
@@ -2136,7 +2164,7 @@
 
 INIT_YMM avx
 MBTREE_AVX 8
-INIT_YMM avx2,fma3
+INIT_YMM avx2
 MBTREE_AVX 7
 
 %macro MBTREE_PROPAGATE_LIST 0

x264-snapshot-20141218-2245.tar.bz2/common/x86/mc-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/x86/mc-c.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc-c.c: x86 motion compensation
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -90,8 +90,12 @@
 void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
 void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
 void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
-void x264_plane_copy_core_mmx2( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
 void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
 void x264_plane_copy_interleave_core_mmx2( pixel *dst,  intptr_t i_dst,
                                            pixel *srcu, intptr_t i_srcu,
                                            pixel *srcv, intptr_t i_srcv, int w, int h );
@@ -167,8 +171,8 @@
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx2_fma3( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                           uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 
 #define MC_CHROMA(cpu)\
 void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
@@ -363,9 +367,6 @@
 }
 #endif // !HIGH_BIT_DEPTH
 
-static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-
 #define MC_LUMA(name,instr1,instr2)\
 static void mc_luma_##name( pixel *dst,    intptr_t i_dst_stride,\
                             pixel *src[4], intptr_t i_src_stride,\
@@ -374,10 +375,10 @@
 {\
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
-    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
+    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
     if( qpel_idx & 5 ) /* qpel interpolation needed */\
     {\
-        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
+        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
         x264_pixel_avg_wtab_##instr1[i_width>>2](\
                 dst, i_dst_stride, src1, i_src_stride,\
                 src2, i_height );\
@@ -412,10 +413,10 @@
 {\
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
-    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
+    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
     if( qpel_idx & 5 ) /* qpel interpolation needed */\
     {\
-        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
+        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
         x264_pixel_avg_wtab_##name[i_width>>2](\
                 dst, *i_dst_stride, src1, i_src_stride,\
                 src2, i_height );\
@@ -492,39 +493,94 @@
 #endif
 #endif // HIGH_BIT_DEPTH
 
-static void x264_plane_copy_mmx2( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )
-{
-    int c_w = 16/sizeof(pixel) - 1;
-    if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold.
-        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );
-    } else if( !(w&c_w) ) {
-        x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, w, h );
-    } else if( i_src > 0 ) {
-        // have to use plain memcpy on the last line (in memory order) to avoid overreading src
-        x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, (w+c_w)&~c_w, h-1 );
-        memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w*sizeof(pixel) );
-    } else {
-        memcpy( dst, src, w*sizeof(pixel) );
-        x264_plane_copy_core_mmx2( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h-1 );
-    }
+#define PLANE_COPY(align, cpu)\
+static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
+{\
+    int c_w = (align) / sizeof(pixel) - 1;\
+    if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
+        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
+    else if( !(w&c_w) )\
+        x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
+    else\
+    {\
+        if( --h > 0 )\
+        {\
+            if( i_src > 0 )\
+            {\
+                x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
+                dst += i_dst * h;\
+                src += i_src * h;\
+            }\
+            else\
+                x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
+        }\
+        /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
+        memcpy( dst, src, w*sizeof(pixel) );\
+    }\
+}
+
+PLANE_COPY(16, sse)
+PLANE_COPY(32, avx)
+
+#define PLANE_COPY_SWAP(align, cpu)\
+static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
+{\
+    int c_w = (align>>1) / sizeof(pixel) - 1;\
+    if( !(w&c_w) )\
+        x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
+    else if( w > c_w )\
+    {\
+        if( --h > 0 )\
+        {\
+            if( i_src > 0 )\
+            {\
+                x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
+                dst += i_dst * h;\
+                src += i_src * h;\
+            }\
+            else\
+                x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
+        }\
+        x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
+        for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
+        {\
+            dst[x]   = src[x+1];\
+            dst[x+1] = src[x];\
+        }\
+    }\
+    else\
+        x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
 }
 
+PLANE_COPY_SWAP(16, ssse3)
+PLANE_COPY_SWAP(32, avx2)
+
 #define PLANE_INTERLEAVE(cpu) \
 static void x264_plane_copy_interleave_##cpu( pixel *dst,  intptr_t i_dst,\
                                               pixel *srcu, intptr_t i_srcu,\
                                               pixel *srcv, intptr_t i_srcv, int w, int h )\
 {\
-    if( !(w&15) ) {\
+    int c_w = 16 / sizeof(pixel) - 1;\
+    if( !(w&c_w) )\
         x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
-    } else if( w < 16 || (i_srcu ^ i_srcv) ) {\
-        x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
-    } else if( i_srcu > 0 ) {\
-        x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+15)&~15, h-1 );\
-        x264_plane_copy_interleave_c( dst+i_dst*(h-1), 0, srcu+i_srcu*(h-1), 0, srcv+i_srcv*(h-1), 0, w, 1 );\
-    } else {\
+    else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
+    {\
+        if( --h > 0 )\
+        {\
+            if( i_srcu > 0 )\
+            {\
+                x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
+                dst  += i_dst  * h;\
+                srcu += i_srcu * h;\
+                srcv += i_srcv * h;\
+            }\
+            else\
+                x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
+        }\
         x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
-        x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+15)&~15, h-1 );\
     }\
+    else\
+        x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
 }
 
 PLANE_INTERLEAVE(mmx2)
@@ -666,7 +722,6 @@
     pf->prefetch_fenc_422 = x264_prefetch_fenc_422_mmx2;
     pf->prefetch_ref  = x264_prefetch_ref_mmx2;
 
-    pf->plane_copy = x264_plane_copy_mmx2;
     pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
     pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2;
 
@@ -695,6 +750,7 @@
     {
         pf->memcpy_aligned  = x264_memcpy_aligned_sse;
         pf->memzero_aligned = x264_memzero_aligned_sse;
+        pf->plane_copy = x264_plane_copy_sse;
     }
 
 #if HIGH_BIT_DEPTH
@@ -751,6 +807,7 @@
         return;
 
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
+    pf->plane_copy_swap = x264_plane_copy_swap_ssse3;
     pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
     pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
 
@@ -855,6 +912,7 @@
     pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_ssse3;
     pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_ssse3;
     pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_ssse3;
+    pf->plane_copy_swap = x264_plane_copy_swap_ssse3;
     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3;
     pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
 
@@ -932,6 +990,7 @@
     if( !(cpu&X264_CPU_AVX) )
         return;
     pf->memzero_aligned = x264_memzero_aligned_avx;
+    pf->plane_copy = x264_plane_copy_avx;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
     pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx;
 
@@ -940,8 +999,7 @@
 
     if( !(cpu&X264_CPU_AVX2) )
         return;
+    pf->plane_copy_swap = x264_plane_copy_swap_avx2;
     pf->get_ref = get_ref_avx2;
-
-    if( cpu&X264_CPU_FMA3 )
-        pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2_fma3;
+    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
 }

x264-snapshot-20141218-2245.tar.bz2/common/x86/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/mc.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/pixel-32.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/pixel-32.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/pixel-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/pixel-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/pixel.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/predict-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/predict-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/predict-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/x86/predict-c.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/predict.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/quant-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/quant-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/quant.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/sad-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/sad-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/sad16-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/sad16-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/trellis-64.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/trellis-64.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/util.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/util.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/x86inc.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/x86inc.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* x86inc.asm: x264asm abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2014 x264 project
+;* Copyright (C) 2005-2015 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
@@ -64,6 +64,15 @@
     %endif
 %endif
 
+%define FORMAT_ELF 0
+%ifidn __OUTPUT_FORMAT__,elf
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf32
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf64
+    %define FORMAT_ELF 1
+%endif
+
 %ifdef PREFIX
     %define mangle(x) _ %+ x
 %else
@@ -74,10 +83,6 @@
     SECTION .rodata align=%1
 %endmacro
 
-%macro SECTION_TEXT 0-1 16
-    SECTION .text align=%1
-%endmacro
-
 %if WIN64
     %define PIC
 %elif ARCH_X86_64 == 0
@@ -90,6 +95,10 @@
     default rel
 %endif
 
+%ifdef __NASM_VER__
+    %use smartalign
+%endif
+
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
 ; Currently this works only for leaf functions which load all their arguments
 ; into registers at the start, and make no other use of the stack. Luckily that
@@ -675,7 +684,7 @@
         CAT_XDEFINE cglobaled_, %2, 1
     %endif
     %xdefine current_function %2
-    %ifidn __OUTPUT_FORMAT__,elf
+    %if FORMAT_ELF
         global %2:function %%VISIBILITY
     %else
         global %2
@@ -701,14 +710,16 @@
 
 ; like cextern, but without the prefix
 %macro cextern_naked 1
-    %xdefine %1 mangle(%1)
+    %ifdef PREFIX
+        %xdefine %1 mangle(%1)
+    %endif
     CAT_XDEFINE cglobaled_, %1, 1
     extern %1
 %endmacro
 
 %macro const 1-2+
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
-    %ifidn __OUTPUT_FORMAT__,elf
+    %if FORMAT_ELF
         global %1:data hidden
     %else
         global %1
@@ -716,10 +727,9 @@
     %1: %2
 %endmacro
 
-; This is needed for ELF, otherwise the GNU linker assumes the stack is
-; executable by default.
-%ifidn __OUTPUT_FORMAT__,elf
-SECTION .note.GNU-stack noalloc noexec nowrite progbits
+; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
+%if FORMAT_ELF
+    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
 %endif
 
 ; cpuflags
@@ -738,8 +748,8 @@
 %assign cpuflags_avx      (1<<11)| cpuflags_sse42
 %assign cpuflags_xop      (1<<12)| cpuflags_avx
 %assign cpuflags_fma4     (1<<13)| cpuflags_avx
-%assign cpuflags_avx2     (1<<14)| cpuflags_avx
-%assign cpuflags_fma3     (1<<15)| cpuflags_avx
+%assign cpuflags_fma3     (1<<14)| cpuflags_avx
+%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
 
 %assign cpuflags_cache32  (1<<16)
 %assign cpuflags_cache64  (1<<17)
@@ -789,9 +799,17 @@
     %endif
 
     %if ARCH_X86_64 || cpuflag(sse2)
-        CPU amdnop
+        %ifdef __NASM_VER__
+            ALIGNMODE k8
+        %else
+            CPU amdnop
+        %endif
     %else
-        CPU basicnop
+        %ifdef __NASM_VER__
+            ALIGNMODE nop
+        %else
+            CPU basicnop
+        %endif
     %endif
 %endmacro
 
@@ -868,7 +886,7 @@
     %assign %%i 0
     %rep num_mmregs
     CAT_XDEFINE m, %%i, ymm %+ %%i
-    CAT_XDEFINE nymm, %%i, %%i
+    CAT_XDEFINE nnymm, %%i, %%i
     %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
@@ -1070,6 +1088,8 @@
         %ifdef cpuname
             %if notcpuflag(%2)
                 %error use of ``%1'' %2 instruction in cpuname function: current_function
+            %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
+                %error use of ``%1'' sse2 instruction in cpuname function: current_function
             %endif
         %endif
     %endif
@@ -1206,7 +1226,7 @@
 AVX_INSTR minss, sse, 1, 0, 1
 AVX_INSTR movapd, sse2
 AVX_INSTR movaps, sse
-AVX_INSTR movd
+AVX_INSTR movd, mmx
 AVX_INSTR movddup, sse3
 AVX_INSTR movdqa, sse2
 AVX_INSTR movdqu, sse2
@@ -1222,7 +1242,7 @@
 AVX_INSTR movntdqa, sse4
 AVX_INSTR movntpd, sse2
 AVX_INSTR movntps, sse
-AVX_INSTR movq
+AVX_INSTR movq, mmx
 AVX_INSTR movsd, sse2, 1, 0, 0
 AVX_INSTR movshdup, sse3
 AVX_INSTR movsldup, sse3
@@ -1468,13 +1488,15 @@
 FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
 FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
 
-; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
-%if ARCH_X86_64 == 0
-%macro vpbroadcastq 2
-%if sizeof%1 == 16
-    movddup %1, %2
-%else
-    vbroadcastsd %1, %2
-%endif
-%endmacro
+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
+%ifdef __YASM_VER__
+    %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
+        %macro vpbroadcastq 2
+            %if sizeof%1 == 16
+                movddup %1, %2
+            %else
+                vbroadcastsd %1, %2
+            %endif
+        %endmacro
+    %endif
 %endif

x264-snapshot-20141218-2245.tar.bz2/common/x86/x86util.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/x86util.asm Changed

x264-snapshot-20141218-2245.tar.bz2/config.guess -> x264-snapshot-20150804-2245.tar.bz2/config.guess Changed

x264-snapshot-20141218-2245.tar.bz2/configure -> x264-snapshot-20150804-2245.tar.bz2/configure Changed

@@ -77,7 +77,7 @@
     # several non gcc compilers issue an incredibly large number of warnings on any warning level,
     # suppress them by disabling all warnings rather than having to use #pragmas to disable most of them
     for arg in $*; do
-        [ $arg = -ffast-math ] && arg=
+        [ "$arg" = -ffast-math ] && arg=
         [[ "$arg" = -falign-loops* ]] && arg=
         [ "$arg" = -fno-tree-vectorize ] && arg=
         [ "$arg" = -Wshadow ] && arg=
@@ -105,10 +105,10 @@
 cl_ldflags() {
     for arg in $*; do
         arg=${arg/LIBPATH/libpath}
-        [ ${arg#-libpath:} == $arg -a ${arg#-l} != $arg ] && arg=${arg#-l}.lib
-        [ ${arg#-L} != $arg ] && arg=-libpath:${arg#-L}
-        [ $arg = -Wl,--large-address-aware ] && arg=-largeaddressaware
-        [ $arg = -s ] && arg=
+        [ "${arg#-libpath:}" == "$arg" -a "${arg#-l}" != "$arg" ] && arg=${arg#-l}.lib
+        [ "${arg#-L}" != "$arg" ] && arg=-libpath:${arg#-L}
+        [ "$arg" = -Wl,--large-address-aware ] && arg=-largeaddressaware
+        [ "$arg" = -s ] && arg=
         [ "$arg" = -Wl,-Bsymbolic ] && arg=
         [ "$arg" = -fno-tree-vectorize ] && arg=
         [ "$arg" = -Werror ] && arg=
@@ -119,6 +119,7 @@
         arg=${arg/pthreadGC/pthreadVC}
         [ "$arg" = avifil32.lib ] && arg=vfw32.lib
         [ "$arg" = gpac_static.lib ] && arg=libgpac_static.lib
+        [ "$arg" = x264.lib ] && arg=libx264.lib
 
         [ -n "$arg" ] && echo -n "$arg "
     done
@@ -143,7 +144,9 @@
         log_check "for $3 in $1";
     fi
     rm -f conftest.c
-    [ -n "$1" ] && echo "#include <$1>" > conftest.c
+    for arg in $1; do
+        echo "#include <$arg>" >> conftest.c
+    done
     echo "int main (void) { $3 return 0; }" >> conftest.c
     if [ $compiler_style = MS ]; then
         cc_cmd="$CC conftest.c $(cc_cflags $CFLAGS $CHECK_CFLAGS $2) -link $(cl_ldflags $2 $LDFLAGSCLI $LDFLAGS)"
@@ -172,7 +175,9 @@
 cpp_check() {
     log_check "whether $3 is true"
     rm -f conftest.c
-    [ -n "$1" ] && echo "#include <$1>" > conftest.c
+    for arg in $1; do
+        echo "#include <$arg>" >> conftest.c
+    done
     echo -e "#if !($3) \n#error $4 \n#endif " >> conftest.c
     if [ $compiler_style = MS ]; then
         cpp_cmd="$CC conftest.c $(cc_cflags $CFLAGS $2) -P"
@@ -256,6 +261,48 @@
     exit 1
 }
 
+configure_system_override() {
+    log_check "system libx264 configuration"
+    x264_config_path="$1/x264_config.h"
+    if [ -e "$x264_config_path" ]; then
+        res=$?
+        log_ok
+        arg="$(grep '#define X264_GPL ' $x264_config_path | sed -e 's/#define X264_GPL *//; s/ *$//')"
+        if [ -n "$arg" ]; then
+            [ "$arg" = 0 ] && arg="no" || arg="yes"
+            [ "$arg" != "$gpl" ] && die "Incompatible license with system libx264"
+        fi
+        arg="$(grep '#define X264_BIT_DEPTH ' $x264_config_path | sed -e 's/#define X264_BIT_DEPTH *//; s/ *$//')"
+        if [ -n "$arg" ]; then
+            if [ "$arg" != "$bit_depth" ]; then
+                echo "Override output bit depth with system libx264 configuration"
+                bit_depth="$arg"
+            fi
+        fi
+        arg="$(grep '#define X264_CHROMA_FORMAT ' $x264_config_path | sed -e 's/#define X264_CHROMA_FORMAT *//; s/ *$//')"
+        if [ -n "$arg" ]; then
+            [ "$arg" = 0 ] && arg="all" || arg="${arg#X264_CSP_I}"
+            if [ "$arg" != "$chroma_format" ]; then
+                echo "Override output chroma format with system libx264 configuration"
+                chroma_format="$arg"
+            fi
+        fi
+        arg="$(grep '#define X264_INTERLACED ' $x264_config_path | sed -e 's/#define X264_INTERLACED *//; s/ *$//')"
+        if [ -n "$arg" ]; then
+            [ "$arg" = 0 ] && arg="no" || arg="yes"
+            if [ "$arg" != "$interlaced" ]; then
+                echo "Override interlaced encoding support with system libx264 configuration"
+                interlaced="$arg"
+            fi
+        fi
+    else
+        res=$?
+        log_fail
+        log_msg "Failed search path was: $x264_config_path"
+    fi
+    return $res
+}
+
 rm -f x264_config.h config.h config.mak config.log x264.pc x264.def conftest*
 
 SRCPATH="$(cd $(dirname $0); pwd)"
@@ -311,7 +358,8 @@
 
 # list of all preprocessor HAVE values we can define
 CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \
-             LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC"
+             LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC INTEL_DISPATCHER \
+             MSA"
 
 # parse options
 
@@ -458,6 +506,8 @@
 host_vendor="${host%%-*}"
 host_os="${host#*-}"
 
+trap 'rm -f conftest*' EXIT
+
 # test for use of compilers that require specific handling
 cc_base=`basename "$CC"`
 QPRE="-"
@@ -600,9 +650,9 @@
 case $host_cpu in
     i*86)
         ARCH="X86"
-        AS="yasm"
+        AS="${AS-yasm}"
         AS_EXT=".asm"
-        ASFLAGS="$ASFLAGS -O2 -DARCH_X86_64=0 -I\$(SRCPATH)/common/x86/"
+        ASFLAGS="$ASFLAGS -DARCH_X86_64=0 -I\$(SRCPATH)/common/x86/"
         if [ $compiler = GNU ]; then
             if [[ "$asm" == auto && "$CFLAGS" != *-march* ]]; then
                 CFLAGS="$CFLAGS -march=i686"
@@ -629,39 +679,39 @@
             stack_alignment=4
         fi
         if [ "$SYS" = MACOSX ]; then
-            ASFLAGS="$ASFLAGS -f macho -DPREFIX"
+            ASFLAGS="$ASFLAGS -f macho32 -DPREFIX"
         elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then
             ASFLAGS="$ASFLAGS -f win32 -DPREFIX"
             LDFLAGS="$LDFLAGS -Wl,--large-address-aware"
             [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--nxcompat -Wl,--dynamicbase"
             [ $compiler = GNU ] && RCFLAGS="--target=pe-i386 $RCFLAGS"
         else
-            ASFLAGS="$ASFLAGS -f elf"
+            ASFLAGS="$ASFLAGS -f elf32"
         fi
         ;;
     x86_64)
         ARCH="X86_64"
-        AS="yasm"
+        AS="${AS-yasm}"
         AS_EXT=".asm"
         ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -I\$(SRCPATH)/common/x86/"
         [ $compiler = GNU ] && CFLAGS="-m64 $CFLAGS" && LDFLAGS="-m64 $LDFLAGS"
         if [ "$SYS" = MACOSX ]; then
-            ASFLAGS="$ASFLAGS -f macho64 -m amd64 -DPIC -DPREFIX"
+            ASFLAGS="$ASFLAGS -f macho64 -DPIC -DPREFIX"
             if cc_check '' "-arch x86_64"; then
                 CFLAGS="$CFLAGS -arch x86_64"
                 LDFLAGS="$LDFLAGS -arch x86_64"
             fi
         elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then
-            ASFLAGS="$ASFLAGS -f win32 -m amd64"
+            ASFLAGS="$ASFLAGS -f win64"
             # only the GNU toolchain is inconsistent in prefixing function names with _
             [ $compiler = GNU ] && cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX"
             [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--nxcompat -Wl,--dynamicbase"
             [ $compiler = GNU ] && RCFLAGS="--target=pe-x86-64 $RCFLAGS"
         else
-            ASFLAGS="$ASFLAGS -f elf -m amd64"
+            ASFLAGS="$ASFLAGS -f elf64"
         fi
         ;;
-    powerpc|powerpc64)
+    powerpc*)
         ARCH="PPC"
         if [ $asm = auto ] ; then
             define HAVE_ALTIVEC
@@ -678,13 +728,15 @@
     sparc)
         ARCH="SPARC"
         ;;
-    mips|mipsel|mips64|mips64el)
+    mips*)
         ARCH="MIPS"
+        AS="${AS-${CC}}"
+        AS_EXT=".c"
         ;;
     arm*)
         ARCH="ARM"
         if [ "$SYS" = MACOSX ] ; then
-            AS="${AS-extras/gas-preprocessor.pl $CC}"
+            AS="${AS-${SRCPATH}/tools/gas-preprocessor.pl -arch arm -- ${CC}}"
             ASFLAGS="$ASFLAGS -DPREFIX -DPIC"  # apple's ld doesn't support movw/movt relocations at all
             # build for armv7 by default
             if ! echo $CFLAGS | grep -Eq '\-arch' ; then
@@ -698,7 +750,7 @@
     aarch64)
         ARCH="AARCH64"
         if [ "$SYS" = MACOSX ] ; then
-            AS="${AS-extras/gas-preprocessor.pl $CC}"
+            AS="${AS-${SRCPATH}/tools/gas-preprocessor.pl -arch aarch64 -- ${CC}}"
             ASFLAGS="$ASFLAGS -DPREFIX"
         else
             AS="${AS-${CC}}"
@@ -788,9 +840,6 @@
 fi
 
 if [ $asm = auto -a $ARCH = AARCH64 ] ; then
-    # set flags so neon is built by default
-    echo $CFLAGS | grep -Eq '(-mcpu|-march|-mfpu|-arch)' || CFLAGS="$CFLAGS -arch arm64 -mfpu=neon"
-
     if  cc_check '' '' '__asm__("cmeq v0.8h, v0.8h, #0");' ; then define HAVE_NEON
         ASFLAGS="$ASFLAGS -c"
     else
@@ -805,6 +854,20 @@
     as_check ".func test${NL}.endfunc" && define HAVE_AS_FUNC 1
 fi
 
+if [ $asm = auto -a $ARCH = MIPS ] ; then
+    if ! echo $CFLAGS | grep -Eq '(-march|-mmsa|-mno-msa)' ; then
+        cc_check '' '-mmsa -mfp64 -mhard-float' && CFLAGS="-mmsa -mfp64 -mhard-float $CFLAGS"
+    fi
+
+    if cc_check '' '' '__asm__("addvi.b $w0, $w1, 1");' ; then
+        define HAVE_MSA
+    else
+        echo "You specified a pre-MSA CPU in your CFLAGS."
+        echo "If you really want to run on such a CPU, configure with --disable-asm."
+        exit 1
+    fi
+fi
+
 [ $asm = no ] && AS=""
 [ "x$AS" = x ] && asm="no" || asm="yes"
 
@@ -815,16 +878,29 @@
 ASFLAGS="$ASFLAGS -DSTACK_ALIGNMENT=$stack_alignment"
 
 # skip endianness check for Intel Compiler and MSVS, as all supported platforms are little. each have flags that will cause the check to fail as well
+CPU_ENDIAN="little-endian"
 if [ $compiler = GNU ]; then
     echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c
     $CC $CFLAGS conftest.c -c -o conftest.o 2>/dev/null || die "endian test failed"
     if (${cross_prefix}strings -a conftest.o | grep -q BIGE) && (${cross_prefix}strings -a conftest.o | grep -q FPendian) ; then
         define WORDS_BIGENDIAN
+        CPU_ENDIAN="big-endian"
     elif !(${cross_prefix}strings -a conftest.o | grep -q EGIB && ${cross_prefix}strings -a conftest.o | grep -q naidnePF) ; then
         die "endian test failed"
     fi
 fi
 
+if [ "$cli_libx264" = "system" -a "$shared" != "yes" ] ; then
+    [ "$static" = "yes" ] && die "Option --system-libx264 can not be used together with --enable-static"
+    if ${cross_prefix}pkg-config --exists x264 2>/dev/null; then
+        X264_LIBS="$(${cross_prefix}pkg-config --libs x264)"
+        X264_INCLUDE_DIR="${X264_INCLUDE_DIR-$(${cross_prefix}pkg-config --variable=includedir x264)}"
+        configure_system_override "$X264_INCLUDE_DIR" || die "Detection of system libx264 configuration failed"
+    else
+        die "Can not find system libx264"
+    fi
+fi
+
 # autodetect options that weren't forced nor disabled
 
 # pthread-win32 is lgpl, prevent its use if --disable-gpl is specified and targeting windows
@@ -1044,7 +1120,7 @@
 cc_check "stdint.h" "" "uint32_t test_vec __attribute__ ((vector_size (16))) = {0,1,2,3};" && define HAVE_VECTOREXT
 
 if [ "$pic" = "yes" ] ; then
-    CFLAGS="$CFLAGS -fPIC"
+    [ "$SYS" != WINDOWS -a "$SYS" != CYGWIN ] && CFLAGS="$CFLAGS -fPIC"
     ASFLAGS="$ASFLAGS -DPIC"
     # resolve textrels in the x86 asm
     cc_check stdio.h "-shared -Wl,-Bsymbolic" && SOFLAGS="$SOFLAGS -Wl,-Bsymbolic"
@@ -1093,6 +1169,12 @@
     CFLAGS="-Wno-maybe-uninitialized $CFLAGS"
 fi
 
+if [ $compiler = ICC -o $compiler = ICL ] ; then
+    if cc_check 'extras/intel_dispatcher.h' '' 'x264_intel_dispatcher_override();' ; then
+        define HAVE_INTEL_DISPATCHER
+    fi
+fi
+
 if [ "$bit_depth" -gt "8" ]; then
     define HIGH_BIT_DEPTH
     ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=1"
@@ -1131,6 +1213,31 @@
     grep -q "HAVE_$var 1" config.h || define HAVE_$var 0
 done
 
+# generate exported config file
+
+config_chroma_format="X264_CSP_I$chroma_format"
+[ "$config_chroma_format" == "X264_CSP_Iall" ] && config_chroma_format="0"
+cat > x264_config.h << EOF
+#define X264_BIT_DEPTH     $bit_depth
+#define X264_GPL           $x264_gpl
+#define X264_INTERLACED    $x264_interlaced
+#define X264_CHROMA_FORMAT $config_chroma_format
+EOF
+
+${SRCPATH}/version.sh "${SRCPATH}" >> x264_config.h
+
+if [ "$cli_libx264" = "system" ] ; then
+    if [ "$shared" = "yes" ]; then
+        CLI_LIBX264='$(SONAME)'
+    else
+        CLI_LIBX264=
+        LDFLAGSCLI="$X264_LIBS $LDFLAGSCLI"
+        cc_check 'stdint.h x264.h' '' 'x264_encoder_open(0);' || die "System libx264 can't be used for compilation of this version"
+    fi
+else
+    CLI_LIBX264='$(LIBX264)'
+fi
+
 DEPMM="${QPRE}MM"
 DEPMT="${QPRE}MT"
 if [ $compiler_style = MS ]; then
@@ -1183,19 +1290,6 @@
     PROF_USE_LD="-fprofile-use"
 fi
 
-rm -f conftest*
-
-# generate exported config file
-
-config_chroma_format="X264_CSP_I$chroma_format"
-[ "$config_chroma_format" == "X264_CSP_Iall" ] && config_chroma_format="0"
-cat > x264_config.h << EOF
-#define X264_BIT_DEPTH     $bit_depth
-#define X264_GPL           $x264_gpl
-#define X264_INTERLACED    $x264_interlaced
-#define X264_CHROMA_FORMAT $config_chroma_format
-EOF
-
 # generate config files
 
 cat > config.mak << EOF
@@ -1205,7 +1299,7 @@
 bindir=$bindir
 libdir=$libdir
 includedir=$includedir
-ARCH=$ARCH
+SYS_ARCH=$ARCH
 SYS=$SYS
 CC=$CC
 CFLAGS=$CFLAGS
@@ -1284,23 +1378,9 @@
     echo 'install: install-lib-static' >> config.mak
 fi
 
-if [ "$cli_libx264" = "system" ] ; then
-    if [ "$shared" = "yes" ]; then
-        CLI_LIBX264='$(SONAME)'
-    elif ${cross_prefix}pkg-config --exists x264 2>/dev/null; then
-        LDFLAGSCLI="$LDFLAGSCLI $(${cross_prefix}pkg-config --libs x264)"
-        CLI_LIBX264=
-    else
-        die "Can not find system libx264"
-    fi
-else
-    CLI_LIBX264='$(LIBX264)'
-fi
 echo "LDFLAGSCLI = $LDFLAGSCLI" >> config.mak
 echo "CLI_LIBX264 = $CLI_LIBX264" >> config.mak
 
-${SRCPATH}/version.sh "${SRCPATH}" >> x264_config.h
-
 cat > x264.pc << EOF
 prefix=$prefix
 exec_prefix=$exec_prefix
@@ -1322,6 +1402,7 @@
 
 cat > conftest.log <<EOF
 platform:      $ARCH
+byte order:    $CPU_ENDIAN
 system:        $SYS
 cli:           $cli
 libx264:       $cli_libx264
@@ -1348,7 +1429,6 @@
 echo >> config.log
 cat conftest.log >> config.log
 cat conftest.log
-rm conftest.log
 
 [ "$SRCPATH" != "." ] && ln -sf ${SRCPATH}/Makefile ./Makefile
 mkdir -p common/{aarch64,arm,ppc,x86} encoder extras filters/video input output tools

x264-snapshot-20141218-2245.tar.bz2/doc/vui.txt -> x264-snapshot-20150804-2245.tar.bz2/doc/vui.txt Changed

@@ -16,14 +16,14 @@
 * How do I use it?
     You can derive the SAR of an image from the width, height and the
     display aspect ratio (DAR) of the image as follows:
-    
+
     SAR_x   DAR_x * height
     ----- = --------------
     SAR_y   DAR_y * width
-    
+
     for example:
     width x height = 704x576, DAR = 4:3 ==> SAR = 2304:2112 or 12:11
-    
+
     Please note that if your material is a digitized analog signal, you should
     not use this equation to calculate the SAR. Refer to the manual of your
     digitizing equipment or this link instead.
@@ -36,7 +36,7 @@
     correction of aspect ratios, and there are just few exceptions. You should
     even use it, if the SAR of your material is 1:1, as the default of x264 is
     "SAR not defined".
-    
+
 2. Overscan
 ------------
 
@@ -49,7 +49,7 @@
     analog signal. Instead it refers to the "overscan" process on a display
     that shows only a part of the image. What that part is depends on the
     display.
-    
+
 * How do I use this option?
     As I'm not sure about what part of the image is shown when the display uses
     an overscan process, I can't provide you with rules or examples. The safe
@@ -72,7 +72,7 @@
 * What is it?
     A purely informative setting, that explains what the type of your analog
     video was, before you digitized it.
-    
+
 * How do I use this option?
     Just set it to the desired value. ( e.g. NTSC, PAL )
     If you transcode from MPEG2, you may find the value for this option in the
@@ -101,11 +101,11 @@
     or want to make sure that your material is played back without
     oversaturation, set if to on. Please note that the default for this option
     in x264 is off, which is not a safe assumption.
-    
+
 * Should I use this option?
     Yes, but there are few decoders/ media players that distinguish
     between the two options.
-    
+
 5. Color Primaries, Transfer Characteristics, Matrix Coefficients
 -------------------------------------------------------------------
 
@@ -120,7 +120,7 @@
     profile of the digitizing equipment is known, it is possible to correct the
     colors and gamma of the decoded h264 stream in a way that the video stream
     looks the same, regardless of the digitizing equipment used.
-    
+
 * How do I use these options?
     If you are able to find out which characteristics your digitizing equipment
     uses, (see the equipment documentation or make reference measurements)
@@ -170,9 +170,8 @@
     chroma sample location in that direction is equal to one of the luma
     samples. H264 Annex E contains images that tell you how to "transform" your
     Chroma Sample Location into a value of 0 to 5 that you can pass to x264.
-    
+
 * Should I use this option?
     Unless you are a perfectionist, don't bother. Media players ignore this
     setting, and favor their own (fixed) assumed Chroma Sample Location.
 
-

x264-snapshot-20141218-2245.tar.bz2/encoder/analyse.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/analyse.c Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/analyse.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/analyse.h Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/cabac.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/cabac.c Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/cavlc.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/cavlc.c Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/encoder.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/encoder.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * encoder.c: top-level encoder functions
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -32,6 +32,9 @@
 #include "ratecontrol.h"
 #include "macroblock.h"
 #include "me.h"
+#if HAVE_INTEL_DISPATCHER
+#include "extras/intel_dispatcher.h"
+#endif
 
 //#define DEBUG_MB_TYPE
 
@@ -471,12 +474,12 @@
 
     int i_csp = h->param.i_csp & X264_CSP_MASK;
 #if X264_CHROMA_FORMAT
-    if( CHROMA_FORMAT != CHROMA_420 && i_csp >= X264_CSP_I420 && i_csp <= X264_CSP_NV12 )
+    if( CHROMA_FORMAT != CHROMA_420 && i_csp >= X264_CSP_I420 && i_csp < X264_CSP_I422 )
     {
         x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:0 support\n" );
         return -1;
     }
-    else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_V210 )
+    else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp < X264_CSP_I444 )
     {
         x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:2 support\n" );
         return -1;
@@ -489,36 +492,41 @@
 #endif
     if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
     {
-        x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" );
+        x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" );
         return -1;
     }
 
-    if( i_csp < X264_CSP_I444 && h->param.i_width % 2 )
+    int w_mod = i_csp < X264_CSP_I444 ? 2 : 1;
+    int h_mod = (i_csp < X264_CSP_I422 ? 2 : 1) << PARAM_INTERLACED;
+    if( h->param.i_width % w_mod )
     {
-        x264_log( h, X264_LOG_ERROR, "width not divisible by 2 (%dx%d)\n",
-                  h->param.i_width, h->param.i_height );
+        x264_log( h, X264_LOG_ERROR, "width not divisible by %d (%dx%d)\n",
+                  w_mod, h->param.i_width, h->param.i_height );
         return -1;
     }
-
-    if( i_csp < X264_CSP_I422 && PARAM_INTERLACED && h->param.i_height % 4 )
+    if( h->param.i_height % h_mod )
     {
-        x264_log( h, X264_LOG_ERROR, "height not divisible by 4 (%dx%d)\n",
-                  h->param.i_width, h->param.i_height );
+        x264_log( h, X264_LOG_ERROR, "height not divisible by %d (%dx%d)\n",
+                  h_mod, h->param.i_width, h->param.i_height );
         return -1;
     }
 
-    if( (i_csp < X264_CSP_I422 || PARAM_INTERLACED) && h->param.i_height % 2 )
+    if( h->param.crop_rect.i_left   >= h->param.i_width ||
+        h->param.crop_rect.i_right  >= h->param.i_width ||
+        h->param.crop_rect.i_top    >= h->param.i_height ||
+        h->param.crop_rect.i_bottom >= h->param.i_height ||
+        h->param.crop_rect.i_left + h->param.crop_rect.i_right  >= h->param.i_width ||
+        h->param.crop_rect.i_top  + h->param.crop_rect.i_bottom >= h->param.i_height )
     {
-        x264_log( h, X264_LOG_ERROR, "height not divisible by 2 (%dx%d)\n",
-                  h->param.i_width, h->param.i_height );
+        x264_log( h, X264_LOG_ERROR, "invalid crop-rect %u,%u,%u,%u\n", h->param.crop_rect.i_left,
+                  h->param.crop_rect.i_top, h->param.crop_rect.i_right,  h->param.crop_rect.i_bottom );
         return -1;
     }
-
-    if( (h->param.crop_rect.i_left + h->param.crop_rect.i_right ) >= h->param.i_width ||
-        (h->param.crop_rect.i_top  + h->param.crop_rect.i_bottom) >= h->param.i_height )
+    if( h->param.crop_rect.i_left % w_mod || h->param.crop_rect.i_right  % w_mod ||
+        h->param.crop_rect.i_top  % h_mod || h->param.crop_rect.i_bottom % h_mod )
     {
-        x264_log( h, X264_LOG_ERROR, "invalid crop-rect %u,%u,%u,%u\n", h->param.crop_rect.i_left,
-                  h->param.crop_rect.i_top, h->param.crop_rect.i_right,  h->param.crop_rect.i_bottom );
+        x264_log( h, X264_LOG_ERROR, "crop-rect %u,%u,%u,%u not divisible by %dx%d\n", h->param.crop_rect.i_left,
+                  h->param.crop_rect.i_top, h->param.crop_rect.i_right,  h->param.crop_rect.i_bottom, w_mod, h_mod );
         return -1;
     }
 
@@ -529,7 +537,13 @@
     }
 
     if( h->param.i_threads == X264_THREADS_AUTO )
+    {
         h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2;
+        /* Avoid too many threads as they don't improve performance and
+         * complicate VBV. Capped at an arbitrary 2 rows per thread. */
+        int max_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 2 );
+        h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
+    }
     int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 );
     if( h->param.i_threads > 1 )
     {
@@ -583,7 +597,20 @@
         h->param.i_dpb_size = 1;
     }
 
-    h->param.i_frame_packing = x264_clip3( h->param.i_frame_packing, -1, 5 );
+    if( h->param.i_frame_packing < -1 || h->param.i_frame_packing > 7 )
+    {
+        x264_log( h, X264_LOG_WARNING, "ignoring unknown frame packing value\n" );
+        h->param.i_frame_packing = -1;
+    }
+    if( h->param.i_frame_packing == 7 &&
+        ((h->param.i_width - h->param.crop_rect.i_left - h->param.crop_rect.i_right)  % 3 ||
+         (h->param.i_height - h->param.crop_rect.i_top - h->param.crop_rect.i_bottom) % 3) )
+    {
+        x264_log( h, X264_LOG_ERROR, "cropped resolution %dx%d not compatible with tile format frame packing\n",
+                  h->param.i_width - h->param.crop_rect.i_left - h->param.crop_rect.i_right,
+                  h->param.i_height - h->param.crop_rect.i_top - h->param.crop_rect.i_bottom );
+        return -1;
+    }
 
     /* Detect default ffmpeg settings and terminate with an error. */
     if( b_open )
@@ -1050,7 +1077,7 @@
         h->param.analyse.intra &= ~X264_ANALYSE_I8x8;
     }
     h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 );
-    h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );
+    h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 3 );
     h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 );
     if( h->param.rc.f_aq_strength == 0 )
         h->param.rc.i_aq_mode = 0;
@@ -1390,6 +1417,10 @@
     if( param->param_free )
         param->param_free( param );
 
+#if HAVE_INTEL_DISPATCHER
+    x264_intel_dispatcher_override();
+#endif
+
     if( x264_threading_init() )
     {
         x264_log( h, X264_LOG_ERROR, "unable to initialize threading\n" );
@@ -1676,6 +1707,7 @@
         else if( !x264_is_regular_file( f ) )
         {
             x264_log( h, X264_LOG_ERROR, "dump_yuv: incompatible with non-regular file %s\n", h->param.psz_dump_yuv );
+            fclose( f );
             goto fail;
         }
         fclose( f );
@@ -3213,6 +3245,12 @@
     /* ------------------- Setup new frame from picture -------------------- */
     if( pic_in != NULL )
     {
+        if( h->lookahead->b_exit_thread )
+        {
+            x264_log( h, X264_LOG_ERROR, "lookahead thread is already stopped\n" );
+            return -1;
+        }
+
         /* 1: Copy the picture to a frame and move it to a buffer */
         x264_frame_t *fenc = x264_frame_pop_unused( h, 0 );
         if( !fenc )
@@ -4087,14 +4125,14 @@
     if( h->stat.i_frame_count[SLICE_TYPE_I] > 0 )
     {
         int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_I];
-        double i_count = h->stat.i_frame_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0;
+        double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0;
         x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
         x264_log( h, X264_LOG_INFO, "mb I  %s\n", buf );
     }
     if( h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
     {
         int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_P];
-        double i_count = h->stat.i_frame_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0;
+        double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0;
         int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_P];
         x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
         x264_log( h, X264_LOG_INFO,
@@ -4110,7 +4148,7 @@
     if( h->stat.i_frame_count[SLICE_TYPE_B] > 0 )
     {
         int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_B];
-        double i_count = h->stat.i_frame_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0;
+        double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0;
         double i_mb_list_count;
         int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_B];
         int64_t list_count[3] = {0}; /* 0 == L0, 1 == L1, 2 == BI */

x264-snapshot-20141218-2245.tar.bz2/encoder/lookahead.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/lookahead.c Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/macroblock.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/macroblock.c Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/macroblock.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/macroblock.h Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/me.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/me.c Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/me.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/me.h Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/ratecontrol.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/ratecontrol.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * ratecontrol.c: ratecontrol
  *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
+ * Copyright (C) 2005-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Michael Niedermayer <michaelni@gmx.at>
@@ -96,6 +96,7 @@
     /* VBV stuff */
     double buffer_size;
     int64_t buffer_fill_final;
+    int64_t buffer_fill_final_min;
     double buffer_fill;         /* planned buffer, if all in-progress frames hit their bit budget */
     double buffer_rate;         /* # of bits added to buffer_fill after each frame */
     double vbv_max_rate;        /* # of bits added to buffer_fill per second */
@@ -301,10 +302,6 @@
 
 void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets )
 {
-    /* constants chosen to result in approximately the same overall bitrate as without AQ.
-     * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
-    float strength;
-    float avg_adj = 0.f;
     /* Initialize frame stats */
     for( int i = 0; i < 3; i++ )
     {
@@ -348,23 +345,30 @@
     /* Actual adaptive quantization */
     else
     {
-        if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
+        /* constants chosen to result in approximately the same overall bitrate as without AQ.
+         * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
+        float strength;
+        float avg_adj = 0.f;
+        float bias_strength = 0.f;
+
+        if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE || h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE_BIASED )
         {
-            float bit_depth_correction = powf(1 << (BIT_DEPTH-8), 0.5f);
+            float bit_depth_correction = 1.f / (1 << (2*(BIT_DEPTH-8)));
             float avg_adj_pow2 = 0.f;
             for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ )
                 for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ )
                 {
                     uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
-                    float qp_adj = powf( energy + 1, 0.125f );
+                    float qp_adj = powf( energy * bit_depth_correction + 1, 0.125f );
                     frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
                     avg_adj += qp_adj;
                     avg_adj_pow2 += qp_adj * qp_adj;
                 }
             avg_adj /= h->mb.i_mb_count;
             avg_adj_pow2 /= h->mb.i_mb_count;
-            strength = h->param.rc.f_aq_strength * avg_adj / bit_depth_correction;
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (14.f * bit_depth_correction)) / avg_adj;
+            strength = h->param.rc.f_aq_strength * avg_adj;
+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
+            bias_strength = h->param.rc.f_aq_strength;
         }
         else
             strength = h->param.rc.f_aq_strength * 1.0397f;
@@ -374,7 +378,12 @@
             {
                 float qp_adj;
                 int mb_xy = mb_x + mb_y*h->mb.i_mb_stride;
-                if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
+                if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE_BIASED )
+                {
+                    qp_adj = frame->f_qp_offset[mb_xy];
+                    qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - 14.f / (qp_adj * qp_adj));
+                }
+                else if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
                 {
                     qp_adj = frame->f_qp_offset[mb_xy];
                     qp_adj = strength * (qp_adj - avg_adj);
@@ -724,7 +733,8 @@
             if( h->param.rc.f_vbv_buffer_init > 1. )
                 h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
             h->param.rc.f_vbv_buffer_init = x264_clip3f( X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size ), 0, 1);
-            rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init * h->sps->vui.i_time_scale;
+            rc->buffer_fill_final =
+            rc->buffer_fill_final_min = rc->buffer_size * h->param.rc.f_vbv_buffer_init * h->sps->vui.i_time_scale;
             rc->b_vbv = 1;
             rc->b_vbv_min_rate = !rc->b_2pass
                           && h->param.rc.i_rc_method == X264_RC_ABR
@@ -776,11 +786,11 @@
     if( h->param.i_nal_hrd )
     {
         uint64_t denom = (uint64_t)h->sps->vui.hrd.i_bit_rate_unscaled * h->sps->vui.i_time_scale;
-        uint64_t num = 180000;
+        uint64_t num = 90000;
         x264_reduce_fraction64( &num, &denom );
-        rc->hrd_multiply_denom = 180000 / num;
+        rc->hrd_multiply_denom = 90000 / num;
 
-        double bits_required = log2( 180000 / rc->hrd_multiply_denom )
+        double bits_required = log2( 90000 / rc->hrd_multiply_denom )
                              + log2( h->sps->vui.i_time_scale )
                              + log2( h->sps->vui.hrd.i_cpb_size_unscaled );
         if( bits_required >= 63 )
@@ -822,6 +832,7 @@
     int num_preds = h->param.b_sliced_threads * h->param.i_threads + 1;
     CHECKED_MALLOC( rc->pred, 5 * sizeof(predictor_t) * num_preds );
     CHECKED_MALLOC( rc->pred_b_from_p, sizeof(predictor_t) );
+    static const float pred_coeff_table[3] = { 1.0, 1.0, 1.5 };
     for( int i = 0; i < 3; i++ )
     {
         rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP );
@@ -829,8 +840,8 @@
         rc->lmax[i] = qp2qscale( h->param.rc.i_qp_max );
         for( int j = 0; j < num_preds; j++ )
         {
-            rc->pred[i+j*5].coeff_min = 2.0 / 4;
-            rc->pred[i+j*5].coeff = 2.0;
+            rc->pred[i+j*5].coeff_min = pred_coeff_table[i] / 2;
+            rc->pred[i+j*5].coeff = pred_coeff_table[i];
             rc->pred[i+j*5].count = 1.0;
             rc->pred[i+j*5].decay = 0.5;
             rc->pred[i+j*5].offset = 0.0;
@@ -844,7 +855,11 @@
             rc->row_preds[i][j].offset = 0.0;
         }
     }
-    *rc->pred_b_from_p = rc->pred[0];
+    rc->pred_b_from_p->coeff_min = 0.5 / 2;
+    rc->pred_b_from_p->coeff = 0.5;
+    rc->pred_b_from_p->count = 1.0;
+    rc->pred_b_from_p->decay = 0.5;
+    rc->pred_b_from_p->offset = 0.0;
 
     if( parse_zones( h ) < 0 )
     {
@@ -1914,15 +1929,16 @@
             h->fenc->hrd_timing.cpb_removal_time = rc->nrt_first_access_unit + (double)(h->fenc->i_cpb_delay - h->i_cpb_delay_pir_offset) *
                                                    h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
 
-            double cpb_earliest_arrival_time = h->fenc->hrd_timing.cpb_removal_time - (double)rc->initial_cpb_removal_delay / 90000;
             if( h->fenc->b_keyframe )
             {
-                 rc->nrt_first_access_unit = h->fenc->hrd_timing.cpb_removal_time;
-                 rc->initial_cpb_removal_delay = h->initial_cpb_removal_delay;
-                 rc->initial_cpb_removal_delay_offset = h->initial_cpb_removal_delay_offset;
+                rc->nrt_first_access_unit = h->fenc->hrd_timing.cpb_removal_time;
+                rc->initial_cpb_removal_delay = h->initial_cpb_removal_delay;
+                rc->initial_cpb_removal_delay_offset = h->initial_cpb_removal_delay_offset;
             }
-            else
-                 cpb_earliest_arrival_time -= (double)rc->initial_cpb_removal_delay_offset / 90000;
+
+            double cpb_earliest_arrival_time = h->fenc->hrd_timing.cpb_removal_time - (double)rc->initial_cpb_removal_delay / 90000;
+            if( !h->fenc->b_keyframe )
+                cpb_earliest_arrival_time -= (double)rc->initial_cpb_removal_delay_offset / 90000;
 
             if( h->sps->vui.hrd.b_cbr_hrd )
                 h->fenc->hrd_timing.cpb_initial_arrival_time = rc->previous_cpb_final_arrival_time;
@@ -2095,7 +2111,7 @@
     int bitrate = h->sps->vui.hrd.i_bit_rate_unscaled;
     x264_ratecontrol_t *rcc = h->rc;
     x264_ratecontrol_t *rct = h->thread[0]->rc;
-    uint64_t buffer_size = (uint64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale;
+    int64_t buffer_size = (int64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale;
 
     if( rcc->last_satd >= h->mb.i_mb_count )
         update_predictor( &rct->pred[h->sh.i_type], qp2qscale( rcc->qpa_rc ), rcc->last_satd, bits );
@@ -2103,32 +2119,45 @@
     if( !rcc->b_vbv )
         return filler;
 
-    rct->buffer_fill_final -= (uint64_t)bits * h->sps->vui.i_time_scale;
+    uint64_t buffer_diff = (uint64_t)bits * h->sps->vui.i_time_scale;
+    rct->buffer_fill_final -= buffer_diff;
+    rct->buffer_fill_final_min -= buffer_diff;
 
-    if( rct->buffer_fill_final < 0 )
+    if( rct->buffer_fill_final_min < 0 )
     {
-        double underflow = (double)rct->buffer_fill_final / h->sps->vui.i_time_scale;
+        double underflow = (double)rct->buffer_fill_final_min / h->sps->vui.i_time_scale;
         if( rcc->rate_factor_max_increment && rcc->qpm >= rcc->qp_novbv + rcc->rate_factor_max_increment )
             x264_log( h, X264_LOG_DEBUG, "VBV underflow due to CRF-max (frame %d, %.0f bits)\n", h->i_frame, underflow );
         else
             x264_log( h, X264_LOG_WARNING, "VBV underflow (frame %d, %.0f bits)\n", h->i_frame, underflow );
+        rct->buffer_fill_final =
+        rct->buffer_fill_final_min = 0;
     }
-    rct->buffer_fill_final = X264_MAX( rct->buffer_fill_final, 0 );
 
     if( h->param.i_avcintra_class )
-        rct->buffer_fill_final += buffer_size;
+        buffer_diff = buffer_size;
     else
-        rct->buffer_fill_final += (uint64_t)bitrate * h->sps->vui.i_num_units_in_tick * h->fenc->i_cpb_duration;
-
-    if( h->param.rc.b_filler && rct->buffer_fill_final > buffer_size )
-    {
-        int64_t scale = (int64_t)h->sps->vui.i_time_scale * 8;
-        filler = (rct->buffer_fill_final - buffer_size + scale - 1) / scale;
-        bits = h->param.i_avcintra_class ? filler * 8 : X264_MAX( (FILLER_OVERHEAD - h->param.b_annexb), filler ) * 8;
-        rct->buffer_fill_final -= (uint64_t)bits * h->sps->vui.i_time_scale;
+        buffer_diff = (uint64_t)bitrate * h->sps->vui.i_num_units_in_tick * h->fenc->i_cpb_duration;
+    rct->buffer_fill_final += buffer_diff;
+    rct->buffer_fill_final_min += buffer_diff;
+
+    if( rct->buffer_fill_final > buffer_size )
+    {
+        if( h->param.rc.b_filler )
+        {
+            int64_t scale = (int64_t)h->sps->vui.i_time_scale * 8;
+            filler = (rct->buffer_fill_final - buffer_size + scale - 1) / scale;
+            bits = h->param.i_avcintra_class ? filler * 8 : X264_MAX( (FILLER_OVERHEAD - h->param.b_annexb), filler ) * 8;
+            buffer_diff = (uint64_t)bits * h->sps->vui.i_time_scale;
+            rct->buffer_fill_final -= buffer_diff;
+            rct->buffer_fill_final_min -= buffer_diff;
+        }
+        else
+        {
+            rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, buffer_size );
+            rct->buffer_fill_final_min = X264_MIN( rct->buffer_fill_final_min, buffer_size );
+        }
     }
-    else
-        rct->buffer_fill_final = X264_MIN( rct->buffer_fill_final, buffer_size );
 
     return filler;
 }
@@ -2139,23 +2168,27 @@
     uint64_t denom = (uint64_t)h->sps->vui.hrd.i_bit_rate_unscaled * h->sps->vui.i_time_scale / rct->hrd_multiply_denom;
     uint64_t cpb_state = rct->buffer_fill_final;
     uint64_t cpb_size = (uint64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale;
-    uint64_t multiply_factor = 180000 / rct->hrd_multiply_denom;
+    uint64_t multiply_factor = 90000 / rct->hrd_multiply_denom;
 
-    if( rct->buffer_fill_final < 0 || rct->buffer_fill_final > cpb_size )
+    if( rct->buffer_fill_final < 0 || rct->buffer_fill_final > (int64_t)cpb_size )
     {
-         x264_log( h, X264_LOG_WARNING, "CPB %s: %.0lf bits in a %.0lf-bit buffer\n",
-                   rct->buffer_fill_final < 0 ? "underflow" : "overflow", (float)rct->buffer_fill_final/denom, (float)cpb_size/denom );
+         x264_log( h, X264_LOG_WARNING, "CPB %s: %.0f bits in a %.0f-bit buffer\n",
+                   rct->buffer_fill_final < 0 ? "underflow" : "overflow",
+                   (double)rct->buffer_fill_final / h->sps->vui.i_time_scale, (double)cpb_size / h->sps->vui.i_time_scale );
     }
 
-    h->initial_cpb_removal_delay = (multiply_factor * cpb_state + denom) / (2*denom);
-    h->initial_cpb_removal_delay_offset = (multiply_factor * cpb_size + denom) / (2*denom) - h->initial_cpb_removal_delay;
+    h->initial_cpb_removal_delay = (multiply_factor * cpb_state) / denom;
+    h->initial_cpb_removal_delay_offset = (multiply_factor * cpb_size) / denom - h->initial_cpb_removal_delay;
+
+    int64_t decoder_buffer_fill = h->initial_cpb_removal_delay * denom / multiply_factor;
+    rct->buffer_fill_final_min = X264_MIN( rct->buffer_fill_final_min, decoder_buffer_fill );
 }
 
 // provisionally update VBV according to the planned size of all frames currently in progress
 static void update_vbv_plan( x264_t *h, int overhead )
 {
     x264_ratecontrol_t *rcc = h->rc;
-    rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final / h->sps->vui.i_time_scale;
+    rcc->buffer_fill = h->thread[0]->rc->buffer_fill_final_min / h->sps->vui.i_time_scale;
     if( h->i_thread_frames > 1 )
     {
         int j = h->rc - h->thread[0]->rc;

x264-snapshot-20141218-2245.tar.bz2/encoder/ratecontrol.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/ratecontrol.h Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/rdo.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/rdo.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * rdo.c: rate-distortion optimization
  *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
+ * Copyright (C) 2005-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Fiona Glaser <fiona@x264.com>
@@ -180,7 +180,7 @@
     else
     {
         x264_macroblock_size_cavlc( h );
-        i_bits = ( h->out.bs.i_bits_encoded * i_lambda2 + 128 ) >> 8;
+        i_bits = ( (uint64_t)h->out.bs.i_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
 
     h->mb.b_transform_8x8 = b_transform_bak;
@@ -261,7 +261,7 @@
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
     else
-        i_bits = x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
+        i_bits = (uint64_t)x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
 
     return (i_ssd<<8) + i_bits;
 }
@@ -297,7 +297,7 @@
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
     else
-        i_bits = x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
+        i_bits = (uint64_t)x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
 
     return (i_ssd<<8) + i_bits;
 }
@@ -331,7 +331,7 @@
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
     else
-        i_bits = x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
+        i_bits = (uint64_t)x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
 
     return (i_ssd<<8) + i_bits;
 }
@@ -357,7 +357,7 @@
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
     else
-        i_bits = x264_chroma_size_cavlc( h ) * i_lambda2;
+        i_bits = (uint64_t)x264_chroma_size_cavlc( h ) * i_lambda2;
 
     return (i_ssd<<8) + i_bits;
 }

x264-snapshot-20141218-2245.tar.bz2/encoder/set.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/set.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * set: header writing
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -166,7 +166,7 @@
     while( (1 << sps->i_log2_max_frame_num) <= max_frame_num )
         sps->i_log2_max_frame_num++;
 
-    sps->i_poc_type = param->i_bframe || param->b_interlaced ? 0 : 2;
+    sps->i_poc_type = param->i_bframe || param->b_interlaced || param->i_avcintra_class ? 0 : 2;
     if( sps->i_poc_type == 0 )
     {
         int max_delta_poc = (param->i_bframe + 2) * (!!param->i_bframe_pyramid + 1) * 2;
@@ -578,7 +578,7 @@
 
     memcpy( payload, uuid, 16 );
     sprintf( payload+16, "x264 - core %d%s - H.264/MPEG-4 AVC codec - "
-             "Copy%s 2003-2014 - http://www.videolan.org/x264.html - options: %s",
+             "Copy%s 2003-2015 - http://www.videolan.org/x264.html - options: %s",
              X264_BUILD, X264_VERSION, HAVE_GPL?"left":"right", opts );
     length = strlen(payload)+1;
 
@@ -663,7 +663,7 @@
     bs_write1( &q, quincunx_sampling_flag );      // quincunx_sampling_flag
 
     // 0: views are unrelated, 1: left view is on the left, 2: left view is on the right
-    bs_write ( &q, 6, 1 );                        // content_interpretation_type
+    bs_write ( &q, 6, h->param.i_frame_packing != 6 ); // content_interpretation_type
 
     bs_write1( &q, 0 );                           // spatial_flipping_flag
     bs_write1( &q, 0 );                           // frame0_flipped_flag

x264-snapshot-20141218-2245.tar.bz2/encoder/set.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/set.h Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/slicetype-cl.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/slicetype-cl.c Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/slicetype.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/slicetype.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * slicetype.c: lookahead analysis
  *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
+ * Copyright (C) 2005-2015 x264 project
  *
  * Authors: Fiona Glaser <fiona@x264.com>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -612,7 +612,6 @@
 
     if( b_bidir )
     {
-        int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy];
         ALIGNED_ARRAY_8( int16_t, dmv,[2],[2] );
 
         m[1].i_pixel = PIXEL_8x8;
@@ -624,14 +623,20 @@
         LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres );
         m[1].p_fref_w = m[1].p_fref[0];
 
-        dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
-        dmv[0][1] = ( mvr[1] * dist_scale_factor + 128 ) >> 8;
-        dmv[1][0] = dmv[0][0] - mvr[0];
-        dmv[1][1] = dmv[0][1] - mvr[1];
-        CLIP_MV( dmv[0] );
-        CLIP_MV( dmv[1] );
-        if( h->param.analyse.i_subpel_refine <= 1 )
-            M64( dmv ) &= ~0x0001000100010001ULL; /* mv & ~1 */
+        if( fref1->lowres_mvs[0][p1-p0-1][0][0] != 0x7FFF )
+        {
+            int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy];
+            dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
+            dmv[0][1] = ( mvr[1] * dist_scale_factor + 128 ) >> 8;
+            dmv[1][0] = dmv[0][0] - mvr[0];
+            dmv[1][1] = dmv[0][1] - mvr[1];
+            CLIP_MV( dmv[0] );
+            CLIP_MV( dmv[1] );
+            if( h->param.analyse.i_subpel_refine <= 1 )
+                M64( dmv ) &= ~0x0001000100010001ULL; /* mv & ~1 */
+        }
+        else
+            M64( dmv ) = 0;
 
         TRY_BIDIR( dmv[0], dmv[1], 0 );
         if( M64( dmv ) )
@@ -1104,7 +1109,7 @@
     if( b_intra )
         x264_slicetype_frame_cost( h, a, frames, 0, 0, 0, 0 );
 
-    while( i > 0 && frames[i]->i_type == X264_TYPE_B )
+    while( i > 0 && IS_X264_TYPE_B( frames[i]->i_type ) )
         i--;
     last_nonb = i;
 
@@ -1132,7 +1137,7 @@
     while( i-- > idx )
     {
         cur_nonb = i;
-        while( frames[cur_nonb]->i_type == X264_TYPE_B && cur_nonb > 0 )
+        while( IS_X264_TYPE_B( frames[cur_nonb]->i_type ) && cur_nonb > 0 )
             cur_nonb--;
         if( cur_nonb < idx )
             break;
@@ -1226,7 +1231,7 @@
     int last_nonb = 0, cur_nonb = 1, idx = 0;
     x264_frame_t *prev_frame = NULL;
     int prev_frame_idx = 0;
-    while( cur_nonb < num_frames && frames[cur_nonb]->i_type == X264_TYPE_B )
+    while( cur_nonb < num_frames && IS_X264_TYPE_B( frames[cur_nonb]->i_type ) )
         cur_nonb++;
     int next_nonb = keyframe ? last_nonb : cur_nonb;
 
@@ -1278,7 +1283,7 @@
         }
         last_nonb = cur_nonb;
         cur_nonb++;
-        while( cur_nonb <= num_frames && frames[cur_nonb]->i_type == X264_TYPE_B )
+        while( cur_nonb <= num_frames && IS_X264_TYPE_B( frames[cur_nonb]->i_type ) )
             cur_nonb++;
     }
     frames[next_nonb]->i_planned_type[idx] = X264_TYPE_AUTO;
@@ -1288,36 +1293,39 @@
 {
     int loc = 1;
     int cost = 0;
-    int cur_p = 0;
+    int cur_nonb = 0;
     path--; /* Since the 1st path element is really the second frame */
     while( path[loc] )
     {
-        int next_p = loc;
-        /* Find the location of the next P-frame. */
-        while( path[next_p] != 'P' )
-            next_p++;
-
-        /* Add the cost of the P-frame found above */
-        cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_p, 0 );
+        int next_nonb = loc;
+        /* Find the location of the next non-B-frame. */
+        while( path[next_nonb] == 'B' )
+            next_nonb++;
+
+        /* Add the cost of the non-B-frame found above */
+        if( path[next_nonb] == 'P' )
+            cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, next_nonb, 0 );
+        else /* I-frame */
+            cost += x264_slicetype_frame_cost( h, a, frames, next_nonb, next_nonb, next_nonb, 0 );
         /* Early terminate if the cost we have found is larger than the best path cost so far */
         if( cost > threshold )
             break;
 
-        if( h->param.i_bframe_pyramid && next_p - cur_p > 2 )
+        if( h->param.i_bframe_pyramid && next_nonb - cur_nonb > 2 )
         {
-            int middle = cur_p + (next_p - cur_p)/2;
-            cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, middle, 0 );
+            int middle = cur_nonb + (next_nonb - cur_nonb)/2;
+            cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, middle, 0 );
             for( int next_b = loc; next_b < middle && cost < threshold; next_b++ )
-                cost += x264_slicetype_frame_cost( h, a, frames, cur_p, middle, next_b, 0 );
-            for( int next_b = middle+1; next_b < next_p && cost < threshold; next_b++ )
-                cost += x264_slicetype_frame_cost( h, a, frames, middle, next_p, next_b, 0 );
+                cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, middle, next_b, 0 );
+            for( int next_b = middle+1; next_b < next_nonb && cost < threshold; next_b++ )
+                cost += x264_slicetype_frame_cost( h, a, frames, middle, next_nonb, next_b, 0 );
         }
         else
-            for( int next_b = loc; next_b < next_p && cost < threshold; next_b++ )
-                cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_b, 0 );
+            for( int next_b = loc; next_b < next_nonb && cost < threshold; next_b++ )
+                cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, next_b, 0 );
 
-        loc = next_p + 1;
-        cur_p = next_p;
+        loc = next_nonb + 1;
+        cur_nonb = next_nonb;
     }
     return cost;
 }
@@ -1331,6 +1339,7 @@
     char paths[2][X264_LOOKAHEAD_MAX+1];
     int num_paths = X264_MIN( h->param.i_bframe+1, length );
     int best_cost = COST_MAX;
+    int best_possible = 0;
     int idx = 0;
 
     /* Iterate over all currently possible paths */
@@ -1342,12 +1351,33 @@
         memset( paths[idx]+len, 'B', path );
         strcpy( paths[idx]+len+path, "P" );
 
-        /* Calculate the actual cost of the current path */
-        int cost = x264_slicetype_path_cost( h, a, frames, paths[idx], best_cost );
-        if( cost < best_cost )
+        int possible = 1;
+        for( int i = 1; i <= length; i++ )
         {
-            best_cost = cost;
-            idx ^= 1;
+            int i_type = frames[i]->i_type;
+            if( i_type == X264_TYPE_AUTO )
+                continue;
+            if( IS_X264_TYPE_B( i_type ) )
+                possible = possible && (i < len || i == length || paths[idx][i-1] == 'B');
+            else
+            {
+                possible = possible && (i < len || paths[idx][i-1] != 'B');
+                paths[idx][i-1] = IS_X264_TYPE_I( i_type ) ? 'I' : 'P';
+            }
+        }
+
+        if( possible || !best_possible )
+        {
+            if( possible && !best_possible )
+                best_cost = COST_MAX;
+            /* Calculate the actual cost of the current path */
+            int cost = x264_slicetype_path_cost( h, a, frames, paths[idx], best_cost );
+            if( cost < best_cost )
+            {
+                best_cost = cost;
+                best_possible = possible;
+                idx ^= 1;
+            }
         }
     }
 
@@ -1441,13 +1471,15 @@
     return scenecut_internal( h, a, frames, p0, p1, real_scenecut );
 }
 
+#define IS_X264_TYPE_AUTO_OR_I(x) ((x)==X264_TYPE_AUTO || IS_X264_TYPE_I(x))
+#define IS_X264_TYPE_AUTO_OR_B(x) ((x)==X264_TYPE_AUTO || IS_X264_TYPE_B(x))
+
 void x264_slicetype_analyse( x264_t *h, int intra_minigop )
 {
     x264_mb_analysis_t a;
     x264_frame_t *frames[X264_LOOKAHEAD_MAX+3] = { NULL, };
     int num_frames, orig_num_frames, keyint_limit, framecnt;
     int i_mb_count = NUM_MBS;
-    int cost1p0, cost2p0, cost1b1, cost2p1;
     int i_max_search = X264_MIN( h->lookahead->next.i_size, X264_LOOKAHEAD_MAX );
     int vbv_lookahead = h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead;
     /* For determinism we should limit the search to the number of frames lookahead has for sure
@@ -1463,7 +1495,7 @@
     if( !h->lookahead->last_nonb )
         return;
     frames[0] = h->lookahead->last_nonb;
-    for( framecnt = 0; framecnt < i_max_search && h->lookahead->next.list[framecnt]->i_type == X264_TYPE_AUTO; framecnt++ )
+    for( framecnt = 0; framecnt < i_max_search; framecnt++ )
         frames[framecnt+1] = h->lookahead->next.list[framecnt];
 
     x264_lowres_context_init( h, &a );
@@ -1492,12 +1524,11 @@
         return;
     }
 
-    int num_bframes = 0;
-    int num_analysed_frames = num_frames;
-    int reset_start;
-    if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1, 1, orig_num_frames, i_max_search ) )
+    if( IS_X264_TYPE_AUTO_OR_I( frames[1]->i_type ) &&
+        h->param.i_scenecut_threshold && scenecut( h, &a, frames, 0, 1, 1, orig_num_frames, i_max_search ) )
     {
-        frames[1]->i_type = X264_TYPE_I;
+        if( frames[1]->i_type == X264_TYPE_AUTO )
+            frames[1]->i_type = X264_TYPE_I;
         return;
     }
 
@@ -1505,6 +1536,23 @@
     x264_opencl_slicetype_prep( h, frames, num_frames, a.i_lambda );
 #endif
 
+    /* Replace forced keyframes with I/IDR-frames */
+    for( int j = 1; j <= num_frames; j++ )
+    {
+        if( frames[j]->i_type == X264_TYPE_KEYFRAME )
+            frames[j]->i_type = h->param.b_open_gop ? X264_TYPE_I : X264_TYPE_IDR;
+    }
+
+    /* Close GOP at IDR-frames */
+    for( int j = 2; j <= num_frames; j++ )
+    {
+        if( frames[j]->i_type == X264_TYPE_IDR && IS_X264_TYPE_AUTO_OR_B( frames[j-1]->i_type ) )
+            frames[j-1]->i_type = X264_TYPE_P;
+    }
+
+    int num_analysed_frames = num_frames;
+    int reset_start;
+
     if( h->param.i_bframe )
     {
         if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
@@ -1518,96 +1566,147 @@
                 for( int j = 2; j <= num_frames; j++ )
                     x264_slicetype_path( h, &a, frames, j, best_paths );
 
-                num_bframes = strspn( best_paths[best_path_index], "B" );
                 /* Load the results of the analysis into the frame types. */
                 for( int j = 1; j < num_frames; j++ )
-                    frames[j]->i_type = best_paths[best_path_index][j-1] == 'B' ? X264_TYPE_B : X264_TYPE_P;
+                {
+                    if( best_paths[best_path_index][j-1] != 'B' )
+                    {
+                        if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) )
+                            frames[j]->i_type = X264_TYPE_P;
+                    }
+                    else
+                    {
+                        if( frames[j]->i_type == X264_TYPE_AUTO )
+                            frames[j]->i_type = X264_TYPE_B;
+                    }
+                }
             }
-            frames[num_frames]->i_type = X264_TYPE_P;
         }
         else if( h->param.i_bframe_adaptive == X264_B_ADAPT_FAST )
         {
-            for( int i = 0; i <= num_frames-2; )
+            int last_nonb = 0;
+            int num_bframes = h->param.i_bframe;
+            for( int j = 1; j < num_frames; j++ )
             {
-                cost2p1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+2, 1 );
-                if( frames[i+2]->i_intra_mbs[2] > i_mb_count / 2 )
+                if( j-1 > 0 && IS_X264_TYPE_B( frames[j-1]->i_type ) )
+                    num_bframes--;
+                else
                 {
-                    frames[i+1]->i_type = X264_TYPE_P;
-                    frames[i+2]->i_type = X264_TYPE_P;
-                    i += 2;
+                    last_nonb = j-1;
+                    num_bframes = h->param.i_bframe;
+                }
+                if( !num_bframes )
+                {
+                    if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) )
+                        frames[j]->i_type = X264_TYPE_P;
                     continue;
                 }
 
-#if HAVE_OPENCL
-                if( h->param.b_opencl )
+                if( frames[j]->i_type != X264_TYPE_AUTO )
+                    continue;
+
+                if( IS_X264_TYPE_B( frames[j+1]->i_type ) )
                 {
-                    int b_work_done = 0;
-                    b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+2, i+1 );
-                    b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+1, i+1 );
-                    b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+1, i+2, i+2 );
-                    if( b_work_done )
-                        x264_opencl_flush( h );
+                    frames[j]->i_type = X264_TYPE_P;
+                    continue;
                 }
+
+                if( j - last_nonb <= 1 )
+                {
+                    int cost2p1 = x264_slicetype_frame_cost( h, &a, frames, last_nonb+0, j+1, j+1, 1 );
+                    if( frames[j+1]->i_intra_mbs[2] > i_mb_count / 2 )
+                    {
+                        frames[j]->i_type = X264_TYPE_P;
+                        continue;
+                    }
+
+#if HAVE_OPENCL
+                    if( h->param.b_opencl )
+                    {
+                        int b_work_done = 0;
+                        b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, last_nonb+0, j+1, j+0 );
+                        b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, last_nonb+0, j+0, j+0 );
+                        b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, last_nonb+1, j+1, j+1 );
+                        if( b_work_done )
+                            x264_opencl_flush( h );
+                    }
 #endif
 
-                cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 );
-                cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 );
-                cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 );
+                    int cost1b1 = x264_slicetype_frame_cost( h, &a, frames, last_nonb+0, j+1, j+0, 0 );
+                    int cost1p0 = x264_slicetype_frame_cost( h, &a, frames, last_nonb+0, j+0, j+0, 0 );
+                    int cost2p0 = x264_slicetype_frame_cost( h, &a, frames, last_nonb+1, j+1, j+1, 0 );
 
-                if( cost1p0 + cost2p0 < cost1b1 + cost2p1 )
-                {
-                    frames[i+1]->i_type = X264_TYPE_P;
-                    i += 1;
+                    if( cost1p0 + cost2p0 < cost1b1 + cost2p1 )
+                    {
+                        frames[j]->i_type = X264_TYPE_P;
+                        continue;
+                    }
+                    frames[j]->i_type = X264_TYPE_B;
                     continue;
                 }
 
                 // arbitrary and untuned
                 #define INTER_THRESH 300
                 #define P_SENS_BIAS (50 - h->param.i_bframe_bias)
-                frames[i+1]->i_type = X264_TYPE_B;
 
-                int j;
-                for( j = i+2; j <= X264_MIN( i+h->param.i_bframe, num_frames-1 ); j++ )
-                {
-                    int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-i-1), INTER_THRESH/10);
-                    int pcost = x264_slicetype_frame_cost( h, &a, frames, i+0, j+1, j+1, 1 );
-                    if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j-i+1] > i_mb_count/3 )
-                        break;
+                int pthresh = X264_MAX(INTER_THRESH - P_SENS_BIAS * (j-last_nonb-1), INTER_THRESH/10);
+                int pcost = x264_slicetype_frame_cost( h, &a, frames, last_nonb, j+1, j+1, 1 );
+                if( pcost > pthresh*i_mb_count || frames[j+1]->i_intra_mbs[j-last_nonb+1] > i_mb_count/3 )
+                    frames[j]->i_type = X264_TYPE_P;
+                else
                     frames[j]->i_type = X264_TYPE_B;
-                }
-                frames[j]->i_type = X264_TYPE_P;
-                i = j;
             }
-            frames[num_frames]->i_type = X264_TYPE_P;
-            num_bframes = 0;
-            while( num_bframes < num_frames && frames[num_bframes+1]->i_type == X264_TYPE_B )
-                num_bframes++;
         }
         else
         {
-            num_bframes = X264_MIN(num_frames-1, h->param.i_bframe);
+            int num_bframes = h->param.i_bframe;
             for( int j = 1; j < num_frames; j++ )
-                frames[j]->i_type = (j%(num_bframes+1)) ? X264_TYPE_B : X264_TYPE_P;
-            frames[num_frames]->i_type = X264_TYPE_P;
+            {
+                if( !num_bframes )
+                {
+                    if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) )
+                        frames[j]->i_type = X264_TYPE_P;
+                }
+                else if( frames[j]->i_type == X264_TYPE_AUTO )
+                {
+                    if( IS_X264_TYPE_B( frames[j+1]->i_type ) )
+                        frames[j]->i_type = X264_TYPE_P;
+                    else
+                        frames[j]->i_type = X264_TYPE_B;
+                }
+                if( IS_X264_TYPE_B( frames[j]->i_type ) )
+                    num_bframes--;
+                else
+                    num_bframes = h->param.i_bframe;
+            }
         }
+        if( IS_X264_TYPE_AUTO_OR_B( frames[num_frames]->i_type ) )
+            frames[num_frames]->i_type = X264_TYPE_P;
+
+        int num_bframes = 0;
+        while( num_bframes < num_frames && IS_X264_TYPE_B( frames[num_bframes+1]->i_type ) )
+            num_bframes++;
 
         /* Check scenecut on the first minigop. */
         for( int j = 1; j < num_bframes+1; j++ )
-            if( h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1, 0, orig_num_frames, i_max_search ) )
+        {
+            if( frames[j]->i_forced_type == X264_TYPE_AUTO && IS_X264_TYPE_AUTO_OR_I( frames[j+1]->i_forced_type ) &&
+                h->param.i_scenecut_threshold && scenecut( h, &a, frames, j, j+1, 0, orig_num_frames, i_max_search ) )
             {
                 frames[j]->i_type = X264_TYPE_P;
                 num_analysed_frames = j;
                 break;
             }
+        }
 
         reset_start = keyframe ? 1 : X264_MIN( num_bframes+2, num_analysed_frames+1 );
     }
     else
     {
         for( int j = 1; j <= num_frames; j++ )
-            frames[j]->i_type = X264_TYPE_P;
+            if( IS_X264_TYPE_AUTO_OR_B( frames[j]->i_type ) )
+                frames[j]->i_type = X264_TYPE_P;
         reset_start = !keyframe + 1;
-        num_bframes = 0;
     }
 
     /* Perform the actual macroblock tree analysis.
@@ -1617,21 +1716,63 @@
 
     /* Enforce keyframe limit. */
     if( !h->param.b_intra_refresh )
-        for( int i = keyint_limit+1; i <= num_frames; i += h->param.i_keyint_max )
+    {
+        int last_keyframe = h->lookahead->i_last_keyframe;
+        int last_possible = 0;
+        for( int j = 1; j <= num_frames; j++ )
         {
-            frames[i]->i_type = X264_TYPE_I;
-            reset_start = X264_MIN( reset_start, i+1 );
-            if( h->param.b_open_gop && h->param.b_bluray_compat )
-                while( IS_X264_TYPE_B( frames[i-1]->i_type ) )
-                    i--;
+            x264_frame_t *frm = frames[j];
+            int keyframe_dist = frm->i_frame - last_keyframe;
+
+            if( IS_X264_TYPE_AUTO_OR_I( frm->i_forced_type ) )
+            {
+                if( h->param.b_open_gop || !IS_X264_TYPE_B( frames[j-1]->i_forced_type ) )
+                    last_possible = j;
+            }
+            if( keyframe_dist >= h->param.i_keyint_max )
+            {
+                if( last_possible != 0 && last_possible != j )
+                {
+                    j = last_possible;
+                    frm = frames[j];
+                    keyframe_dist = frm->i_frame - last_keyframe;
+                }
+                last_possible = 0;
+                if( frm->i_type != X264_TYPE_IDR )
+                    frm->i_type = h->param.b_open_gop ? X264_TYPE_I : X264_TYPE_IDR;
+            }
+            if( frm->i_type == X264_TYPE_I && keyframe_dist >= h->param.i_keyint_min )
+            {
+                if( h->param.b_open_gop )
+                {
+                    last_keyframe = frm->i_frame;
+                    if( h->param.b_bluray_compat )
+                    {
+                        // Use bluray order
+                        int bframes = 0;
+                        while( bframes < j-1 && IS_X264_TYPE_B( frames[j-1-bframes]->i_type ) )
+                            bframes++;
+                        last_keyframe -= bframes;
+                    }
+                }
+                else if( frm->i_forced_type != X264_TYPE_I )
+                    frm->i_type = X264_TYPE_IDR;
+            }
+            if( frm->i_type == X264_TYPE_IDR )
+            {
+                last_keyframe = frm->i_frame;
+                if( j > 1 && IS_X264_TYPE_B( frames[j-1]->i_type ) )
+                    frames[j-1]->i_type = X264_TYPE_P;
+            }
         }
+    }
 
     if( vbv_lookahead )
         x264_vbv_lookahead( h, &a, frames, num_frames, keyframe );
 
     /* Restore frametypes for all frames that haven't actually been decided yet. */
     for( int j = reset_start; j <= num_frames; j++ )
-        frames[j]->i_type = X264_TYPE_AUTO;
+        frames[j]->i_type = frames[j]->i_forced_type;
 
 #if HAVE_OPENCL
     x264_opencl_slicetype_end( h );
@@ -1695,6 +1836,14 @@
     for( bframes = 0, brefs = 0;; bframes++ )
     {
         frm = h->lookahead->next.list[bframes];
+
+        if( frm->i_forced_type != X264_TYPE_AUTO && frm->i_type != frm->i_forced_type &&
+            !(frm->i_forced_type == X264_TYPE_KEYFRAME && IS_X264_TYPE_I( frm->i_type )) )
+        {
+            x264_log( h, X264_LOG_WARNING, "forced frame type (%d) at %d was changed to frame type (%d)\n",
+                      frm->i_forced_type, frm->i_frame, frm->i_type );
+        }
+
         if( frm->i_type == X264_TYPE_BREF && h->param.i_bframe_pyramid < X264_B_PYRAMID_NORMAL &&
             brefs == h->param.i_bframe_pyramid )
         {

x264-snapshot-20141218-2245.tar.bz2/example.c -> x264-snapshot-20150804-2245.tar.bz2/example.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * example.c: libx264 API usage example
  *****************************************************************************
- * Copyright (C) 2014 x264 project
+ * Copyright (C) 2014-2015 x264 project
  *
  * Authors: Anton Mitrofanov <BugMaster@narod.ru>
  *
@@ -24,26 +24,14 @@
  *****************************************************************************/
 
 #ifdef _WIN32
-/* The following two defines must be located before the inclusion of any system header files. */
-#define WINVER       0x0500
-#define _WIN32_WINNT 0x0500
-#include <windows.h>
 #include <io.h>       /* _setmode() */
 #include <fcntl.h>    /* _O_BINARY */
 #endif
 
 #include <stdint.h>
 #include <stdio.h>
-#include <signal.h>
 #include <x264.h>
 
-/* Ctrl-C handler */
-static volatile int b_ctrl_c = 0;
-static void sigint_handler( int a )
-{
-    b_ctrl_c = 1;
-}
-
 #define FAIL_IF_ERROR( cond, ... )\
 do\
 {\
@@ -72,9 +60,6 @@
     _setmode( _fileno( stderr ), _O_BINARY );
 #endif
 
-    /* Control-C handler */
-    signal( SIGINT, sigint_handler );
-
     FAIL_IF_ERROR( !(argc > 1), "Example usage: example 352x288 <input.yuv >output.h264\n" );
     FAIL_IF_ERROR( 2 != sscanf( argv[1], "%dx%d", &width, &height ), "resolution not specified or incorrect\n" );
 
@@ -105,17 +90,17 @@
 #undef fail
 #define fail fail3
 
+    int luma_size = width * height;
+    int chroma_size = luma_size / 4;
     /* Encode frames */
-    for( ; !b_ctrl_c; i_frame++ )
+    for( ;; i_frame++ )
     {
         /* Read input frame */
-        int plane_size = width * height;
-        if( fread( pic.img.plane[0], 1, plane_size, stdin ) != plane_size )
+        if( fread( pic.img.plane[0], 1, luma_size, stdin ) != luma_size )
             break;
-        plane_size = ((width + 1) >> 1) * ((height + 1) >> 1);
-        if( fread( pic.img.plane[1], 1, plane_size, stdin ) != plane_size )
+        if( fread( pic.img.plane[1], 1, chroma_size, stdin ) != chroma_size )
             break;
-        if( fread( pic.img.plane[2], 1, plane_size, stdin ) != plane_size )
+        if( fread( pic.img.plane[2], 1, chroma_size, stdin ) != chroma_size )
             break;
 
         pic.i_pts = i_frame;
@@ -129,7 +114,7 @@
         }
     }
     /* Flush delayed frames */
-    while( !b_ctrl_c && x264_encoder_delayed_frames( h ) )
+    while( x264_encoder_delayed_frames( h ) )
     {
         i_frame_size = x264_encoder_encode( h, &nal, &i_nal, NULL, &pic_out );
         if( i_frame_size < 0 )

x264-snapshot-20141218-2245.tar.bz2/extras/avxsynth_c.h -> x264-snapshot-20150804-2245.tar.bz2/extras/avxsynth_c.h Changed

x264-snapshot-20150804-2245.tar.bz2/extras/intel_dispatcher.h Added

@@ -0,0 +1,46 @@
+/*****************************************************************************
+ * intel_dispatcher.h: intel compiler cpu dispatcher override
+ *****************************************************************************
+ * Copyright (C) 2014-2015 x264 project
+ *
+ * Authors: Anton Mitrofanov <BugMaster@narod.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_INTEL_DISPATCHER_H
+#define X264_INTEL_DISPATCHER_H
+
+/* Feature flags using _FEATURE_* defines from immintrin.h */
+extern unsigned long long __intel_cpu_feature_indicator;
+extern unsigned long long __intel_cpu_feature_indicator_x;
+
+/* CPU vendor independent version of dispatcher */
+void __intel_cpu_features_init_x( void );
+
+static void x264_intel_dispatcher_override( void )
+{
+    if( __intel_cpu_feature_indicator & ~1ULL )
+        return;
+    __intel_cpu_feature_indicator = 0;
+    __intel_cpu_feature_indicator_x = 0;
+    __intel_cpu_features_init_x();
+    __intel_cpu_feature_indicator = __intel_cpu_feature_indicator_x;
+}
+
+#endif

x264-snapshot-20141218-2245.tar.bz2/filters/filters.c -> x264-snapshot-20150804-2245.tar.bz2/filters/filters.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/filters.h -> x264-snapshot-20150804-2245.tar.bz2/filters/filters.h Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/cache.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/cache.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/crop.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/crop.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/depth.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/depth.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * depth.c: bit-depth conversion video filter
  *****************************************************************************
- * Copyright (C) 2010-2014 x264 project
+ * Copyright (C) 2010-2015 x264 project
  *
  * Authors: Oskar Arvidsson <oskar@irock.se>
  *
@@ -50,6 +50,7 @@
            csp_mask == X264_CSP_YV16 ||
            csp_mask == X264_CSP_YV24 ||
            csp_mask == X264_CSP_NV12 ||
+           csp_mask == X264_CSP_NV21 ||
            csp_mask == X264_CSP_NV16 ||
            csp_mask == X264_CSP_BGR ||
            csp_mask == X264_CSP_RGB ||
@@ -59,7 +60,7 @@
 static int csp_num_interleaved( int csp, int plane )
 {
     int csp_mask = csp & X264_CSP_MASK;
-    return (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ? 2 :
+    return (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV21 || csp_mask == X264_CSP_NV16) && plane == 1 ? 2 :
            csp_mask == X264_CSP_BGR || csp_mask == X264_CSP_RGB ? 3 :
            csp_mask == X264_CSP_BGRA ? 4 :
            1;
@@ -73,10 +74,10 @@
 static void dither_plane_##pitch( pixel *dst, int dst_stride, uint16_t *src, int src_stride, \
                                   int width, int height, int16_t *errors ) \
 { \
-    const int lshift = 16-BIT_DEPTH; \
-    const int rshift = 16-BIT_DEPTH+2; \
-    const int half = 1 << (16-BIT_DEPTH+1); \
-    const int pixel_max = (1 << BIT_DEPTH)-1; \
+    const int lshift = 16-X264_BIT_DEPTH; \
+    const int rshift = 16-X264_BIT_DEPTH+2; \
+    const int half = 1 << (16-X264_BIT_DEPTH+1); \
+    const int pixel_max = (1 << X264_BIT_DEPTH)-1; \
     memset( errors, 0, (width+1) * sizeof(int16_t) ); \
     for( int y = 0; y < height; y++, src += src_stride, dst += dst_stride ) \
     { \
@@ -136,7 +137,7 @@
 static void scale_image( cli_image_t *output, cli_image_t *img )
 {
     int csp_mask = img->csp & X264_CSP_MASK;
-    const int shift = BIT_DEPTH - 8;
+    const int shift = X264_BIT_DEPTH - 8;
     for( int i = 0; i < img->planes; i++ )
     {
         uint8_t *src = img->plane[i];
@@ -216,7 +217,7 @@
             ret = 1;
     }
 
-    FAIL_IF_ERROR( bit_depth != BIT_DEPTH, "this build supports only bit depth %d\n", BIT_DEPTH )
+    FAIL_IF_ERROR( bit_depth != X264_BIT_DEPTH, "this build supports only bit depth %d\n", X264_BIT_DEPTH )
     FAIL_IF_ERROR( ret, "unsupported bit depth conversion.\n" )
 
     /* only add the filter to the chain if it's needed */

x264-snapshot-20141218-2245.tar.bz2/filters/video/fix_vfr_pts.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/fix_vfr_pts.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/internal.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/internal.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/internal.h -> x264-snapshot-20150804-2245.tar.bz2/filters/video/internal.h Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/resize.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/resize.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/select_every.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/select_every.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/source.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/source.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/video.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/video.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/video.h -> x264-snapshot-20150804-2245.tar.bz2/filters/video/video.h Changed

x264-snapshot-20141218-2245.tar.bz2/input/avs.c -> x264-snapshot-20150804-2245.tar.bz2/input/avs.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * avs.c: avisynth input
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: Steven Walters <kemuri9@gmail.com>
  *
@@ -27,15 +27,15 @@
 #if USE_AVXSYNTH
 #include <dlfcn.h>
 #if SYS_MACOSX
-#define avs_open dlopen( "libavxsynth.dylib", RTLD_NOW )
+#define avs_open() dlopen( "libavxsynth.dylib", RTLD_NOW )
 #else
-#define avs_open dlopen( "libavxsynth.so", RTLD_NOW )
+#define avs_open() dlopen( "libavxsynth.so", RTLD_NOW )
 #endif
 #define avs_close dlclose
 #define avs_address dlsym
 #else
 #include <windows.h>
-#define avs_open LoadLibraryW( L"avisynth" )
+#define avs_open() LoadLibraryW( L"avisynth" )
 #define avs_close FreeLibrary
 #define avs_address GetProcAddress
 #endif
@@ -80,7 +80,7 @@
 {
     AVS_Clip *clip;
     AVS_ScriptEnvironment *env;
-    HMODULE library;
+    void *library;
     int num_frames;
     struct
     {
@@ -102,7 +102,7 @@
 /* load the library and functions we require from it */
 static int x264_avs_load_library( avs_hnd_t *h )
 {
-    h->library = avs_open;
+    h->library = avs_open();
     if( !h->library )
         return -1;
     LOAD_AVS_FUNC( avs_clip_get_error, 0 );
@@ -175,8 +175,9 @@
     FILE *fh = x264_fopen( psz_filename, "r" );
     if( !fh )
         return -1;
-    FAIL_IF_ERROR( !x264_is_regular_file( fh ), "AVS input is incompatible with non-regular file `%s'\n", psz_filename );
+    int b_regular = x264_is_regular_file( fh );
     fclose( fh );
+    FAIL_IF_ERROR( !b_regular, "AVS input is incompatible with non-regular file `%s'\n", psz_filename );
 
     avs_hnd_t *h = malloc( sizeof(avs_hnd_t) );
     if( !h )

x264-snapshot-20141218-2245.tar.bz2/input/ffms.c -> x264-snapshot-20150804-2245.tar.bz2/input/ffms.c Changed

x264-snapshot-20141218-2245.tar.bz2/input/input.c -> x264-snapshot-20150804-2245.tar.bz2/input/input.c Changed

x264-snapshot-20141218-2245.tar.bz2/input/input.h -> x264-snapshot-20150804-2245.tar.bz2/input/input.h Changed

x264-snapshot-20141218-2245.tar.bz2/input/lavf.c -> x264-snapshot-20150804-2245.tar.bz2/input/lavf.c Changed

x264-snapshot-20141218-2245.tar.bz2/input/raw.c -> x264-snapshot-20150804-2245.tar.bz2/input/raw.c Changed

x264-snapshot-20141218-2245.tar.bz2/input/thread.c -> x264-snapshot-20150804-2245.tar.bz2/input/thread.c Changed

x264-snapshot-20141218-2245.tar.bz2/input/timecode.c -> x264-snapshot-20150804-2245.tar.bz2/input/timecode.c Changed

x264-snapshot-20141218-2245.tar.bz2/input/y4m.c -> x264-snapshot-20150804-2245.tar.bz2/input/y4m.c Changed

x264-snapshot-20141218-2245.tar.bz2/output/flv.c -> x264-snapshot-20150804-2245.tar.bz2/output/flv.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * flv.c: flv muxer
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: Kieran Kunhya <kieran@kunhya.com>
  *
@@ -75,21 +75,29 @@
 
 static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt )
 {
-    *p_handle = NULL;
     flv_hnd_t *p_flv = calloc( 1, sizeof(flv_hnd_t) );
-    if( !p_flv )
-        return -1;
-
-    p_flv->b_dts_compress = opt->use_dts_compress;
-
-    p_flv->c = flv_create_writer( psz_filename );
-    if( !p_flv->c )
-        return -1;
-
-    CHECK( write_header( p_flv->c ) );
-    *p_handle = p_flv;
+    if( p_flv )
+    {
+        flv_buffer *c = flv_create_writer( psz_filename );
+        if( c )
+        {
+            if( !write_header( c ) )
+            {
+                p_flv->c = c;
+                p_flv->b_dts_compress = opt->use_dts_compress;
+                *p_handle = p_flv;
+                return 0;
+            }
+
+            fclose( c->fp );
+            free( c->data );
+            free( c );
+        }
+        free( p_flv );
+    }
 
-    return 0;
+    *p_handle = NULL;
+    return -1;
 }
 
 static int set_param( hnd_t handle, x264_param_t *p_param )
@@ -293,15 +301,22 @@
     return i_size;
 }
 
-static void rewrite_amf_double( FILE *fp, uint64_t position, double value )
+static int rewrite_amf_double( FILE *fp, uint64_t position, double value )
 {
     uint64_t x = endian_fix64( flv_dbl2int( value ) );
-    fseek( fp, position, SEEK_SET );
-    fwrite( &x, 8, 1, fp );
+    return !fseek( fp, position, SEEK_SET ) && fwrite( &x, 8, 1, fp ) == 1 ? 0 : -1;
 }
 
+#undef CHECK
+#define CHECK(x)\
+do {\
+    if( (x) < 0 )\
+        goto error;\
+} while( 0 )
+
 static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
 {
+    int ret = -1;
     flv_hnd_t *p_flv = handle;
     flv_buffer *c = p_flv->c;
 
@@ -317,19 +332,22 @@
         if( p_flv->i_framerate_pos )
         {
             framerate = (double)p_flv->i_framenum / total_duration;
-            rewrite_amf_double( c->fp, p_flv->i_framerate_pos, framerate );
+            CHECK( rewrite_amf_double( c->fp, p_flv->i_framerate_pos, framerate ) );
         }
 
-        rewrite_amf_double( c->fp, p_flv->i_duration_pos, total_duration );
-        rewrite_amf_double( c->fp, p_flv->i_filesize_pos, filesize );
-        rewrite_amf_double( c->fp, p_flv->i_bitrate_pos, filesize * 8 / ( total_duration * 1000 ) );
+        CHECK( rewrite_amf_double( c->fp, p_flv->i_duration_pos, total_duration ) );
+        CHECK( rewrite_amf_double( c->fp, p_flv->i_filesize_pos, filesize ) );
+        CHECK( rewrite_amf_double( c->fp, p_flv->i_bitrate_pos, filesize * 8 / ( total_duration * 1000 ) ) );
     }
+    ret = 0;
 
+error:
     fclose( c->fp );
-    free( p_flv );
+    free( c->data );
     free( c );
+    free( p_flv );
 
-    return 0;
+    return ret;
 }
 
 const cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };

x264-snapshot-20141218-2245.tar.bz2/output/flv_bytestream.c -> x264-snapshot-20150804-2245.tar.bz2/output/flv_bytestream.c Changed

x264-snapshot-20141218-2245.tar.bz2/output/flv_bytestream.h -> x264-snapshot-20150804-2245.tar.bz2/output/flv_bytestream.h Changed

x264-snapshot-20141218-2245.tar.bz2/output/matroska.c -> x264-snapshot-20150804-2245.tar.bz2/output/matroska.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * matroska.c: matroska muxer
  *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
+ * Copyright (C) 2005-2015 x264 project
  *
  * Authors: Mike Matsnev <mike@haali.su>
  *
@@ -62,9 +62,14 @@
     return 0;
 }
 
+#define STEREO_COUNT 7
+static const uint8_t stereo_modes[STEREO_COUNT] = {5,9,7,1,3,13,0};
+static const uint8_t stereo_w_div[STEREO_COUNT] = {1,2,1,2,1,1,1};
+static const uint8_t stereo_h_div[STEREO_COUNT] = {1,1,2,1,2,1,1};
+
 static int set_param( hnd_t handle, x264_param_t *p_param )
 {
-    mkv_hnd_t   *p_mkv = handle;
+    mkv_hnd_t *p_mkv = handle;
     int64_t dw, dh;
 
     if( p_param->i_fps_num > 0 && !p_param->b_vfr_input )
@@ -77,25 +82,27 @@
         p_mkv->frame_duration = 0;
     }
 
-    p_mkv->width = p_mkv->d_width = p_param->i_width;
-    p_mkv->height = p_mkv->d_height = p_param->i_height;
+    dw = p_mkv->width = p_param->i_width;
+    dh = p_mkv->height = p_param->i_height;
     p_mkv->display_size_units = DS_PIXELS;
-    p_mkv->stereo_mode = p_param->i_frame_packing;
-
+    p_mkv->stereo_mode = -1;
+    if( p_param->i_frame_packing >= 0 && p_param->i_frame_packing < STEREO_COUNT )
+    {
+        p_mkv->stereo_mode = stereo_modes[p_param->i_frame_packing];
+        dw /= stereo_w_div[p_param->i_frame_packing];
+        dh /= stereo_h_div[p_param->i_frame_packing];
+    }
     if( p_param->vui.i_sar_width && p_param->vui.i_sar_height
         && p_param->vui.i_sar_width != p_param->vui.i_sar_height )
     {
         if ( p_param->vui.i_sar_width > p_param->vui.i_sar_height ) {
-            dw = (int64_t)p_param->i_width * p_param->vui.i_sar_width / p_param->vui.i_sar_height;
-            dh = p_param->i_height;
+            dw = dw * p_param->vui.i_sar_width / p_param->vui.i_sar_height;
         } else {
-            dw = p_param->i_width;
-            dh = (int64_t)p_param->i_height * p_param->vui.i_sar_height / p_param->vui.i_sar_width;
+            dh = dh * p_param->vui.i_sar_height / p_param->vui.i_sar_width;
         }
-
-        p_mkv->d_width = (int)dw;
-        p_mkv->d_height = (int)dh;
     }
+    p_mkv->d_width = (int)dw;
+    p_mkv->d_height = (int)dh;
 
     p_mkv->i_timebase_num = p_param->i_timebase_num;
     p_mkv->i_timebase_den = p_param->i_timebase_den;
@@ -150,11 +157,11 @@
                            avcC, avcC_len, p_mkv->frame_duration, 50000,
                            p_mkv->width, p_mkv->height,
                            p_mkv->d_width, p_mkv->d_height, p_mkv->display_size_units, p_mkv->stereo_mode );
+    free( avcC );
+
     if( ret < 0 )
         return ret;
 
-    free( avcC );
-
     // SEI
 
     if( !p_mkv->b_writing_frame )

x264-snapshot-20141218-2245.tar.bz2/output/matroska_ebml.c -> x264-snapshot-20150804-2245.tar.bz2/output/matroska_ebml.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * matroska_ebml.c: matroska muxer utilities
  *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
+ * Copyright (C) 2005-2015 x264 project
  *
  * Authors: Mike Matsnev <mike@haali.su>
  *
@@ -317,8 +317,6 @@
     return w;
 }
 
-static const uint8_t mk_stereo_modes[6] = {5,9,7,1,3,13};
-
 int mk_write_header( mk_writer *w, const char *writing_app,
                      const char *codec_id,
                      const void *codec_private, unsigned codec_private_size,
@@ -342,7 +340,7 @@
     CHECK( mk_write_uint( c, 0x42f2, 4 ) ); // EBMLMaxIDLength
     CHECK( mk_write_uint( c, 0x42f3, 8 ) ); // EBMLMaxSizeLength
     CHECK( mk_write_string( c, 0x4282, "matroska") ); // DocType
-    CHECK( mk_write_uint( c, 0x4287, 2 ) ); // DocTypeVersion
+    CHECK( mk_write_uint( c, 0x4287, stereo_mode >= 0 ? 3 : 2 ) ); // DocTypeVersion
     CHECK( mk_write_uint( c, 0x4285, 2 ) ); // DocTypeReadversion
     CHECK( mk_close_context( c, 0 ) );
 
@@ -381,8 +379,8 @@
     CHECK( mk_write_uint( v, 0x54b2, display_size_units ) );
     CHECK( mk_write_uint( v, 0x54b0, d_width ) );
     CHECK( mk_write_uint( v, 0x54ba, d_height ) );
-    if( stereo_mode >= 0 && stereo_mode <= 5 )
-        CHECK( mk_write_uint( v, 0x53b8, mk_stereo_modes[stereo_mode] ) );
+    if( stereo_mode >= 0 )
+        CHECK( mk_write_uint( v, 0x53b8, stereo_mode ) );
     CHECK( mk_close_context( v, 0 ) );
 
     CHECK( mk_close_context( ti, 0 ) );

x264-snapshot-20141218-2245.tar.bz2/output/matroska_ebml.h -> x264-snapshot-20150804-2245.tar.bz2/output/matroska_ebml.h Changed

x264-snapshot-20141218-2245.tar.bz2/output/mp4.c -> x264-snapshot-20150804-2245.tar.bz2/output/mp4.c Changed

x264-snapshot-20141218-2245.tar.bz2/output/mp4_lsmash.c -> x264-snapshot-20150804-2245.tar.bz2/output/mp4_lsmash.c Changed

x264-snapshot-20141218-2245.tar.bz2/output/output.h -> x264-snapshot-20150804-2245.tar.bz2/output/output.h Changed

x264-snapshot-20141218-2245.tar.bz2/output/raw.c -> x264-snapshot-20150804-2245.tar.bz2/output/raw.c Changed

x264-snapshot-20141218-2245.tar.bz2/tools/checkasm-a.asm -> x264-snapshot-20150804-2245.tar.bz2/tools/checkasm-a.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* checkasm-a.asm: assembly check tool
 ;*****************************************************************************
-;* Copyright (C) 2008-2014 x264 project
+;* Copyright (C) 2008-2015 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Henrik Gramner <henrik@gramner.com>
@@ -33,24 +33,24 @@
 %if ARCH_X86_64
 ; just random numbers to reduce the chance of incidental match
 ALIGN 16
-x6:  ddq 0x79445c159ce790641a1b2550a612b48c
-x7:  ddq 0x86b2536fcd8cf6362eed899d5a28ddcd
-x8:  ddq 0x3f2bf84fc0fcca4eb0856806085e7943
-x9:  ddq 0xd229e1f5b281303facbd382dcf5b8de2
-x10: ddq 0xab63e2e11fa38ed971aeaff20b095fd9
-x11: ddq 0x77d410d5c42c882d89b0c0765892729a
-x12: ddq 0x24b3c1d2a024048bc45ea11a955d8dd5
-x13: ddq 0xdd7b8919edd427862e8ec680de14b47c
-x14: ddq 0x11e53e2b2ac655ef135ce6888fa02cbf
-x15: ddq 0x6de8f4c914c334d5011ff554472a7a10
-n7:   dq 0x21f86d66c8ca00ce
-n8:   dq 0x75b6ba21077c48ad
-n9:   dq 0xed56bb2dcb3c7736
-n10:  dq 0x8bda43d3fd1a7e06
-n11:  dq 0xb64a9c9e5d318408
-n12:  dq 0xdf9a54b303f1d3a3
-n13:  dq 0x4a75479abd64e097
-n14:  dq 0x249214109d5d1c88
+x6:  dq 0x1a1b2550a612b48c,0x79445c159ce79064
+x7:  dq 0x2eed899d5a28ddcd,0x86b2536fcd8cf636
+x8:  dq 0xb0856806085e7943,0x3f2bf84fc0fcca4e
+x9:  dq 0xacbd382dcf5b8de2,0xd229e1f5b281303f
+x10: dq 0x71aeaff20b095fd9,0xab63e2e11fa38ed9
+x11: dq 0x89b0c0765892729a,0x77d410d5c42c882d
+x12: dq 0xc45ea11a955d8dd5,0x24b3c1d2a024048b
+x13: dq 0x2e8ec680de14b47c,0xdd7b8919edd42786
+x14: dq 0x135ce6888fa02cbf,0x11e53e2b2ac655ef
+x15: dq 0x011ff554472a7a10,0x6de8f4c914c334d5
+n7:  dq 0x21f86d66c8ca00ce
+n8:  dq 0x75b6ba21077c48ad
+n9:  dq 0xed56bb2dcb3c7736
+n10: dq 0x8bda43d3fd1a7e06
+n11: dq 0xb64a9c9e5d318408
+n12: dq 0xdf9a54b303f1d3a3
+n13: dq 0x4a75479abd64e097
+n14: dq 0x249214109d5d1c88
 %endif
 
 SECTION .text

x264-snapshot-20141218-2245.tar.bz2/tools/checkasm.c -> x264-snapshot-20150804-2245.tar.bz2/tools/checkasm.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * checkasm.c: assembly check tool
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -97,6 +97,12 @@
     asm volatile( "mftb %0" : "=r"(a) :: "memory" );
 #elif ARCH_ARM     // ARMv7 only
     asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) :: "memory" );
+#elif ARCH_AARCH64
+    uint64_t b = 0;
+    asm volatile( "mrs %0, pmccntr_el0" : "=r"(b) :: "memory" );
+    a = b;
+#elif ARCH_MIPS
+    asm volatile( "rdhwr %0, $2" : "=r"(a) :: "memory" );
 #endif
     return a;
 }
@@ -167,12 +173,12 @@
                 continue;
             printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
 #if HAVE_MMX
-                    b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" :
                     b->cpu&X264_CPU_AVX2 ? "avx2" :
                     b->cpu&X264_CPU_FMA3 ? "fma3" :
                     b->cpu&X264_CPU_FMA4 ? "fma4" :
                     b->cpu&X264_CPU_XOP ? "xop" :
                     b->cpu&X264_CPU_AVX ? "avx" :
+                    b->cpu&X264_CPU_SSE42 ? "sse42" :
                     b->cpu&X264_CPU_SSE4 ? "sse4" :
                     b->cpu&X264_CPU_SSSE3 ? "ssse3" :
                     b->cpu&X264_CPU_SSE3 ? "sse3" :
@@ -189,6 +195,8 @@
 #elif ARCH_AARCH64
                     b->cpu&X264_CPU_NEON ? "neon" :
                     b->cpu&X264_CPU_ARMV8 ? "armv8" :
+#elif ARCH_MIPS
+                    b->cpu&X264_CPU_MSA ? "msa" :
 #endif
                     "c",
 #if HAVE_MMX
@@ -637,7 +645,7 @@
             } \
             predict_8x8[res_c>>16]( fdec1, edge ); \
             int res_a = call_a( pixel_asm.name, fenc, fdec2, edge, bitcosts+8-pred_mode, satds_a ); \
-            if( res_c != res_a || memcmp(satds_c, satds_a, sizeof(satds_c)) ) \
+            if( res_c != res_a || memcmp(satds_c, satds_a, 16 * sizeof(*satds_c)) ) \
             { \
                 ok = 0; \
                 fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
@@ -1409,6 +1417,32 @@
         }
     }
 
+    if( mc_a.plane_copy_swap != mc_ref.plane_copy_swap )
+    {
+        set_func_name( "plane_copy_swap" );
+        used_asm = 1;
+        for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
+        {
+            int w = (plane_specs[i].w + 1) >> 1;
+            int h = plane_specs[i].h;
+            intptr_t src_stride = plane_specs[i].src_stride;
+            intptr_t dst_stride = (2*w + 127) & ~63;
+            assert( dst_stride * h <= 0x1000 );
+            pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
+            memset( pbuf3, 0, 0x1000*sizeof(pixel) );
+            memset( pbuf4, 0, 0x1000*sizeof(pixel) );
+            call_c( mc_c.plane_copy_swap, pbuf3, dst_stride, src1, src_stride, w, h );
+            call_a( mc_a.plane_copy_swap, pbuf4, dst_stride, src1, src_stride, w, h );
+            for( int y = 0; y < h; y++ )
+                if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) )
+                {
+                    ok = 0;
+                    fprintf( stderr, "plane_copy_swap FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
+                    break;
+                }
+        }
+    }
+
     if( mc_a.plane_copy_interleave != mc_ref.plane_copy_interleave )
     {
         set_func_name( "plane_copy_interleave" );
@@ -1496,7 +1530,7 @@
     if( mc_a.plane_copy_deinterleave_v210 != mc_ref.plane_copy_deinterleave_v210 )
     {
         set_func_name( "plane_copy_deinterleave_v210" );
-        used_asm = 1;
+        ok = 1; used_asm = 1;
         for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
         {
             int w = (plane_specs[i].w + 1) >> 1;
@@ -1517,8 +1551,8 @@
                     break;
                 }
         }
+        report( "v210 :" );
     }
-    report( "v210 :" );
 
     if( mc_a.hpel_filter != mc_ref.hpel_filter )
     {
@@ -2311,12 +2345,16 @@
             {\
                 fprintf( stderr, #name "[%d] :  [FAILED]\n", dir );\
                 ok = 0;\
-                for( int k = -1; k < 16; k++ )\
-                    printf( "%2x ", edge[16+k] );\
-                printf( "\n" );\
+                if( ip_c.name == (void *)ip_c.predict_8x8 )\
+                {\
+                    for( int k = -1; k < 16; k++ )\
+                        printf( "%2x ", edge[16+k] );\
+                    printf( "\n" );\
+                }\
                 for( int j = 0; j < h; j++ )\
                 {\
-                    printf( "%2x ", edge[14-j] );\
+                    if( ip_c.name == (void *)ip_c.predict_8x8 )\
+                        printf( "%2x ", edge[14-j] );\
                     for( int k = 0; k < w; k++ )\
                         printf( "%2x ", pbuf4[48+k+j*FDEC_STRIDE] );\
                     printf( "\n" );\
@@ -2324,7 +2362,8 @@
                 printf( "\n" );\
                 for( int j = 0; j < h; j++ )\
                 {\
-                    printf( "   " );\
+                    if( ip_c.name == (void *)ip_c.predict_8x8 )\
+                        printf( "   " );\
                     for( int k = 0; k < w; k++ )\
                         printf( "%2x ", pbuf3[48+k+j*FDEC_STRIDE] );\
                     printf( "\n" );\
@@ -2428,6 +2467,8 @@
 DECL_CABAC(c)
 #if HAVE_MMX
 DECL_CABAC(asm)
+#elif defined(ARCH_AARCH64)
+DECL_CABAC(asm)
 #else
 #define run_cabac_decision_asm run_cabac_decision_c
 #define run_cabac_bypass_asm run_cabac_bypass_c
@@ -2646,7 +2687,7 @@
 #endif
         if( cpu_detect & X264_CPU_LZCNT )
         {
-            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX LZCNT" );
             cpu1 &= ~X264_CPU_LZCNT;
         }
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
@@ -2664,11 +2705,11 @@
         cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
         cpu1 &= ~X264_CPU_SLOW_CTZ;
-    }
-    if( cpu_detect & X264_CPU_LZCNT )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
-        cpu1 &= ~X264_CPU_LZCNT;
+        if( cpu_detect & X264_CPU_LZCNT )
+        {
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE2 LZCNT" );
+            cpu1 &= ~X264_CPU_LZCNT;
+        }
     }
     if( cpu_detect & X264_CPU_SSE3 )
     {
@@ -2688,9 +2729,16 @@
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
         cpu1 &= ~X264_CPU_CACHELINE_64;
         cpu1 &= ~X264_CPU_SLOW_ATOM;
+        if( cpu_detect & X264_CPU_LZCNT )
+        {
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSSE3 LZCNT" );
+            cpu1 &= ~X264_CPU_LZCNT;
+        }
     }
     if( cpu_detect & X264_CPU_SSE4 )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
+    if( cpu_detect & X264_CPU_SSE42 )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE42, "SSE4.2" );
     if( cpu_detect & X264_CPU_AVX )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
     if( cpu_detect & X264_CPU_XOP )
@@ -2700,30 +2748,30 @@
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
         cpu1 &= ~X264_CPU_FMA4;
     }
-    if( cpu_detect & X264_CPU_BMI1 )
+    if( cpu_detect & X264_CPU_FMA3 )
     {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
-        cpu1 &= ~X264_CPU_BMI1;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
+        cpu1 &= ~X264_CPU_FMA3;
     }
     if( cpu_detect & X264_CPU_AVX2 )
     {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3 | X264_CPU_AVX2, "AVX2" );
         if( cpu_detect & X264_CPU_LZCNT )
         {
-            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2_LZCNT" );
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2 LZCNT" );
             cpu1 &= ~X264_CPU_LZCNT;
         }
     }
+    if( cpu_detect & X264_CPU_BMI1 )
+    {
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
+        cpu1 &= ~X264_CPU_BMI1;
+    }
     if( cpu_detect & X264_CPU_BMI2 )
     {
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" );
         cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2);
     }
-    if( cpu_detect & X264_CPU_FMA3 )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
-        cpu1 &= ~X264_CPU_FMA3;
-    }
 #elif ARCH_PPC
     if( cpu_detect & X264_CPU_ALTIVEC )
     {
@@ -2742,6 +2790,9 @@
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV8, "ARMv8" );
     if( cpu_detect & X264_CPU_NEON )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" );
+#elif ARCH_MIPS
+    if( cpu_detect & X264_CPU_MSA )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_MSA, "MSA" );
 #endif
     return ret;
 }
@@ -2752,7 +2803,7 @@
 
     if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) )
     {
-#if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_ARM
+#if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_ARM && !ARCH_AARCH64 && !ARCH_MIPS
         fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" );
         return 1;
 #endif

x264-snapshot-20150804-2245.tar.bz2/tools/gas-preprocessor.pl Added

@@ -0,0 +1,1033 @@
+#!/usr/bin/env perl
+# by David Conrad
+# This code is licensed under GPLv2 or later; go to gnu.org to read it
+#  (not that it much matters for an asm preprocessor)
+# usage: set your assembler to be something like "perl gas-preprocessor.pl gcc"
+use strict;
+
+# Apple's gas is ancient and doesn't support modern preprocessing features like
+# .rept and has ugly macro syntax, among other things. Thus, this script
+# implements the subset of the gas preprocessor used by x264 and ffmpeg
+# that isn't supported by Apple's gas.
+
+my %canonical_arch = ("aarch64" => "aarch64", "arm64" => "aarch64",
+                      "arm"     => "arm",
+                      "powerpc" => "powerpc", "ppc"   => "powerpc");
+
+my %comments = ("aarch64" => '//',
+                "arm"     => '@',
+                "powerpc" => '#');
+
+my @gcc_cmd;
+my @preprocess_c_cmd;
+
+my $comm;
+my $arch;
+my $as_type = "apple-gas";
+
+my $fix_unreq = $^O eq "darwin";
+my $force_thumb = 0;
+
+my $arm_cond_codes = "eq|ne|cs|cc|mi|pl|vs|vc|hi|ls|ge|lt|gt|le|al|hs|lo";
+
+my $usage_str = "
+$0\n
+Gas-preprocessor.pl converts assembler files using modern GNU as syntax for
+Apple's ancient gas version or clang's incompatible integrated assembler. The
+conversion is regularly tested for Libav, x264 and vlc. Other projects might
+use different features which are not correctly handled.
+
+Options for this program needs to be separated with ' -- ' from the assembler
+command. Following options are currently supported:
+
+    -help         - this usage text
+    -arch         - target architecture
+    -as-type      - one value out of {{,apple-}{gas,clang},armasm}
+    -fix-unreq
+    -no-fix-unreq
+    -force-thumb  - assemble as thumb regardless of the input source
+                    (note, this is incomplete and only works for sources
+                    it explicitly was tested with)
+";
+
+sub usage() {
+    print $usage_str;
+}
+
+while (@ARGV) {
+    my $opt = shift;
+
+    if ($opt =~ /^-(no-)?fix-unreq$/) {
+        $fix_unreq = $1 ne "no-";
+    } elsif ($opt eq "-force-thumb") {
+        $force_thumb = 1;
+    } elsif ($opt eq "-arch") {
+        $arch = shift;
+        die "unknown arch: '$arch'\n" if not exists $comments{$arch};
+    } elsif ($opt eq "-as-type") {
+        $as_type = shift;
+        die "unknown as type: '$as_type'\n" if $as_type !~ /^((apple-)?(gas|clang)|armasm)$/;
+    } elsif ($opt eq "-help") {
+        usage();
+        exit 0;
+    } elsif ($opt eq "--" ) {
+        @gcc_cmd = @ARGV;
+    } elsif ($opt =~ /^-/) {
+        die "option '$opt' is not known. See '$0 -help' for usage information\n";
+    } else {
+        push @gcc_cmd, $opt, @ARGV;
+    }
+    last if (@gcc_cmd);
+}
+
+if (grep /\.c$/, @gcc_cmd) {
+    # C file (inline asm?) - compile
+    @preprocess_c_cmd = (@gcc_cmd, "-S");
+} elsif (grep /\.[sS]$/, @gcc_cmd) {
+    # asm file, just do C preprocessor
+    @preprocess_c_cmd = (@gcc_cmd, "-E");
+} elsif (grep /-(v|h|-version|dumpversion)/, @gcc_cmd) {
+    # pass -v/--version along, used during probing. Matching '-v' might have
+    # uninteded results but it doesn't matter much if gas-preprocessor or
+    # the compiler fails.
+    exec(@gcc_cmd);
+} else {
+    die "Unrecognized input filetype";
+}
+if ($as_type eq "armasm") {
+
+    $preprocess_c_cmd[0] = "cpp";
+    push(@preprocess_c_cmd, "-U__ELF__");
+    push(@preprocess_c_cmd, "-U__MACH__");
+
+    @preprocess_c_cmd = grep ! /^-nologo$/, @preprocess_c_cmd;
+    # Remove -ignore XX parameter pairs from preprocess_c_cmd
+    my $index = 1;
+    while ($index < $#preprocess_c_cmd) {
+        if ($preprocess_c_cmd[$index] eq "-ignore" and $index + 1 < $#preprocess_c_cmd) {
+            splice(@preprocess_c_cmd, $index, 2);
+            next;
+        }
+        $index++;
+    }
+    if (grep /^-MM$/, @preprocess_c_cmd) {
+        system(@preprocess_c_cmd) == 0 or die "Error running preprocessor";
+        exit 0;
+    }
+}
+
+# if compiling, avoid creating an output file named '-.o'
+if ((grep /^-c$/, @gcc_cmd) && !(grep /^-o/, @gcc_cmd)) {
+    foreach my $i (@gcc_cmd) {
+        if ($i =~ /\.[csS]$/) {
+            my $outputfile = $i;
+            $outputfile =~ s/\.[csS]$/.o/;
+            push(@gcc_cmd, "-o");
+            push(@gcc_cmd, $outputfile);
+            last;
+        }
+    }
+}
+# replace only the '-o' argument with '-', avoids rewriting the make dependency
+# target specified with -MT to '-'
+my $index = 1;
+while ($index < $#preprocess_c_cmd) {
+    if ($preprocess_c_cmd[$index] eq "-o") {
+        $index++;
+        $preprocess_c_cmd[$index] = "-";
+    }
+    $index++;
+}
+
+my $tempfile;
+if ($as_type ne "armasm") {
+    @gcc_cmd = map { /\.[csS]$/ ? qw(-x assembler -) : $_ } @gcc_cmd;
+} else {
+    @preprocess_c_cmd = grep ! /^-c$/, @preprocess_c_cmd;
+    @preprocess_c_cmd = grep ! /^-m/, @preprocess_c_cmd;
+
+    @preprocess_c_cmd = grep ! /^-G/, @preprocess_c_cmd;
+    @preprocess_c_cmd = grep ! /^-W/, @preprocess_c_cmd;
+    @preprocess_c_cmd = grep ! /^-Z/, @preprocess_c_cmd;
+    @preprocess_c_cmd = grep ! /^-fp/, @preprocess_c_cmd;
+    @preprocess_c_cmd = grep ! /^-EHsc$/, @preprocess_c_cmd;
+    @preprocess_c_cmd = grep ! /^-O/, @preprocess_c_cmd;
+
+    @gcc_cmd = grep ! /^-G/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-W/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-Z/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-fp/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-EHsc$/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-O/, @gcc_cmd;
+
+    my @outfiles = grep /\.(o|obj)$/, @gcc_cmd;
+    $tempfile = $outfiles[0].".asm";
+
+    # Remove most parameters from gcc_cmd, which actually is the armasm command,
+    # which doesn't support any of the common compiler/preprocessor options.
+    @gcc_cmd = grep ! /^-D/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-U/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-m/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-M/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-c$/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-I/, @gcc_cmd;
+    @gcc_cmd = map { /\.S$/ ? $tempfile : $_ } @gcc_cmd;
+}
+
+# detect architecture from gcc binary name
+if (!$arch) {
+    if ($gcc_cmd[0] =~ /(arm64|aarch64|arm|powerpc|ppc)/) {
+        $arch = $1;
+    } else {
+        # look for -arch flag
+        foreach my $i (1 .. $#gcc_cmd-1) {
+            if ($gcc_cmd[$i] eq "-arch" and
+                $gcc_cmd[$i+1] =~ /(arm64|aarch64|arm|powerpc|ppc)/) {
+                $arch = $1;
+            }
+        }
+    }
+}
+
+# assume we're not cross-compiling if no -arch or the binary doesn't have the arch name
+$arch = qx/arch/ if (!$arch);
+
+die "Unknown target architecture '$arch'" if not exists $canonical_arch{$arch};
+
+$arch = $canonical_arch{$arch};
+$comm = $comments{$arch};
+my $inputcomm = $comm;
+$comm = ";" if $as_type =~ /armasm/;
+
+my %ppc_spr = (ctr    => 9,
+               vrsave => 256);
+
+open(INPUT, "-|", @preprocess_c_cmd) || die "Error running preprocessor";
+
+if ($ENV{GASPP_DEBUG}) {
+    open(ASMFILE, ">&STDOUT");
+} else {
+    if ($as_type ne "armasm") {
+        open(ASMFILE, "|-", @gcc_cmd) or die "Error running assembler";
+    } else {
+        open(ASMFILE, ">", $tempfile);
+    }
+}
+
+my $current_macro = '';
+my $macro_level = 0;
+my $rept_level = 0;
+my %macro_lines;
+my %macro_args;
+my %macro_args_default;
+my $macro_count = 0;
+my $altmacro = 0;
+my $in_irp = 0;
+
+my $num_repts;
+my @rept_lines;
+
+my @irp_args;
+my $irp_param;
+
+my @ifstack;
+
+my %symbols;
+
+my @sections;
+
+my %literal_labels;     # for ldr <reg>, =<expr>
+my $literal_num = 0;
+my $literal_expr = ".word";
+$literal_expr = ".quad" if $arch eq "aarch64";
+
+my $thumb = 0;
+
+my %thumb_labels;
+my %call_targets;
+my %mov32_targets;
+
+my %neon_alias_reg;
+my %neon_alias_type;
+
+my $temp_label_next = 0;
+my %last_temp_labels;
+my %next_temp_labels;
+
+my %labels_seen;
+
+my %aarch64_req_alias;
+
+if ($force_thumb) {
+    parse_line(".thumb\n");
+}
+
+# pass 1: parse .macro
+# note that the handling of arguments is probably overly permissive vs. gas
+# but it should be the same for valid cases
+while (<INPUT>) {
+    # remove lines starting with '#', preprocessing is done, '#' at start of
+    # the line indicates a comment for all supported archs (aarch64, arm, ppc
+    # and x86). Also strips line number comments but since they are off anyway
+    # it is no loss.
+    s/^#.*$//;
+    # remove all comments (to avoid interfering with evaluating directives)
+    s/(?<!\\)$inputcomm.*//x;
+    # Strip out windows linefeeds
+    s/\r$//;
+
+    foreach my $subline (split(";", $_)) {
+        # Add newlines at the end of lines that don't already have one
+        chomp $subline;
+        $subline .= "\n";
+        parse_line($subline);
+    }
+}
+
+sub eval_expr {
+    my $expr = $_[0];
+    while ($expr =~ /([A-Za-z._][A-Za-z0-9._]*)/g) {
+        my $sym = $1;
+        $expr =~ s/$sym/($symbols{$sym})/ if defined $symbols{$sym};
+    }
+    eval $expr;
+}
+
+sub handle_if {
+    my $line = $_[0];
+    # handle .if directives; apple's assembler doesn't support important non-basic ones
+    # evaluating them is also needed to handle recursive macros
+    if ($line =~ /\.if(n?)([a-z]*)\s+(.*)/) {
+        my $result = $1 eq "n";
+        my $type   = $2;
+        my $expr   = $3;
+
+        if ($type eq "b") {
+            $expr =~ s/\s//g;
+            $result ^= $expr eq "";
+        } elsif ($type eq "c") {
+            if ($expr =~ /(.*)\s*,\s*(.*)/) {
+                $result ^= $1 eq $2;
+            } else {
+                die "argument to .ifc not recognized";
+            }
+        } elsif ($type eq "") {
+            $result ^= eval_expr($expr) != 0;
+        } elsif ($type eq "eq") {
+            $result = eval_expr($expr) == 0;
+        } elsif ($type eq "lt") {
+            $result = eval_expr($expr) < 0;
+        } else {
+            chomp($line);
+            die "unhandled .if varient. \"$line\"";
+        }
+        push (@ifstack, $result);
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+sub parse_if_line {
+    my $line = $_[0];
+
+    # evaluate .if blocks
+    if (scalar(@ifstack)) {
+        # Don't evaluate any new if statements if we're within
+        # a repetition or macro - they will be evaluated once
+        # the repetition is unrolled or the macro is expanded.
+        if (scalar(@rept_lines) == 0 and $macro_level == 0) {
+            if ($line =~ /\.endif/) {
+                pop(@ifstack);
+                return 1;
+            } elsif ($line =~ /\.elseif\s+(.*)/) {
+                if ($ifstack[-1] == 0) {
+                    $ifstack[-1] = !!eval_expr($1);
+                } elsif ($ifstack[-1] > 0) {
+                    $ifstack[-1] = -$ifstack[-1];
+                }
+                return 1;
+            } elsif ($line =~ /\.else/) {
+                $ifstack[-1] = !$ifstack[-1];
+                return 1;
+            } elsif (handle_if($line)) {
+                return 1;
+            }
+        }
+
+        # discard lines in false .if blocks
+        foreach my $i (0 .. $#ifstack) {
+            if ($ifstack[$i] <= 0) {
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+sub parse_line {
+    my $line = $_[0];
+
+    return if (parse_if_line($line));
+
+    if (scalar(@rept_lines) == 0) {
+        if (/\.macro/) {
+            $macro_level++;
+            if ($macro_level > 1 && !$current_macro) {
+                die "nested macros but we don't have master macro";
+            }
+        } elsif (/\.endm/) {
+            $macro_level--;
+            if ($macro_level < 0) {
+                die "unmatched .endm";
+            } elsif ($macro_level == 0) {
+                $current_macro = '';
+                return;
+            }
+        }
+    }
+
+    if ($macro_level == 0) {
+        if ($line =~ /\.(rept|irp)/) {
+            $rept_level++;
+        } elsif ($line =~ /.endr/) {
+            $rept_level--;
+        }
+    }
+
+    if ($macro_level > 1) {
+        push(@{$macro_lines{$current_macro}}, $line);
+    } elsif (scalar(@rept_lines) and $rept_level >= 1) {
+        push(@rept_lines, $line);
+    } elsif ($macro_level == 0) {
+        expand_macros($line);
+    } else {
+        if ($line =~ /\.macro\s+([\d\w\.]+)\s*,?\s*(.*)/) {
+            $current_macro = $1;
+
+            # commas in the argument list are optional, so only use whitespace as the separator
+            my $arglist = $2;
+            $arglist =~ s/,/ /g;
+
+            my @args = split(/\s+/, $arglist);
+            foreach my $i (0 .. $#args) {
+                my @argpair = split(/=/, $args[$i]);
+                $macro_args{$current_macro}[$i] = $argpair[0];
+                $argpair[0] =~ s/:vararg$//;
+                $macro_args_default{$current_macro}{$argpair[0]} = $argpair[1];
+            }
+            # ensure %macro_lines has the macro name added as a key
+            $macro_lines{$current_macro} = [];
+
+        } elsif ($current_macro) {
+            push(@{$macro_lines{$current_macro}}, $line);
+        } else {
+            die "macro level without a macro name";
+        }
+    }
+}
+
+sub handle_set {
+    my $line = $_[0];
+    if ($line =~ /\.set\s+(.*),\s*(.*)/) {
+        $symbols{$1} = eval_expr($2);
+        return 1;
+    }
+    return 0;
+}
+
+sub expand_macros {
+    my $line = $_[0];
+
+    # handle .if directives; apple's assembler doesn't support important non-basic ones
+    # evaluating them is also needed to handle recursive macros
+    if (handle_if($line)) {
+        return;
+    }
+
+    if (/\.purgem\s+([\d\w\.]+)/) {
+        delete $macro_lines{$1};
+        delete $macro_args{$1};
+        delete $macro_args_default{$1};
+        return;
+    }
+
+    if ($line =~ /\.altmacro/) {
+        $altmacro = 1;
+        return;
+    }
+
+    if ($line =~ /\.noaltmacro/) {
+        $altmacro = 0;
+        return;
+    }
+
+    $line =~ s/\%([^,]*)/eval_expr($1)/eg if $altmacro;
+
+    # Strip out the .set lines from the armasm output
+    return if (handle_set($line) and $as_type eq "armasm");
+
+    if ($line =~ /\.rept\s+(.*)/) {
+        $num_repts = $1;
+        @rept_lines = ("\n");
+
+        # handle the possibility of repeating another directive on the same line
+        # .endr on the same line is not valid, I don't know if a non-directive is
+        if ($num_repts =~ s/(\.\w+.*)//) {
+            push(@rept_lines, "$1\n");
+        }
+        $num_repts = eval_expr($num_repts);
+    } elsif ($line =~ /\.irp\s+([\d\w\.]+)\s*(.*)/) {
+        $in_irp = 1;
+        $num_repts = 1;
+        @rept_lines = ("\n");
+        $irp_param = $1;
+
+        # only use whitespace as the separator
+        my $irp_arglist = $2;
+        $irp_arglist =~ s/,/ /g;
+        $irp_arglist =~ s/^\s+//;
+        @irp_args = split(/\s+/, $irp_arglist);
+    } elsif ($line =~ /\.irpc\s+([\d\w\.]+)\s*(.*)/) {
+        $in_irp = 1;
+        $num_repts = 1;
+        @rept_lines = ("\n");
+        $irp_param = $1;
+
+        my $irp_arglist = $2;
+        $irp_arglist =~ s/,/ /g;
+        $irp_arglist =~ s/^\s+//;
+        @irp_args = split(//, $irp_arglist);
+    } elsif ($line =~ /\.endr/) {
+        my @prev_rept_lines = @rept_lines;
+        my $prev_in_irp = $in_irp;
+        my @prev_irp_args = @irp_args;
+        my $prev_irp_param = $irp_param;
+        my $prev_num_repts = $num_repts;
+        @rept_lines = ();
+        $in_irp = 0;
+        @irp_args = '';
+
+        if ($prev_in_irp != 0) {
+            foreach my $i (@prev_irp_args) {
+                foreach my $origline (@prev_rept_lines) {
+                    my $line = $origline;
+                    $line =~ s/\\$prev_irp_param/$i/g;
+                    $line =~ s/\\//g;     # remove \()
+                    parse_line($line);
+                }
+            }
+        } else {
+            for (1 .. $prev_num_repts) {
+                foreach my $origline (@prev_rept_lines) {
+                    my $line = $origline;
+                    parse_line($line);
+                }
+            }
+        }
+    } elsif ($line =~ /(\S+:|)\s*([\w\d\.]+)\s*(.*)/ && exists $macro_lines{$2}) {
+        handle_serialized_line($1);
+        my $macro = $2;
+
+        # commas are optional here too, but are syntactically important because
+        # parameters can be blank
+        my @arglist = split(/,/, $3);
+        my @args;
+        my @args_seperator;
+
+        my $comma_sep_required = 0;
+        foreach (@arglist) {
+            # allow arithmetic/shift operators in macro arguments
+            $_ =~ s/\s*(\+|-|\*|\/|<<|>>|<|>)\s*/$1/g;
+
+            my @whitespace_split = split(/\s+/, $_);
+            if (!@whitespace_split) {
+                push(@args, '');
+                push(@args_seperator, '');
+            } else {
+                foreach (@whitespace_split) {
+                        #print ("arglist = \"$_\"\n");
+                    if (length($_)) {
+                        push(@args, $_);
+                        my $sep = $comma_sep_required ? "," : " ";
+                        push(@args_seperator, $sep);
+                        #print ("sep = \"$sep\", arg = \"$_\"\n");
+                        $comma_sep_required = 0;
+                    }
+                }
+            }
+
+            $comma_sep_required = 1;
+        }
+
+        my %replacements;
+        if ($macro_args_default{$macro}){
+            %replacements = %{$macro_args_default{$macro}};
+        }
+
+        # construct hashtable of text to replace
+        foreach my $i (0 .. $#args) {
+            my $argname = $macro_args{$macro}[$i];
+            my @macro_args = @{ $macro_args{$macro} };
+            if ($args[$i] =~ m/=/) {
+                # arg=val references the argument name
+                # XXX: I'm not sure what the expected behaviour if a lot of
+                # these are mixed with unnamed args
+                my @named_arg = split(/=/, $args[$i]);
+                $replacements{$named_arg[0]} = $named_arg[1];
+            } elsif ($i > $#{$macro_args{$macro}}) {
+                # more args given than the macro has named args
+                # XXX: is vararg allowed on arguments before the last?
+                $argname = $macro_args{$macro}[-1];
+                if ($argname =~ s/:vararg$//) {
+                    #print "macro = $macro, args[$i] = $args[$i], args_seperator=@args_seperator, argname = $argname, arglist[$i] = $arglist[$i], arglist = @arglist, args=@args, macro_args=@macro_args\n";
+                    #$replacements{$argname} .= ", $args[$i]";
+                    $replacements{$argname} .= "$args_seperator[$i] $args[$i]";
+                } else {
+                    die "Too many arguments to macro $macro";
+                }
+            } else {
+                $argname =~ s/:vararg$//;
+                $replacements{$argname} = $args[$i];
+            }
+        }
+
+        my $count = $macro_count++;
+
+        # apply replacements as regex
+        foreach (@{$macro_lines{$macro}}) {
+            my $macro_line = $_;
+            # do replacements by longest first, this avoids wrong replacement
+            # when argument names are subsets of each other
+            foreach (reverse sort {length $a <=> length $b} keys %replacements) {
+                $macro_line =~ s/\\$_/$replacements{$_}/g;
+            }
+            if ($altmacro) {
+                foreach (reverse sort {length $a <=> length $b} keys %replacements) {
+                    $macro_line =~ s/\b$_\b/$replacements{$_}/g;
+                }
+            }
+            $macro_line =~ s/\\\@/$count/g;
+            $macro_line =~ s/\\//g;     # remove \()
+            parse_line($macro_line);
+        }
+    } else {
+        handle_serialized_line($line);
+    }
+}
+
+sub is_arm_register {
+    my $name = $_[0];
+    if ($name eq "lr" or
+        $name eq "ip" or
+        $name =~ /^[rav]\d+$/) {
+        return 1;
+    }
+    return 0;
+}
+
+sub handle_local_label {
+    my $line = $_[0];
+    my $num  = $_[1];
+    my $dir  = $_[2];
+    my $target = "$num$dir";
+    if ($dir eq "b") {
+        $line =~ s/$target/$last_temp_labels{$num}/g;
+    } else {
+        my $name = "temp_label_$temp_label_next";
+        $temp_label_next++;
+        push(@{$next_temp_labels{$num}}, $name);
+        $line =~ s/$target/$name/g;
+    }
+    return $line;
+}
+
+sub handle_serialized_line {
+    my $line = $_[0];
+
+    # handle .previous (only with regard to .section not .subsection)
+    if ($line =~ /\.(section|text|const_data)/) {
+        push(@sections, $line);
+    } elsif ($line =~ /\.previous/) {
+        if (!$sections[-2]) {
+            die ".previous without a previous section";
+        }
+        $line = $sections[-2];
+        push(@sections, $line);
+    }
+
+    $thumb = 1 if $line =~ /\.code\s+16|\.thumb/;
+    $thumb = 0 if $line =~ /\.code\s+32|\.arm/;
+
+    # handle ldr <reg>, =<expr>
+    if ($line =~ /(.*)\s*ldr([\w\s\d]+)\s*,\s*=(.*)/ and $as_type ne "armasm") {
+        my $label = $literal_labels{$3};
+        if (!$label) {
+            $label = "Literal_$literal_num";
+            $literal_num++;
+            $literal_labels{$3} = $label;
+        }
+        $line = "$1 ldr$2, $label\n";
+    } elsif ($line =~ /\.ltorg/ and $as_type ne "armasm") {
+        $line .= ".align 2\n";
+        foreach my $literal (keys %literal_labels) {
+            $line .= "$literal_labels{$literal}:\n $literal_expr $literal\n";
+        }
+        %literal_labels = ();
+    }
+
+    # handle GNU as pc-relative relocations for adrp/add
+    if ($line =~ /(.*)\s*adrp([\w\s\d]+)\s*,\s*#?:pg_hi21:([^\s]+)/) {
+        $line = "$1 adrp$2, ${3}\@PAGE\n";
+    } elsif ($line =~ /(.*)\s*add([\w\s\d]+)\s*,([\w\s\d]+)\s*,\s*#?:lo12:([^\s]+)/) {
+        $line = "$1 add$2, $3, ${4}\@PAGEOFF\n";
+    }
+
+    # thumb add with large immediate needs explicit add.w
+    if ($thumb and $line =~ /add\s+.*#([^@]+)/) {
+        $line =~ s/add/add.w/ if eval_expr($1) > 255;
+    }
+
+    # mach-o local symbol names start with L (no dot)
+    $line =~ s/(?<!\w)\.(L\w+)/$1/g;
+
+    # recycle the '.func' directive for '.thumb_func'
+    if ($thumb and $as_type =~ /^apple-/) {
+        $line =~ s/\.func/.thumb_func/x;
+    }
+
+    if ($thumb and $line =~ /^\s*(\w+)\s*:/) {
+        $thumb_labels{$1}++;
+    }
+
+    if ($as_type =~ /^apple-/ and
+        $line =~ /^\s*((\w+\s*:\s*)?bl?x?(..)?(?:\.w)?|\.global)\s+(\w+)/) {
+        my $cond = $3;
+        my $label = $4;
+        # Don't interpret e.g. bic as b<cc> with ic as conditional code
+        if ($cond =~ /|$arm_cond_codes/) {
+            if (exists $thumb_labels{$label}) {
+                print ASMFILE ".thumb_func $label\n";
+            } else {
+                $call_targets{$label}++;
+            }
+        }
+    }
+
+    # @l -> lo16()  @ha -> ha16()
+    $line =~ s/,\s+([^,]+)\@l\b/, lo16($1)/g;
+    $line =~ s/,\s+([^,]+)\@ha\b/, ha16($1)/g;
+
+    # move to/from SPR
+    if ($line =~ /(\s+)(m[ft])([a-z]+)\s+(\w+)/ and exists $ppc_spr{$3}) {
+        if ($2 eq 'mt') {
+            $line = "$1${2}spr $ppc_spr{$3}, $4\n";
+        } else {
+            $line = "$1${2}spr $4, $ppc_spr{$3}\n";
+        }
+    }
+
+    if ($line =~ /\.unreq\s+(.*)/) {
+        if (defined $neon_alias_reg{$1}) {
+            delete $neon_alias_reg{$1};
+            delete $neon_alias_type{$1};
+            return;
+        } elsif (defined $aarch64_req_alias{$1}) {
+            delete $aarch64_req_alias{$1};
+            return;
+        }
+    }
+    # old gas versions store upper and lower case names on .req,
+    # but they remove only one on .unreq
+    if ($fix_unreq) {
+        if ($line =~ /\.unreq\s+(.*)/) {
+            $line = ".unreq " . lc($1) . "\n";
+            $line .= ".unreq " . uc($1) . "\n";
+        }
+    }
+
+    if ($line =~ /(\w+)\s+\.(dn|qn)\s+(\w+)(?:\.(\w+))?(\[\d+\])?/) {
+        $neon_alias_reg{$1} = "$3$5";
+        $neon_alias_type{$1} = $4;
+        return;
+    }
+    if (scalar keys %neon_alias_reg > 0 && $line =~ /^\s+v\w+/) {
+        # This line seems to possibly have a neon instruction
+        foreach (keys %neon_alias_reg) {
+            my $alias = $_;
+            # Require the register alias to match as an invididual word, not as a substring
+            # of a larger word-token.
+            if ($line =~ /\b$alias\b/) {
+                $line =~ s/\b$alias\b/$neon_alias_reg{$alias}/g;
+                # Add the type suffix. If multiple aliases match on the same line,
+                # only do this replacement the first time (a vfoo.bar string won't match v\w+).
+                $line =~ s/^(\s+)(v\w+)(\s+)/$1$2.$neon_alias_type{$alias}$3/;
+            }
+        }
+    }
+
+    if ($arch eq "aarch64" or $as_type eq "armasm") {
+        # clang's integrated aarch64 assembler in Xcode 5 does not support .req/.unreq
+        if ($line =~ /\b(\w+)\s+\.req\s+(\w+)\b/) {
+            $aarch64_req_alias{$1} = $2;
+            return;
+        }
+        foreach (keys %aarch64_req_alias) {
+            my $alias = $_;
+            # recursively resolve aliases
+            my $resolved = $aarch64_req_alias{$alias};
+            while (defined $aarch64_req_alias{$resolved}) {
+                $resolved = $aarch64_req_alias{$resolved};
+            }
+            $line =~ s/\b$alias\b/$resolved/g;
+        }
+    }
+    if ($arch eq "aarch64") {
+        # fix missing aarch64 instructions in Xcode 5.1 (beta3)
+        # mov with vector arguments is not supported, use alias orr instead
+        if ($line =~ /^\s*mov\s+(v\d[\.{}\[\]\w]+),\s*(v\d[\.{}\[\]\w]+)\b\s*$/) {
+            $line = "        orr $1, $2, $2\n";
+        }
+        # movi 16, 32 bit shifted variant, shift is optional
+        if ($line =~ /^\s*movi\s+(v[0-3]?\d\.(?:2|4|8)[hsHS])\s*,\s*(#\w+)\b\s*$/) {
+            $line = "        movi $1, $2, lsl #0\n";
+        }
+        # Xcode 5 misses the alias uxtl. Replace it with the more general ushll.
+        # Clang 3.4 misses the alias sxtl too. Replace it with the more general sshll.
+        if ($line =~ /^\s*(s|u)xtl(2)?\s+(v[0-3]?\d\.[248][hsdHSD])\s*,\s*(v[0-3]?\d\.(?:2|4|8|16)[bhsBHS])\b\s*$/) {
+            $line = "        $1shll$2 $3, $4, #0\n";
+        }
+        # clang 3.4 does not automatically use shifted immediates in add/sub
+        if ($as_type eq "clang" and
+            $line =~ /^(\s*(?:add|sub)s?) ([^#l]+)#([\d\+\-\*\/ <>]+)\s*$/) {
+            my $imm = eval $3;
+            if ($imm > 4095 and not ($imm & 4095)) {
+                $line = "$1 $2#" . ($imm >> 12) . ", lsl #12\n";
+            }
+        }
+        if ($ENV{GASPP_FIX_XCODE5}) {
+            if ($line =~ /^\s*bsl\b/) {
+                $line =~ s/\b(bsl)(\s+v[0-3]?\d\.(\w+))\b/$1.$3$2/;
+                $line =~ s/\b(v[0-3]?\d)\.$3\b/$1/g;
+            }
+            if ($line =~ /^\s*saddl2?\b/) {
+                $line =~ s/\b(saddl2?)(\s+v[0-3]?\d\.(\w+))\b/$1.$3$2/;
+                $line =~ s/\b(v[0-3]?\d)\.\w+\b/$1/g;
+            }
+            if ($line =~ /^\s*dup\b.*\]$/) {
+                $line =~ s/\bdup(\s+v[0-3]?\d)\.(\w+)\b/dup.$2$1/g;
+                $line =~ s/\b(v[0-3]?\d)\.[bhsdBHSD](\[\d\])$/$1$2/g;
+            }
+        }
+    }
+
+    if ($as_type eq "armasm") {
+        # Also replace variables set by .set
+        foreach (keys %symbols) {
+            my $sym = $_;
+            $line =~ s/\b$sym\b/$symbols{$sym}/g;
+        }
+
+        # Handle function declarations and keep track of the declared labels
+        if ($line =~ s/^\s*\.func\s+(\w+)/$1 PROC/) {
+            $labels_seen{$1} = 1;
+        }
+
+        if ($line =~ s/^\s*(\d+)://) {
+            # Convert local labels into unique labels. armasm (at least in
+            # RVCT) has something similar, but still different enough.
+            # By converting to unique labels we avoid any possible
+            # incompatibilities.
+
+            my $num = $1;
+            foreach (@{$next_temp_labels{$num}}) {
+                $line = "$_\n" . $line;
+            }
+            @next_temp_labels{$num} = ();
+            my $name = "temp_label_$temp_label_next";
+            $temp_label_next++;
+            # The matching regexp above removes the label from the start of
+            # the line (which might contain an instruction as well), readd
+            # it on a separate line above it.
+            $line = "$name:\n" . $line;
+            $last_temp_labels{$num} = $name;
+        }
+
+        if ($line =~ s/^(\w+):/$1/) {
+            # Skip labels that have already been declared with a PROC,
+            # labels must not be declared multiple times.
+            return if (defined $labels_seen{$1});
+            $labels_seen{$1} = 1;
+        } elsif ($line !~ /(\w+) PROC/) {
+            # If not a label, make sure the line starts with whitespace,
+            # otherwise ms armasm interprets it incorrectly.
+            $line =~ s/^[\.\w]/\t$&/;
+        }
+
+
+        # Check branch instructions
+        if ($line =~ /(?:^|\n)\s*(\w+\s*:\s*)?(bl?x?(..)?(\.w)?)\s+(\w+)/) {
+            my $instr = $2;
+            my $cond = $3;
+            my $width = $4;
+            my $target = $5;
+            # Don't interpret e.g. bic as b<cc> with ic as conditional code
+            if ($cond !~ /|$arm_cond_codes/) {
+                # Not actually a branch
+            } elsif ($target =~ /(\d+)([bf])/) {
+                # The target is a local label
+                $line = handle_local_label($line, $1, $2);
+                $line =~ s/\b$instr\b/$&.w/ if $width eq "";
+            } elsif (!is_arm_register($target)) {
+                $call_targets{$target}++;
+            }
+        } elsif ($line =~ /^\s*.h?word.*\b\d+[bf]\b/) {
+            while ($line =~ /\b(\d+)([bf])\b/g) {
+                $line = handle_local_label($line, $1, $2);
+            }
+        }
+
+        # ALIGN in armasm syntax is the actual number of bytes
+        if ($line =~ /\.align\s+(\d+)/) {
+            my $align = 1 << $1;
+            $line =~ s/\.align\s(\d+)/ALIGN $align/;
+        }
+        # Convert gas style [r0, :128] into armasm [r0@128] alignment specification
+        $line =~ s/\[([^\[]+),\s*:(\d+)\]/[$1\@$2]/g;
+
+        # armasm treats logical values {TRUE} and {FALSE} separately from
+        # numeric values - logical operators and values can't be intermixed
+        # with numerical values. Evaluate !<number> and (a <> b) into numbers,
+        # let the assembler evaluate the rest of the expressions. This current
+        # only works for cases when ! and <> are used with actual constant numbers,
+        # we don't evaluate subexpressions here.
+
+        # Evaluate !<number>
+        while ($line =~ /!\s*(\d+)/g) {
+            my $val = ($1 != 0) ? 0 : 1;
+            $line =~ s/!(\d+)/$val/;
+        }
+        # Evaluate (a > b)
+        while ($line =~ /$\s*(\d+)\s*([<>])\s*(\d+)\s*$/) {
+            my $val;
+            if ($2 eq "<") {
+                $val = ($1 < $3) ? 1 : 0;
+            } else {
+                $val = ($1 > $3) ? 1 : 0;
+            }
+            $line =~ s/$\s*(\d+)\s*([<>])\s*(\d+)\s*$/$val/;
+        }
+
+        # Change a movw... #:lower16: into a mov32 pseudoinstruction
+        $line =~ s/^(\s*)movw(\s+\w+\s*,\s*)\#:lower16:(.*)$/$1mov32$2$3/;
+        # and remove the following, matching movt completely
+        $line =~ s/^\s*movt\s+\w+\s*,\s*\#:upper16:.*$//;
+
+        if ($line =~ /^\s*mov32\s+\w+,\s*([a-zA-Z]\w*)/) {
+            $mov32_targets{$1}++;
+        }
+
+        # Misc bugs/deficiencies:
+        # armasm seems unable to parse e.g. "vmov s0, s1" without a type
+        # qualifier, thus add .f32.
+        $line =~ s/^(\s+(?:vmov|vadd))(\s+s)/$1.f32$2/;
+        # armasm is unable to parse &0x - add spacing
+        $line =~ s/&0x/& 0x/g;
+    }
+
+    if ($force_thumb) {
+        # Convert register post indexing to a separate add instruction.
+        # This converts e.g. "ldr r0, [r1], r2" into "ldr r0, [r1]",
+        # "add r1, r1, r2".
+        $line =~ s/(ldr|str)\s+(\w+),\s*\[(\w+)\],\s*(\w+)/$1 $2, [$3]\n\tadd $3, $3, $4/g;
+
+        # Convert "mov pc, lr" into "bx lr", since the former only works
+        # for switching from arm to thumb (and only in armv7), but not
+        # from thumb to arm.
+        s/mov\s*pc\s*,\s*lr/bx lr/g;
+
+        # Convert stmdb/ldmia with only one register into a plain str/ldr with post-increment/decrement
+        $line =~ s/stmdb\s+sp!\s*,\s*\{([^,-]+)\}/str $1, [sp, #-4]!/g;
+        $line =~ s/ldmia\s+sp!\s*,\s*\{([^,-]+)\}/ldr $1, [sp], #4/g;
+
+        $line =~ s/\.arm/.thumb/x;
+    }
+
+    # comment out unsupported directives
+    $line =~ s/\.type/$comm$&/x        if $as_type =~ /^(apple-|armasm)/;
+    $line =~ s/\.func/$comm$&/x        if $as_type =~ /^(apple-|clang)/;
+    $line =~ s/\.endfunc/$comm$&/x     if $as_type =~ /^(apple-|clang)/;
+    $line =~ s/\.endfunc/ENDP/x        if $as_type =~ /armasm/;
+    $line =~ s/\.ltorg/$comm$&/x       if $as_type =~ /^(apple-|clang)/;
+    $line =~ s/\.ltorg/LTORG/x         if $as_type eq "armasm";
+    $line =~ s/\.size/$comm$&/x        if $as_type =~ /^(apple-|armasm)/;
+    $line =~ s/\.fpu/$comm$&/x         if $as_type =~ /^(apple-|armasm)/;
+    $line =~ s/\.arch/$comm$&/x        if $as_type =~ /^(apple-|clang|armasm)/;
+    $line =~ s/\.object_arch/$comm$&/x if $as_type =~ /^(apple-|armasm)/;
+    $line =~ s/.section\s+.note.GNU-stack.*/$comm$&/x if $as_type =~ /^(apple-|armasm)/;
+
+    $line =~ s/\.syntax/$comm$&/x      if $as_type =~ /armasm/;
+
+    $line =~ s/\.hword/.short/x;
+
+    if ($as_type =~ /^apple-/) {
+        # the syntax for these is a little different
+        $line =~ s/\.global/.globl/x;
+        # also catch .section .rodata since the equivalent to .const_data is .section __DATA,__const
+        $line =~ s/(.*)\.rodata/.const_data/x;
+        $line =~ s/\.int/.long/x;
+        $line =~ s/\.float/.single/x;
+    }
+    if ($as_type eq "armasm") {
+        $line =~ s/\.global/EXPORT/x;
+        $line =~ s/\.int/dcd/x;
+        $line =~ s/\.long/dcd/x;
+        $line =~ s/\.float/dcfs/x;
+        $line =~ s/\.word/dcd/x;
+        $line =~ s/\.short/dcw/x;
+        $line =~ s/\.byte/dcb/x;
+        $line =~ s/\.thumb/THUMB/x;
+        $line =~ s/\.arm/ARM/x;
+        # The alignment in AREA is the power of two, just as .align in gas
+        $line =~ s/\.text/AREA |.text|, CODE, READONLY, ALIGN=2, CODEALIGN/;
+        $line =~ s/(\s*)(.*)\.rodata/$1AREA |.rodata|, DATA, READONLY, ALIGN=5/;
+
+        $line =~ s/fmxr/vmsr/;
+        $line =~ s/fmrx/vmrs/;
+        $line =~ s/fadds/vadd.f32/;
+    }
+
+    # catch unknown section names that aren't mach-o style (with a comma)
+    if ($as_type =~ /apple-/ and $line =~ /.section ([^,]*)$/) {
+        die ".section $1 unsupported; figure out the mach-o section name and add it";
+    }
+
+    print ASMFILE $line;
+}
+
+if ($as_type ne "armasm") {
+    print ASMFILE ".text\n";
+    print ASMFILE ".align 2\n";
+    foreach my $literal (keys %literal_labels) {
+        print ASMFILE "$literal_labels{$literal}:\n $literal_expr $literal\n";
+    }
+
+    map print(ASMFILE ".thumb_func $_\n"),
+        grep exists $thumb_labels{$_}, keys %call_targets;
+} else {
+    map print(ASMFILE "\tIMPORT $_\n"),
+        grep ! exists $labels_seen{$_}, (keys %call_targets, keys %mov32_targets);
+
+    print ASMFILE "\tEND\n";
+}
+
+close(INPUT) or exit 1;
+close(ASMFILE) or exit 1;
+if ($as_type eq "armasm" and ! defined $ENV{GASPP_DEBUG}) {
+    system(@gcc_cmd) == 0 or die "Error running assembler";
+}
+
+END {
+    unlink($tempfile) if defined $tempfile;
+}
+#exit 1

x264-snapshot-20141218-2245.tar.bz2/x264.c -> x264-snapshot-20150804-2245.tar.bz2/x264.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * x264: top-level x264cli functions
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -209,6 +209,13 @@
 #endif
     0
 };
+static const char * const chroma_format_names[] =
+{
+    [0] = "all",
+    [X264_CSP_I420] = "i420",
+    [X264_CSP_I422] = "i422",
+    [X264_CSP_I444] = "i444"
+};
 
 static const char * const range_names[] = { "auto", "tv", "pc", 0 };
 
@@ -325,7 +332,8 @@
 #else
     printf( "using an unknown compiler\n" );
 #endif
-    printf( "configuration: --bit-depth=%d --chroma-format=%s\n", x264_bit_depth, X264_CHROMA_FORMAT ? (output_csp_names[0]+1) : "all" );
+    printf( "x264 configuration: --bit-depth=%d --chroma-format=%s\n", X264_BIT_DEPTH, chroma_format_names[X264_CHROMA_FORMAT] );
+    printf( "libx264 configuration: --bit-depth=%d --chroma-format=%s\n", x264_bit_depth, chroma_format_names[x264_chroma_format] );
     printf( "x264 license: " );
 #if HAVE_GPL
     printf( "GPL version 2 or later\n" );
@@ -533,7 +541,7 @@
         "                                  Overrides all settings.\n" );
     H2(
 #if X264_CHROMA_FORMAT <= X264_CSP_I420
-#if BIT_DEPTH==8
+#if X264_BIT_DEPTH==8
         "                                  - baseline:\n"
         "                                    --no-8x8dct --bframes 0 --no-cabac\n"
         "                                    --cqm flat --weightp 0\n"
@@ -561,7 +569,7 @@
         else H0(
         "                                  - "
 #if X264_CHROMA_FORMAT <= X264_CSP_I420
-#if BIT_DEPTH==8
+#if X264_BIT_DEPTH==8
         "baseline,main,high,"
 #endif
         "high10,"
@@ -703,7 +711,9 @@
         "                                  - 2: row alternation - L and R are interlaced by row\n"
         "                                  - 3: side by side - L is on the left, R on the right\n"
         "                                  - 4: top bottom - L is on top, R on bottom\n"
-        "                                  - 5: frame alternation - one view per frame\n" );
+        "                                  - 5: frame alternation - one view per frame\n"
+        "                                  - 6: mono - 2D frame without any frame packing\n"
+        "                                  - 7: tile format - L is on top-left, R split across\n" );
     H0( "\n" );
     H0( "Ratecontrol:\n" );
     H0( "\n" );
@@ -726,7 +736,8 @@
     H2( "      --aq-mode <integer>     AQ method [%d]\n"
         "                                  - 0: Disabled\n"
         "                                  - 1: Variance AQ (complexity mask)\n"
-        "                                  - 2: Auto-variance AQ (experimental)\n", defaults->rc.i_aq_mode );
+        "                                  - 2: Auto-variance AQ\n"
+        "                                  - 3: Auto-variance AQ with bias to dark scenes\n", defaults->rc.i_aq_mode );
     H1( "      --aq-strength <float>   Reduces blocking and blurring in flat and\n"
         "                              textured areas. [%.1f]\n", defaults->rc.f_aq_strength );
     H1( "\n" );
@@ -1286,11 +1297,11 @@
     /* force the output csp to what the user specified (or the default) */
     param->i_csp = info->csp;
     int csp = info->csp & X264_CSP_MASK;
-    if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp > X264_CSP_NV12) )
+    if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp >= X264_CSP_I422) )
         param->i_csp = X264_CSP_I420;
-    else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_V210) )
+    else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp >= X264_CSP_I444) )
         param->i_csp = X264_CSP_I422;
-    else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp > X264_CSP_YV24) )
+    else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp >= X264_CSP_BGR) )
         param->i_csp = X264_CSP_I444;
     else if( output_csp == X264_CSP_RGB && (csp < X264_CSP_BGR || csp > X264_CSP_RGB) )
         param->i_csp = X264_CSP_RGB;

x264-snapshot-20141218-2245.tar.bz2/x264.h -> x264-snapshot-20150804-2245.tar.bz2/x264.h Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * x264.h: x264 public header
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -29,7 +29,7 @@
 #define X264_X264_H
 
 #if !defined(_STDINT_H) && !defined(_STDINT_H_) && !defined(_STDINT_H_INCLUDED) && !defined(_STDINT) &&\
-    !defined(_INTTYPES_H) && !defined(_INTTYPES_H_) && !defined(_INTTYPES)
+    !defined(_SYS_STDINT_H_) && !defined(_INTTYPES_H) && !defined(_INTTYPES_H_) && !defined(_INTTYPES)
 # ifdef _MSC_VER
 #  pragma message("You must include stdint.h or inttypes.h before x264.h")
 # else
@@ -41,7 +41,7 @@
 
 #include "x264_config.h"
 
-#define X264_BUILD 142
+#define X264_BUILD 148
 
 /* Application developers planning to link against a shared library version of
  * libx264 from a Microsoft Visual Studio or similar development environment
@@ -129,8 +129,8 @@
 #define X264_CPU_AVX             0x0000400  /* AVX support: requires OS support even if YMM registers aren't used. */
 #define X264_CPU_XOP             0x0000800  /* AMD XOP */
 #define X264_CPU_FMA4            0x0001000  /* AMD FMA4 */
-#define X264_CPU_AVX2            0x0002000  /* AVX2 */
-#define X264_CPU_FMA3            0x0004000  /* Intel FMA3 */
+#define X264_CPU_FMA3            0x0002000  /* FMA3 */
+#define X264_CPU_AVX2            0x0004000  /* AVX2 */
 #define X264_CPU_BMI1            0x0008000  /* BMI1 */
 #define X264_CPU_BMI2            0x0010000  /* BMI2 */
 /* x86 modifiers */
@@ -158,6 +158,9 @@
 #define X264_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
 #define X264_CPU_ARMV8           0x0000008
 
+/* MIPS */
+#define X264_CPU_MSA             0x0000001  /* MIPS MSA */
+
 /* Analyse flags */
 #define X264_ANALYSE_I4x4       0x0001  /* Analyse i4x4 */
 #define X264_ANALYSE_I8x8       0x0002  /* Analyse i8x8 (requires 8x8 transform) */
@@ -183,6 +186,7 @@
 #define X264_AQ_NONE                 0
 #define X264_AQ_VARIANCE             1
 #define X264_AQ_AUTOVARIANCE         2
+#define X264_AQ_AUTOVARIANCE_BIASED  3
 #define X264_B_ADAPT_NONE            0
 #define X264_B_ADAPT_FAST            1
 #define X264_B_ADAPT_TRELLIS         2
@@ -213,16 +217,17 @@
 #define X264_CSP_I420           0x0001  /* yuv 4:2:0 planar */
 #define X264_CSP_YV12           0x0002  /* yvu 4:2:0 planar */
 #define X264_CSP_NV12           0x0003  /* yuv 4:2:0, with one y plane and one packed u+v */
-#define X264_CSP_I422           0x0004  /* yuv 4:2:2 planar */
-#define X264_CSP_YV16           0x0005  /* yvu 4:2:2 planar */
-#define X264_CSP_NV16           0x0006  /* yuv 4:2:2, with one y plane and one packed u+v */
-#define X264_CSP_V210           0x0007  /* 10-bit yuv 4:2:2 packed in 32 */
-#define X264_CSP_I444           0x0008  /* yuv 4:4:4 planar */
-#define X264_CSP_YV24           0x0009  /* yvu 4:4:4 planar */
-#define X264_CSP_BGR            0x000a  /* packed bgr 24bits   */
-#define X264_CSP_BGRA           0x000b  /* packed bgr 32bits   */
-#define X264_CSP_RGB            0x000c  /* packed rgb 24bits   */
-#define X264_CSP_MAX            0x000d  /* end of list */
+#define X264_CSP_NV21           0x0004  /* yuv 4:2:0, with one y plane and one packed v+u */
+#define X264_CSP_I422           0x0005  /* yuv 4:2:2 planar */
+#define X264_CSP_YV16           0x0006  /* yvu 4:2:2 planar */
+#define X264_CSP_NV16           0x0007  /* yuv 4:2:2, with one y plane and one packed u+v */
+#define X264_CSP_V210           0x0008  /* 10-bit yuv 4:2:2 packed in 32 */
+#define X264_CSP_I444           0x0009  /* yuv 4:4:4 planar */
+#define X264_CSP_YV24           0x000a  /* yvu 4:4:4 planar */
+#define X264_CSP_BGR            0x000b  /* packed bgr 24bits   */
+#define X264_CSP_BGRA           0x000c  /* packed bgr 32bits   */
+#define X264_CSP_RGB            0x000d  /* packed rgb 24bits   */
+#define X264_CSP_MAX            0x000e  /* end of list */
 #define X264_CSP_VFLIP          0x1000  /* the csp is vertically flipped */
 #define X264_CSP_HIGH_DEPTH     0x2000  /* the csp has a depth of 16 bits per pixel component */
 
@@ -234,7 +239,7 @@
 #define X264_TYPE_BREF          0x0004  /* Non-disposable B-frame */
 #define X264_TYPE_B             0x0005
 #define X264_TYPE_KEYFRAME      0x0006  /* IDR or I depending on b_open_gop option */
-#define IS_X264_TYPE_I(x) ((x)==X264_TYPE_I || (x)==X264_TYPE_IDR)
+#define IS_X264_TYPE_I(x) ((x)==X264_TYPE_I || (x)==X264_TYPE_IDR || (x)==X264_TYPE_KEYFRAME)
 #define IS_X264_TYPE_B(x) ((x)==X264_TYPE_B || (x)==X264_TYPE_BREF)
 
 /* Log level */
@@ -789,8 +794,6 @@
     /* In: force picture type (if not auto)
      *     If x264 encoding parameters are violated in the forcing of picture types,
      *     x264 will correct the input picture type and log a warning.
-     *     The quality of frametype decisions may suffer if a great deal of fine-grained
-     *     mixing of auto and forced frametypes is done.
      * Out: type of the picture encoded */
     int     i_type;
     /* In: force quantizer for != X264_QP_AUTO */

x264-snapshot-20141218-2245.tar.bz2/x264cli.h -> x264-snapshot-20150804-2245.tar.bz2/x264cli.h Changed

x264-snapshot-20141218-2245.tar.bz2/x264dll.c -> x264-snapshot-20150804-2245.tar.bz2/x264dll.c Changed

x264-snapshot-20141218-2245.tar.bz2/x264res.rc -> x264-snapshot-20150804-2245.tar.bz2/x264res.rc Changed