Packman Build Service PMBS

libx264.changes Changed

libx264.spec Changed

x264-snapshot-20141218-2245.tar.bz2/extras/gas-preprocessor.pl Deleted

@@ -1,253 +0,0 @@
-#!/usr/bin/env perl
-# by David Conrad
-# This code is licensed under GPLv2 or later; go to gnu.org to read it
-#  (not that it much matters for an asm preprocessor)
-# usage: set your assembler to be something like "perl gas-preprocessor.pl gcc"
-use strict;
-
-# Apple's gas is ancient and doesn't support modern preprocessing features like
-# .rept and has ugly macro syntax, among other things. Thus, this script
-# implements the subset of the gas preprocessor used by x264 and ffmpeg
-# that isn't supported by Apple's gas.
-
-my @gcc_cmd = @ARGV;
-my @preprocess_c_cmd;
-
-if (grep /\.c$/, @gcc_cmd) {
-    # C file (inline asm?) - compile
-    @preprocess_c_cmd = (@gcc_cmd, "-S");
-} elsif (grep /\.S$/, @gcc_cmd) {
-    # asm file, just do C preprocessor
-    @preprocess_c_cmd = (@gcc_cmd, "-E");
-} else {
-    die "Unrecognized input filetype";
-}
-@gcc_cmd = map { /\.[cS]$/ ? qw(-x assembler -) : $_ } @gcc_cmd;
-@preprocess_c_cmd = map { /\.o$/ ? "-" : $_ } @preprocess_c_cmd;
-
-open(ASMFILE, "-|", @preprocess_c_cmd) || die "Error running preprocessor";
-
-my $current_macro = '';
-my %macro_lines;
-my %macro_args;
-my %macro_args_default;
-
-my @pass1_lines;
-
-# pass 1: parse .macro
-# note that the handling of arguments is probably overly permissive vs. gas
-# but it should be the same for valid cases
-while (<ASMFILE>) {
-    # comment out unsupported directives
-    s/\.type/@.type/x;
-    s/\.func/@.func/x;
-    s/\.endfunc/@.endfunc/x;
-    s/\.ltorg/@.ltorg/x;
-    s/\.size/@.size/x;
-    s/\.fpu/@.fpu/x;
-
-    # the syntax for these is a little different
-    s/\.global/.globl/x;
-    # also catch .section .rodata since the equivalent to .const_data is .section __DATA,__const
-    s/(.*)\.rodata/.const_data/x;
-    s/\.int/.long/x;
-    s/\.float/.single/x;
-
-    # catch unknown section names that aren't mach-o style (with a comma)
-    if (/.section ([^,]*)$/) {
-        die ".section $1 unsupported; figure out the mach-o section name and add it";
-    }
-
-    # macros creating macros is not handled (is that valid?)
-    if (/\.macro\s+([\d\w\.]+)\s*(.*)/) {
-        $current_macro = $1;
-
-        # commas in the argument list are optional, so only use whitespace as the separator
-        my $arglist = $2;
-        $arglist =~ s/,/ /g;
-
-        my @args = split(/\s+/, $arglist);
-        foreach my $i (0 .. $#args) {
-            my @argpair = split(/=/, $args[$i]);
-            $macro_args{$current_macro}[$i] = $argpair[0];
-            $argpair[0] =~ s/:vararg$//;
-            $macro_args_default{$current_macro}{$argpair[0]} = $argpair[1];
-        }
-        # ensure %macro_lines has the macro name added as a key
-        $macro_lines{$current_macro} = [];
-    } elsif (/\.endm/) {
-        if (!$current_macro) {
-            die "ERROR: .endm without .macro";
-        }
-        $current_macro = '';
-    } elsif ($current_macro) {
-        push(@{$macro_lines{$current_macro}}, $_);
-    } else {
-        expand_macros($_);
-    }
-}
-
-sub expand_macros {
-    my $line = @_[0];
-    if ($line =~ /(\S+:|)\s*([\w\d\.]+)\s*(.*)/ && exists $macro_lines{$2}) {
-        push(@pass1_lines, $1);
-        my $macro = $2;
-
-        # commas are optional here too, but are syntactically important because
-        # parameters can be blank
-        my @arglist = split(/,/, $3);
-        my @args;
-        foreach (@arglist) {
-            my @whitespace_split = split(/\s+/, $_);
-            if (!@whitespace_split) {
-                push(@args, '');
-            } else {
-                foreach (@whitespace_split) {
-                    if (length($_)) {
-                        push(@args, $_);
-                    }
-                }
-            }
-        }
-
-        my %replacements;
-        if ($macro_args_default{$macro}){
-            %replacements = %{$macro_args_default{$macro}};
-        }
-
-        # construct hashtable of text to replace
-        foreach my $i (0 .. $#args) {
-            my $argname = $macro_args{$macro}[$i];
-
-            if ($args[$i] =~ m/=/) {
-                # arg=val references the argument name
-                # XXX: I'm not sure what the expected behaviour if a lot of
-                # these are mixed with unnamed args
-                my @named_arg = split(/=/, $args[$i]);
-                $replacements{$named_arg[0]} = $named_arg[1];
-            } elsif ($i > $#{$macro_args{$macro}}) {
-                # more args given than the macro has named args
-                # XXX: is vararg allowed on arguments before the last?
-                $argname = $macro_args{$macro}[-1];
-                if ($argname =~ s/:vararg$//) {
-                    $replacements{$argname} .= ", $args[$i]";
-                } else {
-                    die "Too many arguments to macro $macro";
-                }
-            } else {
-                $argname =~ s/:vararg$//;
-                $replacements{$argname} = $args[$i];
-            }
-        }
-
-        # apply replacements as regex
-        foreach (@{$macro_lines{$macro}}) {
-            my $macro_line = $_;
-            # do replacements by longest first, this avoids wrong replacement
-            # when argument names are subsets of each other
-            foreach (reverse sort {length $a <=> length $b} keys %replacements) {
-                $macro_line =~ s/\\$_/$replacements{$_}/g;
-            }
-            $macro_line =~ s/\\//g;     # remove \()
-            expand_macros($macro_line);
-        }
-    } else {
-        push(@pass1_lines, $line);
-    }
-}
-
-close(ASMFILE) or exit 1;
-open(ASMFILE, "|-", @gcc_cmd) or die "Error running assembler";
-
-my @sections;
-my $num_repts;
-my $rept_lines;
-
-my %literal_labels;     # for ldr <reg>, =<expr>
-my $literal_num = 0;
-
-# pass 2: parse .rept and .if variants
-# NOTE: since we don't implement a proper parser, using .rept with a
-# variable assigned from .set is not supported
-foreach my $line (@pass1_lines) {
-    # textual comparison .if
-    # this assumes nothing else on the same line
-    if ($line =~ /\.ifnb\s+(.*)/) {
-        if ($1) {
-            $line = ".if 1\n";
-        } else {
-            $line = ".if 0\n";
-        }
-    } elsif ($line =~ /\.ifb\s+(.*)/) {
-        if ($1) {
-            $line = ".if 0\n";
-        } else {
-            $line = ".if 1\n";
-        }
-    } elsif ($line =~ /\.ifc\s+(.*)\s*,\s*(.*)/) {
-        if ($1 eq $2) {
-            $line = ".if 1\n";
-        } else {
-            $line = ".if 0\n";
-        }
-    }
-
-    # handle .previous (only with regard to .section not .subsection)
-    if ($line =~ /\.(section|text|const_data)/) {
-        push(@sections, $line);
-    } elsif ($line =~ /\.previous/) {
-        if (!$sections[-2]) {

x264-snapshot-20141218-2245.tar.bz2/extras/windowsPorts Deleted

x264-snapshot-20141218-2245.tar.bz2/extras/windowsPorts/basicDataTypeConversions.h Deleted

@@ -1,85 +0,0 @@
-#ifndef __DATA_TYPE_CONVERSIONS_H__
-#define __DATA_TYPE_CONVERSIONS_H__
-
-#include <stdint.h>
-#include <wchar.h>
-
-#ifdef __cplusplus
-namespace avxsynth {
-#endif // __cplusplus
-
-typedef int64_t __int64;
-typedef int32_t __int32;
-#ifdef __cplusplus
-typedef bool	BOOL;
-#else
-typedef uint32_t BOOL;
-#endif // __cplusplus
-typedef void* HMODULE;
-typedef void* LPVOID;
-typedef void* PVOID;
-typedef PVOID HANDLE;
-typedef HANDLE HWND;
-typedef HANDLE HINSTANCE;
-typedef void* HDC;
-typedef void* HBITMAP;
-typedef void* HICON;
-typedef void* HFONT;
-typedef void* HGDIOBJ;
-typedef void* HBRUSH;
-typedef void* HMMIO;
-typedef void* HACMSTREAM;
-typedef void* HACMDRIVER;
-typedef void* HIC;
-typedef void* HACMOBJ;
-typedef HACMSTREAM* LPHACMSTREAM;
-typedef void* HACMDRIVERID;
-typedef void* LPHACMDRIVER;
-typedef unsigned char BYTE;
-typedef BYTE* LPBYTE;
-typedef char TCHAR;
-typedef TCHAR* LPTSTR;
-typedef const TCHAR* LPCTSTR;
-typedef char* LPSTR;
-typedef LPSTR LPOLESTR;
-typedef const char* LPCSTR;
-typedef LPCSTR LPCOLESTR;
-typedef wchar_t WCHAR;
-typedef unsigned short WORD;
-typedef unsigned int UINT;
-typedef UINT MMRESULT;
-typedef uint32_t DWORD;
-typedef DWORD COLORREF;
-typedef DWORD FOURCC;
-typedef DWORD HRESULT;
-typedef DWORD* LPDWORD;
-typedef DWORD* DWORD_PTR;
-typedef int32_t LONG;
-typedef int32_t* LONG_PTR;
-typedef LONG_PTR LRESULT;
-typedef uint32_t ULONG;
-typedef uint32_t* ULONG_PTR;
-//typedef __int64_t intptr_t;
-typedef uint64_t _fsize_t;
-
-
-//
-// Structures
-//
-
-typedef struct _GUID {
-  DWORD Data1;
-  WORD  Data2;
-  WORD  Data3;
-  BYTE  Data4[8];
-} GUID;
-
-typedef GUID REFIID;
-typedef GUID CLSID;
-typedef CLSID* LPCLSID;
-typedef GUID IID;
-
-#ifdef __cplusplus
-}; // namespace avxsynth
-#endif // __cplusplus
-#endif //  __DATA_TYPE_CONVERSIONS_H__

x264-snapshot-20141218-2245.tar.bz2/extras/windowsPorts/windows2linux.h Deleted

@@ -1,77 +0,0 @@
-#ifndef __WINDOWS2LINUX_H__
-#define __WINDOWS2LINUX_H__
-
-/*
- * LINUX SPECIFIC DEFINITIONS
-*/
-//
-// Data types conversions
-//
-#include <stdlib.h>
-#include <string.h>
-#include "basicDataTypeConversions.h"
-
-#ifdef __cplusplus
-namespace avxsynth {
-#endif // __cplusplus
-//
-// purposefully define the following MSFT definitions 
-// to mean nothing (as they do not mean anything on Linux)
-//
-#define __stdcall
-#define __cdecl
-#define noreturn
-#define __declspec(x)
-#define STDAPI       extern "C" HRESULT
-#define STDMETHODIMP HRESULT __stdcall
-#define STDMETHODIMP_(x) x __stdcall
-
-#define STDMETHOD(x)    virtual HRESULT x
-#define STDMETHOD_(a, x) virtual a x
-
-#ifndef TRUE
-#define TRUE  true
-#endif 
-
-#ifndef FALSE
-#define FALSE false
-#endif
-
-#define S_OK                (0x00000000)
-#define S_FALSE             (0x00000001)
-#define E_NOINTERFACE       (0X80004002)
-#define E_POINTER           (0x80004003)
-#define E_FAIL              (0x80004005)
-#define E_OUTOFMEMORY       (0x8007000E)
-
-#define INVALID_HANDLE_VALUE    ((HANDLE)((LONG_PTR)-1))
-#define FAILED(hr)              ((hr) & 0x80000000)
-#define SUCCEEDED(hr)           (!FAILED(hr))
-
-
-// 
-// Functions
-//
-#define MAKEDWORD(a,b,c,d) ((a << 24) | (b << 16) | (c << 8) | (d))
-#define MAKEWORD(a,b) ((a << 8) | (b))
-
-#define lstrlen                             strlen
-#define lstrcpy                             strcpy
-#define lstrcmpi                            strcasecmp
-#define _stricmp                            strcasecmp
-#define InterlockedIncrement(x)             __sync_fetch_and_add((x), 1)
-#define InterlockedDecrement(x)             __sync_fetch_and_sub((x), 1)
-// Windows uses (new, old) ordering but GCC has (old, new)
-#define InterlockedCompareExchange(x,y,z)   __sync_val_compare_and_swap(x,z,y)
-
-#define UInt32x32To64(a, b)                 ( (uint64_t) ( ((uint64_t)((uint32_t)(a))) * ((uint32_t)(b))  ) ) 
-#define Int64ShrlMod32(a, b)                ( (uint64_t) ( (uint64_t)(a) >> (b) ) )
-#define Int32x32To64(a, b)                  ((__int64)(((__int64)((long)(a))) * ((long)(b))))
-
-#define MulDiv(nNumber, nNumerator, nDenominator)   (int32_t) (((int64_t) (nNumber) * (int64_t) (nNumerator) + (int64_t) ((nDenominator)/2)) / (int64_t) (nDenominator))
-
-#ifdef __cplusplus
-}; // namespace avxsynth
-#endif // __cplusplus
-
-#endif //  __WINDOWS2LINUX_H__

x264-snapshot-20141218-2245.tar.bz2/AUTHORS -> x264-snapshot-20150804-2245.tar.bz2/AUTHORS Changed

x264-snapshot-20141218-2245.tar.bz2/Makefile -> x264-snapshot-20150804-2245.tar.bz2/Makefile Changed

@@ -87,12 +87,12 @@
 endif
 X86SRC = $(X86SRC0:%=common/x86/%)
 
-ifeq ($(ARCH),X86)
+ifeq ($(SYS_ARCH),X86)
 ARCH_X86 = yes
 ASMSRC   = $(X86SRC) common/x86/pixel-32.asm
 endif
 
-ifeq ($(ARCH),X86_64)
+ifeq ($(SYS_ARCH),X86_64)
 ARCH_X86 = yes
 ASMSRC   = $(X86SRC:-32.asm=-64.asm) common/x86/trellis-64.asm
 endif
@@ -106,7 +106,7 @@
 endif
 
 # AltiVec optims
-ifeq ($(ARCH),PPC)
+ifeq ($(SYS_ARCH),PPC)
 ifneq ($(AS),)
 SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \
         common/ppc/quant.c common/ppc/deblock.c \
@@ -115,7 +115,7 @@
 endif
 
 # NEON optims
-ifeq ($(ARCH),ARM)
+ifeq ($(SYS_ARCH),ARM)
 ifneq ($(AS),)
 ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \
           common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S \
@@ -126,20 +126,32 @@
 endif
 
 # AArch64 NEON optims
-ifeq ($(ARCH),AARCH64)
+ifeq ($(SYS_ARCH),AARCH64)
 ifneq ($(AS),)
-ASMSRC += common/aarch64/dct-a.S     \
+ASMSRC += common/aarch64/bitstream-a.S \
+          common/aarch64/cabac-a.S     \
+          common/aarch64/dct-a.S     \
           common/aarch64/deblock-a.S \
           common/aarch64/mc-a.S      \
           common/aarch64/pixel-a.S   \
           common/aarch64/predict-a.S \
           common/aarch64/quant-a.S
-SRCS   += common/aarch64/mc-c.c      \
+SRCS   += common/aarch64/asm-offsets.c \
+          common/aarch64/mc-c.c        \
           common/aarch64/predict-c.c
 OBJASM  = $(ASMSRC:%.S=%.o)
 endif
 endif
 
+# MSA optims
+ifeq ($(SYS_ARCH),MIPS)
+ifneq ($(findstring HAVE_MSA 1, $(CONFIG)),)
+SRCS += common/mips/mc-c.c common/mips/dct-c.c \
+        common/mips/deblock-c.c common/mips/pixel-c.c \
+        common/mips/predict-c.c common/mips/quant-c.c
+endif
+endif
+
 ifneq ($(HAVE_GETOPT_LONG),1)
 SRCCLI += extras/getopt.c
 endif
@@ -264,7 +276,7 @@
 	rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
 
 distclean: clean
-	rm -f config.mak x264_config.h config.h config.log x264.pc x264.def
+	rm -f config.mak x264_config.h config.h config.log x264.pc x264.def conftest*
 
 install-cli: cli
 	$(INSTALL) -d $(DESTDIR)$(bindir)

x264-snapshot-20150804-2245.tar.bz2/common/aarch64/asm-offsets.c Added

@@ -0,0 +1,42 @@
+/*****************************************************************************
+ * asm-offsets.c: check asm offsets for aarch64
+ *****************************************************************************
+ * Copyright (C) 2014-2015 x264 project
+ *
+ * Authors: Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "asm-offsets.h"
+
+#define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \
+{ \
+    int m_##m[2 * (offsetof(s, m) == o) - 1]; \
+}
+
+X264_CHECK_OFFSET(x264_cabac_t, i_low,               CABAC_I_LOW);
+X264_CHECK_OFFSET(x264_cabac_t, i_range,             CABAC_I_RANGE);
+X264_CHECK_OFFSET(x264_cabac_t, i_queue,             CABAC_I_QUEUE);
+X264_CHECK_OFFSET(x264_cabac_t, i_bytes_outstanding, CABAC_I_BYTES_OUTSTANDING);
+X264_CHECK_OFFSET(x264_cabac_t, p_start,             CABAC_P_START);
+X264_CHECK_OFFSET(x264_cabac_t, p,                   CABAC_P);
+X264_CHECK_OFFSET(x264_cabac_t, p_end,               CABAC_P_END);
+X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded,     CABAC_F8_BITS_ENCODED);
+X264_CHECK_OFFSET(x264_cabac_t, state,               CABAC_STATE);

x264-snapshot-20150804-2245.tar.bz2/common/aarch64/asm-offsets.h Added

@@ -0,0 +1,39 @@
+/*****************************************************************************
+ * asm-offsets.h: asm offsets for aarch64
+ *****************************************************************************
+ * Copyright (C) 2014-2015 x264 project
+ *
+ * Authors: Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_ASM_OFFSETS_H
+#define X264_AARCH64_ASM_OFFSETS_H
+
+#define CABAC_I_LOW                 0x00
+#define CABAC_I_RANGE               0x04
+#define CABAC_I_QUEUE               0x08
+#define CABAC_I_BYTES_OUTSTANDING   0x0c
+#define CABAC_P_START               0x10
+#define CABAC_P                     0x18
+#define CABAC_P_END                 0x20
+#define CABAC_F8_BITS_ENCODED       0x30
+#define CABAC_STATE                 0x34
+
+#endif

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/asm.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/asm.S Changed

x264-snapshot-20150804-2245.tar.bz2/common/aarch64/bitstream-a.S Added

@@ -0,0 +1,82 @@
+/*****************************************************************************
+ * bitstream-a.S: aarch64 bitstream functions
+ *****************************************************************************
+ * Copyright (C) 2014-2015 x264 project
+ *
+ * Authors: Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+function x264_nal_escape_neon, export=1
+    movi        v0.16b,  #0xff
+    movi        v4.16b,  #4
+    mov         w3,  #3
+    subs        x6,  x1,  x2
+    cbz         x6,  99f
+0:
+    cmn         x6,  #15
+    b.lt        16f
+    mov         x1,  x2
+    b           100f
+16:
+    ld1         {v1.16b}, [x1], #16
+    ext         v2.16b, v0.16b, v1.16b, #14
+    ext         v3.16b, v0.16b, v1.16b, #15
+    cmhi        v7.16b, v4.16b, v1.16b
+    cmeq        v5.16b, v2.16b, #0
+    cmeq        v6.16b, v3.16b, #0
+    and         v5.16b, v5.16b, v7.16b
+    and         v5.16b, v5.16b, v6.16b
+    shrn        v7.8b,  v5.8h,  #4
+    mov         x7,  v7.d[0]
+    cbz         x7,  16f
+    mov         x6,  #-16
+100:
+    umov        w5,  v0.b[14]
+    umov        w4,  v0.b[15]
+    orr         w5,  w4,  w5, lsl #8
+101:
+    ldrb        w4,  [x1, x6]
+    orr         w9,  w4,  w5, lsl #16
+    cmp         w9,  #3
+    b.hi        102f
+    strb        w3,  [x0], #1
+    orr         w5,  w3,  w5, lsl #8
+102:
+    adds        x6,  x6,  #1
+    strb        w4,  [x0], #1
+    orr         w5,  w4,  w5, lsl #8
+    b.lt        101b
+    subs        x6,  x1,  x2
+    lsr         w9,  w5,  #8
+    mov         v0.b[14],  w9
+    mov         v0.b[15],  w5
+    b.lt        0b
+
+    ret
+16:
+    subs        x6,  x1,  x2
+    st1         {v1.16b}, [x0], #16
+    mov         v0.16b, v1.16b
+    b.lt        0b
+99:
+    ret
+endfunc

x264-snapshot-20150804-2245.tar.bz2/common/aarch64/cabac-a.S Added

@@ -0,0 +1,122 @@
+/*****************************************************************************
+ * cabac-a.S: aarch64 cabac
+ *****************************************************************************
+ * Copyright (C) 2014-2015 x264 project
+ *
+ * Authors: Janne Grunau <janne-x264@jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "asm-offsets.h"
+
+// w11 holds x264_cabac_t.i_low
+// w12 holds x264_cabac_t.i_range
+
+function x264_cabac_encode_decision_asm, export=1
+    movrel      x8,  X(x264_cabac_range_lps)
+    movrel      x9,  X(x264_cabac_transition)
+    add         w10, w1, #CABAC_STATE
+    ldrb        w3,  [x0,  x10]         // i_state
+    ldr         w12, [x0,  #CABAC_I_RANGE]
+    and         x4,  x3,  #~1
+    asr         w5,  w12, #6
+    add         x8,  x8,  x4, lsl #1
+    sub         w5,  w5,  #4
+    eor         w6,  w2,  w3            // b ^ i_state
+    ldrb        w4,  [x8,  x5]          // i_range_lps
+    ldr         w11, [x0, #CABAC_I_LOW]
+    sub         w12, w12, w4
+    tbz         w6,  #0,  1f            // (b ^ i_state) & 1
+    add         w11, w11, w12
+    mov         w12,  w4
+1:
+    orr         w4,  w2,  w3, lsl #1
+    ldrb        w9,  [x9,  x4]
+    strb        w9,  [x0,  x10]    // i_state
+
+cabac_encode_renorm:
+    clz         w5,  w12
+    ldr         w2,  [x0, #CABAC_I_QUEUE]
+    sub         w5,  w5,  #23
+    lsl         w12, w12, w5
+    lsl         w11, w11, w5
+2:
+    adds        w2,  w2,  w5
+    str         w12, [x0, #CABAC_I_RANGE]
+    b.lt        0f
+cabac_putbyte:
+    mov         w13, #0x400
+    add         w12, w2,  #10
+    lsl         w13, w13, w2
+    asr         w4,  w11, w12           // out
+    sub         w2,  w2,  #8
+    sub         w13, w13, #1
+    subs        w5,  w4,  #0xff
+    and         w11, w11, w13
+    ldr         w6,  [x0, #CABAC_I_BYTES_OUTSTANDING]
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    b.ne        1f
+
+    add         w6,  w6,  #1
+    str         w11, [x0, #CABAC_I_LOW]
+    str         w6,  [x0, #CABAC_I_BYTES_OUTSTANDING]
+    ret
+
+1:
+    ldr         x7,  [x0, #CABAC_P]
+    asr         w5,  w4,  #8            // carry
+    ldrb        w8,  [x7, #-1]
+    add         w8,  w8,  w5
+    sub         w5,  w5,  #1
+    strb        w8,  [x7, #-1]
+    cbz         w6,  3f
+2:
+    subs        w6,  w6,  #1
+    strb        w5,  [x7],  #1
+    b.gt        2b
+3:
+    strb        w4,  [x7],  #1
+    str         wzr, [x0, #CABAC_I_BYTES_OUTSTANDING]
+    str         x7,  [x0, #CABAC_P]
+0:
+    str         w11, [x0, #CABAC_I_LOW]
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    ret
+endfunc
+
+function x264_cabac_encode_bypass_asm, export=1
+    ldr         w12, [x0, #CABAC_I_RANGE]
+    ldr         w11, [x0, #CABAC_I_LOW]
+    ldr         w2,  [x0, #CABAC_I_QUEUE]
+    and         w1,  w1,  w12
+    add         w11, w1,  w11, lsl #1
+    adds        w2,  w2,  #1
+    b.ge        cabac_putbyte
+    str         w11, [x0, #CABAC_I_LOW]
+    str         w2,  [x0, #CABAC_I_QUEUE]
+    ret
+endfunc
+
+function x264_cabac_encode_terminal_asm, export=1
+    ldr         w12, [x0, #CABAC_I_RANGE]
+    ldr         w11, [x0, #CABAC_I_LOW]
+    sub         w12, w12, #2
+    b           cabac_encode_renorm
+endfunc

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/dct-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/dct-a.S Changed

@@ -1,9 +1,10 @@
 /****************************************************************************
- * dct-a.S: AArch6464 transform and zigzag
+ * dct-a.S: aarch64 transform and zigzag
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -32,6 +33,25 @@
 .byte   26,27, 28,29, 22,23, 30,31
 endconst
 
+const scan4x4_field, align=4
+.byte    0,1,   2,3,   8,9,   4,5
+.byte    6,7,  10,11, 12,13, 14,15
+endconst
+
+const sub4x4_frame, align=4
+.byte    0,  1,  4,  8
+.byte    5,  2,  3,  6
+.byte    9, 12, 13, 10
+.byte    7, 11, 14, 15
+endconst
+
+const sub4x4_field, align=4
+.byte    0,  4,  1,  8
+.byte   12,  5,  9, 13
+.byte    2,  6, 10, 14
+.byte    3,  7, 11, 15
+endconst
+
 // sum = a + (b>>shift)   sub = (a>>shift) - b
 .macro SUMSUB_SHR shift sum sub a b t0 t1
     sshr        \t0,  \b, #\shift
@@ -602,56 +622,99 @@
     ret
 endfunc
 
+.macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7
+    ld1        {\t0\().8b}, [x1], x3
+    ld1        {\t1\().8b}, [x2], x4
+    ld1        {\t2\().8b}, [x1], x3
+    ld1        {\t3\().8b}, [x2], x4
+    usubl       \t0\().8h,  \t0\().8b,  \t1\().8b
+    ld1        {\t4\().8b}, [x1], x3
+    ld1        {\t5\().8b}, [x2], x4
+    usubl       \t1\().8h,  \t2\().8b,  \t3\().8b
+    ld1        {\t6\().8b}, [x1], x3
+    ld1        {\t7\().8b}, [x2], x4
+    add         \dst\().8h, \t0\().8h,  \t1\().8h
+    usubl       \t2\().8h,  \t4\().8b,  \t5\().8b
+    usubl       \t3\().8h,  \t6\().8b,  \t7\().8b
+    add         \dst\().8h, \dst\().8h, \t2\().8h
+    add         \dst\().8h, \dst\().8h, \t3\().8h
+.endm
+
 function x264_sub8x8_dct_dc_neon, export=1
     mov             x3,  #FENC_STRIDE
     mov             x4,  #FDEC_STRIDE
-    ld1        {v16.8b}, [x1], x3
-    ld1        {v17.8b}, [x2], x4
-    usubl       v16.8h,  v16.8b, v17.8b
-    ld1        {v18.8b}, [x1], x3
-    ld1        {v19.8b}, [x2], x4
-    usubl       v17.8h,  v18.8b, v19.8b
-    ld1        {v20.8b}, [x1], x3
-    ld1        {v21.8b}, [x2], x4
-    usubl       v18.8h, v20.8b, v21.8b
-    ld1        {v22.8b}, [x1], x3
-    add         v0.8h,  v16.8h, v17.8h
-    ld1        {v23.8b}, [x2], x4
-    usubl       v19.8h, v22.8b, v23.8b
-    ld1        {v24.8b}, [x1], x3
-    add         v0.8h,  v0.8h,  v18.8h
-    ld1        {v25.8b}, [x2], x4
-    usubl       v20.8h, v24.8b, v25.8b
-    ld1        {v26.8b}, [x1], x3
-    add         v0.8h,  v0.8h,  v19.8h
-    ld1        {v27.8b}, [x2], x4
-    usubl       v21.8h, v26.8b, v27.8b
-    ld1        {v28.8b}, [x1], x3
-    ld1        {v29.8b}, [x2], x4
-    usubl       v22.8h, v28.8b, v29.8b
-    ld1        {v30.8b}, [x1], x3
-    add         v1.8h,  v20.8h, v21.8h
-    ld1        {v31.8b}, [x2], x4
-    usubl       v23.8h, v30.8b, v31.8b
-    add         v1.8h,  v1.8h,  v22.8h
-    add         v1.8h,  v1.8h,  v23.8h
 
+    sub4x4x2_dct_dc  v0, v16, v17, v18, v19, v20, v21, v22, v23
+    sub4x4x2_dct_dc  v1, v24, v25, v26, v27, v28, v29, v30, v31
+
+    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
+    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
+    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
+    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
     transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
 
-    add         v0.8h,  v2.8h,  v3.8h
-    sub         v1.8h,  v2.8h,  v3.8h
+    addp        v0.8h,  v2.8h,  v3.8h
+    addp        v0.8h,  v0.8h,  v0.8h
 
-    transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
+    st1        {v0.4h}, [x0]
+    ret
+endfunc
+
+function x264_sub8x16_dct_dc_neon, export=1
+    mov             x3,  #FENC_STRIDE
+    mov             x4,  #FDEC_STRIDE
+    sub4x4x2_dct_dc  v0, v16, v17, v18, v19, v20, v21, v22, v23
+    sub4x4x2_dct_dc  v1, v24, v25, v26, v27, v28, v29, v30, v31
+    sub4x4x2_dct_dc  v2, v16, v17, v18, v19, v20, v21, v22, v23
+    sub4x4x2_dct_dc  v3, v24, v25, v26, v27, v28, v29, v30, v31
 
-    add         v0.8h,  v2.8h,  v3.8h
-    sub         v1.8h,  v2.8h,  v3.8h
+    addp             v4.8h,  v0.8h,  v2.8h
+    addp             v5.8h,  v1.8h,  v3.8h
+
+    transpose   v2.4s,  v3.4s,  v4.4s,  v5.4s
+    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
+
+    transpose   v2.4s,  v3.4s,  v0.4s,  v1.4s
+    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
 
     transpose   v2.2d,  v3.2d,  v0.2d,  v1.2d
+    SUMSUB_AB   v0.8h,  v1.8h,  v2.8h,  v3.8h
+
+    trn1        v2.2d,  v0.2d,  v1.2d
+    trn2        v3.2d,  v1.2d,  v0.2d
 
     addp        v0.8h,  v2.8h,  v3.8h
-    addp        v0.8h,  v0.8h,  v0.8h
 
-    st1        {v0.4h}, [x0]
+    st1        {v0.8h}, [x0]
+    ret
+endfunc
+
+function x264_zigzag_interleave_8x8_cavlc_neon, export=1
+    mov        x3,  #7
+    movi       v31.4s, #1
+    ld4        {v0.8h,v1.8h,v2.8h,v3.8h}, [x1],  #64
+    ld4        {v4.8h,v5.8h,v6.8h,v7.8h}, [x1],  #64
+    umax       v16.8h, v0.8h,  v4.8h
+    umax       v17.8h, v1.8h,  v5.8h
+    umax       v18.8h, v2.8h,  v6.8h
+    umax       v19.8h, v3.8h,  v7.8h
+    st1        {v0.8h}, [x0],  #16
+    st1        {v4.8h}, [x0],  #16
+    umaxp      v16.8h, v16.8h, v17.8h
+    umaxp      v18.8h, v18.8h, v19.8h
+    st1        {v1.8h}, [x0],  #16
+    st1        {v5.8h}, [x0],  #16
+    umaxp      v16.8h, v16.8h, v18.8h
+    st1        {v2.8h}, [x0],  #16
+    st1        {v6.8h}, [x0],  #16
+    cmhi       v16.4s, v16.4s, v31.4s
+    st1        {v3.8h}, [x0],  #16
+    and        v16.16b, v16.16b, v31.16b
+    st1        {v7.8h}, [x0],  #16
+    st1        {v16.b}[0],    [x2],  #1
+    st1        {v16.b}[4],    [x2],  x3
+    st1        {v16.b}[8],    [x2],  #1
+    st1        {v16.b}[12],   [x2]
     ret
 endfunc
 
@@ -664,3 +727,282 @@
     st1        {v2.16b,v3.16b},   [x0]
     ret
 endfunc
+
+.macro zigzag_sub_4x4 f ac
+function x264_zigzag_sub_4x4\ac\()_\f\()_neon, export=1
+    mov         x9,  #FENC_STRIDE
+    mov         x4,  #FDEC_STRIDE
+    movrel      x5,  sub4x4_\f
+    mov         x6,  x2
+    ld1        {v0.s}[0], [x1], x9
+    ld1        {v0.s}[1], [x1], x9
+    ld1        {v0.s}[2], [x1], x9
+    ld1        {v0.s}[3], [x1], x9
+    ld1        {v16.16b}, [x5]
+    ld1        {v1.s}[0], [x2], x4
+    ld1        {v1.s}[1], [x2], x4
+    ld1        {v1.s}[2], [x2], x4
+    ld1        {v1.s}[3], [x2], x4
+    tbl         v2.16b, {v0.16b}, v16.16b
+    tbl         v3.16b, {v1.16b}, v16.16b
+    st1        {v0.s}[0], [x6], x4
+    usubl       v4.8h,  v2.8b,  v3.8b

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/dct.h Changed

@@ -1,9 +1,10 @@
 /*****************************************************************************
- * dct.h: AArch64 transform and zigzag
+ * dct.h: aarch64 transform and zigzag
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -40,6 +41,7 @@
 void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
 void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
 void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
 
 void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
@@ -48,5 +50,18 @@
 void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
 
 void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_field_neon( int16_t level[64], int16_t dct[64] );
+
+int x264_zigzag_sub_4x4_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
+int x264_zigzag_sub_4x4ac_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
+int x264_zigzag_sub_4x4_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
+int x264_zigzag_sub_4x4ac_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
+
+int x264_zigzag_sub_8x8_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
+int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
+
+void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz );
 
 #endif

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/deblock-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/deblock-a.S Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * deblock.S: aarch64 deblocking
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: Mans Rullgard <mans@mansr.com>
  *          Janne Grunau <janne-x264@jannau.net>
@@ -180,6 +180,202 @@
     ret
 endfunc
 
+.macro h264_loop_filter_start_intra
+    orr             w4,  w2,  w3
+    cmp             w4,  #0
+    b.ne            1f
+    ret
+1:
+    dup             v30.16b, w2                // alpha
+    dup             v31.16b, w3                // beta
+.endm
+
+.macro h264_loop_filter_luma_intra
+    uabd            v16.16b, v7.16b,  v0.16b        // abs(p0 - q0)
+    uabd            v17.16b, v6.16b,  v7.16b        // abs(p1 - p0)
+    uabd            v18.16b, v1.16b,  v0.16b        // abs(q1 - q0)
+    cmhi            v19.16b, v30.16b, v16.16b       // < alpha
+    cmhi            v17.16b, v31.16b, v17.16b       // < beta
+    cmhi            v18.16b, v31.16b, v18.16b       // < beta
+
+    movi            v29.16b, #2
+    ushr            v30.16b, v30.16b, #2            // alpha >> 2
+    add             v30.16b, v30.16b, v29.16b       // (alpha >> 2) + 2
+    cmhi            v16.16b, v30.16b, v16.16b       // < (alpha >> 2) + 2
+
+    and             v19.16b, v19.16b, v17.16b
+    and             v19.16b, v19.16b, v18.16b
+    shrn            v20.8b,  v19.8h,  #4
+    mov             x4, v20.d[0]
+    cbz             x4, 9f
+
+    ushll           v20.8h,  v6.8b,   #1
+    ushll           v22.8h,  v1.8b,   #1
+    ushll2          v21.8h,  v6.16b,  #1
+    ushll2          v23.8h,  v1.16b,  #1
+    uaddw           v20.8h,  v20.8h,  v7.8b
+    uaddw           v22.8h,  v22.8h,  v0.8b
+    uaddw2          v21.8h,  v21.8h,  v7.16b
+    uaddw2          v23.8h,  v23.8h,  v0.16b
+    uaddw           v20.8h,  v20.8h,  v1.8b
+    uaddw           v22.8h,  v22.8h,  v6.8b
+    uaddw2          v21.8h,  v21.8h,  v1.16b
+    uaddw2          v23.8h,  v23.8h,  v6.16b
+
+    rshrn           v24.8b,  v20.8h,  #2 // p0'_1
+    rshrn           v25.8b,  v22.8h,  #2 // q0'_1
+    rshrn2          v24.16b, v21.8h,  #2 // p0'_1
+    rshrn2          v25.16b, v23.8h,  #2 // q0'_1
+
+    uabd            v17.16b, v5.16b,  v7.16b        // abs(p2 - p0)
+    uabd            v18.16b, v2.16b,  v0.16b        // abs(q2 - q0)
+    cmhi            v17.16b, v31.16b, v17.16b       // < beta
+    cmhi            v18.16b, v31.16b, v18.16b       // < beta
+
+    and             v17.16b, v16.16b, v17.16b  // if_2 && if_3
+    and             v18.16b, v16.16b, v18.16b  // if_2 && if_4
+
+    not             v30.16b, v17.16b
+    not             v31.16b, v18.16b
+
+    and             v30.16b, v30.16b, v19.16b  // if_1 && !(if_2 && if_3)
+    and             v31.16b, v31.16b, v19.16b  // if_1 && !(if_2 && if_4)
+
+    and             v17.16b, v19.16b, v17.16b  // if_1 && if_2 && if_3
+    and             v18.16b, v19.16b, v18.16b  // if_1 && if_2 && if_4
+
+    //calc            p, v7, v6, v5, v4, v17, v7, v6, v5, v4
+    uaddl           v26.8h,  v5.8b,   v7.8b
+    uaddl2          v27.8h,  v5.16b,  v7.16b
+    uaddw           v26.8h,  v26.8h,  v0.8b
+    uaddw2          v27.8h,  v27.8h,  v0.16b
+    add             v20.8h,  v20.8h,  v26.8h
+    add             v21.8h,  v21.8h,  v27.8h
+    uaddw           v20.8h,  v20.8h,  v0.8b
+    uaddw2          v21.8h,  v21.8h,  v0.16b
+    rshrn           v20.8b,  v20.8h,  #3 // p0'_2
+    rshrn2          v20.16b, v21.8h,  #3 // p0'_2
+    uaddw           v26.8h,  v26.8h,  v6.8b
+    uaddw2          v27.8h,  v27.8h,  v6.16b
+    rshrn           v21.8b,  v26.8h,  #2 // p1'_2
+    rshrn2          v21.16b, v27.8h,  #2 // p1'_2
+    uaddl           v28.8h,  v4.8b,   v5.8b
+    uaddl2          v29.8h,  v4.16b,  v5.16b
+    shl             v28.8h,  v28.8h,  #1
+    shl             v29.8h,  v29.8h,  #1
+    add             v28.8h,  v28.8h,  v26.8h
+    add             v29.8h,  v29.8h,  v27.8h
+    rshrn           v19.8b,  v28.8h,  #3 // p2'_2
+    rshrn2          v19.16b, v29.8h,  #3 // p2'_2
+
+    //calc            q, v0, v1, v2, v3, v18, v0, v1, v2, v3
+    uaddl           v26.8h,  v2.8b,   v0.8b
+    uaddl2          v27.8h,  v2.16b,  v0.16b
+    uaddw           v26.8h,  v26.8h,  v7.8b
+    uaddw2          v27.8h,  v27.8h,  v7.16b
+    add             v22.8h,  v22.8h,  v26.8h
+    add             v23.8h,  v23.8h,  v27.8h
+    uaddw           v22.8h,  v22.8h,  v7.8b
+    uaddw2          v23.8h,  v23.8h,  v7.16b
+    rshrn           v22.8b,  v22.8h,  #3 // q0'_2
+    rshrn2          v22.16b, v23.8h,  #3 // q0'_2
+    uaddw           v26.8h,  v26.8h,  v1.8b
+    uaddw2          v27.8h,  v27.8h,  v1.16b
+    rshrn           v23.8b,  v26.8h,  #2 // q1'_2
+    rshrn2          v23.16b, v27.8h,  #2 // q1'_2
+    uaddl           v28.8h,  v2.8b,   v3.8b
+    uaddl2          v29.8h,  v2.16b,  v3.16b
+    shl             v28.8h,  v28.8h,  #1
+    shl             v29.8h,  v29.8h,  #1
+    add             v28.8h,  v28.8h,  v26.8h
+    add             v29.8h,  v29.8h,  v27.8h
+    rshrn           v26.8b,  v28.8h,  #3 // q2'_2
+    rshrn2          v26.16b, v29.8h,  #3 // q2'_2
+
+    bit             v7.16b,  v24.16b, v30.16b  // p0'_1
+    bit             v0.16b,  v25.16b, v31.16b  // q0'_1
+    bit             v7.16b, v20.16b,  v17.16b  // p0'_2
+    bit             v6.16b, v21.16b,  v17.16b  // p1'_2
+    bit             v5.16b, v19.16b,  v17.16b  // p2'_2
+    bit             v0.16b, v22.16b,  v18.16b  // q0'_2
+    bit             v1.16b, v23.16b,  v18.16b  // q1'_2
+    bit             v2.16b, v26.16b,  v18.16b  // q2'_2
+.endm
+
+function x264_deblock_v_luma_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    ld1             {v0.16b},  [x0], x1 // q0
+    ld1             {v1.16b},  [x0], x1 // q1
+    ld1             {v2.16b},  [x0], x1 // q2
+    ld1             {v3.16b},  [x0], x1 // q3
+    sub             x0,  x0,  x1, lsl #3
+    ld1             {v4.16b},  [x0], x1 // p3
+    ld1             {v5.16b},  [x0], x1 // p2
+    ld1             {v6.16b},  [x0], x1 // p1
+    ld1             {v7.16b},  [x0]     // p0
+
+    h264_loop_filter_luma_intra
+
+    sub             x0,  x0,  x1, lsl #1
+    st1             {v5.16b}, [x0], x1  // p2
+    st1             {v6.16b}, [x0], x1  // p1
+    st1             {v7.16b}, [x0], x1  // p0
+    st1             {v0.16b}, [x0], x1  // q0
+    st1             {v1.16b}, [x0], x1  // q1
+    st1             {v2.16b}, [x0]      // q2
+9:
+    ret
+endfunc
+
+function x264_deblock_h_luma_intra_neon, export=1
+    h264_loop_filter_start_intra
+
+    sub             x0,  x0,  #4
+    ld1             {v4.8b},  [x0], x1
+    ld1             {v5.8b},  [x0], x1
+    ld1             {v6.8b},  [x0], x1
+    ld1             {v7.8b},  [x0], x1
+    ld1             {v0.8b},  [x0], x1
+    ld1             {v1.8b},  [x0], x1
+    ld1             {v2.8b},  [x0], x1
+    ld1             {v3.8b},  [x0], x1
+    ld1             {v4.d}[1],  [x0], x1
+    ld1             {v5.d}[1],  [x0], x1
+    ld1             {v6.d}[1],  [x0], x1
+    ld1             {v7.d}[1],  [x0], x1
+    ld1             {v0.d}[1],  [x0], x1
+    ld1             {v1.d}[1],  [x0], x1
+    ld1             {v2.d}[1],  [x0], x1
+    ld1             {v3.d}[1],  [x0], x1
+
+    transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
+
+    h264_loop_filter_luma_intra
+
+    transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
+
+    sub             x0,  x0,  x1, lsl #4
+    st1             {v4.8b},  [x0], x1
+    st1             {v5.8b},  [x0], x1
+    st1             {v6.8b},  [x0], x1
+    st1             {v7.8b},  [x0], x1
+    st1             {v0.8b},  [x0], x1
+    st1             {v1.8b},  [x0], x1
+    st1             {v2.8b},  [x0], x1
+    st1             {v3.8b},  [x0], x1
+    st1             {v4.d}[1],  [x0], x1
+    st1             {v5.d}[1],  [x0], x1
+    st1             {v6.d}[1],  [x0], x1

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/mc-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/mc-a.S Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc.S: aarch64 motion compensation
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
  *          Janne Grunau <janne-x264@jannau.net>
@@ -1253,6 +1253,34 @@
     ret
 endfunc
 
+function x264_plane_copy_neon, export=1
+    add         x8,  x4,  #15
+    and         x4,  x8,  #~15
+    sub         x1,  x1,  x4
+    sub         x3,  x3,  x4
+1:
+    mov         w8,  w4
+16:
+    tst         w8,  #16
+    b.eq        32f
+    subs        w8,  w8,  #16
+    ldr         q0,  [x2], #16
+    str         q0,  [x0], #16
+    b.eq        0f
+32:
+    subs        w8,  w8,  #32
+    ldp         q0,  q1,  [x2], #32
+    stp         q0,  q1,  [x0], #32
+    b.gt        32b
+0:
+    subs        w5,  w5,  #1
+    add         x2,  x2,  x3
+    add         x0,  x0,  x1
+    b.gt        1b
+
+    ret
+endfunc
+
 function x264_plane_copy_deinterleave_neon, export=1
     add         w9,  w6,  #15
     and         w9,  w9,  #0xfffffff0
@@ -1363,3 +1391,279 @@
 
     ret
 endfunc
+
+.macro integral4h p1, p2
+    ext         v1.8b,  \p1\().8b,  \p2\().8b,  #1
+    ext         v2.8b,  \p1\().8b,  \p2\().8b,  #2
+    ext         v3.8b,  \p1\().8b,  \p2\().8b,  #3
+    uaddl       v0.8h,  \p1\().8b,  v1.8b
+    uaddl       v4.8h,  v2.8b,  v3.8b
+    add         v0.8h,  v0.8h,  v4.8h
+    add         v0.8h,  v0.8h,  v5.8h
+.endm
+
+function integral_init4h_neon, export=1
+    sub         x3,  x0,  x2
+    ld1        {v6.8b,v7.8b}, [x1], #16
+1:
+    subs        x2,  x2,  #16
+    ld1        {v5.8h},  [x3], #16
+    integral4h  v6, v7
+    ld1        {v6.8b},  [x1], #8
+    ld1        {v5.8h},  [x3], #16
+    st1        {v0.8h},  [x0], #16
+    integral4h  v7, v6
+    ld1        {v7.8b},  [x1], #8
+    st1        {v0.8h},  [x0], #16
+    b.gt        1b
+    ret
+endfunc
+
+.macro integral8h p1, p2, s
+    ext         v1.8b,  \p1\().8b,  \p2\().8b,  #1
+    ext         v2.8b,  \p1\().8b,  \p2\().8b,  #2
+    ext         v3.8b,  \p1\().8b,  \p2\().8b,  #3
+    ext         v4.8b,  \p1\().8b,  \p2\().8b,  #4
+    ext         v5.8b,  \p1\().8b,  \p2\().8b,  #5
+    ext         v6.8b,  \p1\().8b,  \p2\().8b,  #6
+    ext         v7.8b,  \p1\().8b,  \p2\().8b,  #7
+    uaddl       v0.8h,  \p1\().8b,  v1.8b
+    uaddl       v2.8h,  v2.8b,  v3.8b
+    uaddl       v4.8h,  v4.8b,  v5.8b
+    uaddl       v6.8h,  v6.8b,  v7.8b
+    add         v0.8h,  v0.8h,  v2.8h
+    add         v4.8h,  v4.8h,  v6.8h
+    add         v0.8h,  v0.8h,  v4.8h
+    add         v0.8h,  v0.8h,  \s\().8h
+.endm
+
+function integral_init8h_neon, export=1
+    sub         x3,  x0,  x2
+    ld1        {v16.8b,v17.8b}, [x1], #16
+1:
+    subs        x2,  x2,  #16
+    ld1        {v18.8h}, [x3], #16
+    integral8h  v16, v17, v18
+    ld1        {v16.8b}, [x1], #8
+    ld1        {v18.8h}, [x3], #16
+    st1        {v0.8h},  [x0], #16
+    integral8h  v17, v16, v18
+    ld1        {v17.8b}, [x1], #8
+    st1        {v0.8h},  [x0], #16
+    b.gt        1b
+    ret
+endfunc
+
+function integral_init4v_neon, export=1
+    mov         x3,  x0
+    add         x4,  x0,  x2,  lsl #3
+    add         x8,  x0,  x2,  lsl #4
+    sub         x2,  x2,  #8
+    ld1        {v20.8h,v21.8h,v22.8h}, [x3], #48
+    ld1        {v16.8h,v17.8h,v18.8h}, [x8], #48
+1:
+    subs        x2,  x2,  #16
+    ld1        {v24.8h,v25.8h}, [x4], #32
+    ext         v0.16b,  v20.16b, v21.16b, #8
+    ext         v1.16b,  v21.16b, v22.16b, #8
+    ext         v2.16b,  v16.16b, v17.16b, #8
+    ext         v3.16b,  v17.16b, v18.16b, #8
+    sub         v24.8h,  v24.8h,  v20.8h
+    sub         v25.8h,  v25.8h,  v21.8h
+    add         v0.8h,   v0.8h,   v20.8h
+    add         v1.8h,   v1.8h,   v21.8h
+    add         v2.8h,   v2.8h,   v16.8h
+    add         v3.8h,   v3.8h,   v17.8h
+    st1        {v24.8h},  [x1], #16
+    st1        {v25.8h},  [x1], #16
+    mov         v20.16b,  v22.16b
+    mov         v16.16b,  v18.16b
+    sub         v0.8h,   v2.8h,   v0.8h
+    sub         v1.8h,   v3.8h,   v1.8h
+    ld1        {v21.8h,v22.8h}, [x3], #32
+    ld1        {v17.8h,v18.8h}, [x8], #32
+    st1        {v0.8h},  [x0], #16
+    st1        {v1.8h},  [x0], #16
+    b.gt        1b
+2:
+    ret
+endfunc
+
+function integral_init8v_neon, export=1
+    add         x2,  x0,  x1,  lsl #4
+    sub         x1,  x1,  #8
+    ands        x3,  x1,  #16 - 1
+    b.eq        1f
+    subs        x1,  x1,  #8
+    ld1        {v0.8h}, [x0]
+    ld1        {v2.8h}, [x2], #16
+    sub         v4.8h,  v2.8h,  v0.8h
+    st1        {v4.8h},  [x0], #16
+    b.le        2f
+1:
+    subs        x1,  x1,  #16
+    ld1        {v0.8h,v1.8h}, [x0]
+    ld1        {v2.8h,v3.8h}, [x2], #32
+    sub         v4.8h,  v2.8h,  v0.8h
+    sub         v5.8h,  v3.8h,  v1.8h
+    st1        {v4.8h},  [x0], #16
+    st1        {v5.8h},  [x0], #16
+    b.gt        1b
+2:
+    ret
+endfunc
+
+function x264_mbtree_propagate_cost_neon, export=1
+    ld1r        {v5.4s},  [x5]
+8:
+    subs        w6,  w6,  #8
+    ld1         {v1.8h},  [x1], #16
+    ld1         {v2.8h},  [x2], #16
+    ld1         {v3.8h},  [x3], #16
+    ld1         {v4.8h},  [x4], #16
+    bic         v3.8h,  #0xc0, lsl #8
+    umin        v3.8h,  v2.8h,  v3.8h
+    umull       v20.4s, v2.4h,  v4.4h   // propagate_intra
+    umull2      v21.4s, v2.8h,  v4.8h   // propagate_intra
+    usubl       v22.4s, v2.4h,  v3.4h   // propagate_num
+    usubl2      v23.4s, v2.8h,  v3.8h   // propagate_num
+    uxtl        v26.4s, v2.4h           // propagate_denom
+    uxtl2       v27.4s, v2.8h           // propagate_denom
+    uxtl        v24.4s, v1.4h
+    uxtl2       v25.4s, v1.8h
+    ucvtf       v20.4s, v20.4s
+    ucvtf       v21.4s, v21.4s
+    ucvtf       v26.4s, v26.4s
+    ucvtf       v27.4s, v27.4s
+    ucvtf       v22.4s, v22.4s
+    ucvtf       v23.4s, v23.4s
+    frecpe      v28.4s, v26.4s
+    frecpe      v29.4s, v27.4s
+    ucvtf       v24.4s, v24.4s
+    ucvtf       v25.4s, v25.4s
+    frecps      v30.4s, v28.4s, v26.4s
+    frecps      v31.4s, v29.4s, v27.4s

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/mc-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/mc-c.c Changed

@@ -1,9 +1,10 @@
 /*****************************************************************************
  * mc-c.c: aarch64 motion compensation
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -48,6 +49,8 @@
 void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 
+void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
+                           pixel *src, intptr_t i_src, int w, int h );
 void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
                                          pixel *dstv, intptr_t i_dstv,
                                          pixel *src,  intptr_t i_src, int w, int h );
@@ -89,8 +92,14 @@
 void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 
 void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
+void integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
+void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
+void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
+void integral_init8v_neon( uint16_t *, intptr_t );
 void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
 
+void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
+
 #if !HIGH_BIT_DEPTH
 static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
 {
@@ -132,9 +141,6 @@
     x264_mc_copy_w16_neon,
 };
 
-static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-
 static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
                           uint8_t *src[4], intptr_t i_src_stride,
                           int mvx, int mvy,
@@ -142,13 +148,13 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
     if ( (mvy&3) == 3 )             // explict if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         x264_pixel_avg_wtab_neon[i_width>>2](
                 dst, i_dst_stride, src1, i_src_stride,
                 src2, i_height );
@@ -168,13 +174,13 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
     if ( (mvy&3) == 3 )             // explict if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         x264_pixel_avg_wtab_neon[i_width>>2](
                 dst, *i_dst_stride, src1, i_src_stride,
                 src2, i_height );
@@ -199,6 +205,89 @@
                             int height, int16_t *buf );
 #endif // !HIGH_BIT_DEPTH
 
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
+#define CLIP_ADD2(s,x)\
+do\
+{\
+    CLIP_ADD((s)[0], (x)[0]);\
+    CLIP_ADD((s)[1], (x)[1]);\
+} while(0)
+
+void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2],
+                                               int16_t *propagate_amount,
+                                               uint16_t *lowres_costs,
+                                               int16_t *output,
+                                               int bipred_weight, int mb_y,
+                                               int len );
+
+static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs,
+                                             int16_t (*mvs)[2],
+                                             int16_t *propagate_amount,
+                                             uint16_t *lowres_costs,
+                                             int bipred_weight, int mb_y,
+                                             int len, int list )
+{
+    int16_t *current = h->scratch_buffer2;
+
+    x264_mbtree_propagate_list_internal_neon( mvs, propagate_amount,
+                                              lowres_costs, current,
+                                              bipred_weight, mb_y, len );
+
+    unsigned stride = h->mb.i_mb_stride;
+    unsigned width = h->mb.i_mb_width;
+    unsigned height = h->mb.i_mb_height;
+
+    for( unsigned i = 0; i < len; current += 32 )
+    {
+        int end = X264_MIN( i+8, len );
+        for( ; i < end; i++, current += 2 )
+        {
+            if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )
+                continue;
+
+            unsigned mbx = current[0];
+            unsigned mby = current[1];
+            unsigned idx0 = mbx + mby * stride;
+            unsigned idx2 = idx0 + stride;
+
+            /* Shortcut for the simple/common case of zero MV */
+            if( !M32( mvs[i] ) )
+            {
+                CLIP_ADD( ref_costs[idx0], current[16] );
+                continue;
+            }
+
+            if( mbx < width-1 && mby < height-1 )
+            {
+                CLIP_ADD2( ref_costs+idx0, current+16 );
+                CLIP_ADD2( ref_costs+idx2, current+32 );
+            }
+            else
+            {
+                /* Note: this takes advantage of unsigned representation to
+                 * catch negative mbx/mby. */
+                if( mby < height )
+                {
+                    if( mbx < width )
+                        CLIP_ADD( ref_costs[idx0+0], current[16] );
+                    if( mbx+1 < width )
+                        CLIP_ADD( ref_costs[idx0+1], current[17] );
+                }
+                if( mby+1 < height )
+                {
+                    if( mbx < width )
+                        CLIP_ADD( ref_costs[idx2+0], current[32] );
+                    if( mbx+1 < width )
+                        CLIP_ADD( ref_costs[idx2+1], current[33] );
+                }
+            }
+        }
+    }
+}
+
+#undef CLIP_ADD
+#undef CLIP_ADD2
+
 void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
 {
 #if !HIGH_BIT_DEPTH
@@ -217,6 +306,7 @@
     pf->copy[PIXEL_8x8]      = x264_mc_copy_w8_neon;
     pf->copy[PIXEL_4x4]      = x264_mc_copy_w4_neon;
 
+    pf->plane_copy                  = x264_plane_copy_neon;
     pf->plane_copy_deinterleave     = x264_plane_copy_deinterleave_neon;
     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
     pf->plane_copy_interleave       = x264_plane_copy_interleave_neon;
@@ -245,5 +335,16 @@
     pf->get_ref = get_ref_neon;
     pf->hpel_filter = x264_hpel_filter_neon;
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+
+    pf->integral_init4h = integral_init4h_neon;
+    pf->integral_init8h = integral_init8h_neon;
+    pf->integral_init4v = integral_init4v_neon;
+    pf->integral_init8v = integral_init8v_neon;
+
+    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
+    pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
+
+    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
+    pf->memzero_aligned = x264_memzero_aligned_neon;
 #endif // !HIGH_BIT_DEPTH
 }

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/mc.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/pixel-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/pixel-a.S Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * pixel.S: aarch64 pixel metrics
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
  *          Janne Grunau <janne-x264@jannau.net>
@@ -114,6 +114,7 @@
 
 SAD_FUNC  4,  4
 SAD_FUNC  4,  8
+SAD_FUNC  4,  16
 SAD_FUNC  8,  4
 SAD_FUNC  8,  8
 SAD_FUNC  8,  16
@@ -148,7 +149,7 @@
     \first      v17.8h,  v2.8b,  v0.8b
     ld1        {v3.8b}, [x3], x5
     ld1        {v1.8b}, [x1], x5
-   \first       v18.8h,  v3.8b,  v0.8b
+    \first      v18.8h,  v3.8b,  v0.8b
     uabal       v16.8h,  v1.8b,  v5.8b
     ld1        {v2.8b}, [x2], x5
     ld1        {v3.8b}, [x3], x5
@@ -248,6 +249,56 @@
 SAD_X_FUNC  4, 16, 16
 
 
+function x264_pixel_vsad_neon, export=1
+    subs        w2,  w2,  #2
+    ld1        {v0.16b},  [x0],  x1
+    ld1        {v1.16b},  [x0],  x1
+    uabdl       v6.8h,  v0.8b,  v1.8b
+    uabdl2      v7.8h,  v0.16b, v1.16b
+    b.le        2f
+1:
+    subs        w2,  w2,  #2
+    ld1        {v0.16b},  [x0],  x1
+    uabal       v6.8h,  v1.8b,  v0.8b
+    uabal2      v7.8h,  v1.16b, v0.16b
+    ld1        {v1.16b},  [x0],  x1
+    b.lt        2f
+    uabal       v6.8h,  v0.8b,  v1.8b
+    uabal2      v7.8h,  v0.16b, v1.16b
+    b.gt        1b
+2:
+    add         v5.8h,  v6.8h,  v7.8h
+    uaddlv      s0,  v5.8h
+    fmov        w0,  s0
+    ret
+endfunc
+
+function x264_pixel_asd8_neon, export=1
+    sub         w4,  w4,  #2
+    ld1        {v0.8b}, [x0], x1
+    ld1        {v1.8b}, [x2], x3
+    ld1        {v2.8b}, [x0], x1
+    ld1        {v3.8b}, [x2], x3
+    usubl       v16.8h, v0.8b,  v1.8b
+1:
+    subs        w4,  w4,  #2
+    ld1        {v4.8b}, [x0], x1
+    ld1        {v5.8b}, [x2], x3
+    usubl       v17.8h, v2.8b,  v3.8b
+    usubl       v18.8h, v4.8b,  v5.8b
+    add         v16.8h, v16.8h, v17.8h
+    ld1        {v2.8b}, [x0], x1
+    ld1        {v3.8b}, [x2], x3
+    add         v16.8h, v16.8h, v18.8h
+    b.gt        1b
+    usubl       v17.8h, v2.8b,  v3.8b
+    add         v16.8h, v16.8h, v17.8h
+    saddlv      s0,  v16.8h
+    abs         v0.2s,  v0.2s
+    fmov        w0,  s0
+    ret
+endfunc
+
 .macro SSD_START_4
     ld1        {v16.s}[0], [x0], x1
     ld1        {v17.s}[0], [x2], x3
@@ -343,12 +394,84 @@
 
 SSD_FUNC   4, 4
 SSD_FUNC   4, 8
+SSD_FUNC   4, 16
 SSD_FUNC   8, 4
 SSD_FUNC   8, 8
 SSD_FUNC   8, 16
 SSD_FUNC  16, 8
 SSD_FUNC  16, 16
 
+
+function x264_pixel_ssd_nv12_core_neon, export=1
+    sxtw        x8,  w4
+    add         x8,  x8,  #8
+    and         x8,  x8,  #~15
+    movi        v6.2d,  #0
+    movi        v7.2d,  #0
+    sub         x1,  x1,  x8, lsl #1
+    sub         x3,  x3,  x8, lsl #1
+1:
+    subs        w8,  w4,  #16
+    ld2        {v0.8b,v1.8b},   [x0],  #16
+    ld2        {v2.8b,v3.8b},   [x2],  #16
+    ld2        {v24.8b,v25.8b}, [x0],  #16
+    ld2        {v26.8b,v27.8b}, [x2],  #16
+
+    usubl       v16.8h, v0.8b,  v2.8b
+    usubl       v17.8h, v1.8b,  v3.8b
+    smull       v20.4s, v16.4h, v16.4h
+    smull       v21.4s, v17.4h, v17.4h
+    usubl       v18.8h, v24.8b, v26.8b
+    usubl       v19.8h, v25.8b, v27.8b
+    smlal2      v20.4s, v16.8h, v16.8h
+    smlal2      v21.4s, v17.8h, v17.8h
+
+    b.lt        4f
+    b.eq        3f
+2:
+    smlal       v20.4s, v18.4h, v18.4h
+    smlal       v21.4s, v19.4h, v19.4h
+    ld2        {v0.8b,v1.8b}, [x0],  #16
+    ld2        {v2.8b,v3.8b}, [x2],  #16
+    smlal2      v20.4s, v18.8h, v18.8h
+    smlal2      v21.4s, v19.8h, v19.8h
+
+    subs        w8,  w8,  #16
+    usubl       v16.8h, v0.8b,  v2.8b
+    usubl       v17.8h, v1.8b,  v3.8b
+    smlal       v20.4s, v16.4h, v16.4h
+    smlal       v21.4s, v17.4h, v17.4h
+    ld2        {v24.8b,v25.8b}, [x0],  #16
+    ld2        {v26.8b,v27.8b}, [x2],  #16
+    smlal2      v20.4s, v16.8h, v16.8h
+    smlal2      v21.4s, v17.8h, v17.8h
+    b.lt        4f
+
+    usubl       v18.8h, v24.8b, v26.8b
+    usubl       v19.8h, v25.8b, v27.8b
+    b.gt        2b
+3:
+    smlal       v20.4s, v18.4h, v18.4h
+    smlal       v21.4s, v19.4h, v19.4h
+    smlal2      v20.4s, v18.8h, v18.8h
+    smlal2      v21.4s, v19.8h, v19.8h
+4:
+    subs        w5,  w5,  #1
+    uaddw       v6.2d,  v6.2d,  v20.2s
+    uaddw       v7.2d,  v7.2d,  v21.2s
+    add         x0,  x0,  x1
+    add         x2,  x2,  x3
+    uaddw2      v6.2d,  v6.2d,  v20.4s
+    uaddw2      v7.2d,  v7.2d,  v21.4s
+    b.gt        1b
+
+    addp        v6.2d,  v6.2d,  v7.2d
+    st1        {v6.d}[0], [x6]
+    st1        {v6.d}[1], [x7]
+
+    ret
+endfunc
+
 .macro pixel_var_8 h
 function x264_pixel_var_8x\h\()_neon, export=1
     ld1            {v16.8b}, [x0], x1
@@ -800,10 +923,65 @@
     b           x264_satd_8x4v_8x8h_neon
 endfunc
 
+function x264_pixel_satd_4x16_neon, export=1
+    mov         x4,  x30
+    ld1        {v1.s}[0],  [x2], x3
+    ld1        {v0.s}[0],  [x0], x1
+    ld1        {v3.s}[0],  [x2], x3
+    ld1        {v2.s}[0],  [x0], x1
+    ld1        {v5.s}[0],  [x2], x3
+    ld1        {v4.s}[0],  [x0], x1
+    ld1        {v7.s}[0],  [x2], x3
+    ld1        {v6.s}[0],  [x0], x1
+    ld1        {v1.s}[1],  [x2], x3
+    ld1        {v0.s}[1],  [x0], x1
+    ld1        {v3.s}[1],  [x2], x3
+    ld1        {v2.s}[1],  [x0], x1
+    ld1        {v5.s}[1],  [x2], x3
+    ld1        {v4.s}[1],  [x0], x1
+    ld1        {v7.s}[1],  [x2], x3
+    ld1        {v6.s}[1],  [x0], x1
+    usubl       v16.8h, v0.8b,  v1.8b
+    usubl       v17.8h, v2.8b,  v3.8b
+    usubl       v18.8h, v4.8b,  v5.8b
+    usubl       v19.8h, v6.8b,  v7.8b
+    ld1        {v1.s}[0],  [x2], x3
+    ld1        {v0.s}[0],  [x0], x1
+    ld1        {v3.s}[0],  [x2], x3
+    ld1        {v2.s}[0],  [x0], x1
+    ld1        {v5.s}[0],  [x2], x3
+    ld1        {v4.s}[0],  [x0], x1

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/pixel.h Changed

@@ -1,9 +1,10 @@
 /*****************************************************************************
  * pixel.h: aarch64 pixel metrics
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -32,6 +33,7 @@
     ret x264_pixel_##name##_8x16_##suffix args;\
     ret x264_pixel_##name##_8x8_##suffix args;\
     ret x264_pixel_##name##_8x4_##suffix args;\
+    ret x264_pixel_##name##_4x16_##suffix args;\
     ret x264_pixel_##name##_4x8_##suffix args;\
     ret x264_pixel_##name##_4x4_##suffix args;\
 
@@ -47,8 +49,14 @@
 DECL_X1( satd, neon )
 DECL_X1( ssd, neon )
 
+
+void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
+
+int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
+
 int x264_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
 int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
+uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
 
 uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
 uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
@@ -66,4 +74,6 @@
                                       int sums[2][4] );
 float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
 
+int x264_pixel_asd8_neon( uint8_t *, intptr_t,  uint8_t *, intptr_t, int );
+
 #endif

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/predict-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/predict-a.S Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * predict.S: aarch64 intra prediction
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
  *          Mans Rullgard <mans@mansr.com>
@@ -436,14 +436,25 @@
 endfunc
 
 function x264_predict_8x8c_dc_left_neon, export=1
-    sub         x2,  x0,  #1
+    ldrb        w2,  [x0, #0 * FDEC_STRIDE - 1]
+    ldrb        w3,  [x0, #1 * FDEC_STRIDE - 1]
+    ldrb        w4,  [x0, #2 * FDEC_STRIDE - 1]
+    ldrb        w5,  [x0, #3 * FDEC_STRIDE - 1]
     mov         x1,  #FDEC_STRIDE
-    ldcol.8     v0,  x2,  x1
-    uaddlp      v0.4h,  v0.8b
-    addp        v0.4h,  v0.4h,  v0.4h
+    add         w2,  w2,  w3
+    add         w3,  w4,  w5
+    ldrb        w6,  [x0, #4 * FDEC_STRIDE - 1]
+    ldrb        w7,  [x0, #5 * FDEC_STRIDE - 1]
+    ldrb        w8,  [x0, #6 * FDEC_STRIDE - 1]
+    ldrb        w9,  [x0, #7 * FDEC_STRIDE - 1]
+    add         w6,  w6,  w7
+    add         w7,  w8,  w9
+    add         w2,  w2,  w3
+    add         w6,  w6,  w7
+    dup         v0.8h,  w2
+    dup         v1.8h,  w6
     rshrn       v0.8b,  v0.8h,  #2
-    dup         v1.8b,  v0.b[1]
-    dup         v0.8b,  v0.b[0]
+    rshrn       v1.8b,  v1.8h,  #2
     b           pred8x8c_dc_end
 endfunc
 
@@ -546,6 +557,223 @@
 endfunc
 
 
+.macro loadsum4 wd, t1, t2, t3, x, idx
+    ldrb        \wd,  [\x, #(\idx + 0) * FDEC_STRIDE - 1]
+    ldrb        \t1,  [\x, #(\idx + 1) * FDEC_STRIDE - 1]
+    ldrb        \t2,  [\x, #(\idx + 2) * FDEC_STRIDE - 1]
+    ldrb        \t3,  [\x, #(\idx + 3) * FDEC_STRIDE - 1]
+    add         \wd,  \wd,  \t1
+    add         \t1,  \t2,  \t3
+    add         \wd,  \wd,  \t1
+.endm
+
+function x264_predict_8x16c_h_neon, export=1
+    sub         x2,  x0,  #1
+    add         x3,  x0,  #FDEC_STRIDE - 1
+    mov         x7,  #2 * FDEC_STRIDE
+    add         x1,  x0,  #FDEC_STRIDE
+.rept 4
+    ld1r       {v0.8b}, [x2], x7
+    ld1r       {v1.8b}, [x3], x7
+    ld1r       {v2.8b}, [x2], x7
+    ld1r       {v3.8b}, [x3], x7
+    st1        {v0.8b}, [x0], x7
+    st1        {v1.8b}, [x1], x7
+    st1        {v2.8b}, [x0], x7
+    st1        {v3.8b}, [x1], x7
+.endr
+    ret
+endfunc
+
+function x264_predict_8x16c_v_neon, export=1
+    sub         x1,  x0,  #FDEC_STRIDE
+    mov         x2,  #2 * FDEC_STRIDE
+    ld1        {v0.8b}, [x1], x2
+.rept 8
+    st1        {v0.8b}, [x0], x2
+    st1        {v0.8b}, [x1], x2
+.endr
+    ret
+endfunc
+
+function x264_predict_8x16c_p_neon, export=1
+    movrel      x4,  p16weight
+    ld1        {v17.8h}, [x4]
+    sub         x3,  x0,  #FDEC_STRIDE
+    mov         x1,  #FDEC_STRIDE
+    add         x2,  x3,  #4
+    sub         x3,  x3,  #1
+
+    ld1        {v0.8b}, [x3]
+    ld1        {v2.8b}, [x2], x1
+    ldcol.8     v1,  x3,  x1
+    add         x3,  x3,  x1
+    ldcol.8     v3,  x3,  x1
+    ext         v4.8b,  v2.8b,  v2.8b,  #3
+    ext         v5.8b,  v3.8b,  v3.8b,  #7
+    rev32       v0.8b,  v0.8b
+    rev64       v1.8b,  v1.8b
+
+    uaddl       v4.8h,  v5.8b,  v4.8b // a * 1/16
+
+    usubl       v2.8h,  v2.8b,  v0.8b
+    mul         v2.8h,  v2.8h,  v17.8h
+    saddlp      v2.4s,  v2.8h
+    addp        v2.4s,  v2.4s,  v2.4s  // H
+
+    usubl       v3.8h,  v3.8b,  v1.8b
+    mul         v3.8h,  v3.8h,  v17.8h
+    saddlp      v3.4s,  v3.8h
+    addp        v3.4s,  v3.4s,  v3.4s
+    addp        v3.4s,  v3.4s,  v3.4s  // V
+
+    ext         v17.16b, v17.16b, v17.16b, #14
+
+    shl         v4.4h,  v4.4h,  #4     // a
+    shl         v6.2s,  v2.2s,  #4     // 16 * H
+    shl         v7.2s,  v3.2s,  #2     // 4 * V
+    add         v2.2s,  v2.2s,  v6.2s  // 17 * H
+    add         v3.2s,  v3.2s,  v7.2s  // 5 * V
+    rshrn       v2.4h,  v2.4s,  #5     // b
+    rshrn       v3.4h,  v3.4s,  #6     // c
+
+    mov         v17.h[0],  wzr
+
+    sub         v4.4h,  v4.4h,  v2.4h  // a - b
+    shl         v6.4h,  v2.4h,  #1     // 2 * b
+    add         v4.4h,  v4.4h,  v3.4h  // a - b + c
+    shl         v7.4h,  v3.4h,  #3     // 8 * c
+    sub         v4.4h,  v4.4h,  v6.4h  // a - 3b + c
+    sub         v4.4h,  v4.4h,  v7.4h  // a - 3b - 7c
+
+    mul         v0.8h,  v17.8h, v2.h[0]         // 0,1,2,3,4,5,6,7 * b
+    dup         v1.8h,  v4.h[0]                 // i00
+    dup         v2.8h,  v3.h[0]                 // c
+    add         v1.8h,  v1.8h,  v0.8h           // pix + {0..7}*b
+    mov         x3,  #16
+1:
+    subs        x3,  x3,  #2
+    sqrshrun    v4.8b,  v1.8h,  #5
+    add         v1.8h,  v1.8h,  v2.8h
+    sqrshrun    v5.8b,  v1.8h,  #5
+    st1        {v4.8b}, [x0], x1
+    add         v1.8h,  v1.8h,  v2.8h
+    st1        {v5.8b}, [x0], x1
+    b.ne        1b
+    ret
+endfunc
+
+function x264_predict_8x16c_dc_neon, export=1
+    sub         x3,  x0,  #FDEC_STRIDE
+    mov         x1,  #FDEC_STRIDE
+    ld1        {v6.8b}, [x3]
+    loadsum4    w2, w3, w4, w5, x0, 0
+    uaddlp      v6.4h,  v6.8b
+    dup         v22.8h, w2              // s2
+    loadsum4    w6, w7, w8, w9, x0, 4
+    addp        v6.4h,  v6.4h,  v6.4h   // s0, s1
+    dup         v23.8h, w6              // s3
+    loadsum4    w2, w3, w4, w5, x0, 8
+    dup         v20.8h, v6.h[0]         // s0
+    dup         v24.8h, w2              // s4
+    loadsum4    w6, w7, w8, w9, x0, 12
+    dup         v21.8h, v6.h[1]         // s1
+    dup         v25.8h, w6              // s5
+
+    ext         v16.16b, v20.16b, v21.16b, #8
+    ext         v17.16b, v22.16b, v21.16b, #8
+    ext         v1.16b,  v23.16b, v21.16b, #8
+    ext         v2.16b,  v24.16b, v21.16b, #8
+    ext         v3.16b,  v25.16b, v21.16b, #8
+
+    add         v0.8h,  v16.8h, v17.8h
+    add         v1.8h,  v1.8h,  v23.8h
+    add         v2.8h,  v2.8h,  v24.8h
+    add         v3.8h,  v3.8h,  v25.8h
+
+    rshrn       v0.8b,  v0.8h,  #3
+    rshrn       v1.8b,  v1.8h,  #3
+    rshrn       v2.8b,  v2.8h,  #3
+    rshrn       v3.8b,  v3.8h,  #3
+.irp  idx, 0, 1, 2, 3
+.rept 4
+    st1        {v\idx\().8b}, [x0], x1
+.endr
+.endr
+    ret
+endfunc
+
+function x264_predict_8x16c_dc_left_neon, export=1
+    mov         x1,  #FDEC_STRIDE
+    ldrb        w2,  [x0, # 0 * FDEC_STRIDE - 1]
+    ldrb        w3,  [x0, # 1 * FDEC_STRIDE - 1]
+    ldrb        w4,  [x0, # 2 * FDEC_STRIDE - 1]
+    ldrb        w5,  [x0, # 3 * FDEC_STRIDE - 1]
+    add         w2,  w2,  w3
+
+    ldrb        w6,  [x0, # 4 * FDEC_STRIDE - 1]

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/predict-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/predict-c.c Changed

@@ -1,9 +1,10 @@
 /*****************************************************************************
  * predict.c: aarch64 intra prediction
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -35,6 +36,10 @@
 void x264_predict_8x8c_dc_left_neon( uint8_t *src );
 void x264_predict_8x8c_p_neon( uint8_t *src );
 
+void x264_predict_8x16c_dc_left_neon( uint8_t *src );
+void x264_predict_8x16c_dc_top_neon( uint8_t *src );
+void x264_predict_8x16c_p_neon( uint8_t *src );
+
 void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
 void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
 void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
@@ -80,6 +85,22 @@
 #endif // !HIGH_BIT_DEPTH
 }
 
+
+void x264_predict_8x16c_init_aarch64( int cpu, x264_predict_t pf[7] )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf[I_PRED_CHROMA_V ]     = x264_predict_8x16c_v_neon;
+    pf[I_PRED_CHROMA_H ]     = x264_predict_8x16c_h_neon;
+    pf[I_PRED_CHROMA_DC]     = x264_predict_8x16c_dc_neon;
+    pf[I_PRED_CHROMA_P ]     = x264_predict_8x16c_p_neon;
+    pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_neon;
+    pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_neon;
+#endif // !HIGH_BIT_DEPTH
+}
+
 void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
 {
     if (!(cpu&X264_CPU_NEON))

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/predict.h Changed

@@ -1,9 +1,10 @@
 /*****************************************************************************
  * predict.h: aarch64 intra prediction
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -40,6 +41,9 @@
 void x264_predict_8x8c_dc_neon( uint8_t *src );
 void x264_predict_8x8c_h_neon( uint8_t *src );
 void x264_predict_8x8c_v_neon( uint8_t *src );
+void x264_predict_8x16c_v_neon( uint8_t *src );
+void x264_predict_8x16c_h_neon( uint8_t *src );
+void x264_predict_8x16c_dc_neon( uint8_t *src );
 void x264_predict_16x16_v_neon( uint8_t *src );
 void x264_predict_16x16_h_neon( uint8_t *src );
 void x264_predict_16x16_dc_neon( uint8_t *src );
@@ -47,6 +51,7 @@
 void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] );
 void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
 void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] );
+void x264_predict_8x16c_init_aarch64( int cpu, x264_predict_t pf[7] );
 void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] );
 
 #endif /* X264_AARCH64_PREDICT_H */

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/quant-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/quant-a.S Changed

@@ -1,9 +1,10 @@
 /****************************************************************************
  * quant.S: arm quantization and level-run
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -300,6 +301,118 @@
     ret
 endfunc
 
+.macro decimate_score_1x size
+function x264_decimate_score\size\()_neon, export=1
+    ld1        {v0.8h,v1.8h}, [x0]
+    movrel      x5,  X(x264_decimate_table4)
+    movi        v3.16b, #0x01
+    sqxtn       v0.8b,  v0.8h
+    sqxtn2      v0.16b, v1.8h
+    abs         v2.16b, v0.16b
+    cmeq        v1.16b, v0.16b, #0
+    cmhi        v2.16b, v2.16b, v3.16b
+    shrn        v1.8b,  v1.8h,  #4
+    shrn        v2.8b,  v2.8h,  #4
+    fmov        x2,  d2
+    fmov        x1,  d1
+    cbnz        x2,  9f
+    mvn         x1,  x1
+    mov         w0,  #0
+    cbz         x1,  0f
+.ifc \size, 15
+    lsr         x1,  x1,  #1
+.endif
+    rbit        x1,  x1
+1:
+    clz         x3,  x1
+    lsr         x6,  x3,  #2
+    lsl         x1,  x1,  x3
+    ldrb        w7,  [x5, x6]
+    cbz         x1,  2f
+    lsl         x1,  x1,  #4
+    add         w0,  w0,  w7
+    cbnz        x1,  1b
+    ret
+2:
+    add         w0,  w0,  w7
+0:
+    ret
+9:
+    mov         w0,  #9
+    ret
+endfunc
+.endm
+
+decimate_score_1x 15
+decimate_score_1x 16
+
+const mask64, align=6
+    .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+    .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+endconst
+
+function x264_decimate_score64_neon, export=1
+    ld1        {v0.8h,v1.8h}, [x0], #32
+    ld1        {v2.8h,v3.8h}, [x0], #32
+    ld1        {v4.8h,v5.8h}, [x0], #32
+    ld1        {v6.8h,v7.8h}, [x0]
+    movrel      x6,  mask64
+    movi        v31.16b, #0x01
+    sqxtn       v16.8b,  v1.8h
+    sqxtn2      v16.16b, v0.8h
+    sqxtn       v17.8b,  v3.8h
+    sqxtn2      v17.16b, v2.8h
+    sqxtn       v18.8b,  v5.8h
+    sqxtn2      v18.16b, v4.8h
+    sqxtn       v19.8b,  v7.8h
+    sqxtn2      v19.16b, v6.8h
+    abs         v4.16b, v16.16b
+    abs         v5.16b, v17.16b
+    abs         v6.16b, v18.16b
+    abs         v7.16b, v19.16b
+    ld1        {v30.16b}, [x6]
+    cmeq        v0.16b, v16.16b, #0
+    cmeq        v1.16b, v17.16b, #0
+    cmeq        v2.16b, v18.16b, #0
+    cmeq        v3.16b, v19.16b, #0
+    umax        v4.16b, v4.16b, v5.16b
+    umax        v6.16b, v6.16b, v7.16b
+    and         v0.16b, v0.16b, v30.16b
+    and         v1.16b, v1.16b, v30.16b
+    and         v2.16b, v2.16b, v30.16b
+    and         v3.16b, v3.16b, v30.16b
+    umax        v4.16b, v4.16b, v6.16b
+    addp        v0.16b, v1.16b, v0.16b
+    addp        v2.16b, v3.16b, v2.16b
+    cmhi        v4.16b, v4.16b, v31.16b
+    addp        v0.16b, v2.16b, v0.16b
+    shrn        v4.8b,  v4.8h,  #4
+    addp        v0.16b, v0.16b, v0.16b
+    fmov        x2,  d4
+    fmov        x1,  d0
+    cbnz        x2,  9f
+    mvn         x1,  x1
+    mov         w0,  #0
+    cbz         x1,  0f
+    movrel      x5,  X(x264_decimate_table8)
+1:
+    clz         x3,  x1
+    lsl         x1,  x1,  x3
+    ldrb        w7,  [x5, x3]
+    cbz         x1,  2f
+    lsl         x1,  x1,  #1
+    add         w0,  w0,  w7
+    cbnz        x1,  1b
+    ret
+2:
+    add         w0,  w0,  w7
+0:
+    ret
+9:
+    mov         w0,  #9
+    ret
+endfunc
+
 // int coeff_last( int16_t *l )
 function x264_coeff_last4_aarch64, export=1
     ldr         x2,  [x0]
@@ -384,3 +497,105 @@
     sub         w0,  w3,  w2
     ret
 endfunc
+
+.macro coeff_level_run_start size
+    add         x6,  x1,  #23            // runlevel->mask
+    mov         w7,  #0
+    mov         w8,  #0
+    mov         w9,  #1
+    and         x6,  x6,  #~15
+    mov         w4,  #\size - 1
+.endm
+
+.macro coeff_level_run shift
+    clz         x3,  x2
+    subs        w4,  w4,  w3, lsr #\shift
+    str         w4,  [x1], #4
+1:
+    ldrh        w5,  [x0, x4, lsl #1]
+    strh        w5,  [x6], #2
+    add         w7,  w7,  #1
+    lsl         w10, w9, w4
+    orr         w8,  w8,  w10
+    b.le        2f
+    add         w3,  w3,  #1 << \shift
+    sub         w4,  w4,  #1
+    and         x3,  x3,  #~((1 << \shift) - 1)
+    lsl         x2,  x2,  x3
+    clz         x3,  x2
+    subs        w4,  w4,  w3, lsr #\shift
+    b.ge        1b
+2:
+    str         w8,  [x1]
+    mov         w0,  w7
+.endm
+
+function x264_coeff_level_run4_aarch64, export=1
+    ldr         x2,  [x0]
+
+    coeff_level_run_start 4
+
+    coeff_level_run 4
+
+    ret
+endfunc
+
+.macro X264_COEFF_LEVEL_RUN size
+function x264_coeff_level_run\size\()_neon, export=1
+.if \size == 15
+    sub         x0,  x0,  #2
+.endif
+.if         \size < 15
+    .equ        shiftw, 3
+    ld1         {v0.8h}, [x0]
+    uqxtn       v0.8b,  v0.8h
+    cmtst       v0.8b,  v0.8b,  v0.8b
+.else
+    .equ        shiftw, 2
+    ld1         {v0.8h,v1.8h}, [x0]
+    uqxtn       v0.8b,  v0.8h
+    uqxtn2      v0.16b, v1.8h
+    cmtst       v0.16b, v0.16b, v0.16b
+    shrn        v0.8b,  v0.8h,  #4
+.endif
+    fmov        x2,  d0
+.if \size == 15
+    add         x0,  x0,  #2
+.endif

x264-snapshot-20141218-2245.tar.bz2/common/aarch64/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/aarch64/quant.h Changed

@@ -1,9 +1,10 @@
 /*****************************************************************************
  * quant.h: arm quantization and level-run
  *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
+ * Copyright (C) 2005-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
+ *          Janne Grunau <janne-x264@jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -38,10 +39,21 @@
 void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 
+int x264_decimate_score15_neon( int16_t * );
+int x264_decimate_score16_neon( int16_t * );
+int x264_decimate_score64_neon( int16_t * );
+
 int x264_coeff_last4_aarch64( int16_t * );
 int x264_coeff_last8_aarch64( int16_t * );
 int x264_coeff_last15_neon( int16_t * );
 int x264_coeff_last16_neon( int16_t * );
 int x264_coeff_last64_neon( int16_t * );
 
+int x264_coeff_level_run4_aarch64( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run8_neon( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run15_neon( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run16_neon( int16_t *, x264_run_level_t * );
+
+void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
+
 #endif

x264-snapshot-20141218-2245.tar.bz2/common/arm/asm.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/asm.S Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/cpu-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/cpu-a.S Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/dct-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/dct-a.S Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/dct.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/deblock-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/deblock-a.S Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/mc-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/mc-a.S Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc.S: arm motion compensation
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
  *          Mans Rullgard <mans@mansr.com>
@@ -1566,6 +1566,30 @@
     pop             {r4-r7, pc}
 endfunc
 
+function x264_plane_copy_swap_neon
+    push            {r4-r5, lr}
+    ldrd            r4, r5, [sp, #12]
+    add             lr,  r4,  #15
+    bic             lr,  lr,  #15
+    sub             r1,  r1,  lr, lsl #1
+    sub             r3,  r3,  lr, lsl #1
+1:
+    vld1.8          {q0, q1}, [r2]!
+    subs            lr,  lr,  #16
+    vrev16.8        q0,  q0
+    vrev16.8        q1,  q1
+    vst1.8          {q0, q1}, [r0]!
+    bgt             1b
+
+    subs            r5,  r5,  #1
+    add             r0,  r0,  r1
+    add             r2,  r2,  r3
+    mov             lr,  r4
+    bgt             1b
+
+    pop             {r4-r5, pc}
+endfunc
+
 function x264_store_interleave_chroma_neon
     push            {lr}
     ldr             lr,  [sp, #4]

x264-snapshot-20141218-2245.tar.bz2/common/arm/mc-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/arm/mc-c.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc-c.c: arm motion compensation
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42@gmail.com>
  *
@@ -57,6 +57,8 @@
 void x264_plane_copy_interleave_neon( pixel *dst,  intptr_t i_dst,
                                       pixel *srcu, intptr_t i_srcu,
                                       pixel *srcv, intptr_t i_srcv, int w, int h );
+void x264_plane_copy_swap_neon( pixel *dst, intptr_t i_dst,
+                                pixel *src, intptr_t i_src, int w, int h );
 
 void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
 void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
@@ -136,9 +138,6 @@
     x264_mc_copy_w16_neon,
 };
 
-static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-
 static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
                           uint8_t *src[4], intptr_t i_src_stride,
                           int mvx, int mvy,
@@ -146,13 +145,13 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
     if ( (mvy&3) == 3 )             // explict if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         x264_pixel_avg_wtab_neon[i_width>>2](
                 dst, i_dst_stride, src1, i_src_stride,
                 src2, i_height );
@@ -172,13 +171,13 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
     if ( (mvy&3) == 3 )             // explict if() to force conditional add
         src1 += i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         x264_pixel_avg_wtab_neon[i_width>>2](
                 dst, *i_dst_stride, src1, i_src_stride,
                 src2, i_height );
@@ -243,6 +242,7 @@
     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
     pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
+    pf->plane_copy_swap = x264_plane_copy_swap_neon;
 
     pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
     pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;

x264-snapshot-20141218-2245.tar.bz2/common/arm/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/mc.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/pixel-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/pixel-a.S Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/pixel.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/predict-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/predict-a.S Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/predict-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/arm/predict-c.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/predict.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/quant-a.S -> x264-snapshot-20150804-2245.tar.bz2/common/arm/quant-a.S Changed

x264-snapshot-20141218-2245.tar.bz2/common/arm/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/arm/quant.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/bitstream.c -> x264-snapshot-20150804-2245.tar.bz2/common/bitstream.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * bitstream.c: bitstream writing
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Fiona Glaser <fiona@x264.com>
@@ -54,6 +54,8 @@
 void x264_cabac_block_residual_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 
+uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
+
 /****************************************************************************
  * x264_nal_encode:
  ****************************************************************************/
@@ -142,4 +144,8 @@
     }
 #endif
 #endif
+#if ARCH_AARCH64
+    if( cpu&X264_CPU_NEON )
+        pf->nal_escape = x264_nal_escape_neon;
+#endif
 }

x264-snapshot-20141218-2245.tar.bz2/common/bitstream.h -> x264-snapshot-20150804-2245.tar.bz2/common/bitstream.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/cabac.c -> x264-snapshot-20150804-2245.tar.bz2/common/cabac.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/cabac.h -> x264-snapshot-20150804-2245.tar.bz2/common/cabac.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/common.c -> x264-snapshot-20150804-2245.tar.bz2/common/common.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * common.c: misc common functions
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -579,6 +579,7 @@
 {
     char *name_buf = NULL;
     int b_error = 0;
+    int errortype = X264_PARAM_BAD_VALUE;
     int name_was_bool;
     int value_was_null = !value;
     int i;
@@ -595,6 +596,8 @@
     {
         char *c;
         name_buf = strdup(name);
+        if( !name_buf )
+            return X264_PARAM_BAD_NAME;
         while( (c = strchr( name_buf, '_' )) )
             *c = '-';
         name = name_buf;
@@ -617,20 +620,23 @@
                  !strcasecmp(value, "auto") || atobool(value) ? x264_cpu_detect() : 0;
         if( b_error )
         {
-            char *buf = strdup(value);
-            char *tok, UNUSED *saveptr=NULL, *init;
-            b_error = 0;
-            p->cpu = 0;
-            for( init=buf; (tok=strtok_r(init, ",", &saveptr)); init=NULL )
+            char *buf = strdup( value );
+            if( buf )
             {
-                for( i=0; x264_cpu_names[i].flags && strcasecmp(tok, x264_cpu_names[i].name); i++ );
-                p->cpu |= x264_cpu_names[i].flags;
-                if( !x264_cpu_names[i].flags )
-                    b_error = 1;
+                char *tok, UNUSED *saveptr=NULL, *init;
+                b_error = 0;
+                p->cpu = 0;
+                for( init=buf; (tok=strtok_r(init, ",", &saveptr)); init=NULL )
+                {
+                    for( i=0; x264_cpu_names[i].flags && strcasecmp(tok, x264_cpu_names[i].name); i++ );
+                    p->cpu |= x264_cpu_names[i].flags;
+                    if( !x264_cpu_names[i].flags )
+                        b_error = 1;
+                }
+                free( buf );
+                if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) )
+                    p->cpu |= X264_CPU_SSE2_IS_FAST;
             }
-            free( buf );
-            if( (p->cpu&X264_CPU_SSSE3) && !(p->cpu&X264_CPU_SSE2_IS_SLOW) )
-                p->cpu |= X264_CPU_SSE2_IS_FAST;
         }
     }
     OPT("threads")
@@ -1049,7 +1055,10 @@
     OPT("opencl-device")
         p->i_opencl_device = atoi( value );
     else
-        return X264_PARAM_BAD_NAME;
+    {
+        b_error = 1;
+        errortype = X264_PARAM_BAD_NAME;
+    }
 #undef OPT
 #undef OPT2
 #undef atobool
@@ -1060,7 +1069,7 @@
         free( name_buf );
 
     b_error |= value_was_null && !name_was_bool;
-    return b_error ? X264_PARAM_BAD_VALUE : 0;
+    return b_error ? errortype : 0;
 }
 
 /****************************************************************************
@@ -1133,6 +1142,7 @@
         [X264_CSP_I420] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } },
         [X264_CSP_YV12] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } },
         [X264_CSP_NV12] = { 2, { 256*1, 256*1 },        { 256*1, 256/2 },       },
+        [X264_CSP_NV21] = { 2, { 256*1, 256*1 },        { 256*1, 256/2 },       },
         [X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
         [X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
         [X264_CSP_NV16] = { 2, { 256*1, 256*1 },        { 256*1, 256*1 },       },
@@ -1265,29 +1275,36 @@
 char *x264_slurp_file( const char *filename )
 {
     int b_error = 0;
-    size_t i_size;
+    int64_t i_size;
     char *buf;
     FILE *fh = x264_fopen( filename, "rb" );
     if( !fh )
         return NULL;
+
     b_error |= fseek( fh, 0, SEEK_END ) < 0;
     b_error |= ( i_size = ftell( fh ) ) <= 0;
+    if( WORD_SIZE == 4 )
+        b_error |= i_size > INT32_MAX;
     b_error |= fseek( fh, 0, SEEK_SET ) < 0;
     if( b_error )
         goto error;
+
     buf = x264_malloc( i_size+2 );
     if( !buf )
         goto error;
+
     b_error |= fread( buf, 1, i_size, fh ) != i_size;
-    if( buf[i_size-1] != '\n' )
-        buf[i_size++] = '\n';
-    buf[i_size] = 0;
     fclose( fh );
     if( b_error )
     {
         x264_free( buf );
         return NULL;
     }
+
+    if( buf[i_size-1] != '\n' )
+        buf[i_size++] = '\n';
+    buf[i_size] = '\0';
+
     return buf;
 error:
     fclose( fh );

x264-snapshot-20141218-2245.tar.bz2/common/common.h -> x264-snapshot-20150804-2245.tar.bz2/common/common.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/cpu.c -> x264-snapshot-20150804-2245.tar.bz2/common/cpu.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * cpu.c: cpu detection
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -67,8 +67,8 @@
     {"AVX",         AVX},
     {"XOP",         AVX|X264_CPU_XOP},
     {"FMA4",        AVX|X264_CPU_FMA4},
-    {"AVX2",        AVX|X264_CPU_AVX2},
     {"FMA3",        AVX|X264_CPU_FMA3},
+    {"AVX2",        AVX|X264_CPU_FMA3|X264_CPU_AVX2},
 #undef AVX
 #undef SSE2
 #undef MMX2
@@ -92,6 +92,8 @@
 #elif ARCH_AARCH64
     {"ARMv8",           X264_CPU_ARMV8},
     {"NEON",            X264_CPU_NEON},
+#elif ARCH_MIPS
+    {"MSA",             X264_CPU_MSA},
 #endif
     {"", 0},
 };
@@ -419,6 +421,17 @@
     return X264_CPU_ARMV8 | X264_CPU_NEON;
 }
 
+#elif ARCH_MIPS
+
+uint32_t x264_cpu_detect( void )
+{
+    uint32_t flags = 0;
+#if HAVE_MSA
+    flags |= X264_CPU_MSA;
+#endif
+    return flags;
+}
+
 #else
 
 uint32_t x264_cpu_detect( void )

x264-snapshot-20141218-2245.tar.bz2/common/cpu.h -> x264-snapshot-20150804-2245.tar.bz2/common/cpu.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/dct.c -> x264-snapshot-20150804-2245.tar.bz2/common/dct.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * dct.c: transform and zigzag
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -38,6 +38,9 @@
 #if ARCH_AARCH64
 #   include "aarch64/dct.h"
 #endif
+#if ARCH_MIPS
+#   include "mips/dct.h"
+#endif
 
 /* the inverse of the scaling factors introduced by 8x8 fdct */
 /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
@@ -747,8 +750,32 @@
 
         dctf->add8x8_idct8  = x264_add8x8_idct8_neon;
         dctf->add16x16_idct8= x264_add16x16_idct8_neon;
+#if ARCH_AARCH64
+        dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
+#endif
+    }
+#endif
+
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        dctf->sub4x4_dct       = x264_sub4x4_dct_msa;
+        dctf->sub8x8_dct       = x264_sub8x8_dct_msa;
+        dctf->sub16x16_dct     = x264_sub16x16_dct_msa;
+        dctf->sub8x8_dct_dc    = x264_sub8x8_dct_dc_msa;
+        dctf->sub8x16_dct_dc   = x264_sub8x16_dct_dc_msa;
+        dctf->dct4x4dc         = x264_dct4x4dc_msa;
+        dctf->idct4x4dc        = x264_idct4x4dc_msa;
+        dctf->add4x4_idct      = x264_add4x4_idct_msa;
+        dctf->add8x8_idct      = x264_add8x8_idct_msa;
+        dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_msa;
+        dctf->add16x16_idct    = x264_add16x16_idct_msa;
+        dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa;
+        dctf->add8x8_idct8     = x264_add8x8_idct8_msa;
+        dctf->add16x16_idct8   = x264_add16x16_idct8_msa;
     }
 #endif
+
 #endif // HIGH_BIT_DEPTH
 }
 
@@ -1004,7 +1031,20 @@
 #endif
 #if HAVE_ARMV6 || ARCH_AARCH64
     if( cpu&X264_CPU_NEON )
-        pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
+    {
+        pf_progressive->scan_4x4  = x264_zigzag_scan_4x4_frame_neon;
+#if ARCH_AARCH64
+        pf_interlaced->scan_4x4   = x264_zigzag_scan_4x4_field_neon;
+        pf_interlaced->scan_8x8   = x264_zigzag_scan_8x8_field_neon;
+        pf_interlaced->sub_4x4    = x264_zigzag_sub_4x4_field_neon;
+        pf_interlaced->sub_4x4ac  = x264_zigzag_sub_4x4ac_field_neon;
+        pf_interlaced->sub_8x8    = x264_zigzag_sub_8x8_field_neon;
+        pf_progressive->scan_8x8  = x264_zigzag_scan_8x8_frame_neon;
+        pf_progressive->sub_4x4   = x264_zigzag_sub_4x4_frame_neon;
+        pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon;
+        pf_progressive->sub_8x8   = x264_zigzag_sub_8x8_frame_neon;
+#endif // ARCH_AARCH64
+    }
 #endif // HAVE_ARMV6 || ARCH_AARCH64
 #endif // HIGH_BIT_DEPTH
 
@@ -1047,4 +1087,21 @@
     }
 #endif // HIGH_BIT_DEPTH
 #endif
+#if !HIGH_BIT_DEPTH
+#if ARCH_AARCH64
+    if( cpu&X264_CPU_NEON )
+    {
+        pf_interlaced->interleave_8x8_cavlc =
+        pf_progressive->interleave_8x8_cavlc =  x264_zigzag_interleave_8x8_cavlc_neon;
+    }
+#endif // ARCH_AARCH64
+#endif // !HIGH_BIT_DEPTH
+#if !HIGH_BIT_DEPTH
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        pf_progressive->scan_4x4  = x264_zigzag_scan_4x4_frame_msa;
+    }
+#endif
+#endif
 }

x264-snapshot-20141218-2245.tar.bz2/common/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/dct.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/deblock.c -> x264-snapshot-20150804-2245.tar.bz2/common/deblock.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * deblock.c: deblocking
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -612,8 +612,10 @@
     uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
     if( intra_cur )
     {
-        memset( &bs[0][1], 3, 3*4*sizeof(uint8_t) );
-        memset( &bs[1][1], 3, 3*4*sizeof(uint8_t) );
+        M32( bs[0][1] ) = 0x03030303;
+        M64( bs[0][2] ) = 0x0303030303030303ULL;
+        M32( bs[1][1] ) = 0x03030303;
+        M64( bs[1][2] ) = 0x0303030303030303ULL;
     }
     else
         h->loopf.deblock_strength( h->mb.cache.non_zero_count, h->mb.cache.ref, h->mb.cache.mv,
@@ -737,6 +739,32 @@
 void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                  int mvy_limit, int bframe );
+#if ARCH_AARCH64
+void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+#endif
+#endif
+
+#if !HIGH_BIT_DEPTH
+#if HAVE_MSA
+void x264_deblock_v_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
+void x264_deblock_strength_msa( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+                                int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
+                                int bframe );
+#endif
 #endif
 
 void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
@@ -835,18 +863,43 @@
     {
         pf->deblock_luma[1] = x264_deblock_v_luma_altivec;
         pf->deblock_luma[0] = x264_deblock_h_luma_altivec;
-   }
+    }
 #endif // HAVE_ALTIVEC
 
 #if HAVE_ARMV6 || ARCH_AARCH64
-   if( cpu&X264_CPU_NEON )
-   {
+    if( cpu&X264_CPU_NEON )
+    {
         pf->deblock_luma[1] = x264_deblock_v_luma_neon;
         pf->deblock_luma[0] = x264_deblock_h_luma_neon;
         pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
         pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
+#if ARCH_AARCH64
+        pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
+        pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
+        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
+        pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
+        pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
+        pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
+        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon;
+        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon;
+#endif
         pf->deblock_strength     = x264_deblock_strength_neon;
-   }
+    }
+#endif
+
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        pf->deblock_luma[1] = x264_deblock_v_luma_msa;
+        pf->deblock_luma[0] = x264_deblock_h_luma_msa;
+        pf->deblock_chroma[1] = x264_deblock_v_chroma_msa;
+        pf->deblock_h_chroma_420 = x264_deblock_h_chroma_msa;
+        pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_msa;
+        pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_msa;
+        pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_msa;
+        pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_msa;
+        pf->deblock_strength = x264_deblock_strength_msa;
+    }
 #endif
 #endif // !HIGH_BIT_DEPTH

x264-snapshot-20141218-2245.tar.bz2/common/frame.c -> x264-snapshot-20150804-2245.tar.bz2/common/frame.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * frame.c: frame handling
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -47,6 +47,7 @@
     switch( external_csp & X264_CSP_MASK )
     {
         case X264_CSP_NV12:
+        case X264_CSP_NV21:
         case X264_CSP_I420:
         case X264_CSP_YV12:
             return X264_CSP_NV12;
@@ -77,7 +78,7 @@
 #if ARCH_X86 || ARCH_X86_64
     if( h->param.cpu&X264_CPU_CACHELINE_64 )
         align = 64;
-    else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX2 )
+    else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
         align = 32;
 #endif
 #if ARCH_PPC
@@ -387,7 +388,15 @@
         return -1;
     }
 
-    dst->i_type     = src->i_type;
+    if( src->i_type < X264_TYPE_AUTO || src->i_type > X264_TYPE_KEYFRAME )
+    {
+        x264_log( h, X264_LOG_WARNING, "forced frame type (%d) at %d is unknown\n", src->i_type, h->frames.i_input );
+        dst->i_forced_type = X264_TYPE_AUTO;
+    }
+    else
+        dst->i_forced_type = src->i_type;
+
+    dst->i_type     = dst->i_forced_type;
     dst->i_qpplus1  = src->i_qpplus1;
     dst->i_pts      = dst->i_reordered_pts = src->i_pts;
     dst->param      = src->param;
@@ -435,6 +444,12 @@
             h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
                               stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>v_shift );
         }
+        else if( i_csp == X264_CSP_NV21 )
+        {
+            get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
+            h->mc.plane_copy_swap( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
+                                   stride[1]/sizeof(pixel), h->param.i_width>>1, h->param.i_height>>v_shift );
+        }
         else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 )
         {
             int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16;

x264-snapshot-20141218-2245.tar.bz2/common/frame.h -> x264-snapshot-20150804-2245.tar.bz2/common/frame.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/macroblock.c -> x264-snapshot-20150804-2245.tar.bz2/common/macroblock.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * macroblock.c: macroblock common functions
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Fiona Glaser <fiona@x264.com>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -1158,7 +1158,7 @@
             {
                 // Looking at the bottom field so always take the bottom macroblock of the pair.
                 h->mb.cache.topright_ref[l][0] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]];
-                h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[0]];
+                h->mb.cache.topright_ref[l][1] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[1]];
                 h->mb.cache.topright_ref[l][2] = ref[h->mb.left_b8[0] + 1 + s8x8*2 + s8x8*left_index_table->ref[2]];
                 CP32( h->mb.cache.topright_mv[l][0], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[0]] );
                 CP32( h->mb.cache.topright_mv[l][1], mv[h->mb.left_b4[0] + 3 + s4x4*4 + s4x4*left_index_table->mv[1]] );
@@ -1436,8 +1436,10 @@
     uint8_t (*bs)[8][4] = h->mb.cache.deblock_strength;
     if( IS_INTRA( h->mb.i_type ) )
     {
-        memset( bs[0][1], 3, 3*4*sizeof(uint8_t) );
-        memset( bs[1][1], 3, 3*4*sizeof(uint8_t) );
+        M32( bs[0][1] ) = 0x03030303;
+        M64( bs[0][2] ) = 0x0303030303030303ULL;
+        M32( bs[1][1] ) = 0x03030303;
+        M64( bs[1][2] ) = 0x0303030303030303ULL;
         return;
     }
 
@@ -1450,7 +1452,9 @@
             M32( bs[0][0] ) = 0x02020202;
             M32( bs[0][2] ) = 0x02020202;
             M32( bs[0][4] ) = 0x02020202;
-            memset( bs[1][0], 2, 5*4*sizeof(uint8_t) ); /* [1][1] and [1][3] has to be set for 4:2:2 */
+            M64( bs[1][0] ) = 0x0202020202020202ULL; /* [1][1] and [1][3] has to be set for 4:2:2 */
+            M64( bs[1][2] ) = 0x0202020202020202ULL;
+            M32( bs[1][4] ) = 0x02020202;
             return;
         }
     }

x264-snapshot-20141218-2245.tar.bz2/common/macroblock.h -> x264-snapshot-20150804-2245.tar.bz2/common/macroblock.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/mc.c -> x264-snapshot-20150804-2245.tar.bz2/common/mc.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc.c: motion compensation
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -38,6 +38,9 @@
 #if ARCH_AARCH64
 #include "aarch64/mc.h"
 #endif
+#if ARCH_MIPS
+#include "mips/mc.h"
+#endif
 
 
 static inline void pixel_avg( pixel *dst,  intptr_t i_dst_stride,
@@ -189,8 +192,8 @@
     }
 }
 
-static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+const uint8_t x264_hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+const uint8_t x264_hpel_ref1[16] = {0,0,1,0,2,2,3,2,2,2,3,2,2,2,3,2};
 
 static void mc_luma( pixel *dst,    intptr_t i_dst_stride,
                      pixel *src[4], intptr_t i_src_stride,
@@ -199,11 +202,11 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
+    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         pixel_avg( dst, i_dst_stride, src1, i_src_stride,
                    src2, i_src_stride, i_width, i_height );
         if( weight->weightfn )
@@ -222,11 +225,11 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
+    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
 
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         pixel_avg( dst, *i_dst_stride, src1, i_src_stride,
                    src2, i_src_stride, i_width, i_height );
         if( weight->weightfn )
@@ -299,6 +302,17 @@
     }
 }
 
+void x264_plane_copy_swap_c( pixel *dst, intptr_t i_dst,
+                             pixel *src, intptr_t i_src, int w, int h )
+{
+    for( int y=0; y<h; y++, dst+=i_dst, src+=i_src )
+        for( int x=0; x<2*w; x+=2 )
+        {
+            dst[x]   = src[x+1];
+            dst[x+1] = src[x];
+        }
+}
+
 void x264_plane_copy_interleave_c( pixel *dst,  intptr_t i_dst,
                                    pixel *srcu, intptr_t i_srcu,
                                    pixel *srcv, intptr_t i_srcv, int w, int h )
@@ -612,6 +626,7 @@
     pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec;
 
     pf->plane_copy = x264_plane_copy_c;
+    pf->plane_copy_swap = x264_plane_copy_swap_c;
     pf->plane_copy_interleave = x264_plane_copy_interleave_c;
     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c;
@@ -647,6 +662,10 @@
 #if ARCH_AARCH64
     x264_mc_init_aarch64( cpu, pf );
 #endif
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+        x264_mc_init_mips( cpu, pf );
+#endif
 
     if( cpu_independent )
     {

x264-snapshot-20141218-2245.tar.bz2/common/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/mc.h Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc.h: motion compensation
  *****************************************************************************
- * Copyright (C) 2004-2014 x264 project
+ * Copyright (C) 2004-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *
@@ -41,6 +41,8 @@
 } ALIGNED_16( x264_weight_t );
 
 extern const x264_weight_t x264_weight_none[3];
+extern const uint8_t x264_hpel_ref0[16];
+extern const uint8_t x264_hpel_ref1[16];
 
 #define SET_WEIGHT( w, b, s, d, o )\
 {\
@@ -86,6 +88,7 @@
     void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, intptr_t i_src, int height );
 
     void (*plane_copy)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
+    void (*plane_copy_swap)( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h );
     void (*plane_copy_interleave)( pixel *dst,  intptr_t i_dst, pixel *srcu, intptr_t i_srcu,
                                    pixel *srcv, intptr_t i_srcv, int w, int h );
     /* may write up to 15 pixels off the end of each plane */

x264-snapshot-20150804-2245.tar.bz2/common/mips Added

x264-snapshot-20150804-2245.tar.bz2/common/mips/dct-c.c Added

@@ -0,0 +1,525 @@
+/*****************************************************************************
+ * dct-c.c: msa transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Rishikesh More <rishikesh.more@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macros.h"
+
+#if !HIGH_BIT_DEPTH
+#define AVC_ITRANS_H( in0, in1, in2, in3, out0, out1, out2, out3 )          \
+{                                                                           \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+                                                                            \
+    tmp0_m = in0 + in2;                                                     \
+    tmp1_m = in0 - in2;                                                     \
+    tmp2_m = in1 >> 1;                                                      \
+    tmp2_m = tmp2_m - in3;                                                  \
+    tmp3_m = in3 >> 1;                                                      \
+    tmp3_m = in1 + tmp3_m;                                                  \
+                                                                            \
+    BUTTERFLY_4( tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3 );  \
+}
+
+static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst,
+                              int32_t i_src_stride )
+{
+    v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3;
+    v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
+    v4i32 hor_res0, hor_res1, hor_res2, hor_res3;
+    v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r;
+
+    LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
+    UNPCK_R_SH_SW( src0, src0_r );
+    UNPCK_R_SH_SW( src1, src1_r );
+    UNPCK_R_SH_SW( src2, src2_r );
+    UNPCK_R_SH_SW( src3, src3_r );
+    BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r,
+                 tmp0, tmp3, tmp2, tmp1 );
+    BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
+                 hor_res0, hor_res3, hor_res2, hor_res1 );
+    TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3,
+                        hor_res0, hor_res1, hor_res2, hor_res3 );
+    BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1,
+                 tmp0, tmp3, tmp2, tmp1 );
+    BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
+                 ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r );
+    SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 );
+    PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r,
+                 ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r,
+                 ver_res0, ver_res1, ver_res2, ver_res3 );
+    PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 );
+    ST_SH2( ver_res0, ver_res2, p_dst, 8 );
+}
+
+static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride,
+                                uint8_t *p_ref, int32_t i_dst_stride,
+                                int16_t *p_dst )
+{
+    uint32_t i_src0, i_src1, i_src2, i_src3;
+    uint32_t i_ref0, i_ref1, i_ref2, i_ref3;
+    v16i8 src = { 0 };
+    v16i8 ref = { 0 };
+    v16u8 inp0, inp1;
+    v8i16 diff0, diff1, diff2, diff3;
+    v8i16 temp0, temp1, temp2, temp3;
+
+    LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
+    LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 );
+
+    INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
+    INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref );
+
+    ILVRL_B2_UB( src, ref, inp0, inp1 );
+
+    HSUB_UB2_SH( inp0, inp1, diff0, diff2 );
+
+    diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 );
+    diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 );
+
+    BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 );
+
+    diff0 = temp0 + temp1;
+    diff1 = ( temp3 << 1 ) + temp2;
+    diff2 = temp0 - temp1;
+    diff3 = temp3 - ( temp2 << 1 );
+
+    TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
+                        temp0, temp1, temp2, temp3 );
+    BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 );
+
+    temp0 = diff0 + diff1;
+    temp1 = ( diff3 << 1 ) + diff2;
+    temp2 = diff0 - diff1;
+    temp3 = diff3 - ( diff2 << 1 );
+
+    ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 );
+    ST_UB2( inp0, inp1, p_dst, 8 );
+}
+
+static void avc_zigzag_scan_4x4_frame_msa( int16_t pi_dct[16],
+                                           int16_t pi_level[16] )
+{
+    v8i16 src0, src1;
+    v8i16 mask0 = { 0, 4, 1, 2, 5, 8, 12, 9 };
+    v8i16 mask1 = { 6, 3, 7, 10, 13, 14, 11, 15 };
+
+    LD_SH2( pi_dct, 8, src0, src1 );
+    VSHF_H2_SH( src0, src1, src0, src1, mask0, mask1, mask0, mask1 );
+    ST_SH2( mask0, mask1, pi_level, 8 );
+}
+
+static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src,
+                                    int32_t i_dst_stride )
+{
+    v8i16 src0, src1, src2, src3;
+    v8i16 hres0, hres1, hres2, hres3;
+    v8i16 vres0, vres1, vres2, vres3;
+    v8i16 zeros = { 0 };
+
+    LD4x4_SH( p_src, src0, src1, src2, src3 );
+    AVC_ITRANS_H( src0, src1, src2, src3, hres0, hres1, hres2, hres3 );
+    TRANSPOSE4x4_SH_SH( hres0, hres1, hres2, hres3,
+                        hres0, hres1, hres2, hres3 );
+    AVC_ITRANS_H( hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3 );
+    SRARI_H4_SH( vres0, vres1, vres2, vres3, 6 );
+    ADDBLK_ST4x4_UB( vres0, vres1, vres2, vres3, p_dst, i_dst_stride );
+    ST_SH2( zeros, zeros, p_src, 8 );
+}
+
+static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src,
+                                       int32_t i_dst_stride )
+{
+    int16_t i_dc;
+    uint32_t i_src0, i_src1, i_src2, i_src3;
+    v16u8 pred = { 0 };
+    v16i8 out;
+    v8i16 input_dc, pred_r, pred_l;
+
+    i_dc = ( p_src[0] + 32 ) >> 6;
+    input_dc = __msa_fill_h( i_dc );
+    p_src[ 0 ] = 0;
+
+    LW4( p_dst, i_dst_stride, i_src0, i_src1, i_src2, i_src3 );
+    INSERT_W4_UB( i_src0, i_src1, i_src2, i_src3, pred );
+    UNPCK_UB_SH( pred, pred_r, pred_l );
+
+    pred_r += input_dc;
+    pred_l += input_dc;
+
+    CLIP_SH2_0_255( pred_r, pred_l );
+    out = __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r );
+    ST4x4_UB( out, out, 0, 1, 2, 3, p_dst, i_dst_stride );
+}
+
+static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src,
+                                  int32_t i_dst_stride )
+{
+    v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 vec0, vec1, vec2, vec3;
+    v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
+    v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
+    v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
+    v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
+    v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
+    v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+    v16i8 zeros = { 0 };
+
+    p_src[ 0 ] += 32;
+
+    LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
+
+    vec0 = src0 + src4;
+    vec1 = src0 - src4;
+    vec2 = src2 >> 1;
+    vec2 = vec2 - src6;
+    vec3 = src6 >> 1;
+    vec3 = src2 + vec3;

x264-snapshot-20150804-2245.tar.bz2/common/mips/dct.h Added

@@ -0,0 +1,49 @@
+/*****************************************************************************
+ * dct.h: msa transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Rishikesh More <rishikesh.more@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_DCT_H
+#define X264_MIPS_DCT_H
+
+void x264_dct4x4dc_msa( int16_t d[16] );
+void x264_idct4x4dc_msa( int16_t d[16] );
+void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] );
+void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] );
+void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] );
+void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] );
+void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] );
+void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] );
+void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] );
+void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref );
+void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
+                          uint8_t *p_ref );
+void x264_sub16x16_dct_msa( int16_t p_dst[16][16], uint8_t *p_src,
+                            uint8_t *p_ref );
+void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4], uint8_t *p_pix1,
+                             uint8_t *p_pix2 );
+void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8], uint8_t *p_pix1,
+                              uint8_t *p_pix2 );
+void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] );
+
+#endif

x264-snapshot-20150804-2245.tar.bz2/common/mips/deblock-c.c Added

@@ -0,0 +1,2010 @@
+/*****************************************************************************
+ * deblock-c.c: msa deblocking
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Neha Rana <neha.rana@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macros.h"
+
+#if !HIGH_BIT_DEPTH
+#define AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_or_q3_org_in, p0_or_q0_org_in,           \
+                                  q3_or_p3_org_in, p1_or_q1_org_in,           \
+                                  p2_or_q2_org_in, q1_or_p1_org_in,           \
+                                  p0_or_q0_out, p1_or_q1_out, p2_or_q2_out )  \
+{                                                                             \
+    v8i16 threshold;                                                          \
+    v8i16 const3 = __msa_ldi_h( 3 );                                          \
+                                                                              \
+    threshold = p0_or_q0_org_in + q3_or_p3_org_in;                            \
+    threshold += p1_or_q1_org_in;                                             \
+                                                                              \
+    p0_or_q0_out = threshold << 1;                                            \
+    p0_or_q0_out += p2_or_q2_org_in;                                          \
+    p0_or_q0_out += q1_or_p1_org_in;                                          \
+    p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 3 );                          \
+                                                                              \
+    p1_or_q1_out = p2_or_q2_org_in + threshold;                               \
+    p1_or_q1_out = __msa_srari_h( p1_or_q1_out, 2 );                          \
+                                                                              \
+    p2_or_q2_out = p2_or_q2_org_in * const3;                                  \
+    p2_or_q2_out += p3_or_q3_org_in;                                          \
+    p2_or_q2_out += p3_or_q3_org_in;                                          \
+    p2_or_q2_out += threshold;                                                \
+    p2_or_q2_out = __msa_srari_h( p2_or_q2_out, 3 );                          \
+}
+
+/* data[-u32_u_img_width] = ( uint8_t )( ( 2 * p1 + p0 + q1 + 2 ) >> 2 ); */
+#define AVC_LPF_P0_OR_Q0( p0_or_q0_org_in, q1_or_p1_org_in,  \
+                          p1_or_q1_org_in, p0_or_q0_out )    \
+{                                                            \
+    p0_or_q0_out = p0_or_q0_org_in + q1_or_p1_org_in;        \
+    p0_or_q0_out += p1_or_q1_org_in;                         \
+    p0_or_q0_out += p1_or_q1_org_in;                         \
+    p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 2 );         \
+}
+
+#define AVC_LPF_P1_OR_Q1( p0_or_q0_org_in, q0_or_p0_org_in,          \
+                          p1_or_q1_org_in, p2_or_q2_org_in,          \
+                          negate_tc_in, tc_in, p1_or_q1_out )        \
+{                                                                    \
+    v8i16 clip3, temp;                                               \
+                                                                     \
+    clip3 = ( v8i16 ) __msa_aver_u_h( ( v8u16 ) p0_or_q0_org_in,     \
+                                      ( v8u16 ) q0_or_p0_org_in );   \
+    temp = p1_or_q1_org_in << 1;                                     \
+    clip3 -= temp;                                                   \
+    clip3 = __msa_ave_s_h( p2_or_q2_org_in, clip3 );                 \
+    clip3 = CLIP_SH( clip3, negate_tc_in, tc_in );                   \
+    p1_or_q1_out = p1_or_q1_org_in + clip3;                          \
+}
+
+#define AVC_LPF_P0Q0( q0_or_p0_org_in, p0_or_q0_org_in,           \
+                      p1_or_q1_org_in, q1_or_p1_org_in,           \
+                      negate_threshold_in, threshold_in,          \
+                      p0_or_q0_out, q0_or_p0_out )                \
+{                                                                 \
+    v8i16 q0_sub_p0, p1_sub_q1, delta;                            \
+                                                                  \
+    q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in;                \
+    p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in;                \
+    q0_sub_p0 <<= 2;                                              \
+    p1_sub_q1 += 4;                                               \
+    delta = q0_sub_p0 + p1_sub_q1;                                \
+    delta >>= 3;                                                  \
+                                                                  \
+    delta = CLIP_SH( delta, negate_threshold_in, threshold_in );  \
+                                                                  \
+    p0_or_q0_out = p0_or_q0_org_in + delta;                       \
+    q0_or_p0_out = q0_or_p0_org_in - delta;                       \
+                                                                  \
+    CLIP_SH2_0_255( p0_or_q0_out, q0_or_p0_out );                 \
+}
+
+static void avc_loopfilter_luma_intra_edge_hor_msa( uint8_t *p_data,
+                                                    uint8_t u_alpha_in,
+                                                    uint8_t u_beta_in,
+                                                    uint32_t u_img_width )
+{
+    v16u8 p2_asub_p0, q2_asub_q0, p0_asub_q0;
+    v16u8 alpha, beta;
+    v16u8 is_less_than, is_less_than_beta, negate_is_less_than_beta;
+    v16u8 p2, p1, p0, q0, q1, q2;
+    v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
+    v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
+    v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
+    v8i16 p2_r = { 0 };
+    v8i16 p1_r = { 0 };
+    v8i16 p0_r = { 0 };
+    v8i16 q0_r = { 0 };
+    v8i16 q1_r = { 0 };
+    v8i16 q2_r = { 0 };
+    v8i16 p2_l = { 0 };
+    v8i16 p1_l = { 0 };
+    v8i16 p0_l = { 0 };
+    v8i16 q0_l = { 0 };
+    v8i16 q1_l = { 0 };
+    v8i16 q2_l = { 0 };
+    v16u8 tmp_flag;
+    v16i8 zero = { 0 };
+
+    alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
+    beta = ( v16u8 ) __msa_fill_b( u_beta_in );
+
+    LD_UB4( p_data - ( u_img_width << 1 ), u_img_width,
+            p1_org, p0_org, q0_org, q1_org );
+
+    {
+        v16u8 p1_asub_p0, q1_asub_q0, is_less_than_alpha;
+
+        p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
+        p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
+        q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
+
+        is_less_than_alpha = ( p0_asub_q0 < alpha );
+        is_less_than_beta = ( p1_asub_p0 < beta );
+        is_less_than = is_less_than_beta & is_less_than_alpha;
+        is_less_than_beta = ( q1_asub_q0 < beta );
+        is_less_than = is_less_than_beta & is_less_than;
+    }
+
+    if( !__msa_test_bz_v( is_less_than ) )
+    {
+        q2_org = LD_UB( p_data + ( 2 * u_img_width ) );
+        p3_org = LD_UB( p_data - ( u_img_width << 2 ) );
+        p2_org = LD_UB( p_data - ( 3 * u_img_width ) );
+
+        UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
+        UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
+        UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
+
+        tmp_flag = alpha >> 2;
+        tmp_flag = tmp_flag + 2;
+        tmp_flag = ( p0_asub_q0 < tmp_flag );
+
+        p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
+        is_less_than_beta = ( p2_asub_p0 < beta );
+        is_less_than_beta = is_less_than_beta & tmp_flag;
+        negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
+        is_less_than_beta = is_less_than_beta & is_less_than;
+        negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
+        {
+            v8u16 is_less_than_beta_l, is_less_than_beta_r;
+
+            q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
+
+            is_less_than_beta_r =
+                ( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
+            if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
+            {
+                v8i16 p3_org_r;
+
+                ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r );
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r,
+                                          q0_org_r, p1_org_r,
+                                          p2_r, q1_org_r, p0_r, p1_r, p2_r );
+            }
+
+            q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
+
+            is_less_than_beta_l =
+                ( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
+
+            if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
+            {
+                v8i16 p3_org_l;
+
+                ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l );
+                AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l,
+                                          q0_org_l, p1_org_l,
+                                          p2_l, q1_org_l, p0_l, p1_l, p2_l );

x264-snapshot-20150804-2245.tar.bz2/common/mips/macros.h Added

@@ -0,0 +1,1952 @@
+/*****************************************************************************
+ * macros.h: msa macros
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Rishikesh More <rishikesh.more@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_MACROS_H
+#define X264_MIPS_MACROS_H
+
+#include <stdint.h>
+#include <msa.h>
+
+#define LD_B( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
+#define LD_UB( ... ) LD_B( v16u8, __VA_ARGS__ )
+#define LD_SB( ... ) LD_B( v16i8, __VA_ARGS__ )
+
+#define LD_H( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
+#define LD_SH( ... ) LD_H( v8i16, __VA_ARGS__ )
+
+#define LD_W( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
+#define LD_SW( ... ) LD_W( v4i32, __VA_ARGS__ )
+
+#define ST_B( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in )
+#define ST_UB( ... ) ST_B( v16u8, __VA_ARGS__ )
+#define ST_SB( ... ) ST_B( v16i8, __VA_ARGS__ )
+
+#define ST_H( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in )
+#define ST_UH( ... ) ST_H( v8u16, __VA_ARGS__ )
+#define ST_SH( ... ) ST_H( v8i16, __VA_ARGS__ )
+
+#if ( __mips_isa_rev >= 6 )
+    #define LH( p_src )                              \
+    ( {                                              \
+        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+        uint16_t u_val_h_m;                          \
+                                                     \
+        asm volatile (                               \
+            "lh  %[u_val_h_m],  %[p_src_m]  \n\t"    \
+                                                     \
+            : [u_val_h_m] "=r" ( u_val_h_m )         \
+            : [p_src_m] "m" ( *p_src_m )             \
+        );                                           \
+                                                     \
+        u_val_h_m;                                   \
+    } )
+
+    #define LW( p_src )                              \
+    ( {                                              \
+        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+        uint32_t u_val_w_m;                          \
+                                                     \
+        asm volatile (                               \
+            "lw  %[u_val_w_m],  %[p_src_m]  \n\t"    \
+                                                     \
+            : [u_val_w_m] "=r" ( u_val_w_m )         \
+            : [p_src_m] "m" ( *p_src_m )             \
+        );                                           \
+                                                     \
+        u_val_w_m;                                   \
+    } )
+
+    #if ( __mips == 64 )
+        #define LD( p_src )                              \
+        ( {                                              \
+            uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+            uint64_t u_val_d_m = 0;                      \
+                                                         \
+            asm volatile (                               \
+                "ld  %[u_val_d_m],  %[p_src_m]  \n\t"    \
+                                                         \
+                : [u_val_d_m] "=r" ( u_val_d_m )         \
+                : [p_src_m] "m" ( *p_src_m )             \
+            );                                           \
+                                                         \
+            u_val_d_m;                                   \
+        } )
+    #else  // !( __mips == 64 )
+        #define LD( p_src )                                                  \
+        ( {                                                                  \
+            uint8_t *p_src_m = ( uint8_t * ) ( p_src );                      \
+            uint32_t u_val0_m, u_val1_m;                                     \
+            uint64_t u_val_d_m = 0;                                          \
+                                                                             \
+            u_val0_m = LW( p_src_m );                                        \
+            u_val1_m = LW( p_src_m + 4 );                                    \
+                                                                             \
+            u_val_d_m = ( uint64_t ) ( u_val1_m );                           \
+            u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) &                 \
+                                       0xFFFFFFFF00000000 );                 \
+            u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m );  \
+                                                                             \
+            u_val_d_m;                                                       \
+        } )
+    #endif  // ( __mips == 64 )
+
+    #define SH( u_val, p_dst )                       \
+    {                                                \
+        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
+        uint16_t u_val_h_m = ( u_val );              \
+                                                     \
+        asm volatile (                               \
+            "sh  %[u_val_h_m],  %[p_dst_m]  \n\t"    \
+                                                     \
+            : [p_dst_m] "=m" ( *p_dst_m )            \
+            : [u_val_h_m] "r" ( u_val_h_m )          \
+        );                                           \
+    }
+
+    #define SW( u_val, p_dst )                       \
+    {                                                \
+        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
+        uint32_t u_val_w_m = ( u_val );              \
+                                                     \
+        asm volatile (                               \
+            "sw  %[u_val_w_m],  %[p_dst_m]  \n\t"    \
+                                                     \
+            : [p_dst_m] "=m" ( *p_dst_m )            \
+            : [u_val_w_m] "r" ( u_val_w_m )          \
+        );                                           \
+    }
+
+    #define SD( u_val, p_dst )                       \
+    {                                                \
+        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
+        uint64_t u_val_d_m = ( u_val );              \
+                                                     \
+        asm volatile (                               \
+            "sd  %[u_val_d_m],  %[p_dst_m]  \n\t"    \
+                                                     \
+            : [p_dst_m] "=m" ( *p_dst_m )            \
+            : [u_val_d_m] "r" ( u_val_d_m )          \
+        );                                           \
+    }
+
+#else  // !( __mips_isa_rev >= 6 )
+    #define LH( p_src )                              \
+    ( {                                              \
+        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+        uint16_t u_val_h_m;                          \
+                                                     \
+        asm volatile (                               \
+            "ulh  %[u_val_h_m],  %[p_src_m]  \n\t"   \
+                                                     \
+            : [u_val_h_m] "=r" ( u_val_h_m )         \
+            : [p_src_m] "m" ( *p_src_m )             \
+        );                                           \
+                                                     \
+        u_val_h_m;                                   \
+    } )
+
+    #define LW( p_src )                              \
+    ( {                                              \
+        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+        uint32_t u_val_w_m;                          \
+                                                     \
+        asm volatile (                               \
+            "ulw  %[u_val_w_m],  %[p_src_m]  \n\t"   \
+                                                     \
+            : [u_val_w_m] "=r" ( u_val_w_m )         \
+            : [p_src_m] "m" ( *p_src_m )             \
+        );                                           \
+                                                     \
+        u_val_w_m;                                   \
+    } )
+
+    #if ( __mips == 64 )
+        #define LD( p_src )                              \
+        ( {                                              \
+            uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+            uint64_t u_val_d_m = 0;                      \
+                                                         \
+            asm volatile (                               \
+                "uld  %[u_val_d_m],  %[p_src_m]  \n\t"   \
+                                                         \
+                : [u_val_d_m] "=r" ( u_val_d_m )         \
+                : [p_src_m] "m" ( *p_src_m )             \
+            );                                           \
+                                                         \
+            u_val_d_m;                                   \
+        } )

x264-snapshot-20150804-2245.tar.bz2/common/mips/mc-c.c Added

@@ -0,0 +1,3807 @@
+/*****************************************************************************
+ * mc-c.c: msa motion compensation
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Neha Rana <neha.rana@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macros.h"
+#include "mc.h"
+
+#if !HIGH_BIT_DEPTH
+static const uint8_t pu_luma_mask_arr[16 * 8] =
+{
+    /* 8 width cases */
+    0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
+    1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
+    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+    /* 4 width cases */
+    0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
+    1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
+    2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
+    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
+    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
+};
+
+static const uint8_t pu_chroma_mask_arr[16 * 5] =
+{
+    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+    0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
+    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
+    0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                           uint8_t *p_src, intptr_t i_src_stride,
+                           int32_t i_height );
+void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                          uint8_t *p_src, intptr_t i_src_stride,
+                          int32_t i_height );
+void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
+                          intptr_t i_src_stride, int32_t i_height );
+void x264_memzero_aligned_msa( void *p_dst, size_t n );
+
+void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                               uint8_t *p_pix2, intptr_t i_pix2_stride,
+                               uint8_t *p_pix3, intptr_t i_pix3_stride,
+                               int32_t i_weight );
+void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                              uint8_t *p_pix2, intptr_t i_pix2_stride,
+                              uint8_t *p_pix3, intptr_t i_pix3_stride,
+                              int32_t i_weight );
+void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                              uint8_t *p_pix2, intptr_t i_pix2_stride,
+                              uint8_t *p_pix3, intptr_t i_pix3_stride,
+                              int32_t i_weight );
+void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                             uint8_t *p_pix2, intptr_t i_pix2_stride,
+                             uint8_t *p_pix3, intptr_t i_pix3_stride,
+                             int32_t i_weight );
+void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                             uint8_t *p_pix2, intptr_t i_pix2_stride,
+                             uint8_t *p_pix3, intptr_t i_pix3_stride,
+                             int32_t i_weight );
+void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+                              uint8_t *p_pix2, intptr_t pix2_stride,
+                              uint8_t *p_pix3, intptr_t pix3_stride,
+                              int32_t i_weight );
+void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                             uint8_t *p_pix2, intptr_t i_pix2_stride,
+                             uint8_t *p_pix3, intptr_t i_pix3_stride,
+                             int32_t i_weight );
+void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                             uint8_t *p_pix2, intptr_t i_pix2_stride,
+                             uint8_t *p_pix3, intptr_t i_pix3_stride,
+                             int32_t i_weight );
+void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+                             uint8_t *p_pix2, intptr_t i_pix2_stride,
+                             uint8_t *p_pix3, intptr_t i_pix3_stride,
+                             int32_t i_weight );
+
+void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                             uint8_t *p_src, intptr_t i_src_stride,
+                             const x264_weight_t *pWeight, int32_t i_height );
+void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                            uint8_t *p_src, intptr_t i_src_stride,
+                            const x264_weight_t *pWeight, int32_t i_height );
+void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                            uint8_t *p_src, intptr_t i_src_stride,
+                            const x264_weight_t *pWeight, int32_t i_height );
+void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                             uint8_t *p_src, intptr_t i_src_stride,
+                             const x264_weight_t *pWeight, int32_t i_height );
+
+weight_fn_t x264_mc_weight_wtab_msa[6] =
+{
+    x264_mc_weight_w4_msa,
+    x264_mc_weight_w4_msa,
+    x264_mc_weight_w8_msa,
+    x264_mc_weight_w16_msa,
+    x264_mc_weight_w16_msa,
+    x264_mc_weight_w20_msa,
+};
+
+void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                       uint8_t *p_src[4], intptr_t i_src_stride,
+                       int32_t m_vx, int32_t m_vy,
+                       int32_t i_width, int32_t i_height,
+                       const x264_weight_t *pWeight );
+uint8_t *x264_get_ref_msa( uint8_t *p_dst,   intptr_t *p_dst_stride,
+                           uint8_t *p_src[4], intptr_t i_src_stride,
+                           int32_t m_vx, int32_t m_vy,
+                           int32_t i_width, int32_t i_height,
+                           const x264_weight_t *pWeight );
+void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
+                         intptr_t i_dst_stride,
+                         uint8_t *p_src, intptr_t i_src_stride,
+                         int32_t m_vx, int32_t m_vy,
+                         int32_t i_width, int32_t i_height );
+void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v,
+                           uint8_t *p_dstc, uint8_t *p_src,
+                           intptr_t i_stride, int32_t i_width,
+                           int32_t i_height, int16_t *p_buf );
+
+void x264_plane_copy_interleave_msa( uint8_t *p_dst,  intptr_t i_dst_stride,
+                                     uint8_t *p_src0, intptr_t i_src_stride0,
+                                     uint8_t *p_src1, intptr_t i_src_stride1,
+                                     int32_t i_width, int32_t i_height );
+void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0,
+                                       uint8_t *p_dst1, intptr_t i_dst_stride1,
+                                       uint8_t *p_src,  intptr_t i_src_stride,
+                                       int32_t i_width, int32_t i_height );
+void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0,
+                                           intptr_t i_dst_stride0,
+                                           uint8_t *p_dst1,
+                                           intptr_t i_dst_stride1,
+                                           uint8_t *p_dst2,
+                                           intptr_t i_dst_stride2,
+                                           uint8_t *p_src,
+                                           intptr_t i_src_stride,
+                                           int32_t i_src_width, int32_t i_width,
+                                           int32_t i_height );
+void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+                                       uint8_t *p_src0, uint8_t *p_src1,
+                                       int32_t i_height );
+void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src,
+                                             intptr_t i_src_stride,
+                                             int32_t i_height );
+void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src,
+                                             intptr_t i_src_stride,
+                                             int32_t i_height );
+void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0,
+                                      uint8_t *p_dst1, uint8_t *p_dst2,
+                                      uint8_t *p_dst3, intptr_t i_src_stride,
+                                      intptr_t i_dst_stride, int32_t i_width,
+                                      int32_t i_height );
+
+static void avc_luma_hz_16w_msa( uint8_t *p_src, int32_t i_src_stride,
+                                 uint8_t *p_dst, int32_t i_dst_stride,
+                                 int32_t i_height )
+{
+    uint32_t u_loop_cnt, u_h4w;
+    v16u8 dst0;
+    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+    v16i8 mask0, mask1, mask2;
+    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
+    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
+    v16i8 minus5b = __msa_ldi_b( -5 );
+    v16i8 plus20b = __msa_ldi_b( 20 );
+
+    u_h4w = i_height % 4;
+    LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
+
+    for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
+    {
+        LD_SB2( p_src, 8, src0, src1 );
+        p_src += i_src_stride;
+        LD_SB2( p_src, 8, src2, src3 );
+        p_src += i_src_stride;

x264-snapshot-20150804-2245.tar.bz2/common/mips/mc.h Added

@@ -0,0 +1,31 @@
+/*****************************************************************************
+ * mc.h: msa motion compensation
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Neha Rana <neha.rana@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_MC_H
+#define X264_MIPS_MC_H
+
+void x264_mc_init_mips( int cpu, x264_mc_functions_t *pf );
+
+#endif

x264-snapshot-20150804-2245.tar.bz2/common/mips/pixel-c.c Added

@@ -0,0 +1,1491 @@
+/*****************************************************************************
+ * pixel-c.c: msa pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macros.h"
+#include "pixel.h"
+#include "predict.h"
+
+#if !HIGH_BIT_DEPTH
+#define CALC_MSE_B( src, ref, var )                                    \
+{                                                                      \
+    v16u8 src_l0_m, src_l1_m;                                          \
+    v8i16 res_l0_m, res_l1_m;                                          \
+                                                                       \
+    ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m );                       \
+    HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m );             \
+    DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var );  \
+}
+
+#define CALC_MSE_AVG_B( src, ref, var, sub )                           \
+{                                                                      \
+    v16u8 src_l0_m, src_l1_m;                                          \
+    v8i16 res_l0_m, res_l1_m;                                          \
+                                                                       \
+    ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m );                       \
+    HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m );             \
+    DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var );  \
+                                                                       \
+    sub += res_l0_m + res_l1_m;                                        \
+}
+
+#define VARIANCE_WxH( sse, diff, shift )                                \
+    ( ( sse ) - ( ( ( uint32_t )( diff ) * ( diff ) ) >> ( shift ) ) )
+
+static uint32_t sad_4width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                uint8_t *p_ref, int32_t i_ref_stride,
+                                int32_t i_height )
+{
+    int32_t i_ht_cnt;
+    uint32_t u_src0, u_src1, u_src2, u_src3, u_ref0, u_ref1, u_ref2, u_ref3;
+    v16u8 src = { 0 };
+    v16u8 ref = { 0 };
+    v16u8 diff;
+    v8u16 sad = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 );
+        p_src += ( 4 * i_src_stride );
+        LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 );
+        p_ref += ( 4 * i_ref_stride );
+
+        INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src );
+        INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref );
+
+        diff = __msa_asub_u_b( src, ref );
+        sad += __msa_hadd_u_h( diff, diff );
+    }
+
+    return ( HADD_UH_U32( sad ) );
+}
+
+static uint32_t sad_8width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                uint8_t *p_ref, int32_t i_ref_stride,
+                                int32_t i_height )
+{
+    int32_t i_ht_cnt;
+    v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+    v8u16 sad = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+        p_src += ( 4 * i_src_stride );
+        LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
+        p_ref += ( 4 * i_ref_stride );
+
+        PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+                     src0, src1, ref0, ref1 );
+        sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
+    }
+
+    return ( HADD_UH_U32( sad ) );
+}
+
+static uint32_t sad_16width_msa( uint8_t *p_src, int32_t i_src_stride,
+                                 uint8_t *p_ref, int32_t i_ref_stride,
+                                 int32_t i_height )
+{
+    int32_t i_ht_cnt;
+    v16u8 src0, src1, ref0, ref1;
+    v8u16 sad = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LD_UB2( p_src, i_src_stride, src0, src1 );
+        p_src += ( 2 * i_src_stride );
+        LD_UB2( p_ref, i_ref_stride, ref0, ref1 );
+        p_ref += ( 2 * i_ref_stride );
+        sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
+
+        LD_UB2( p_src, i_src_stride, src0, src1 );
+        p_src += ( 2 * i_src_stride );
+        LD_UB2( p_ref, i_ref_stride, ref0, ref1 );
+        p_ref += ( 2 * i_ref_stride );
+        sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
+    }
+
+    return ( HADD_UH_U32( sad ) );
+}
+
+static void sad_4width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
+                                uint8_t *p_ref0, uint8_t *p_ref1,
+                                uint8_t *p_ref2, int32_t i_ref_stride,
+                                int32_t i_height, uint32_t *pu_sad_array )
+{
+    int32_t i_ht_cnt;
+    v16u8 src = { 0 };
+    uint32_t src0, src1, src2, src3, load0, load1, load2, load3;
+    v16u8 ref0 = { 0 };
+    v16u8 ref1 = { 0 };
+    v16u8 ref2 = { 0 };
+    v16u8 diff;
+    v8u16 sad0 = { 0 };
+    v8u16 sad1 = { 0 };
+    v8u16 sad2 = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LW4( p_src, i_src_stride, src0, src1, src2, src3 );
+        INSERT_W4_UB( src0, src1, src2, src3, src );
+        p_src += ( 4 * i_src_stride );
+
+        LW4( p_ref0, i_ref_stride, load0, load1, load2, load3 );
+        INSERT_W4_UB( load0, load1, load2, load3, ref0 );
+        p_ref0 += ( 4 * i_ref_stride );
+
+        LW4( p_ref1, i_ref_stride, load0, load1, load2, load3 );
+        INSERT_W4_UB( load0, load1, load2, load3, ref1 );
+        p_ref1 += ( 4 * i_ref_stride );
+
+        LW4( p_ref2, i_ref_stride, load0, load1, load2, load3 );
+        INSERT_W4_UB( load0, load1, load2, load3, ref2 );
+        p_ref2 += ( 4 * i_ref_stride );
+
+        diff = __msa_asub_u_b( src, ref0 );
+        sad0 += __msa_hadd_u_h( diff, diff );
+
+        diff = __msa_asub_u_b( src, ref1 );
+        sad1 += __msa_hadd_u_h( diff, diff );
+
+        diff = __msa_asub_u_b( src, ref2 );
+        sad2 += __msa_hadd_u_h( diff, diff );
+    }
+
+    pu_sad_array[0] = HADD_UH_U32( sad0 );
+    pu_sad_array[1] = HADD_UH_U32( sad1 );
+    pu_sad_array[2] = HADD_UH_U32( sad2 );
+}
+
+static void sad_8width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
+                                uint8_t *p_ref0, uint8_t *p_ref1,
+                                uint8_t *p_ref2, int32_t i_ref_stride,
+                                int32_t i_height, uint32_t *pu_sad_array )
+{
+    int32_t i_ht_cnt;
+    v16u8 src0, src1, src2, src3;
+    v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
+    v8u16 sad0 = { 0 };
+    v8u16 sad1 = { 0 };
+    v8u16 sad2 = { 0 };
+
+    for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
+    {
+        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+        p_src += ( 4 * i_src_stride );
+        LD_UB4( p_ref0, i_ref_stride, ref00, ref11, ref22, ref33 );

x264-snapshot-20150804-2245.tar.bz2/common/mips/pixel.h Added

@@ -0,0 +1,170 @@
+/*****************************************************************************
+ * pixel.h: msa pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_SAD_H
+#define X264_MIPS_SAD_H
+
+int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                  uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                  uint8_t *p_ref1, uint8_t *p_ref2,
+                                  uint8_t *p_ref3, intptr_t i_ref_stride,
+                                  int32_t p_sad_array[4] );
+void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                 uint8_t *p_ref1, uint8_t *p_ref2,
+                                 uint8_t *p_ref3, intptr_t i_ref_stride,
+                                 int32_t p_sad_array[4] );
+void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                 uint8_t *p_ref1, uint8_t *p_ref2,
+                                 uint8_t *p_ref3, intptr_t i_ref_stride,
+                                 int32_t p_sad_array[4] );
+void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                uint8_t *p_ref3, intptr_t i_ref_stride,
+                                int32_t p_sad_array[4] );
+void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                uint8_t *p_ref3, intptr_t i_ref_stride,
+                                int32_t p_sad_array[4] );
+void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                uint8_t *p_ref3, intptr_t i_ref_stride,
+                                int32_t p_sad_array[4] );
+void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                uint8_t *p_ref3, intptr_t i_ref_stride,
+                                int32_t p_sad_array[4] );
+void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                  uint8_t *p_ref1, uint8_t *p_ref2,
+                                  intptr_t i_ref_stride,
+                                  int32_t p_sad_array[3] );
+void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                 uint8_t *p_ref1, uint8_t *p_ref2,
+                                 intptr_t i_ref_stride,
+                                 int32_t p_sad_array[3] );
+void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                 uint8_t *p_ref1, uint8_t *p_ref2,
+                                 intptr_t i_ref_stride,
+                                 int32_t p_sad_array[3] );
+void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                intptr_t i_ref_stride,
+                                int32_t p_sad_array[3] );
+void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                intptr_t i_ref_stride,
+                                int32_t p_sad_array[3] );
+void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                intptr_t i_ref_stride,
+                                int32_t p_sad_array[3] );
+void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
+                                uint8_t *p_ref1, uint8_t *p_ref2,
+                                intptr_t i_ref_stride,
+                                int32_t p_sad_array[3] );
+int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                  uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                 uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
+                                uint8_t *p_ref, intptr_t i_ref_stride );
+void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                int32_t p_sad_array[3] );
+void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                  int32_t p_sad_array[3] );
+void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
+                                int32_t p_sad_array[3] );
+void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                 int32_t p_sad_array[3] );
+void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1,
+                               const uint8_t *p_pix2, intptr_t i_stride2,
+                               int32_t i_sums[2][4] );
+uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride );
+uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride );
+uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride );
+uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride );
+int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                  uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                  uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                  uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                   uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                 uint8_t *p_pix2, intptr_t i_stride2 );
+int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
+                                   uint8_t *p_pix2, intptr_t i_stride2 );
+void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                 int32_t p_sad_array[3] );
+void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                   int32_t p_sad_array[3] );
+void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
+                                 int32_t p_sad_array[3] );
+void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
+                                  int32_t p_sad_array[3] );
+uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride );
+uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride );
+uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride );
+int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1,
+                                  uint8_t *p_pix2, intptr_t i_stride2,
+                                  int32_t *p_ssd );
+int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1,
+                                 uint8_t *p_pix2, intptr_t i_stride2,
+                                 int32_t *p_ssd );
+
+#endif

x264-snapshot-20150804-2245.tar.bz2/common/mips/predict-c.c Added

@@ -0,0 +1,607 @@
+/*****************************************************************************
+ * predict-c.c: msa intra prediction
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macros.h"
+
+#if !HIGH_BIT_DEPTH
+static void intra_predict_vert_4x4_msa( uint8_t *p_src, uint8_t *p_dst,
+                                        int32_t i_dst_stride )
+{
+    uint32_t u_src_data;
+
+    u_src_data = LW( p_src );
+
+    SW4( u_src_data, u_src_data, u_src_data, u_src_data, p_dst, i_dst_stride );
+}
+
+static void intra_predict_vert_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
+                                        int32_t i_dst_stride )
+{
+    uint64_t u_out;
+
+    u_out = LD( p_src );
+
+    SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
+    p_dst += ( 4 * i_dst_stride );
+    SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
+}
+
+static void intra_predict_vert_16x16_msa( uint8_t *p_src, uint8_t *p_dst,
+                                          int32_t i_dst_stride )
+{
+    v16u8 src0 = LD_UB( p_src );
+
+    ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
+            i_dst_stride );
+    p_dst += ( 8 * i_dst_stride );
+    ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
+            i_dst_stride );
+}
+
+static void intra_predict_horiz_4x4_msa( uint8_t *p_src, int32_t i_src_stride,
+                                         uint8_t *p_dst, int32_t i_dst_stride )
+{
+    uint32_t u_out0, u_out1, u_out2, u_out3;
+
+    u_out0 = p_src[0 * i_src_stride] * 0x01010101;
+    u_out1 = p_src[1 * i_src_stride] * 0x01010101;
+    u_out2 = p_src[2 * i_src_stride] * 0x01010101;
+    u_out3 = p_src[3 * i_src_stride] * 0x01010101;
+
+    SW4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+}
+
+static void intra_predict_horiz_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
+                                         uint8_t *p_dst, int32_t i_dst_stride )
+{
+    uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
+
+    u_out0 = p_src[0 * i_src_stride] * 0x0101010101010101ull;
+    u_out1 = p_src[1 * i_src_stride] * 0x0101010101010101ull;
+    u_out2 = p_src[2 * i_src_stride] * 0x0101010101010101ull;
+    u_out3 = p_src[3 * i_src_stride] * 0x0101010101010101ull;
+    u_out4 = p_src[4 * i_src_stride] * 0x0101010101010101ull;
+    u_out5 = p_src[5 * i_src_stride] * 0x0101010101010101ull;
+    u_out6 = p_src[6 * i_src_stride] * 0x0101010101010101ull;
+    u_out7 = p_src[7 * i_src_stride] * 0x0101010101010101ull;
+
+    SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+    p_dst += ( 4 * i_dst_stride );
+    SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
+}
+
+static void intra_predict_horiz_16x16_msa( uint8_t *p_src, int32_t i_src_stride,
+                                           uint8_t *p_dst,
+                                           int32_t i_dst_stride )
+{
+    uint32_t u_row;
+    uint8_t u_inp0, u_inp1, u_inp2, u_inp3;
+    v16u8 src0, src1, src2, src3;
+
+    for ( u_row = 4; u_row--; )
+    {
+        u_inp0 = p_src[0];
+        p_src += i_src_stride;
+        u_inp1 = p_src[0];
+        p_src += i_src_stride;
+        u_inp2 = p_src[0];
+        p_src += i_src_stride;
+        u_inp3 = p_src[0];
+        p_src += i_src_stride;
+
+        src0 = ( v16u8 ) __msa_fill_b( u_inp0 );
+        src1 = ( v16u8 ) __msa_fill_b( u_inp1 );
+        src2 = ( v16u8 ) __msa_fill_b( u_inp2 );
+        src3 = ( v16u8 ) __msa_fill_b( u_inp3 );
+
+        ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
+        p_dst += ( 4 * i_dst_stride );
+    }
+}
+
+static void intra_predict_dc_4x4_msa( uint8_t *p_src_top, uint8_t *p_src_left,
+                                      int32_t i_src_stride_left,
+                                      uint8_t *p_dst, int32_t i_dst_stride,
+                                      uint8_t is_above, uint8_t is_left )
+{
+    uint32_t u_row;
+    uint32_t u_out, u_addition = 0;
+    v16u8 src_above, store;
+    v8u16 sum_above;
+    v4u32 sum;
+
+    if ( is_left && is_above )
+    {
+        src_above = LD_UB( p_src_top );
+
+        sum_above = __msa_hadd_u_h( src_above, src_above );
+        sum = __msa_hadd_u_w( sum_above, sum_above );
+        u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
+
+        for ( u_row = 0; u_row < 4; u_row++ )
+        {
+            u_addition += p_src_left[u_row * i_src_stride_left];
+        }
+
+        u_addition = ( u_addition + 4 ) >> 3;
+        store = ( v16u8 ) __msa_fill_b( u_addition );
+    }
+    else if ( is_left )
+    {
+        for ( u_row = 0; u_row < 4; u_row++ )
+        {
+            u_addition += p_src_left[u_row * i_src_stride_left];
+        }
+
+        u_addition = ( u_addition + 2 ) >> 2;
+        store = ( v16u8 ) __msa_fill_b( u_addition );
+    }
+    else if ( is_above )
+    {
+        src_above = LD_UB( p_src_top );
+
+        sum_above = __msa_hadd_u_h( src_above, src_above );
+        sum = __msa_hadd_u_w( sum_above, sum_above );
+        sum = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum, 2 );
+        store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
+    }
+    else
+    {
+        store = ( v16u8 ) __msa_ldi_b( 128 );
+    }
+
+    u_out = __msa_copy_u_w( ( v4i32 ) store, 0 );
+
+    SW4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
+}
+
+static void intra_predict_dc_8x8_msa( uint8_t *p_src_top, uint8_t *p_src_left,
+                                      uint8_t *p_dst, int32_t i_dst_stride )
+{
+    uint64_t u_val0, u_val1;
+    v16i8 store;
+    v16u8 src = { 0 };
+    v8u16 sum_h;
+    v4u32 sum_w;
+    v2u64 sum_d;
+
+    u_val0 = LD( p_src_top );
+    u_val1 = LD( p_src_left );
+    INSERT_D2_UB( u_val0, u_val1, src );
+    sum_h = __msa_hadd_u_h( src, src );
+    sum_w = __msa_hadd_u_w( sum_h, sum_h );
+    sum_d = __msa_hadd_u_d( sum_w, sum_w );
+    sum_w = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum_d, ( v4i32 ) sum_d );
+    sum_d = __msa_hadd_u_d( sum_w, sum_w );
+    sum_w = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum_d, 4 );

x264-snapshot-20150804-2245.tar.bz2/common/mips/predict.h Added

@@ -0,0 +1,48 @@
+/*****************************************************************************
+ * predict.h: msa intra prediction
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Rishikesh More <rishikesh.more@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_PREDICT_H
+#define X264_MIPS_PREDICT_H
+
+void x264_intra_predict_dc_16x16_msa( uint8_t *p_src );
+void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src );
+void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src );
+void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src );
+void x264_intra_predict_hor_16x16_msa( uint8_t *p_src );
+void x264_intra_predict_vert_16x16_msa( uint8_t *p_src );
+void x264_intra_predict_plane_16x16_msa( uint8_t *p_src );
+void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src );
+void x264_intra_predict_hor_8x8_msa( uint8_t *p_src );
+void x264_intra_predict_vert_8x8_msa( uint8_t *p_src );
+void x264_intra_predict_plane_8x8_msa( uint8_t *p_src );
+void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
+void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
+void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
+void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
+void x264_intra_predict_dc_4x4_msa( uint8_t *p_src );
+void x264_intra_predict_hor_4x4_msa( uint8_t *p_src );
+void x264_intra_predict_vert_4x4_msa( uint8_t *p_src );
+
+#endif

x264-snapshot-20150804-2245.tar.bz2/common/mips/quant-c.c Added

@@ -0,0 +1,630 @@
+/*****************************************************************************
+ * quant-c.c: msa quantization and level-run
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Rishikesh More <rishikesh.more@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macros.h"
+
+#if !HIGH_BIT_DEPTH
+static void avc_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
+                                 int32_t i_qp )
+{
+    const int32_t i_mf = i_qp % 6;
+    const int32_t q_bits = i_qp / 6 - 4;
+    v8i16 dct0, dct1;
+    v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
+
+    LD_SH2( p_dct, 8, dct0, dct1 );
+
+    LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
+    LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
+
+    if ( q_bits >= 0 )
+    {
+        v8i16 dequant_mf_h0, dequant_mf_h1, q_bits_vec;
+
+        q_bits_vec = __msa_fill_h( q_bits );
+
+        PCKEV_H2_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
+                     dequant_mf_h0, dequant_mf_h1 );
+
+        dct0 *= dequant_mf_h0;
+        dct1 *= dequant_mf_h1;
+        dct0 <<= q_bits_vec;
+        dct1 <<= q_bits_vec;
+        ST_SH2( dct0, dct1, p_dct, 8 );
+    }
+    else
+    {
+        const int32_t q_bits_add = 1 << ( -q_bits - 1 );
+        v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
+        v4i32 q_bits_vec, q_bits_vec_add;
+
+        q_bits_vec_add = __msa_fill_w( q_bits_add );
+        q_bits_vec = __msa_fill_w( -q_bits );
+
+        UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
+        UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
+
+        dct_signed_w0 *= dequant_m_f0;
+        dct_signed_w1 *= dequant_m_f1;
+        dct_signed_w2 *= dequant_m_f2;
+        dct_signed_w3 *= dequant_m_f3;
+        dct_signed_w0 += q_bits_vec_add;
+        dct_signed_w1 += q_bits_vec_add;
+        dct_signed_w2 += q_bits_vec_add;
+        dct_signed_w3 += q_bits_vec_add;
+
+        SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
+                q_bits_vec );
+        PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
+                     dct0, dct1 );
+        ST_SH2( dct0, dct1, p_dct, 8 );
+    }
+}
+
+static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
+                                 int32_t i_qp )
+{
+    const int32_t i_mf = i_qp % 6;
+    const int32_t q_bits = i_qp / 6 - 6;
+    v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7;
+    v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
+    v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7;
+    v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11;
+    v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15;
+
+    LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 );
+
+    LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
+    LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
+    LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 );
+    LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 );
+    LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 );
+    LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 );
+    LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 );
+    LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 );
+
+    if ( q_bits >= 0 )
+    {
+        v8i16 q_bits_vec;
+        v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3;
+        v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7;
+
+        q_bits_vec = __msa_fill_h( q_bits );
+
+        PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
+                     dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6,
+                     dequant_mf_h0, dequant_mf_h1,
+                     dequant_mf_h2, dequant_mf_h3 );
+        PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10,
+                     dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14,
+                     dequant_mf_h4, dequant_mf_h5,
+                     dequant_mf_h6, dequant_mf_h7 );
+
+        dct0 *= dequant_mf_h0;
+        dct1 *= dequant_mf_h1;
+        dct2 *= dequant_mf_h2;
+        dct3 *= dequant_mf_h3;
+        dct4 *= dequant_mf_h4;
+        dct5 *= dequant_mf_h5;
+        dct6 *= dequant_mf_h6;
+        dct7 *= dequant_mf_h7;
+
+        SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec );
+        SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec );
+
+        ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
+    }
+    else
+    {
+        const int32_t q_bits_add = 1 << ( -q_bits - 1 );
+        v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
+        v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
+        v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11;
+        v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15;
+        v4i32 q_bits_vec, q_bits_vec_add;
+
+        q_bits_vec_add = __msa_fill_w( q_bits_add );
+        q_bits_vec = __msa_fill_w( -q_bits );
+
+        UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
+        UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
+        UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
+        UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
+        UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 );
+        UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 );
+        UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 );
+        UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 );
+
+        dct_signed_w0 *= dequant_m_f0;
+        dct_signed_w1 *= dequant_m_f1;
+        dct_signed_w2 *= dequant_m_f2;
+        dct_signed_w3 *= dequant_m_f3;
+        dct_signed_w4 *= dequant_m_f4;
+        dct_signed_w5 *= dequant_m_f5;
+        dct_signed_w6 *= dequant_m_f6;
+        dct_signed_w7 *= dequant_m_f7;
+        dct_signed_w8 *= dequant_m_f8;
+        dct_signed_w9 *= dequant_m_f9;
+        dct_signed_w10 *= dequant_m_f10;
+        dct_signed_w11 *= dequant_m_f11;
+        dct_signed_w12 *= dequant_m_f12;
+        dct_signed_w13 *= dequant_m_f13;
+        dct_signed_w14 *= dequant_m_f14;
+        dct_signed_w15 *= dequant_m_f15;
+
+        dct_signed_w0 += q_bits_vec_add;
+        dct_signed_w1 += q_bits_vec_add;
+        dct_signed_w2 += q_bits_vec_add;
+        dct_signed_w3 += q_bits_vec_add;
+        dct_signed_w4 += q_bits_vec_add;
+        dct_signed_w5 += q_bits_vec_add;
+        dct_signed_w6 += q_bits_vec_add;
+        dct_signed_w7 += q_bits_vec_add;
+        dct_signed_w8 += q_bits_vec_add;
+        dct_signed_w9 += q_bits_vec_add;
+        dct_signed_w10 += q_bits_vec_add;
+        dct_signed_w11 += q_bits_vec_add;
+        dct_signed_w12 += q_bits_vec_add;
+        dct_signed_w13 += q_bits_vec_add;
+        dct_signed_w14 += q_bits_vec_add;
+        dct_signed_w15 += q_bits_vec_add;
+
+        SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
+                q_bits_vec );
+        SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7,
+                q_bits_vec );
+        SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11,

x264-snapshot-20150804-2245.tar.bz2/common/mips/quant.h Added

@@ -0,0 +1,43 @@
+/*****************************************************************************
+ * quant.h: msa quantization and level-run
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Rishikesh More <rishikesh.more@imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_QUANT_H
+#define X264_MIPS_QUANT_H
+
+void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
+                           int32_t i_qp );
+void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
+                           int32_t i_qp );
+void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
+                              int32_t i_qp );
+int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias );
+int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16],
+                              uint16_t pu_mf[16], uint16_t pu_bias[16] );
+int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias );
+int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias );
+int32_t x264_coeff_last64_msa( int16_t *p_src );
+int32_t x264_coeff_last16_msa( int16_t *p_src );
+
+#endif

x264-snapshot-20141218-2245.tar.bz2/common/mvpred.c -> x264-snapshot-20150804-2245.tar.bz2/common/mvpred.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/opencl.c -> x264-snapshot-20150804-2245.tar.bz2/common/opencl.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/opencl.h -> x264-snapshot-20150804-2245.tar.bz2/common/opencl.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/osdep.c -> x264-snapshot-20150804-2245.tar.bz2/common/osdep.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * osdep.c: platform-specific code
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Steven Walters <kemuri9@gmail.com>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -94,51 +94,6 @@
 }
 #endif
 
-#if HAVE_MMX
-#ifdef __INTEL_COMPILER
-/* Agner's patch to Intel's CPU dispatcher from pages 131-132 of
- * http://agner.org/optimize/optimizing_cpp.pdf (2011-01-30)
- * adapted to x264's cpu schema. */
-
-// Global variable indicating cpu
-int __intel_cpu_indicator = 0;
-// CPU dispatcher function
-void x264_intel_cpu_indicator_init( void )
-{
-    unsigned int cpu = x264_cpu_detect();
-    if( cpu&X264_CPU_AVX )
-        __intel_cpu_indicator = 0x20000;
-    else if( cpu&X264_CPU_SSE42 )
-        __intel_cpu_indicator = 0x8000;
-    else if( cpu&X264_CPU_SSE4 )
-        __intel_cpu_indicator = 0x2000;
-    else if( cpu&X264_CPU_SSSE3 )
-        __intel_cpu_indicator = 0x1000;
-    else if( cpu&X264_CPU_SSE3 )
-        __intel_cpu_indicator = 0x800;
-    else if( cpu&X264_CPU_SSE2 && !(cpu&X264_CPU_SSE2_IS_SLOW) )
-        __intel_cpu_indicator = 0x200;
-    else if( cpu&X264_CPU_SSE )
-        __intel_cpu_indicator = 0x80;
-    else if( cpu&X264_CPU_MMX2 )
-        __intel_cpu_indicator = 8;
-    else
-        __intel_cpu_indicator = 1;
-}
-
-/* __intel_cpu_indicator_init appears to have a non-standard calling convention that
- * assumes certain registers aren't preserved, so we'll route it through a function
- * that backs up all the registers. */
-void __intel_cpu_indicator_init( void )
-{
-    x264_safe_intel_cpu_indicator_init();
-}
-#else
-void x264_intel_cpu_indicator_init( void )
-{}
-#endif
-#endif
-
 #ifdef _WIN32
 /* Functions for dealing with Unicode on Windows. */
 FILE *x264_fopen( const char *filename, const char *mode )

x264-snapshot-20141218-2245.tar.bz2/common/osdep.h -> x264-snapshot-20150804-2245.tar.bz2/common/osdep.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/pixel.c -> x264-snapshot-20150804-2245.tar.bz2/common/pixel.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * pixel.c: pixel metrics
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -42,6 +42,9 @@
 #   include "aarch64/pixel.h"
 #   include "aarch64/predict.h"
 #endif
+#if ARCH_MIPS
+#   include "mips/pixel.h"
+#endif
 
 
 /****************************************************************************
@@ -598,8 +601,8 @@
 INTRA_MBCMP(satd,  4x4,   v, h, dc,  , _neon, _neon )
 INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _neon, _neon )
 INTRA_MBCMP(satd,  8x8,  dc, h,  v, c, _neon, _neon )
-INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _neon, _c )
-INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _neon, _c )
+INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _neon, _neon )
+INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _neon, _neon )
 INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _neon, _neon )
 INTRA_MBCMP(satd, 16x16,  v, h, dc,  , _neon, _neon )
 #endif
@@ -1409,25 +1412,28 @@
 #if ARCH_AARCH64
     if( cpu&X264_CPU_NEON )
     {
-        INIT7( sad, _neon );
+        INIT8( sad, _neon );
         // AArch64 has no distinct instructions for aligned load/store
-        INIT7_NAME( sad_aligned, sad, _neon );
+        INIT8_NAME( sad_aligned, sad, _neon );
         INIT7( sad_x3, _neon );
         INIT7( sad_x4, _neon );
-        INIT7( ssd, _neon );
-        INIT7( satd, _neon );
+        INIT8( ssd, _neon );
+        INIT8( satd, _neon );
         INIT7( satd_x3, _neon );
         INIT7( satd_x4, _neon );
         INIT4( hadamard_ac, _neon );
 
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon;
 
         pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
         pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_neon;
         pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
         pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
         pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_neon;
+        pixf->vsad = x264_pixel_vsad_neon;
+        pixf->asd8 = x264_pixel_asd8_neon;
 
         pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_neon;
         pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_neon;
@@ -1440,11 +1446,44 @@
         pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_neon;
         pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon;
 
+        pixf->ssd_nv12_core     = x264_pixel_ssd_nv12_core_neon;
         pixf->ssim_4x4x2_core   = x264_pixel_ssim_4x4x2_core_neon;
         pixf->ssim_end4         = x264_pixel_ssim_end4_neon;
     }
 #endif // ARCH_AARCH64
 
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        INIT8( sad, _msa );
+        INIT8_NAME( sad_aligned, sad, _msa );
+        INIT8( ssd, _msa );
+        INIT7( sad_x3, _msa );
+        INIT7( sad_x4, _msa );
+        INIT8( satd, _msa );
+        INIT4( hadamard_ac, _msa );
+
+        pixf->intra_sad_x3_4x4   = x264_intra_sad_x3_4x4_msa;
+        pixf->intra_sad_x3_8x8   = x264_intra_sad_x3_8x8_msa;
+        pixf->intra_sad_x3_8x8c  = x264_intra_sad_x3_8x8c_msa;
+        pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_msa;
+        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_msa;
+        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_msa;
+        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_msa;
+        pixf->intra_sa8d_x3_8x8   = x264_intra_sa8d_x3_8x8_msa;
+
+        pixf->ssim_4x4x2_core = x264_ssim_4x4x2_core_msa;
+
+        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_msa;
+        pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_msa;
+        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_msa;
+        pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_msa;
+        pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_msa;
+        pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
+        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8;
+    }
+#endif // HAVE_MSA
+
 #endif // HIGH_BIT_DEPTH
 #if HAVE_ALTIVEC
     if( cpu&X264_CPU_ALTIVEC )

x264-snapshot-20141218-2245.tar.bz2/common/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/pixel.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/dct.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/dct.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * dct.c: ppc transform and zigzag
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
  *          Eric Petit <eric.petit@lapsus.org>
@@ -264,7 +264,7 @@
     vec_u8_t lv = vec_ld(0, dest);                              \
     vec_u8_t dstv = vec_perm(lv, zero_u8v, (vec_u8_t)perm_ldv); \
     vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                  \
-    vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv);    \
+    vec_u16_t dst16 = vec_u8_to_u16_h(dstv);                    \
     vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);   \
     vec_u8_t idstsum8 = vec_s16_to_u8(idstsum);                 \
     /* unaligned store */                                       \
@@ -384,7 +384,7 @@
     vec_u8_t lv = vec_ld( 7, dest );                           \
     vec_u8_t dstv   = vec_perm( hv, lv, (vec_u8_t)perm_ldv );  \
     vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                 \
-    vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv);   \
+    vec_u16_t dst16 = vec_u8_to_u16_h(dstv);                   \
     vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);  \
     vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum);        \
     /* unaligned store */                                      \

x264-snapshot-20141218-2245.tar.bz2/common/ppc/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/dct.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/deblock.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/deblock.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/mc.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/mc.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc.c: ppc motion compensation
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Eric Petit <eric.petit@lapsus.org>
  *          Guillaume Poirier <gpoirier@mplayerhq.hu>
@@ -40,24 +40,19 @@
 typedef void (*pf_mc_t)( uint8_t *src, intptr_t i_src,
                          uint8_t *dst, intptr_t i_dst, int i_height );
 
-
-static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-
-
 static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
 {
     return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
            pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
            pix[ 3*i_pix_next];
 }
+
 static inline int x264_tapfilter1( uint8_t *pix )
 {
     return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
            pix[ 3];
 }
 
-
 static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst,  intptr_t i_dst,
                                                uint8_t *src1, intptr_t i_src1,
                                                uint8_t *src2, int i_height )
@@ -181,10 +176,10 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
+    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
 
         switch( i_width )
         {
@@ -229,10 +224,10 @@
 {
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
-    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
+    uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
     if( qpel_idx & 5 ) /* qpel interpolation needed */
     {
-        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
         switch( i_width )
         {
             case 4:
@@ -296,6 +291,12 @@
     }
  }
 
+#ifdef WORDS_BIGENDIAN
+#define VSLD(a,b,n) vec_sld(a,b,n)
+#else
+#define VSLD(a,b,n) vec_sld(b,a,16-n)
+#endif
+
 static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
                                    uint8_t *src, intptr_t i_src_stride,
                                    int mvx, int mvy, int i_height )
@@ -321,8 +322,13 @@
     vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16, dstv16;
     vec_u16_t   shiftv, k32v;
 
+#ifdef WORDS_BIGENDIAN
     static const vec_u8_t perm0v = CV(1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13);
     static const vec_u8_t perm1v = CV(3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15);
+#else
+    static const vec_u8_t perm0v = CV(0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12);
+    static const vec_u8_t perm1v = CV(2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14);
+#endif
 
     coeff0v = vec_ld( 0, coeff );
     coeff3v = vec_splat( coeff0v, 3 );
@@ -334,7 +340,7 @@
 
     VEC_LOAD( src, src2v_8, 9, vec_u8_t, src );
     src2v_16 = vec_u8_to_u16( src2v_8 );
-    src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
+    src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
 
     for( int y = 0; y < i_height; y += 2 )
     {
@@ -342,7 +348,7 @@
         src1v_16 = src3v_16;
         VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
         src2v_16 = vec_u8_to_u16( src2v_8 );
-        src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
+        src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
 
         dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
         dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
@@ -364,7 +370,7 @@
         src1v_16 = src3v_16;
         VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
         src2v_16 = vec_u8_to_u16( src2v_8 );
-        src3v_16 = vec_u8_to_u16( vec_sld( src2v_8, src2v_8, 2 ) );
+        src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
 
         dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
         dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
@@ -420,12 +426,17 @@
     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
     shiftv  = vec_splat_u16( 6 );
 
+#ifdef WORDS_BIGENDIAN
     static const vec_u8_t perm0v = CV(1,5,9,13,17,21,25,29,0,0,0,0,0,0,0,0);
     static const vec_u8_t perm1v = CV(3,7,11,15,19,23,27,31,0,0,0,0,0,0,0,0);
+#else
+    static const vec_u8_t perm0v = CV(0,4,8,12,16,20,24,28,1,1,1,1,1,1,1,1);
+    static const vec_u8_t perm1v = CV(2,6,10,14,18,22,26,30,1,1,1,1,1,1,1,1);
+#endif
 
     VEC_LOAD( src, src2v_8, 16, vec_u8_t, src );
     VEC_LOAD( src+16, src3v_8, 2, vec_u8_t, src );
-    src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
+    src3v_8 = VSLD( src2v_8, src3v_8, 2 );
 
     for( int y = 0; y < i_height; y += 2 )
     {
@@ -434,7 +445,7 @@
         VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
         VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
 
-        src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
+        src3v_8 = VSLD( src2v_8, src3v_8, 2 );
 
         src0v_16h = vec_u8_to_u16_h( src0v_8 );
         src0v_16l = vec_u8_to_u16_l( src0v_8 );
@@ -472,7 +483,7 @@
         VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
         VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
 
-        src3v_8 = vec_sld( src2v_8, src3v_8, 2 );
+        src3v_8 = VSLD( src2v_8, src3v_8, 2 );
 
         src0v_16h = vec_u8_to_u16_h( src0v_8 );
         src0v_16l = vec_u8_to_u16_l( src0v_8 );
@@ -555,11 +566,11 @@
     VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
     VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
                                                              \
-    src2v = vec_sld( src1v, src6v,  1 );                     \
-    src3v = vec_sld( src1v, src6v,  2 );                     \
-    src4v = vec_sld( src1v, src6v,  3 );                     \
-    src5v = vec_sld( src1v, src6v,  4 );                     \
-    src6v = vec_sld( src1v, src6v,  5 );                     \
+    src2v = VSLD( src1v, src6v,  1 );                        \
+    src3v = VSLD( src1v, src6v,  2 );                        \
+    src4v = VSLD( src1v, src6v,  3 );                        \
+    src5v = VSLD( src1v, src6v,  4 );                        \
+    src6v = VSLD( src1v, src6v,  5 );                        \
                                                              \
     temp1v = vec_u8_to_s16_h( src1v );                       \
     temp2v = vec_u8_to_s16_h( src2v );                       \
@@ -634,12 +645,12 @@
 
 #define HPEL_FILTER_CENTRAL()                           \
 {                                                       \
-    temp1v = vec_sld( tempav, tempbv, 12 );             \
-    temp2v = vec_sld( tempav, tempbv, 14 );             \
+    temp1v = VSLD( tempav, tempbv, 12 );                \
+    temp2v = VSLD( tempav, tempbv, 14 );                \
     temp3v = tempbv;                                    \
-    temp4v = vec_sld( tempbv, tempcv,  2 );             \
-    temp5v = vec_sld( tempbv, tempcv,  4 );             \
-    temp6v = vec_sld( tempbv, tempcv,  6 );             \
+    temp4v = VSLD( tempbv, tempcv,  2 );                \
+    temp5v = VSLD( tempbv, tempcv,  4 );                \
+    temp6v = VSLD( tempbv, tempcv,  6 );                \
                                                         \
     HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
                    temp4v, temp5v, temp6v );            \
@@ -647,12 +658,12 @@
     dest1v = vec_add( temp1v, thirtytwov );             \
     dest1v = vec_sra( dest1v, sixv );                   \
                                                         \
-    temp1v = vec_sld( tempbv, tempcv, 12 );             \
-    temp2v = vec_sld( tempbv, tempcv, 14 );             \
+    temp1v = VSLD( tempbv, tempcv, 12 );                \
+    temp2v = VSLD( tempbv, tempcv, 14 );                \
     temp3v = tempcv;                                    \
-    temp4v = vec_sld( tempcv, tempdv,  2 );             \
-    temp5v = vec_sld( tempcv, tempdv,  4 );             \
-    temp6v = vec_sld( tempcv, tempdv,  6 );             \
+    temp4v = VSLD( tempcv, tempdv,  2 );                \

x264-snapshot-20141218-2245.tar.bz2/common/ppc/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/mc.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/pixel.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/pixel.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/pixel.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/ppccommon.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/ppccommon.h Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * ppccommon.h: ppc utility macros
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Eric Petit <eric.petit@lapsus.org>
  *
@@ -81,10 +81,17 @@
 /***********************************************************************
  * 8 <-> 16 bits conversions
  **********************************************************************/
+#ifdef WORDS_BIGENDIAN
 #define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
 #define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
 #define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
 #define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
+#else
+#define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
+#define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
+#endif
 
 #define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
 #define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
@@ -96,10 +103,17 @@
 /***********************************************************************
  * 16 <-> 32 bits conversions
  **********************************************************************/
+#ifdef WORDS_BIGENDIAN
 #define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
 #define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
 #define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
 #define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
+#else
+#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( (vec_u16_t) v, zero_u16v )
+#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( (vec_u16_t) v, zero_u16v )
+#endif
 
 #define vec_u16_to_u32(v) vec_u16_to_u32_h(v)
 #define vec_u16_to_s32(v) vec_u16_to_s32_h(v)

x264-snapshot-20141218-2245.tar.bz2/common/ppc/predict.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/predict.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/predict.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/ppc/quant.c -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/quant.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * quant.c: ppc quantization
  *****************************************************************************
- * Copyright (C) 2007-2014 x264 project
+ * Copyright (C) 2007-2015 x264 project
  *
  * Authors: Guillaume Poirier <gpoirier@mplayerhq.hu>
  *
@@ -251,6 +251,14 @@
     vec_st(dctv, 8*y, dct);                                          \
 }
 
+#ifdef WORDS_BIGENDIAN
+#define VEC_MULE vec_mule
+#define VEC_MULO vec_mulo
+#else
+#define VEC_MULE vec_mulo
+#define VEC_MULO vec_mule
+#endif
+
 #define DEQUANT_SHR()                                          \
 {                                                              \
     dctv = vec_ld(8*y, dct);                                   \
@@ -259,14 +267,14 @@
     mf1v = vec_ld(16*y, dequant_mf[i_mf]);                     \
     mf2v = vec_ld(16+16*y, dequant_mf[i_mf]);                  \
                                                                \
-    multEvenvA = vec_mule(dct1v, (vec_s16_t)mf1v);             \
-    multOddvA = vec_mulo(dct1v, (vec_s16_t)mf1v);              \
+    multEvenvA = VEC_MULE(dct1v, (vec_s16_t)mf1v);             \
+    multOddvA = VEC_MULO(dct1v, (vec_s16_t)mf1v);              \
     temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
     temp1v = vec_add(temp1v, fv);                              \
     temp1v = vec_sra(temp1v, i_qbitsv);                        \
                                                                \
-    multEvenvA = vec_mule(dct2v, (vec_s16_t)mf2v);             \
-    multOddvA = vec_mulo(dct2v, (vec_s16_t)mf2v);              \
+    multEvenvA = VEC_MULE(dct2v, (vec_s16_t)mf2v);             \
+    multOddvA = VEC_MULO(dct2v, (vec_s16_t)mf2v);              \
     temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
     temp2v = vec_add(temp2v, fv);                              \
     temp2v = vec_sra(temp2v, i_qbitsv);                        \

x264-snapshot-20141218-2245.tar.bz2/common/ppc/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/ppc/quant.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/predict.c -> x264-snapshot-20150804-2245.tar.bz2/common/predict.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * predict.c: intra prediction
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -43,6 +43,9 @@
 #if ARCH_AARCH64
 #   include "aarch64/predict.h"
 #endif
+#if ARCH_MIPS
+#   include "mips/predict.h"
+#endif
 
 /****************************************************************************
  * 16x16 prediction for intra luma block
@@ -906,6 +909,21 @@
 #if ARCH_AARCH64
     x264_predict_16x16_init_aarch64( cpu, pf );
 #endif
+
+#if !HIGH_BIT_DEPTH
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        pf[I_PRED_16x16_V ]     = x264_intra_predict_vert_16x16_msa;
+        pf[I_PRED_16x16_H ]     = x264_intra_predict_hor_16x16_msa;
+        pf[I_PRED_16x16_DC]     = x264_intra_predict_dc_16x16_msa;
+        pf[I_PRED_16x16_P ]     = x264_intra_predict_plane_16x16_msa;
+        pf[I_PRED_16x16_DC_LEFT]= x264_intra_predict_dc_left_16x16_msa;
+        pf[I_PRED_16x16_DC_TOP ]= x264_intra_predict_dc_top_16x16_msa;
+        pf[I_PRED_16x16_DC_128 ]= x264_intra_predict_dc_128_16x16_msa;
+    }
+#endif
+#endif
 }
 
 void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
@@ -934,6 +952,15 @@
 #if ARCH_AARCH64
     x264_predict_8x8c_init_aarch64( cpu, pf );
 #endif
+
+#if !HIGH_BIT_DEPTH
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        pf[I_PRED_CHROMA_P ]     = x264_intra_predict_plane_8x8_msa;
+    }
+#endif
+#endif
 }
 
 void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
@@ -949,6 +976,10 @@
 #if HAVE_MMX
     x264_predict_8x16c_init_mmx( cpu, pf );
 #endif
+
+#if ARCH_AARCH64
+    x264_predict_8x16c_init_aarch64( cpu, pf );
+#endif
 }
 
 void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
@@ -978,6 +1009,15 @@
 #if ARCH_AARCH64
     x264_predict_8x8_init_aarch64( cpu, pf, predict_filter );
 #endif
+
+#if !HIGH_BIT_DEPTH
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        pf[I_PRED_8x8_DDL]    = x264_intra_predict_ddl_8x8_msa;
+    }
+#endif
+#endif
 }
 
 void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )

x264-snapshot-20141218-2245.tar.bz2/common/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/predict.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/quant.c -> x264-snapshot-20150804-2245.tar.bz2/common/quant.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * quant.c: quantization and level-run
  *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
+ * Copyright (C) 2005-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Fiona Glaser <fiona@x264.com>
@@ -40,6 +40,9 @@
 #if ARCH_AARCH64
 #   include "aarch64/quant.h"
 #endif
+#if ARCH_MIPS
+#   include "mips/quant.h"
+#endif
 
 #define QUANT_ONE( coef, mf, f ) \
 { \
@@ -714,7 +717,8 @@
 #endif // HAVE_MMX
 
 #if HAVE_ALTIVEC
-    if( cpu&X264_CPU_ALTIVEC ) {
+    if( cpu&X264_CPU_ALTIVEC )
+    {
         pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
         pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
         pf->quant_4x4 = x264_quant_4x4_altivec;
@@ -753,6 +757,32 @@
     {
         pf->coeff_last4 = x264_coeff_last4_aarch64;
         pf->coeff_last8 = x264_coeff_last8_aarch64;
+        pf->coeff_level_run4 = x264_coeff_level_run4_aarch64;
+    }
+    if( cpu&X264_CPU_NEON )
+    {
+        pf->coeff_level_run8 = x264_coeff_level_run8_neon;
+        pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_neon;
+        pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
+        pf->decimate_score15 = x264_decimate_score15_neon;
+        pf->decimate_score16 = x264_decimate_score16_neon;
+        pf->decimate_score64 = x264_decimate_score64_neon;
+        pf->denoise_dct = x264_denoise_dct_neon;
+    }
+#endif
+
+#if HAVE_MSA
+    if( cpu&X264_CPU_MSA )
+    {
+        pf->quant_4x4      = x264_quant_4x4_msa;
+        pf->quant_4x4_dc   = x264_quant_4x4_dc_msa;
+        pf->quant_4x4x4    = x264_quant_4x4x4_msa;
+        pf->quant_8x8      = x264_quant_8x8_msa;
+        pf->dequant_4x4    = x264_dequant_4x4_msa;
+        pf->dequant_4x4_dc = x264_dequant_4x4_dc_msa;
+        pf->dequant_8x8    = x264_dequant_8x8_msa;
+        pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_msa;
+        pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_msa;
     }
 #endif
 #endif // HIGH_BIT_DEPTH

x264-snapshot-20141218-2245.tar.bz2/common/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/quant.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/rectangle.c -> x264-snapshot-20150804-2245.tar.bz2/common/rectangle.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/rectangle.h -> x264-snapshot-20150804-2245.tar.bz2/common/rectangle.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/set.c -> x264-snapshot-20150804-2245.tar.bz2/common/set.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/set.h -> x264-snapshot-20150804-2245.tar.bz2/common/set.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/threadpool.c -> x264-snapshot-20150804-2245.tar.bz2/common/threadpool.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/threadpool.h -> x264-snapshot-20150804-2245.tar.bz2/common/threadpool.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/vlc.c -> x264-snapshot-20150804-2245.tar.bz2/common/vlc.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/win32thread.c -> x264-snapshot-20150804-2245.tar.bz2/common/win32thread.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * win32thread.c: windows threading
  *****************************************************************************
- * Copyright (C) 2010-2014 x264 project
+ * Copyright (C) 2010-2015 x264 project
  *
  * Authors: Steven Walters <kemuri9@gmail.com>
  *          Pegasys Inc. <http://www.pegasys-inc.com>
@@ -138,7 +138,7 @@
     if( !win32_cond )
         return -1;
     cond->ptr = win32_cond;
-    win32_cond->semaphore = CreateSemaphore( NULL, 0, 0x7fffffff, NULL );
+    win32_cond->semaphore = CreateSemaphoreW( NULL, 0, 0x7fffffff, NULL );
     if( !win32_cond->semaphore )
         return -1;
 
@@ -147,7 +147,7 @@
     if( x264_pthread_mutex_init( &win32_cond->mtx_broadcast, NULL ) )
         return -1;
 
-    win32_cond->waiters_done = CreateEvent( NULL, FALSE, FALSE, NULL );
+    win32_cond->waiters_done = CreateEventW( NULL, FALSE, FALSE, NULL );
     if( !win32_cond->waiters_done )
         return -1;

x264-snapshot-20141218-2245.tar.bz2/common/win32thread.h -> x264-snapshot-20150804-2245.tar.bz2/common/win32thread.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/bitstream-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/bitstream-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/cabac-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/cabac-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/const-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/const-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/cpu-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/cpu-a.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* cpu-a.asm: x86 cpu utilities
 ;*****************************************************************************
-;* Copyright (C) 2003-2014 x264 project
+;* Copyright (C) 2003-2015 x264 project
 ;*
 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
 ;*          Loren Merritt <lorenm@u.washington.edu>
@@ -145,53 +145,3 @@
 cglobal cpu_sfence
     sfence
     ret
-
-cextern intel_cpu_indicator_init
-
-;-----------------------------------------------------------------------------
-; void safe_intel_cpu_indicator_init( void );
-;-----------------------------------------------------------------------------
-cglobal safe_intel_cpu_indicator_init
-    push r0
-    push r1
-    push r2
-    push r3
-    push r4
-    push r5
-    push r6
-%if ARCH_X86_64
-    push r7
-    push r8
-    push r9
-    push r10
-    push r11
-    push r12
-    push r13
-    push r14
-%endif
-    push rbp
-    mov  rbp, rsp
-%if WIN64
-    sub  rsp, 32 ; shadow space
-%endif
-    and  rsp, ~31
-    call intel_cpu_indicator_init
-    leave
-%if ARCH_X86_64
-    pop r14
-    pop r13
-    pop r12
-    pop r11
-    pop r10
-    pop r9
-    pop r8
-    pop r7
-%endif
-    pop r6
-    pop r5
-    pop r4
-    pop r3
-    pop r2
-    pop r1
-    pop r0
-    ret

x264-snapshot-20141218-2245.tar.bz2/common/x86/dct-32.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/dct-32.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/dct-64.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/dct-64.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/dct-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/dct-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/dct.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/dct.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/deblock-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/deblock-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/mc-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/mc-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/mc-a2.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/mc-a2.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* mc-a2.asm: x86 motion compensation
 ;*****************************************************************************
-;* Copyright (C) 2005-2014 x264 project
+;* Copyright (C) 2005-2015 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Fiona Glaser <fiona@x264.com>
@@ -40,6 +40,7 @@
 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
 
 %if HIGH_BIT_DEPTH
+copy_swap_shuf: times 2 db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
 v210_mask: times 4 dq 0xc00ffc003ff003ff
 v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
 v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
@@ -50,6 +51,7 @@
 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
 %else
+copy_swap_shuf: times 2 db 1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14
 deinterleave_rgb_shuf: db 0,3,6,9,1,4,7,10,2,5,8,11,-1,-1,-1,-1
                        db 0,4,8,12,1,5,9,13,2,6,10,14,-1,-1,-1,-1
 
@@ -913,64 +915,90 @@
 %undef sfence
 %endif ; !HIGH_BIT_DEPTH
 
+%macro PREFETCHNT_ITER 2 ; src, bytes/iteration
+    %assign %%i 4*(%2) ; prefetch 4 iterations ahead. is this optimal?
+    %rep (%2+63) / 64  ; assume 64 byte cache lines
+        prefetchnta [%1+%%i]
+        %assign %%i %%i + 64
+    %endrep
+%endmacro
+
 ;-----------------------------------------------------------------------------
-; void plane_copy_core( pixel *dst, intptr_t i_dst,
-;                       pixel *src, intptr_t i_src, int w, int h )
+; void plane_copy(_swap)_core( pixel *dst, intptr_t i_dst,
+;                              pixel *src, intptr_t i_src, int w, int h )
 ;-----------------------------------------------------------------------------
-; assumes i_dst and w are multiples of 16, and i_dst>w
-INIT_MMX
-cglobal plane_copy_core_mmx2, 6,7
-    FIX_STRIDES r1, r3, r4d
-%if HIGH_BIT_DEPTH == 0
+; assumes i_dst and w are multiples of mmsize, and i_dst>w
+%macro PLANE_COPY_CORE 1 ; swap
+%if %1
+cglobal plane_copy_swap_core, 6,7
+    mova   m4, [copy_swap_shuf]
+%else
+cglobal plane_copy_core, 6,7
+%endif
+    FIX_STRIDES r1, r3
+%if %1 && HIGH_BIT_DEPTH
+    shl   r4d, 2
+%elif %1 || HIGH_BIT_DEPTH
+    add   r4d, r4d
+%else
     movsxdifnidn r4, r4d
 %endif
-    sub    r1,  r4
-    sub    r3,  r4
+    add    r0, r4
+    add    r2, r4
+    neg    r4
 .loopy:
-    lea   r6d, [r4-63]
+    lea    r6, [r4+4*mmsize]
+%if %1
+    test  r6d, r6d
+    jg .skip
+%endif
 .loopx:
-    prefetchnta [r2+256]
-    movq   m0, [r2   ]
-    movq   m1, [r2+ 8]
-    movntq [r0   ], m0
-    movntq [r0+ 8], m1
-    movq   m2, [r2+16]
-    movq   m3, [r2+24]
-    movntq [r0+16], m2
-    movntq [r0+24], m3
-    movq   m4, [r2+32]
-    movq   m5, [r2+40]
-    movntq [r0+32], m4
-    movntq [r0+40], m5
-    movq   m6, [r2+48]
-    movq   m7, [r2+56]
-    movntq [r0+48], m6
-    movntq [r0+56], m7
-    add    r2,  64
-    add    r0,  64
-    sub    r6d, 64
-    jg .loopx
-    prefetchnta [r2+256]
-    add    r6d, 63
-    jle .end16
-.loop16:
-    movq   m0, [r2  ]
-    movq   m1, [r2+8]
-    movntq [r0  ], m0
-    movntq [r0+8], m1
-    add    r2,  16
-    add    r0,  16
-    sub    r6d, 16
-    jg .loop16
-.end16:
+    PREFETCHNT_ITER r2+r6, 4*mmsize
+    movu   m0, [r2+r6-4*mmsize]
+    movu   m1, [r2+r6-3*mmsize]
+    movu   m2, [r2+r6-2*mmsize]
+    movu   m3, [r2+r6-1*mmsize]
+%if %1
+    pshufb m0, m4
+    pshufb m1, m4
+    pshufb m2, m4
+    pshufb m3, m4
+%endif
+    movnta [r0+r6-4*mmsize], m0
+    movnta [r0+r6-3*mmsize], m1
+    movnta [r0+r6-2*mmsize], m2
+    movnta [r0+r6-1*mmsize], m3
+    add    r6, 4*mmsize
+    jle .loopx
+.skip:
+    PREFETCHNT_ITER r2+r6, 4*mmsize
+    sub    r6, 4*mmsize
+    jz .end
+.loop_end:
+    movu   m0, [r2+r6]
+%if %1
+    pshufb m0, m4
+%endif
+    movnta [r0+r6], m0
+    add    r6, mmsize
+    jl .loop_end
+.end:
     add    r0, r1
     add    r2, r3
-    dec    r5d
+    dec   r5d
     jg .loopy
     sfence
-    emms
     RET
+%endmacro
 
+INIT_XMM sse
+PLANE_COPY_CORE 0
+INIT_XMM ssse3
+PLANE_COPY_CORE 1
+INIT_YMM avx
+PLANE_COPY_CORE 0
+INIT_YMM avx2
+PLANE_COPY_CORE 1
 
 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
 %if HIGH_BIT_DEPTH
@@ -2136,7 +2164,7 @@
 
 INIT_YMM avx
 MBTREE_AVX 8
-INIT_YMM avx2,fma3
+INIT_YMM avx2
 MBTREE_AVX 7
 
 %macro MBTREE_PROPAGATE_LIST 0

x264-snapshot-20141218-2245.tar.bz2/common/x86/mc-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/x86/mc-c.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * mc-c.c: x86 motion compensation
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -90,8 +90,12 @@
 void x264_prefetch_fenc_420_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
 void x264_prefetch_fenc_422_mmx2( pixel *, intptr_t, pixel *, intptr_t, int );
 void x264_prefetch_ref_mmx2( pixel *, intptr_t, int );
-void x264_plane_copy_core_mmx2( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_core_sse( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_core_avx( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
 void x264_plane_copy_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_swap_core_ssse3( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_swap_core_avx2 ( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
+void x264_plane_copy_swap_c( pixel *, intptr_t, pixel *, intptr_t, int w, int h );
 void x264_plane_copy_interleave_core_mmx2( pixel *dst,  intptr_t i_dst,
                                            pixel *srcu, intptr_t i_srcu,
                                            pixel *srcv, intptr_t i_srcv, int w, int h );
@@ -167,8 +171,8 @@
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx2_fma3( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                           uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 
 #define MC_CHROMA(cpu)\
 void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
@@ -363,9 +367,6 @@
 }
 #endif // !HIGH_BIT_DEPTH
 
-static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
-static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
-
 #define MC_LUMA(name,instr1,instr2)\
 static void mc_luma_##name( pixel *dst,    intptr_t i_dst_stride,\
                             pixel *src[4], intptr_t i_src_stride,\
@@ -374,10 +375,10 @@
 {\
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
-    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
+    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
     if( qpel_idx & 5 ) /* qpel interpolation needed */\
     {\
-        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
+        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
         x264_pixel_avg_wtab_##instr1[i_width>>2](\
                 dst, i_dst_stride, src1, i_src_stride,\
                 src2, i_height );\
@@ -412,10 +413,10 @@
 {\
     int qpel_idx = ((mvy&3)<<2) + (mvx&3);\
     int offset = (mvy>>2)*i_src_stride + (mvx>>2);\
-    pixel *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
+    pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;\
     if( qpel_idx & 5 ) /* qpel interpolation needed */\
     {\
-        pixel *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
+        pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);\
         x264_pixel_avg_wtab_##name[i_width>>2](\
                 dst, *i_dst_stride, src1, i_src_stride,\
                 src2, i_height );\
@@ -492,39 +493,94 @@
 #endif
 #endif // HIGH_BIT_DEPTH
 
-static void x264_plane_copy_mmx2( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )
-{
-    int c_w = 16/sizeof(pixel) - 1;
-    if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold.
-        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );
-    } else if( !(w&c_w) ) {
-        x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, w, h );
-    } else if( i_src > 0 ) {
-        // have to use plain memcpy on the last line (in memory order) to avoid overreading src
-        x264_plane_copy_core_mmx2( dst, i_dst, src, i_src, (w+c_w)&~c_w, h-1 );
-        memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w*sizeof(pixel) );
-    } else {
-        memcpy( dst, src, w*sizeof(pixel) );
-        x264_plane_copy_core_mmx2( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h-1 );
-    }
+#define PLANE_COPY(align, cpu)\
+static void x264_plane_copy_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
+{\
+    int c_w = (align) / sizeof(pixel) - 1;\
+    if( w < 256 ) /* tiny resolutions don't want non-temporal hints. dunno the exact threshold. */\
+        x264_plane_copy_c( dst, i_dst, src, i_src, w, h );\
+    else if( !(w&c_w) )\
+        x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, w, h );\
+    else\
+    {\
+        if( --h > 0 )\
+        {\
+            if( i_src > 0 )\
+            {\
+                x264_plane_copy_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
+                dst += i_dst * h;\
+                src += i_src * h;\
+            }\
+            else\
+                x264_plane_copy_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
+        }\
+        /* use plain memcpy on the last line (in memory order) to avoid overreading src. */\
+        memcpy( dst, src, w*sizeof(pixel) );\
+    }\
+}
+
+PLANE_COPY(16, sse)
+PLANE_COPY(32, avx)
+
+#define PLANE_COPY_SWAP(align, cpu)\
+static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, intptr_t i_src, int w, int h )\
+{\
+    int c_w = (align>>1) / sizeof(pixel) - 1;\
+    if( !(w&c_w) )\
+        x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, w, h );\
+    else if( w > c_w )\
+    {\
+        if( --h > 0 )\
+        {\
+            if( i_src > 0 )\
+            {\
+                x264_plane_copy_swap_core_##cpu( dst, i_dst, src, i_src, (w+c_w)&~c_w, h );\
+                dst += i_dst * h;\
+                src += i_src * h;\
+            }\
+            else\
+                x264_plane_copy_swap_core_##cpu( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h );\
+        }\
+        x264_plane_copy_swap_core_##cpu( dst, 0, src, 0, w&~c_w, 1 );\
+        for( int x = 2*(w&~c_w); x < 2*w; x += 2 )\
+        {\
+            dst[x]   = src[x+1];\
+            dst[x+1] = src[x];\
+        }\
+    }\
+    else\
+        x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
 }
 
+PLANE_COPY_SWAP(16, ssse3)
+PLANE_COPY_SWAP(32, avx2)
+
 #define PLANE_INTERLEAVE(cpu) \
 static void x264_plane_copy_interleave_##cpu( pixel *dst,  intptr_t i_dst,\
                                               pixel *srcu, intptr_t i_srcu,\
                                               pixel *srcv, intptr_t i_srcv, int w, int h )\
 {\
-    if( !(w&15) ) {\
+    int c_w = 16 / sizeof(pixel) - 1;\
+    if( !(w&c_w) )\
         x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
-    } else if( w < 16 || (i_srcu ^ i_srcv) ) {\
-        x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
-    } else if( i_srcu > 0 ) {\
-        x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+15)&~15, h-1 );\
-        x264_plane_copy_interleave_c( dst+i_dst*(h-1), 0, srcu+i_srcu*(h-1), 0, srcv+i_srcv*(h-1), 0, w, 1 );\
-    } else {\
+    else if( w > c_w && (i_srcu ^ i_srcv) >= 0 ) /* only works correctly for strides with identical signs */\
+    {\
+        if( --h > 0 )\
+        {\
+            if( i_srcu > 0 )\
+            {\
+                x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+c_w)&~c_w, h );\
+                dst  += i_dst  * h;\
+                srcu += i_srcu * h;\
+                srcv += i_srcv * h;\
+            }\
+            else\
+                x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+c_w)&~c_w, h );\
+        }\
         x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
-        x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+15)&~15, h-1 );\
     }\
+    else\
+        x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
 }
 
 PLANE_INTERLEAVE(mmx2)
@@ -666,7 +722,6 @@
     pf->prefetch_fenc_422 = x264_prefetch_fenc_422_mmx2;
     pf->prefetch_ref  = x264_prefetch_ref_mmx2;
 
-    pf->plane_copy = x264_plane_copy_mmx2;
     pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
     pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2;
 
@@ -695,6 +750,7 @@
     {
         pf->memcpy_aligned  = x264_memcpy_aligned_sse;
         pf->memzero_aligned = x264_memzero_aligned_sse;

x264-snapshot-20141218-2245.tar.bz2/common/x86/mc.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/mc.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/pixel-32.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/pixel-32.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/pixel-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/pixel-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/pixel.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/pixel.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/predict-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/predict-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/predict-c.c -> x264-snapshot-20150804-2245.tar.bz2/common/x86/predict-c.c Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/predict.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/predict.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/quant-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/quant-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/quant.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/quant.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/sad-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/sad-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/sad16-a.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/sad16-a.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/trellis-64.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/trellis-64.asm Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/util.h -> x264-snapshot-20150804-2245.tar.bz2/common/x86/util.h Changed

x264-snapshot-20141218-2245.tar.bz2/common/x86/x86inc.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/x86inc.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* x86inc.asm: x264asm abstraction layer
 ;*****************************************************************************
-;* Copyright (C) 2005-2014 x264 project
+;* Copyright (C) 2005-2015 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
@@ -64,6 +64,15 @@
     %endif
 %endif
 
+%define FORMAT_ELF 0
+%ifidn __OUTPUT_FORMAT__,elf
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf32
+    %define FORMAT_ELF 1
+%elifidn __OUTPUT_FORMAT__,elf64
+    %define FORMAT_ELF 1
+%endif
+
 %ifdef PREFIX
     %define mangle(x) _ %+ x
 %else
@@ -74,10 +83,6 @@
     SECTION .rodata align=%1
 %endmacro
 
-%macro SECTION_TEXT 0-1 16
-    SECTION .text align=%1
-%endmacro
-
 %if WIN64
     %define PIC
 %elif ARCH_X86_64 == 0
@@ -90,6 +95,10 @@
     default rel
 %endif
 
+%ifdef __NASM_VER__
+    %use smartalign
+%endif
+
 ; Macros to eliminate most code duplication between x86_32 and x86_64:
 ; Currently this works only for leaf functions which load all their arguments
 ; into registers at the start, and make no other use of the stack. Luckily that
@@ -675,7 +684,7 @@
         CAT_XDEFINE cglobaled_, %2, 1
     %endif
     %xdefine current_function %2
-    %ifidn __OUTPUT_FORMAT__,elf
+    %if FORMAT_ELF
         global %2:function %%VISIBILITY
     %else
         global %2
@@ -701,14 +710,16 @@
 
 ; like cextern, but without the prefix
 %macro cextern_naked 1
-    %xdefine %1 mangle(%1)
+    %ifdef PREFIX
+        %xdefine %1 mangle(%1)
+    %endif
     CAT_XDEFINE cglobaled_, %1, 1
     extern %1
 %endmacro
 
 %macro const 1-2+
     %xdefine %1 mangle(private_prefix %+ _ %+ %1)
-    %ifidn __OUTPUT_FORMAT__,elf
+    %if FORMAT_ELF
         global %1:data hidden
     %else
         global %1
@@ -716,10 +727,9 @@
     %1: %2
 %endmacro
 
-; This is needed for ELF, otherwise the GNU linker assumes the stack is
-; executable by default.
-%ifidn __OUTPUT_FORMAT__,elf
-SECTION .note.GNU-stack noalloc noexec nowrite progbits
+; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default.
+%if FORMAT_ELF
+    [SECTION .note.GNU-stack noalloc noexec nowrite progbits]
 %endif
 
 ; cpuflags
@@ -738,8 +748,8 @@
 %assign cpuflags_avx      (1<<11)| cpuflags_sse42
 %assign cpuflags_xop      (1<<12)| cpuflags_avx
 %assign cpuflags_fma4     (1<<13)| cpuflags_avx
-%assign cpuflags_avx2     (1<<14)| cpuflags_avx
-%assign cpuflags_fma3     (1<<15)| cpuflags_avx
+%assign cpuflags_fma3     (1<<14)| cpuflags_avx
+%assign cpuflags_avx2     (1<<15)| cpuflags_fma3
 
 %assign cpuflags_cache32  (1<<16)
 %assign cpuflags_cache64  (1<<17)
@@ -789,9 +799,17 @@
     %endif
 
     %if ARCH_X86_64 || cpuflag(sse2)
-        CPU amdnop
+        %ifdef __NASM_VER__
+            ALIGNMODE k8
+        %else
+            CPU amdnop
+        %endif
     %else
-        CPU basicnop
+        %ifdef __NASM_VER__
+            ALIGNMODE nop
+        %else
+            CPU basicnop
+        %endif
     %endif
 %endmacro
 
@@ -868,7 +886,7 @@
     %assign %%i 0
     %rep num_mmregs
     CAT_XDEFINE m, %%i, ymm %+ %%i
-    CAT_XDEFINE nymm, %%i, %%i
+    CAT_XDEFINE nnymm, %%i, %%i
     %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
@@ -1070,6 +1088,8 @@
         %ifdef cpuname
             %if notcpuflag(%2)
                 %error use of ``%1'' %2 instruction in cpuname function: current_function
+            %elif cpuflags_%2 < cpuflags_sse && notcpuflag(sse2) && __sizeofreg > 8
+                %error use of ``%1'' sse2 instruction in cpuname function: current_function
             %endif
         %endif
     %endif
@@ -1206,7 +1226,7 @@
 AVX_INSTR minss, sse, 1, 0, 1
 AVX_INSTR movapd, sse2
 AVX_INSTR movaps, sse
-AVX_INSTR movd
+AVX_INSTR movd, mmx
 AVX_INSTR movddup, sse3
 AVX_INSTR movdqa, sse2
 AVX_INSTR movdqu, sse2
@@ -1222,7 +1242,7 @@
 AVX_INSTR movntdqa, sse4
 AVX_INSTR movntpd, sse2
 AVX_INSTR movntps, sse
-AVX_INSTR movq
+AVX_INSTR movq, mmx
 AVX_INSTR movsd, sse2, 1, 0, 0
 AVX_INSTR movshdup, sse3
 AVX_INSTR movsldup, sse3
@@ -1468,13 +1488,15 @@
 FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
 FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
 
-; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
-%if ARCH_X86_64 == 0
-%macro vpbroadcastq 2
-%if sizeof%1 == 16
-    movddup %1, %2
-%else
-    vbroadcastsd %1, %2
-%endif
-%endmacro
+; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
+%ifdef __YASM_VER__
+    %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
+        %macro vpbroadcastq 2
+            %if sizeof%1 == 16
+                movddup %1, %2
+            %else
+                vbroadcastsd %1, %2
+            %endif
+        %endmacro
+    %endif
 %endif

x264-snapshot-20141218-2245.tar.bz2/common/x86/x86util.asm -> x264-snapshot-20150804-2245.tar.bz2/common/x86/x86util.asm Changed

x264-snapshot-20141218-2245.tar.bz2/config.guess -> x264-snapshot-20150804-2245.tar.bz2/config.guess Changed

x264-snapshot-20141218-2245.tar.bz2/configure -> x264-snapshot-20150804-2245.tar.bz2/configure Changed

@@ -77,7 +77,7 @@
     # several non gcc compilers issue an incredibly large number of warnings on any warning level,
     # suppress them by disabling all warnings rather than having to use #pragmas to disable most of them
     for arg in $*; do
-        [ $arg = -ffast-math ] && arg=
+        [ "$arg" = -ffast-math ] && arg=
         [[ "$arg" = -falign-loops* ]] && arg=
         [ "$arg" = -fno-tree-vectorize ] && arg=
         [ "$arg" = -Wshadow ] && arg=
@@ -105,10 +105,10 @@
 cl_ldflags() {
     for arg in $*; do
         arg=${arg/LIBPATH/libpath}
-        [ ${arg#-libpath:} == $arg -a ${arg#-l} != $arg ] && arg=${arg#-l}.lib
-        [ ${arg#-L} != $arg ] && arg=-libpath:${arg#-L}
-        [ $arg = -Wl,--large-address-aware ] && arg=-largeaddressaware
-        [ $arg = -s ] && arg=
+        [ "${arg#-libpath:}" == "$arg" -a "${arg#-l}" != "$arg" ] && arg=${arg#-l}.lib
+        [ "${arg#-L}" != "$arg" ] && arg=-libpath:${arg#-L}
+        [ "$arg" = -Wl,--large-address-aware ] && arg=-largeaddressaware
+        [ "$arg" = -s ] && arg=
         [ "$arg" = -Wl,-Bsymbolic ] && arg=
         [ "$arg" = -fno-tree-vectorize ] && arg=
         [ "$arg" = -Werror ] && arg=
@@ -119,6 +119,7 @@
         arg=${arg/pthreadGC/pthreadVC}
         [ "$arg" = avifil32.lib ] && arg=vfw32.lib
         [ "$arg" = gpac_static.lib ] && arg=libgpac_static.lib
+        [ "$arg" = x264.lib ] && arg=libx264.lib
 
         [ -n "$arg" ] && echo -n "$arg "
     done
@@ -143,7 +144,9 @@
         log_check "for $3 in $1";
     fi
     rm -f conftest.c
-    [ -n "$1" ] && echo "#include <$1>" > conftest.c
+    for arg in $1; do
+        echo "#include <$arg>" >> conftest.c
+    done
     echo "int main (void) { $3 return 0; }" >> conftest.c
     if [ $compiler_style = MS ]; then
         cc_cmd="$CC conftest.c $(cc_cflags $CFLAGS $CHECK_CFLAGS $2) -link $(cl_ldflags $2 $LDFLAGSCLI $LDFLAGS)"
@@ -172,7 +175,9 @@
 cpp_check() {
     log_check "whether $3 is true"
     rm -f conftest.c
-    [ -n "$1" ] && echo "#include <$1>" > conftest.c
+    for arg in $1; do
+        echo "#include <$arg>" >> conftest.c
+    done
     echo -e "#if !($3) \n#error $4 \n#endif " >> conftest.c
     if [ $compiler_style = MS ]; then
         cpp_cmd="$CC conftest.c $(cc_cflags $CFLAGS $2) -P"
@@ -256,6 +261,48 @@
     exit 1
 }
 
+configure_system_override() {
+    log_check "system libx264 configuration"
+    x264_config_path="$1/x264_config.h"
+    if [ -e "$x264_config_path" ]; then
+        res=$?
+        log_ok
+        arg="$(grep '#define X264_GPL ' $x264_config_path | sed -e 's/#define X264_GPL *//; s/ *$//')"
+        if [ -n "$arg" ]; then
+            [ "$arg" = 0 ] && arg="no" || arg="yes"
+            [ "$arg" != "$gpl" ] && die "Incompatible license with system libx264"
+        fi
+        arg="$(grep '#define X264_BIT_DEPTH ' $x264_config_path | sed -e 's/#define X264_BIT_DEPTH *//; s/ *$//')"
+        if [ -n "$arg" ]; then
+            if [ "$arg" != "$bit_depth" ]; then
+                echo "Override output bit depth with system libx264 configuration"
+                bit_depth="$arg"
+            fi
+        fi
+        arg="$(grep '#define X264_CHROMA_FORMAT ' $x264_config_path | sed -e 's/#define X264_CHROMA_FORMAT *//; s/ *$//')"
+        if [ -n "$arg" ]; then
+            [ "$arg" = 0 ] && arg="all" || arg="${arg#X264_CSP_I}"
+            if [ "$arg" != "$chroma_format" ]; then
+                echo "Override output chroma format with system libx264 configuration"
+                chroma_format="$arg"
+            fi
+        fi
+        arg="$(grep '#define X264_INTERLACED ' $x264_config_path | sed -e 's/#define X264_INTERLACED *//; s/ *$//')"
+        if [ -n "$arg" ]; then
+            [ "$arg" = 0 ] && arg="no" || arg="yes"
+            if [ "$arg" != "$interlaced" ]; then
+                echo "Override interlaced encoding support with system libx264 configuration"
+                interlaced="$arg"
+            fi
+        fi
+    else
+        res=$?
+        log_fail
+        log_msg "Failed search path was: $x264_config_path"
+    fi
+    return $res
+}
+
 rm -f x264_config.h config.h config.mak config.log x264.pc x264.def conftest*
 
 SRCPATH="$(cd $(dirname $0); pwd)"
@@ -311,7 +358,8 @@
 
 # list of all preprocessor HAVE values we can define
 CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \
-             LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC"
+             LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC INTEL_DISPATCHER \
+             MSA"
 
 # parse options
 
@@ -458,6 +506,8 @@
 host_vendor="${host%%-*}"
 host_os="${host#*-}"
 
+trap 'rm -f conftest*' EXIT
+
 # test for use of compilers that require specific handling
 cc_base=`basename "$CC"`
 QPRE="-"
@@ -600,9 +650,9 @@
 case $host_cpu in
     i*86)
         ARCH="X86"
-        AS="yasm"
+        AS="${AS-yasm}"
         AS_EXT=".asm"
-        ASFLAGS="$ASFLAGS -O2 -DARCH_X86_64=0 -I\$(SRCPATH)/common/x86/"
+        ASFLAGS="$ASFLAGS -DARCH_X86_64=0 -I\$(SRCPATH)/common/x86/"
         if [ $compiler = GNU ]; then
             if [[ "$asm" == auto && "$CFLAGS" != *-march* ]]; then
                 CFLAGS="$CFLAGS -march=i686"
@@ -629,39 +679,39 @@
             stack_alignment=4
         fi
         if [ "$SYS" = MACOSX ]; then
-            ASFLAGS="$ASFLAGS -f macho -DPREFIX"
+            ASFLAGS="$ASFLAGS -f macho32 -DPREFIX"
         elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then
             ASFLAGS="$ASFLAGS -f win32 -DPREFIX"
             LDFLAGS="$LDFLAGS -Wl,--large-address-aware"
             [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--nxcompat -Wl,--dynamicbase"
             [ $compiler = GNU ] && RCFLAGS="--target=pe-i386 $RCFLAGS"
         else
-            ASFLAGS="$ASFLAGS -f elf"
+            ASFLAGS="$ASFLAGS -f elf32"
         fi
         ;;
     x86_64)
         ARCH="X86_64"
-        AS="yasm"
+        AS="${AS-yasm}"
         AS_EXT=".asm"
         ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -I\$(SRCPATH)/common/x86/"
         [ $compiler = GNU ] && CFLAGS="-m64 $CFLAGS" && LDFLAGS="-m64 $LDFLAGS"
         if [ "$SYS" = MACOSX ]; then
-            ASFLAGS="$ASFLAGS -f macho64 -m amd64 -DPIC -DPREFIX"
+            ASFLAGS="$ASFLAGS -f macho64 -DPIC -DPREFIX"
             if cc_check '' "-arch x86_64"; then
                 CFLAGS="$CFLAGS -arch x86_64"
                 LDFLAGS="$LDFLAGS -arch x86_64"
             fi
         elif [ "$SYS" = WINDOWS -o "$SYS" = CYGWIN ]; then
-            ASFLAGS="$ASFLAGS -f win32 -m amd64"
+            ASFLAGS="$ASFLAGS -f win64"
             # only the GNU toolchain is inconsistent in prefixing function names with _
             [ $compiler = GNU ] && cc_check "" "-S" && grep -q "_main:" conftest && ASFLAGS="$ASFLAGS -DPREFIX"
             [ $compiler = GNU ] && LDFLAGS="$LDFLAGS -Wl,--nxcompat -Wl,--dynamicbase"
             [ $compiler = GNU ] && RCFLAGS="--target=pe-x86-64 $RCFLAGS"
         else
-            ASFLAGS="$ASFLAGS -f elf -m amd64"
+            ASFLAGS="$ASFLAGS -f elf64"
         fi
         ;;
-    powerpc|powerpc64)
+    powerpc*)
         ARCH="PPC"
         if [ $asm = auto ] ; then
             define HAVE_ALTIVEC
@@ -678,13 +728,15 @@
     sparc)
         ARCH="SPARC"
         ;;
-    mips|mipsel|mips64|mips64el)
+    mips*)
         ARCH="MIPS"
+        AS="${AS-${CC}}"
+        AS_EXT=".c"
         ;;
     arm*)
         ARCH="ARM"
         if [ "$SYS" = MACOSX ] ; then
-            AS="${AS-extras/gas-preprocessor.pl $CC}"
+            AS="${AS-${SRCPATH}/tools/gas-preprocessor.pl -arch arm -- ${CC}}"
             ASFLAGS="$ASFLAGS -DPREFIX -DPIC"  # apple's ld doesn't support movw/movt relocations at all
             # build for armv7 by default
             if ! echo $CFLAGS | grep -Eq '\-arch' ; then
@@ -698,7 +750,7 @@

x264-snapshot-20141218-2245.tar.bz2/doc/vui.txt -> x264-snapshot-20150804-2245.tar.bz2/doc/vui.txt Changed

@@ -16,14 +16,14 @@
 * How do I use it?
     You can derive the SAR of an image from the width, height and the
     display aspect ratio (DAR) of the image as follows:
-    
+
     SAR_x   DAR_x * height
     ----- = --------------
     SAR_y   DAR_y * width
-    
+
     for example:
     width x height = 704x576, DAR = 4:3 ==> SAR = 2304:2112 or 12:11
-    
+
     Please note that if your material is a digitized analog signal, you should
     not use this equation to calculate the SAR. Refer to the manual of your
     digitizing equipment or this link instead.
@@ -36,7 +36,7 @@
     correction of aspect ratios, and there are just few exceptions. You should
     even use it, if the SAR of your material is 1:1, as the default of x264 is
     "SAR not defined".
-    
+
 2. Overscan
 ------------
 
@@ -49,7 +49,7 @@
     analog signal. Instead it refers to the "overscan" process on a display
     that shows only a part of the image. What that part is depends on the
     display.
-    
+
 * How do I use this option?
     As I'm not sure about what part of the image is shown when the display uses
     an overscan process, I can't provide you with rules or examples. The safe
@@ -72,7 +72,7 @@
 * What is it?
     A purely informative setting, that explains what the type of your analog
     video was, before you digitized it.
-    
+
 * How do I use this option?
     Just set it to the desired value. ( e.g. NTSC, PAL )
     If you transcode from MPEG2, you may find the value for this option in the
@@ -101,11 +101,11 @@
     or want to make sure that your material is played back without
     oversaturation, set if to on. Please note that the default for this option
     in x264 is off, which is not a safe assumption.
-    
+
 * Should I use this option?
     Yes, but there are few decoders/ media players that distinguish
     between the two options.
-    
+
 5. Color Primaries, Transfer Characteristics, Matrix Coefficients
 -------------------------------------------------------------------
 
@@ -120,7 +120,7 @@
     profile of the digitizing equipment is known, it is possible to correct the
     colors and gamma of the decoded h264 stream in a way that the video stream
     looks the same, regardless of the digitizing equipment used.
-    
+
 * How do I use these options?
     If you are able to find out which characteristics your digitizing equipment
     uses, (see the equipment documentation or make reference measurements)
@@ -170,9 +170,8 @@
     chroma sample location in that direction is equal to one of the luma
     samples. H264 Annex E contains images that tell you how to "transform" your
     Chroma Sample Location into a value of 0 to 5 that you can pass to x264.
-    
+
 * Should I use this option?
     Unless you are a perfectionist, don't bother. Media players ignore this
     setting, and favor their own (fixed) assumed Chroma Sample Location.
 
-

x264-snapshot-20141218-2245.tar.bz2/encoder/analyse.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/analyse.c Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/analyse.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/analyse.h Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/cabac.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/cabac.c Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/cavlc.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/cavlc.c Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/encoder.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/encoder.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * encoder.c: top-level encoder functions
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -32,6 +32,9 @@
 #include "ratecontrol.h"
 #include "macroblock.h"
 #include "me.h"
+#if HAVE_INTEL_DISPATCHER
+#include "extras/intel_dispatcher.h"
+#endif
 
 //#define DEBUG_MB_TYPE
 
@@ -471,12 +474,12 @@
 
     int i_csp = h->param.i_csp & X264_CSP_MASK;
 #if X264_CHROMA_FORMAT
-    if( CHROMA_FORMAT != CHROMA_420 && i_csp >= X264_CSP_I420 && i_csp <= X264_CSP_NV12 )
+    if( CHROMA_FORMAT != CHROMA_420 && i_csp >= X264_CSP_I420 && i_csp < X264_CSP_I422 )
     {
         x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:0 support\n" );
         return -1;
     }
-    else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_V210 )
+    else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp < X264_CSP_I444 )
     {
         x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:2 support\n" );
         return -1;
@@ -489,36 +492,41 @@
 #endif
     if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
     {
-        x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" );
+        x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" );
         return -1;
     }
 
-    if( i_csp < X264_CSP_I444 && h->param.i_width % 2 )
+    int w_mod = i_csp < X264_CSP_I444 ? 2 : 1;
+    int h_mod = (i_csp < X264_CSP_I422 ? 2 : 1) << PARAM_INTERLACED;
+    if( h->param.i_width % w_mod )
     {
-        x264_log( h, X264_LOG_ERROR, "width not divisible by 2 (%dx%d)\n",
-                  h->param.i_width, h->param.i_height );
+        x264_log( h, X264_LOG_ERROR, "width not divisible by %d (%dx%d)\n",
+                  w_mod, h->param.i_width, h->param.i_height );
         return -1;
     }
-
-    if( i_csp < X264_CSP_I422 && PARAM_INTERLACED && h->param.i_height % 4 )
+    if( h->param.i_height % h_mod )
     {
-        x264_log( h, X264_LOG_ERROR, "height not divisible by 4 (%dx%d)\n",
-                  h->param.i_width, h->param.i_height );
+        x264_log( h, X264_LOG_ERROR, "height not divisible by %d (%dx%d)\n",
+                  h_mod, h->param.i_width, h->param.i_height );
         return -1;
     }
 
-    if( (i_csp < X264_CSP_I422 || PARAM_INTERLACED) && h->param.i_height % 2 )
+    if( h->param.crop_rect.i_left   >= h->param.i_width ||
+        h->param.crop_rect.i_right  >= h->param.i_width ||
+        h->param.crop_rect.i_top    >= h->param.i_height ||
+        h->param.crop_rect.i_bottom >= h->param.i_height ||
+        h->param.crop_rect.i_left + h->param.crop_rect.i_right  >= h->param.i_width ||
+        h->param.crop_rect.i_top  + h->param.crop_rect.i_bottom >= h->param.i_height )
     {
-        x264_log( h, X264_LOG_ERROR, "height not divisible by 2 (%dx%d)\n",
-                  h->param.i_width, h->param.i_height );
+        x264_log( h, X264_LOG_ERROR, "invalid crop-rect %u,%u,%u,%u\n", h->param.crop_rect.i_left,
+                  h->param.crop_rect.i_top, h->param.crop_rect.i_right,  h->param.crop_rect.i_bottom );
         return -1;
     }
-
-    if( (h->param.crop_rect.i_left + h->param.crop_rect.i_right ) >= h->param.i_width ||
-        (h->param.crop_rect.i_top  + h->param.crop_rect.i_bottom) >= h->param.i_height )
+    if( h->param.crop_rect.i_left % w_mod || h->param.crop_rect.i_right  % w_mod ||
+        h->param.crop_rect.i_top  % h_mod || h->param.crop_rect.i_bottom % h_mod )
     {
-        x264_log( h, X264_LOG_ERROR, "invalid crop-rect %u,%u,%u,%u\n", h->param.crop_rect.i_left,
-                  h->param.crop_rect.i_top, h->param.crop_rect.i_right,  h->param.crop_rect.i_bottom );
+        x264_log( h, X264_LOG_ERROR, "crop-rect %u,%u,%u,%u not divisible by %dx%d\n", h->param.crop_rect.i_left,
+                  h->param.crop_rect.i_top, h->param.crop_rect.i_right,  h->param.crop_rect.i_bottom, w_mod, h_mod );
         return -1;
     }
 
@@ -529,7 +537,13 @@
     }
 
     if( h->param.i_threads == X264_THREADS_AUTO )
+    {
         h->param.i_threads = x264_cpu_num_processors() * (h->param.b_sliced_threads?2:3)/2;
+        /* Avoid too many threads as they don't improve performance and
+         * complicate VBV. Capped at an arbitrary 2 rows per thread. */
+        int max_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 2 );
+        h->param.i_threads = X264_MIN( h->param.i_threads, max_threads );
+    }
     int max_sliced_threads = X264_MAX( 1, (h->param.i_height+15)/16 / 4 );
     if( h->param.i_threads > 1 )
     {
@@ -583,7 +597,20 @@
         h->param.i_dpb_size = 1;
     }
 
-    h->param.i_frame_packing = x264_clip3( h->param.i_frame_packing, -1, 5 );
+    if( h->param.i_frame_packing < -1 || h->param.i_frame_packing > 7 )
+    {
+        x264_log( h, X264_LOG_WARNING, "ignoring unknown frame packing value\n" );
+        h->param.i_frame_packing = -1;
+    }
+    if( h->param.i_frame_packing == 7 &&
+        ((h->param.i_width - h->param.crop_rect.i_left - h->param.crop_rect.i_right)  % 3 ||
+         (h->param.i_height - h->param.crop_rect.i_top - h->param.crop_rect.i_bottom) % 3) )
+    {
+        x264_log( h, X264_LOG_ERROR, "cropped resolution %dx%d not compatible with tile format frame packing\n",
+                  h->param.i_width - h->param.crop_rect.i_left - h->param.crop_rect.i_right,
+                  h->param.i_height - h->param.crop_rect.i_top - h->param.crop_rect.i_bottom );
+        return -1;
+    }
 
     /* Detect default ffmpeg settings and terminate with an error. */
     if( b_open )
@@ -1050,7 +1077,7 @@
         h->param.analyse.intra &= ~X264_ANALYSE_I8x8;
     }
     h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 );
-    h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 );
+    h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 3 );
     h->param.rc.f_aq_strength = x264_clip3f( h->param.rc.f_aq_strength, 0, 3 );
     if( h->param.rc.f_aq_strength == 0 )
         h->param.rc.i_aq_mode = 0;
@@ -1390,6 +1417,10 @@
     if( param->param_free )
         param->param_free( param );
 
+#if HAVE_INTEL_DISPATCHER
+    x264_intel_dispatcher_override();
+#endif
+
     if( x264_threading_init() )
     {
         x264_log( h, X264_LOG_ERROR, "unable to initialize threading\n" );
@@ -1676,6 +1707,7 @@
         else if( !x264_is_regular_file( f ) )
         {
             x264_log( h, X264_LOG_ERROR, "dump_yuv: incompatible with non-regular file %s\n", h->param.psz_dump_yuv );
+            fclose( f );
             goto fail;
         }
         fclose( f );
@@ -3213,6 +3245,12 @@
     /* ------------------- Setup new frame from picture -------------------- */
     if( pic_in != NULL )
     {
+        if( h->lookahead->b_exit_thread )
+        {
+            x264_log( h, X264_LOG_ERROR, "lookahead thread is already stopped\n" );
+            return -1;
+        }
+
         /* 1: Copy the picture to a frame and move it to a buffer */
         x264_frame_t *fenc = x264_frame_pop_unused( h, 0 );
         if( !fenc )
@@ -4087,14 +4125,14 @@
     if( h->stat.i_frame_count[SLICE_TYPE_I] > 0 )
     {
         int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_I];
-        double i_count = h->stat.i_frame_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0;
+        double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_I] * h->mb.i_mb_count / 100.0;
         x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
         x264_log( h, X264_LOG_INFO, "mb I  %s\n", buf );
     }
     if( h->stat.i_frame_count[SLICE_TYPE_P] > 0 )
     {
         int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_P];
-        double i_count = h->stat.i_frame_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0;
+        double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_P] * h->mb.i_mb_count / 100.0;
         int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_P];
         x264_print_intra( i_mb_count, i_count, b_print_pcm, buf );
         x264_log( h, X264_LOG_INFO,
@@ -4110,7 +4148,7 @@
     if( h->stat.i_frame_count[SLICE_TYPE_B] > 0 )
     {
         int64_t *i_mb_count = h->stat.i_mb_count[SLICE_TYPE_B];
-        double i_count = h->stat.i_frame_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0;
+        double i_count = (double)h->stat.i_frame_count[SLICE_TYPE_B] * h->mb.i_mb_count / 100.0;
         double i_mb_list_count;
         int64_t *i_mb_size = i_mb_count_size[SLICE_TYPE_B];
         int64_t list_count[3] = {0}; /* 0 == L0, 1 == L1, 2 == BI */

x264-snapshot-20141218-2245.tar.bz2/encoder/lookahead.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/lookahead.c Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/macroblock.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/macroblock.c Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/macroblock.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/macroblock.h Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/me.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/me.c Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/me.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/me.h Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/ratecontrol.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/ratecontrol.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * ratecontrol.c: ratecontrol
  *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
+ * Copyright (C) 2005-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Michael Niedermayer <michaelni@gmx.at>
@@ -96,6 +96,7 @@
     /* VBV stuff */
     double buffer_size;
     int64_t buffer_fill_final;
+    int64_t buffer_fill_final_min;
     double buffer_fill;         /* planned buffer, if all in-progress frames hit their bit budget */
     double buffer_rate;         /* # of bits added to buffer_fill after each frame */
     double vbv_max_rate;        /* # of bits added to buffer_fill per second */
@@ -301,10 +302,6 @@
 
 void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_offsets )
 {
-    /* constants chosen to result in approximately the same overall bitrate as without AQ.
-     * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
-    float strength;
-    float avg_adj = 0.f;
     /* Initialize frame stats */
     for( int i = 0; i < 3; i++ )
     {
@@ -348,23 +345,30 @@
     /* Actual adaptive quantization */
     else
     {
-        if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
+        /* constants chosen to result in approximately the same overall bitrate as without AQ.
+         * FIXME: while they're written in 5 significant digits, they're only tuned to 2. */
+        float strength;
+        float avg_adj = 0.f;
+        float bias_strength = 0.f;
+
+        if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE || h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE_BIASED )
         {
-            float bit_depth_correction = powf(1 << (BIT_DEPTH-8), 0.5f);
+            float bit_depth_correction = 1.f / (1 << (2*(BIT_DEPTH-8)));
             float avg_adj_pow2 = 0.f;
             for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ )
                 for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ )
                 {
                     uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
-                    float qp_adj = powf( energy + 1, 0.125f );
+                    float qp_adj = powf( energy * bit_depth_correction + 1, 0.125f );
                     frame->f_qp_offset[mb_x + mb_y*h->mb.i_mb_stride] = qp_adj;
                     avg_adj += qp_adj;
                     avg_adj_pow2 += qp_adj * qp_adj;
                 }
             avg_adj /= h->mb.i_mb_count;
             avg_adj_pow2 /= h->mb.i_mb_count;
-            strength = h->param.rc.f_aq_strength * avg_adj / bit_depth_correction;
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (14.f * bit_depth_correction)) / avg_adj;
+            strength = h->param.rc.f_aq_strength * avg_adj;
+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
+            bias_strength = h->param.rc.f_aq_strength;
         }
         else
             strength = h->param.rc.f_aq_strength * 1.0397f;
@@ -374,7 +378,12 @@
             {
                 float qp_adj;
                 int mb_xy = mb_x + mb_y*h->mb.i_mb_stride;
-                if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
+                if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE_BIASED )
+                {
+                    qp_adj = frame->f_qp_offset[mb_xy];
+                    qp_adj = strength * (qp_adj - avg_adj) + bias_strength * (1.f - 14.f / (qp_adj * qp_adj));
+                }
+                else if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
                 {
                     qp_adj = frame->f_qp_offset[mb_xy];
                     qp_adj = strength * (qp_adj - avg_adj);
@@ -724,7 +733,8 @@
             if( h->param.rc.f_vbv_buffer_init > 1. )
                 h->param.rc.f_vbv_buffer_init = x264_clip3f( h->param.rc.f_vbv_buffer_init / h->param.rc.i_vbv_buffer_size, 0, 1 );
             h->param.rc.f_vbv_buffer_init = x264_clip3f( X264_MAX( h->param.rc.f_vbv_buffer_init, rc->buffer_rate / rc->buffer_size ), 0, 1);
-            rc->buffer_fill_final = rc->buffer_size * h->param.rc.f_vbv_buffer_init * h->sps->vui.i_time_scale;
+            rc->buffer_fill_final =
+            rc->buffer_fill_final_min = rc->buffer_size * h->param.rc.f_vbv_buffer_init * h->sps->vui.i_time_scale;
             rc->b_vbv = 1;
             rc->b_vbv_min_rate = !rc->b_2pass
                           && h->param.rc.i_rc_method == X264_RC_ABR
@@ -776,11 +786,11 @@
     if( h->param.i_nal_hrd )
     {
         uint64_t denom = (uint64_t)h->sps->vui.hrd.i_bit_rate_unscaled * h->sps->vui.i_time_scale;
-        uint64_t num = 180000;
+        uint64_t num = 90000;
         x264_reduce_fraction64( &num, &denom );
-        rc->hrd_multiply_denom = 180000 / num;
+        rc->hrd_multiply_denom = 90000 / num;
 
-        double bits_required = log2( 180000 / rc->hrd_multiply_denom )
+        double bits_required = log2( 90000 / rc->hrd_multiply_denom )
                              + log2( h->sps->vui.i_time_scale )
                              + log2( h->sps->vui.hrd.i_cpb_size_unscaled );
         if( bits_required >= 63 )
@@ -822,6 +832,7 @@
     int num_preds = h->param.b_sliced_threads * h->param.i_threads + 1;
     CHECKED_MALLOC( rc->pred, 5 * sizeof(predictor_t) * num_preds );
     CHECKED_MALLOC( rc->pred_b_from_p, sizeof(predictor_t) );
+    static const float pred_coeff_table[3] = { 1.0, 1.0, 1.5 };
     for( int i = 0; i < 3; i++ )
     {
         rc->last_qscale_for[i] = qp2qscale( ABR_INIT_QP );
@@ -829,8 +840,8 @@
         rc->lmax[i] = qp2qscale( h->param.rc.i_qp_max );
         for( int j = 0; j < num_preds; j++ )
         {
-            rc->pred[i+j*5].coeff_min = 2.0 / 4;
-            rc->pred[i+j*5].coeff = 2.0;
+            rc->pred[i+j*5].coeff_min = pred_coeff_table[i] / 2;
+            rc->pred[i+j*5].coeff = pred_coeff_table[i];
             rc->pred[i+j*5].count = 1.0;
             rc->pred[i+j*5].decay = 0.5;
             rc->pred[i+j*5].offset = 0.0;
@@ -844,7 +855,11 @@
             rc->row_preds[i][j].offset = 0.0;
         }
     }
-    *rc->pred_b_from_p = rc->pred[0];
+    rc->pred_b_from_p->coeff_min = 0.5 / 2;
+    rc->pred_b_from_p->coeff = 0.5;
+    rc->pred_b_from_p->count = 1.0;
+    rc->pred_b_from_p->decay = 0.5;
+    rc->pred_b_from_p->offset = 0.0;
 
     if( parse_zones( h ) < 0 )
     {
@@ -1914,15 +1929,16 @@
             h->fenc->hrd_timing.cpb_removal_time = rc->nrt_first_access_unit + (double)(h->fenc->i_cpb_delay - h->i_cpb_delay_pir_offset) *
                                                    h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
 
-            double cpb_earliest_arrival_time = h->fenc->hrd_timing.cpb_removal_time - (double)rc->initial_cpb_removal_delay / 90000;
             if( h->fenc->b_keyframe )
             {
-                 rc->nrt_first_access_unit = h->fenc->hrd_timing.cpb_removal_time;
-                 rc->initial_cpb_removal_delay = h->initial_cpb_removal_delay;
-                 rc->initial_cpb_removal_delay_offset = h->initial_cpb_removal_delay_offset;
+                rc->nrt_first_access_unit = h->fenc->hrd_timing.cpb_removal_time;
+                rc->initial_cpb_removal_delay = h->initial_cpb_removal_delay;
+                rc->initial_cpb_removal_delay_offset = h->initial_cpb_removal_delay_offset;
             }
-            else
-                 cpb_earliest_arrival_time -= (double)rc->initial_cpb_removal_delay_offset / 90000;
+
+            double cpb_earliest_arrival_time = h->fenc->hrd_timing.cpb_removal_time - (double)rc->initial_cpb_removal_delay / 90000;
+            if( !h->fenc->b_keyframe )
+                cpb_earliest_arrival_time -= (double)rc->initial_cpb_removal_delay_offset / 90000;
 
             if( h->sps->vui.hrd.b_cbr_hrd )
                 h->fenc->hrd_timing.cpb_initial_arrival_time = rc->previous_cpb_final_arrival_time;
@@ -2095,7 +2111,7 @@
     int bitrate = h->sps->vui.hrd.i_bit_rate_unscaled;
     x264_ratecontrol_t *rcc = h->rc;
     x264_ratecontrol_t *rct = h->thread[0]->rc;
-    uint64_t buffer_size = (uint64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale;
+    int64_t buffer_size = (int64_t)h->sps->vui.hrd.i_cpb_size_unscaled * h->sps->vui.i_time_scale;
 
     if( rcc->last_satd >= h->mb.i_mb_count )
         update_predictor( &rct->pred[h->sh.i_type], qp2qscale( rcc->qpa_rc ), rcc->last_satd, bits );
@@ -2103,32 +2119,45 @@
     if( !rcc->b_vbv )
         return filler;
 
-    rct->buffer_fill_final -= (uint64_t)bits * h->sps->vui.i_time_scale;
+    uint64_t buffer_diff = (uint64_t)bits * h->sps->vui.i_time_scale;
+    rct->buffer_fill_final -= buffer_diff;
+    rct->buffer_fill_final_min -= buffer_diff;
 
-    if( rct->buffer_fill_final < 0 )
+    if( rct->buffer_fill_final_min < 0 )
     {
-        double underflow = (double)rct->buffer_fill_final / h->sps->vui.i_time_scale;
+        double underflow = (double)rct->buffer_fill_final_min / h->sps->vui.i_time_scale;
         if( rcc->rate_factor_max_increment && rcc->qpm >= rcc->qp_novbv + rcc->rate_factor_max_increment )
             x264_log( h, X264_LOG_DEBUG, "VBV underflow due to CRF-max (frame %d, %.0f bits)\n", h->i_frame, underflow );
         else
             x264_log( h, X264_LOG_WARNING, "VBV underflow (frame %d, %.0f bits)\n", h->i_frame, underflow );
+        rct->buffer_fill_final =
+        rct->buffer_fill_final_min = 0;
     }
-    rct->buffer_fill_final = X264_MAX( rct->buffer_fill_final, 0 );
 
     if( h->param.i_avcintra_class )
-        rct->buffer_fill_final += buffer_size;
+        buffer_diff = buffer_size;
     else
-        rct->buffer_fill_final += (uint64_t)bitrate * h->sps->vui.i_num_units_in_tick * h->fenc->i_cpb_duration;
-
-    if( h->param.rc.b_filler && rct->buffer_fill_final > buffer_size )
-    {
-        int64_t scale = (int64_t)h->sps->vui.i_time_scale * 8;
-        filler = (rct->buffer_fill_final - buffer_size + scale - 1) / scale;

x264-snapshot-20141218-2245.tar.bz2/encoder/ratecontrol.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/ratecontrol.h Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/rdo.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/rdo.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * rdo.c: rate-distortion optimization
  *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
+ * Copyright (C) 2005-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Fiona Glaser <fiona@x264.com>
@@ -180,7 +180,7 @@
     else
     {
         x264_macroblock_size_cavlc( h );
-        i_bits = ( h->out.bs.i_bits_encoded * i_lambda2 + 128 ) >> 8;
+        i_bits = ( (uint64_t)h->out.bs.i_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
 
     h->mb.b_transform_8x8 = b_transform_bak;
@@ -261,7 +261,7 @@
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
     else
-        i_bits = x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
+        i_bits = (uint64_t)x264_partition_size_cavlc( h, i8, i_pixel ) * i_lambda2;
 
     return (i_ssd<<8) + i_bits;
 }
@@ -297,7 +297,7 @@
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
     else
-        i_bits = x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
+        i_bits = (uint64_t)x264_partition_i8x8_size_cavlc( h, i8, i_mode ) * i_lambda2;
 
     return (i_ssd<<8) + i_bits;
 }
@@ -331,7 +331,7 @@
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
     else
-        i_bits = x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
+        i_bits = (uint64_t)x264_partition_i4x4_size_cavlc( h, i4, i_mode ) * i_lambda2;
 
     return (i_ssd<<8) + i_bits;
 }
@@ -357,7 +357,7 @@
         i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
     else
-        i_bits = x264_chroma_size_cavlc( h ) * i_lambda2;
+        i_bits = (uint64_t)x264_chroma_size_cavlc( h ) * i_lambda2;
 
     return (i_ssd<<8) + i_bits;
 }

x264-snapshot-20141218-2245.tar.bz2/encoder/set.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/set.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * set: header writing
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -166,7 +166,7 @@
     while( (1 << sps->i_log2_max_frame_num) <= max_frame_num )
         sps->i_log2_max_frame_num++;
 
-    sps->i_poc_type = param->i_bframe || param->b_interlaced ? 0 : 2;
+    sps->i_poc_type = param->i_bframe || param->b_interlaced || param->i_avcintra_class ? 0 : 2;
     if( sps->i_poc_type == 0 )
     {
         int max_delta_poc = (param->i_bframe + 2) * (!!param->i_bframe_pyramid + 1) * 2;
@@ -578,7 +578,7 @@
 
     memcpy( payload, uuid, 16 );
     sprintf( payload+16, "x264 - core %d%s - H.264/MPEG-4 AVC codec - "
-             "Copy%s 2003-2014 - http://www.videolan.org/x264.html - options: %s",
+             "Copy%s 2003-2015 - http://www.videolan.org/x264.html - options: %s",
              X264_BUILD, X264_VERSION, HAVE_GPL?"left":"right", opts );
     length = strlen(payload)+1;
 
@@ -663,7 +663,7 @@
     bs_write1( &q, quincunx_sampling_flag );      // quincunx_sampling_flag
 
     // 0: views are unrelated, 1: left view is on the left, 2: left view is on the right
-    bs_write ( &q, 6, 1 );                        // content_interpretation_type
+    bs_write ( &q, 6, h->param.i_frame_packing != 6 ); // content_interpretation_type
 
     bs_write1( &q, 0 );                           // spatial_flipping_flag
     bs_write1( &q, 0 );                           // frame0_flipped_flag

x264-snapshot-20141218-2245.tar.bz2/encoder/set.h -> x264-snapshot-20150804-2245.tar.bz2/encoder/set.h Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/slicetype-cl.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/slicetype-cl.c Changed

x264-snapshot-20141218-2245.tar.bz2/encoder/slicetype.c -> x264-snapshot-20150804-2245.tar.bz2/encoder/slicetype.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * slicetype.c: lookahead analysis
  *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
+ * Copyright (C) 2005-2015 x264 project
  *
  * Authors: Fiona Glaser <fiona@x264.com>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -612,7 +612,6 @@
 
     if( b_bidir )
     {
-        int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy];
         ALIGNED_ARRAY_8( int16_t, dmv,[2],[2] );
 
         m[1].i_pixel = PIXEL_8x8;
@@ -624,14 +623,20 @@
         LOAD_HPELS_LUMA( m[1].p_fref, fref1->lowres );
         m[1].p_fref_w = m[1].p_fref[0];
 
-        dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
-        dmv[0][1] = ( mvr[1] * dist_scale_factor + 128 ) >> 8;
-        dmv[1][0] = dmv[0][0] - mvr[0];
-        dmv[1][1] = dmv[0][1] - mvr[1];
-        CLIP_MV( dmv[0] );
-        CLIP_MV( dmv[1] );
-        if( h->param.analyse.i_subpel_refine <= 1 )
-            M64( dmv ) &= ~0x0001000100010001ULL; /* mv & ~1 */
+        if( fref1->lowres_mvs[0][p1-p0-1][0][0] != 0x7FFF )
+        {
+            int16_t *mvr = fref1->lowres_mvs[0][p1-p0-1][i_mb_xy];
+            dmv[0][0] = ( mvr[0] * dist_scale_factor + 128 ) >> 8;
+            dmv[0][1] = ( mvr[1] * dist_scale_factor + 128 ) >> 8;
+            dmv[1][0] = dmv[0][0] - mvr[0];
+            dmv[1][1] = dmv[0][1] - mvr[1];
+            CLIP_MV( dmv[0] );
+            CLIP_MV( dmv[1] );
+            if( h->param.analyse.i_subpel_refine <= 1 )
+                M64( dmv ) &= ~0x0001000100010001ULL; /* mv & ~1 */
+        }
+        else
+            M64( dmv ) = 0;
 
         TRY_BIDIR( dmv[0], dmv[1], 0 );
         if( M64( dmv ) )
@@ -1104,7 +1109,7 @@
     if( b_intra )
         x264_slicetype_frame_cost( h, a, frames, 0, 0, 0, 0 );
 
-    while( i > 0 && frames[i]->i_type == X264_TYPE_B )
+    while( i > 0 && IS_X264_TYPE_B( frames[i]->i_type ) )
         i--;
     last_nonb = i;
 
@@ -1132,7 +1137,7 @@
     while( i-- > idx )
     {
         cur_nonb = i;
-        while( frames[cur_nonb]->i_type == X264_TYPE_B && cur_nonb > 0 )
+        while( IS_X264_TYPE_B( frames[cur_nonb]->i_type ) && cur_nonb > 0 )
             cur_nonb--;
         if( cur_nonb < idx )
             break;
@@ -1226,7 +1231,7 @@
     int last_nonb = 0, cur_nonb = 1, idx = 0;
     x264_frame_t *prev_frame = NULL;
     int prev_frame_idx = 0;
-    while( cur_nonb < num_frames && frames[cur_nonb]->i_type == X264_TYPE_B )
+    while( cur_nonb < num_frames && IS_X264_TYPE_B( frames[cur_nonb]->i_type ) )
         cur_nonb++;
     int next_nonb = keyframe ? last_nonb : cur_nonb;
 
@@ -1278,7 +1283,7 @@
         }
         last_nonb = cur_nonb;
         cur_nonb++;
-        while( cur_nonb <= num_frames && frames[cur_nonb]->i_type == X264_TYPE_B )
+        while( cur_nonb <= num_frames && IS_X264_TYPE_B( frames[cur_nonb]->i_type ) )
             cur_nonb++;
     }
     frames[next_nonb]->i_planned_type[idx] = X264_TYPE_AUTO;
@@ -1288,36 +1293,39 @@
 {
     int loc = 1;
     int cost = 0;
-    int cur_p = 0;
+    int cur_nonb = 0;
     path--; /* Since the 1st path element is really the second frame */
     while( path[loc] )
     {
-        int next_p = loc;
-        /* Find the location of the next P-frame. */
-        while( path[next_p] != 'P' )
-            next_p++;
-
-        /* Add the cost of the P-frame found above */
-        cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_p, 0 );
+        int next_nonb = loc;
+        /* Find the location of the next non-B-frame. */
+        while( path[next_nonb] == 'B' )
+            next_nonb++;
+
+        /* Add the cost of the non-B-frame found above */
+        if( path[next_nonb] == 'P' )
+            cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, next_nonb, 0 );
+        else /* I-frame */
+            cost += x264_slicetype_frame_cost( h, a, frames, next_nonb, next_nonb, next_nonb, 0 );
         /* Early terminate if the cost we have found is larger than the best path cost so far */
         if( cost > threshold )
             break;
 
-        if( h->param.i_bframe_pyramid && next_p - cur_p > 2 )
+        if( h->param.i_bframe_pyramid && next_nonb - cur_nonb > 2 )
         {
-            int middle = cur_p + (next_p - cur_p)/2;
-            cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, middle, 0 );
+            int middle = cur_nonb + (next_nonb - cur_nonb)/2;
+            cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, middle, 0 );
             for( int next_b = loc; next_b < middle && cost < threshold; next_b++ )
-                cost += x264_slicetype_frame_cost( h, a, frames, cur_p, middle, next_b, 0 );
-            for( int next_b = middle+1; next_b < next_p && cost < threshold; next_b++ )
-                cost += x264_slicetype_frame_cost( h, a, frames, middle, next_p, next_b, 0 );
+                cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, middle, next_b, 0 );
+            for( int next_b = middle+1; next_b < next_nonb && cost < threshold; next_b++ )
+                cost += x264_slicetype_frame_cost( h, a, frames, middle, next_nonb, next_b, 0 );
         }
         else
-            for( int next_b = loc; next_b < next_p && cost < threshold; next_b++ )
-                cost += x264_slicetype_frame_cost( h, a, frames, cur_p, next_p, next_b, 0 );
+            for( int next_b = loc; next_b < next_nonb && cost < threshold; next_b++ )
+                cost += x264_slicetype_frame_cost( h, a, frames, cur_nonb, next_nonb, next_b, 0 );
 
-        loc = next_p + 1;
-        cur_p = next_p;
+        loc = next_nonb + 1;
+        cur_nonb = next_nonb;
     }
     return cost;
 }
@@ -1331,6 +1339,7 @@
     char paths[2][X264_LOOKAHEAD_MAX+1];
     int num_paths = X264_MIN( h->param.i_bframe+1, length );
     int best_cost = COST_MAX;
+    int best_possible = 0;
     int idx = 0;
 
     /* Iterate over all currently possible paths */
@@ -1342,12 +1351,33 @@
         memset( paths[idx]+len, 'B', path );
         strcpy( paths[idx]+len+path, "P" );
 
-        /* Calculate the actual cost of the current path */
-        int cost = x264_slicetype_path_cost( h, a, frames, paths[idx], best_cost );
-        if( cost < best_cost )
+        int possible = 1;
+        for( int i = 1; i <= length; i++ )
         {
-            best_cost = cost;
-            idx ^= 1;
+            int i_type = frames[i]->i_type;
+            if( i_type == X264_TYPE_AUTO )
+                continue;
+            if( IS_X264_TYPE_B( i_type ) )
+                possible = possible && (i < len || i == length || paths[idx][i-1] == 'B');
+            else
+            {
+                possible = possible && (i < len || paths[idx][i-1] != 'B');
+                paths[idx][i-1] = IS_X264_TYPE_I( i_type ) ? 'I' : 'P';
+            }
+        }
+
+        if( possible || !best_possible )
+        {
+            if( possible && !best_possible )
+                best_cost = COST_MAX;
+            /* Calculate the actual cost of the current path */
+            int cost = x264_slicetype_path_cost( h, a, frames, paths[idx], best_cost );
+            if( cost < best_cost )
+            {
+                best_cost = cost;
+                best_possible = possible;
+                idx ^= 1;
+            }
         }
     }
 
@@ -1441,13 +1471,15 @@
     return scenecut_internal( h, a, frames, p0, p1, real_scenecut );
 }
 
+#define IS_X264_TYPE_AUTO_OR_I(x) ((x)==X264_TYPE_AUTO || IS_X264_TYPE_I(x))
+#define IS_X264_TYPE_AUTO_OR_B(x) ((x)==X264_TYPE_AUTO || IS_X264_TYPE_B(x))
+
 void x264_slicetype_analyse( x264_t *h, int intra_minigop )
 {
     x264_mb_analysis_t a;
     x264_frame_t *frames[X264_LOOKAHEAD_MAX+3] = { NULL, };
     int num_frames, orig_num_frames, keyint_limit, framecnt;
     int i_mb_count = NUM_MBS;

x264-snapshot-20141218-2245.tar.bz2/example.c -> x264-snapshot-20150804-2245.tar.bz2/example.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * example.c: libx264 API usage example
  *****************************************************************************
- * Copyright (C) 2014 x264 project
+ * Copyright (C) 2014-2015 x264 project
  *
  * Authors: Anton Mitrofanov <BugMaster@narod.ru>
  *
@@ -24,26 +24,14 @@
  *****************************************************************************/
 
 #ifdef _WIN32
-/* The following two defines must be located before the inclusion of any system header files. */
-#define WINVER       0x0500
-#define _WIN32_WINNT 0x0500
-#include <windows.h>
 #include <io.h>       /* _setmode() */
 #include <fcntl.h>    /* _O_BINARY */
 #endif
 
 #include <stdint.h>
 #include <stdio.h>
-#include <signal.h>
 #include <x264.h>
 
-/* Ctrl-C handler */
-static volatile int b_ctrl_c = 0;
-static void sigint_handler( int a )
-{
-    b_ctrl_c = 1;
-}
-
 #define FAIL_IF_ERROR( cond, ... )\
 do\
 {\
@@ -72,9 +60,6 @@
     _setmode( _fileno( stderr ), _O_BINARY );
 #endif
 
-    /* Control-C handler */
-    signal( SIGINT, sigint_handler );
-
     FAIL_IF_ERROR( !(argc > 1), "Example usage: example 352x288 <input.yuv >output.h264\n" );
     FAIL_IF_ERROR( 2 != sscanf( argv[1], "%dx%d", &width, &height ), "resolution not specified or incorrect\n" );
 
@@ -105,17 +90,17 @@
 #undef fail
 #define fail fail3
 
+    int luma_size = width * height;
+    int chroma_size = luma_size / 4;
     /* Encode frames */
-    for( ; !b_ctrl_c; i_frame++ )
+    for( ;; i_frame++ )
     {
         /* Read input frame */
-        int plane_size = width * height;
-        if( fread( pic.img.plane[0], 1, plane_size, stdin ) != plane_size )
+        if( fread( pic.img.plane[0], 1, luma_size, stdin ) != luma_size )
             break;
-        plane_size = ((width + 1) >> 1) * ((height + 1) >> 1);
-        if( fread( pic.img.plane[1], 1, plane_size, stdin ) != plane_size )
+        if( fread( pic.img.plane[1], 1, chroma_size, stdin ) != chroma_size )
             break;
-        if( fread( pic.img.plane[2], 1, plane_size, stdin ) != plane_size )
+        if( fread( pic.img.plane[2], 1, chroma_size, stdin ) != chroma_size )
             break;
 
         pic.i_pts = i_frame;
@@ -129,7 +114,7 @@
         }
     }
     /* Flush delayed frames */
-    while( !b_ctrl_c && x264_encoder_delayed_frames( h ) )
+    while( x264_encoder_delayed_frames( h ) )
     {
         i_frame_size = x264_encoder_encode( h, &nal, &i_nal, NULL, &pic_out );
         if( i_frame_size < 0 )

x264-snapshot-20141218-2245.tar.bz2/extras/avxsynth_c.h -> x264-snapshot-20150804-2245.tar.bz2/extras/avxsynth_c.h Changed

x264-snapshot-20150804-2245.tar.bz2/extras/intel_dispatcher.h Added

@@ -0,0 +1,46 @@
+/*****************************************************************************
+ * intel_dispatcher.h: intel compiler cpu dispatcher override
+ *****************************************************************************
+ * Copyright (C) 2014-2015 x264 project
+ *
+ * Authors: Anton Mitrofanov <BugMaster@narod.ru>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_INTEL_DISPATCHER_H
+#define X264_INTEL_DISPATCHER_H
+
+/* Feature flags using _FEATURE_* defines from immintrin.h */
+extern unsigned long long __intel_cpu_feature_indicator;
+extern unsigned long long __intel_cpu_feature_indicator_x;
+
+/* CPU vendor independent version of dispatcher */
+void __intel_cpu_features_init_x( void );
+
+static void x264_intel_dispatcher_override( void )
+{
+    if( __intel_cpu_feature_indicator & ~1ULL )
+        return;
+    __intel_cpu_feature_indicator = 0;
+    __intel_cpu_feature_indicator_x = 0;
+    __intel_cpu_features_init_x();
+    __intel_cpu_feature_indicator = __intel_cpu_feature_indicator_x;
+}
+
+#endif

x264-snapshot-20141218-2245.tar.bz2/filters/filters.c -> x264-snapshot-20150804-2245.tar.bz2/filters/filters.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/filters.h -> x264-snapshot-20150804-2245.tar.bz2/filters/filters.h Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/cache.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/cache.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/crop.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/crop.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/depth.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/depth.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * depth.c: bit-depth conversion video filter
  *****************************************************************************
- * Copyright (C) 2010-2014 x264 project
+ * Copyright (C) 2010-2015 x264 project
  *
  * Authors: Oskar Arvidsson <oskar@irock.se>
  *
@@ -50,6 +50,7 @@
            csp_mask == X264_CSP_YV16 ||
            csp_mask == X264_CSP_YV24 ||
            csp_mask == X264_CSP_NV12 ||
+           csp_mask == X264_CSP_NV21 ||
            csp_mask == X264_CSP_NV16 ||
            csp_mask == X264_CSP_BGR ||
            csp_mask == X264_CSP_RGB ||
@@ -59,7 +60,7 @@
 static int csp_num_interleaved( int csp, int plane )
 {
     int csp_mask = csp & X264_CSP_MASK;
-    return (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ? 2 :
+    return (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV21 || csp_mask == X264_CSP_NV16) && plane == 1 ? 2 :
            csp_mask == X264_CSP_BGR || csp_mask == X264_CSP_RGB ? 3 :
            csp_mask == X264_CSP_BGRA ? 4 :
            1;
@@ -73,10 +74,10 @@
 static void dither_plane_##pitch( pixel *dst, int dst_stride, uint16_t *src, int src_stride, \
                                   int width, int height, int16_t *errors ) \
 { \
-    const int lshift = 16-BIT_DEPTH; \
-    const int rshift = 16-BIT_DEPTH+2; \
-    const int half = 1 << (16-BIT_DEPTH+1); \
-    const int pixel_max = (1 << BIT_DEPTH)-1; \
+    const int lshift = 16-X264_BIT_DEPTH; \
+    const int rshift = 16-X264_BIT_DEPTH+2; \
+    const int half = 1 << (16-X264_BIT_DEPTH+1); \
+    const int pixel_max = (1 << X264_BIT_DEPTH)-1; \
     memset( errors, 0, (width+1) * sizeof(int16_t) ); \
     for( int y = 0; y < height; y++, src += src_stride, dst += dst_stride ) \
     { \
@@ -136,7 +137,7 @@
 static void scale_image( cli_image_t *output, cli_image_t *img )
 {
     int csp_mask = img->csp & X264_CSP_MASK;
-    const int shift = BIT_DEPTH - 8;
+    const int shift = X264_BIT_DEPTH - 8;
     for( int i = 0; i < img->planes; i++ )
     {
         uint8_t *src = img->plane[i];
@@ -216,7 +217,7 @@
             ret = 1;
     }
 
-    FAIL_IF_ERROR( bit_depth != BIT_DEPTH, "this build supports only bit depth %d\n", BIT_DEPTH )
+    FAIL_IF_ERROR( bit_depth != X264_BIT_DEPTH, "this build supports only bit depth %d\n", X264_BIT_DEPTH )
     FAIL_IF_ERROR( ret, "unsupported bit depth conversion.\n" )
 
     /* only add the filter to the chain if it's needed */

x264-snapshot-20141218-2245.tar.bz2/filters/video/fix_vfr_pts.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/fix_vfr_pts.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/internal.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/internal.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/internal.h -> x264-snapshot-20150804-2245.tar.bz2/filters/video/internal.h Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/resize.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/resize.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/select_every.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/select_every.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/source.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/source.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/video.c -> x264-snapshot-20150804-2245.tar.bz2/filters/video/video.c Changed

x264-snapshot-20141218-2245.tar.bz2/filters/video/video.h -> x264-snapshot-20150804-2245.tar.bz2/filters/video/video.h Changed

x264-snapshot-20141218-2245.tar.bz2/input/avs.c -> x264-snapshot-20150804-2245.tar.bz2/input/avs.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * avs.c: avisynth input
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: Steven Walters <kemuri9@gmail.com>
  *
@@ -27,15 +27,15 @@
 #if USE_AVXSYNTH
 #include <dlfcn.h>
 #if SYS_MACOSX
-#define avs_open dlopen( "libavxsynth.dylib", RTLD_NOW )
+#define avs_open() dlopen( "libavxsynth.dylib", RTLD_NOW )
 #else
-#define avs_open dlopen( "libavxsynth.so", RTLD_NOW )
+#define avs_open() dlopen( "libavxsynth.so", RTLD_NOW )
 #endif
 #define avs_close dlclose
 #define avs_address dlsym
 #else
 #include <windows.h>
-#define avs_open LoadLibraryW( L"avisynth" )
+#define avs_open() LoadLibraryW( L"avisynth" )
 #define avs_close FreeLibrary
 #define avs_address GetProcAddress
 #endif
@@ -80,7 +80,7 @@
 {
     AVS_Clip *clip;
     AVS_ScriptEnvironment *env;
-    HMODULE library;
+    void *library;
     int num_frames;
     struct
     {
@@ -102,7 +102,7 @@
 /* load the library and functions we require from it */
 static int x264_avs_load_library( avs_hnd_t *h )
 {
-    h->library = avs_open;
+    h->library = avs_open();
     if( !h->library )
         return -1;
     LOAD_AVS_FUNC( avs_clip_get_error, 0 );
@@ -175,8 +175,9 @@
     FILE *fh = x264_fopen( psz_filename, "r" );
     if( !fh )
         return -1;
-    FAIL_IF_ERROR( !x264_is_regular_file( fh ), "AVS input is incompatible with non-regular file `%s'\n", psz_filename );
+    int b_regular = x264_is_regular_file( fh );
     fclose( fh );
+    FAIL_IF_ERROR( !b_regular, "AVS input is incompatible with non-regular file `%s'\n", psz_filename );
 
     avs_hnd_t *h = malloc( sizeof(avs_hnd_t) );
     if( !h )

x264-snapshot-20141218-2245.tar.bz2/input/ffms.c -> x264-snapshot-20150804-2245.tar.bz2/input/ffms.c Changed

x264-snapshot-20141218-2245.tar.bz2/input/input.c -> x264-snapshot-20150804-2245.tar.bz2/input/input.c Changed

x264-snapshot-20141218-2245.tar.bz2/input/input.h -> x264-snapshot-20150804-2245.tar.bz2/input/input.h Changed

x264-snapshot-20141218-2245.tar.bz2/input/lavf.c -> x264-snapshot-20150804-2245.tar.bz2/input/lavf.c Changed

x264-snapshot-20141218-2245.tar.bz2/input/raw.c -> x264-snapshot-20150804-2245.tar.bz2/input/raw.c Changed

x264-snapshot-20141218-2245.tar.bz2/input/thread.c -> x264-snapshot-20150804-2245.tar.bz2/input/thread.c Changed

x264-snapshot-20141218-2245.tar.bz2/input/timecode.c -> x264-snapshot-20150804-2245.tar.bz2/input/timecode.c Changed

x264-snapshot-20141218-2245.tar.bz2/input/y4m.c -> x264-snapshot-20150804-2245.tar.bz2/input/y4m.c Changed

x264-snapshot-20141218-2245.tar.bz2/output/flv.c -> x264-snapshot-20150804-2245.tar.bz2/output/flv.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * flv.c: flv muxer
  *****************************************************************************
- * Copyright (C) 2009-2014 x264 project
+ * Copyright (C) 2009-2015 x264 project
  *
  * Authors: Kieran Kunhya <kieran@kunhya.com>
  *
@@ -75,21 +75,29 @@
 
 static int open_file( char *psz_filename, hnd_t *p_handle, cli_output_opt_t *opt )
 {
-    *p_handle = NULL;
     flv_hnd_t *p_flv = calloc( 1, sizeof(flv_hnd_t) );
-    if( !p_flv )
-        return -1;
-
-    p_flv->b_dts_compress = opt->use_dts_compress;
-
-    p_flv->c = flv_create_writer( psz_filename );
-    if( !p_flv->c )
-        return -1;
-
-    CHECK( write_header( p_flv->c ) );
-    *p_handle = p_flv;
+    if( p_flv )
+    {
+        flv_buffer *c = flv_create_writer( psz_filename );
+        if( c )
+        {
+            if( !write_header( c ) )
+            {
+                p_flv->c = c;
+                p_flv->b_dts_compress = opt->use_dts_compress;
+                *p_handle = p_flv;
+                return 0;
+            }
+
+            fclose( c->fp );
+            free( c->data );
+            free( c );
+        }
+        free( p_flv );
+    }
 
-    return 0;
+    *p_handle = NULL;
+    return -1;
 }
 
 static int set_param( hnd_t handle, x264_param_t *p_param )
@@ -293,15 +301,22 @@
     return i_size;
 }
 
-static void rewrite_amf_double( FILE *fp, uint64_t position, double value )
+static int rewrite_amf_double( FILE *fp, uint64_t position, double value )
 {
     uint64_t x = endian_fix64( flv_dbl2int( value ) );
-    fseek( fp, position, SEEK_SET );
-    fwrite( &x, 8, 1, fp );
+    return !fseek( fp, position, SEEK_SET ) && fwrite( &x, 8, 1, fp ) == 1 ? 0 : -1;
 }
 
+#undef CHECK
+#define CHECK(x)\
+do {\
+    if( (x) < 0 )\
+        goto error;\
+} while( 0 )
+
 static int close_file( hnd_t handle, int64_t largest_pts, int64_t second_largest_pts )
 {
+    int ret = -1;
     flv_hnd_t *p_flv = handle;
     flv_buffer *c = p_flv->c;
 
@@ -317,19 +332,22 @@
         if( p_flv->i_framerate_pos )
         {
             framerate = (double)p_flv->i_framenum / total_duration;
-            rewrite_amf_double( c->fp, p_flv->i_framerate_pos, framerate );
+            CHECK( rewrite_amf_double( c->fp, p_flv->i_framerate_pos, framerate ) );
         }
 
-        rewrite_amf_double( c->fp, p_flv->i_duration_pos, total_duration );
-        rewrite_amf_double( c->fp, p_flv->i_filesize_pos, filesize );
-        rewrite_amf_double( c->fp, p_flv->i_bitrate_pos, filesize * 8 / ( total_duration * 1000 ) );
+        CHECK( rewrite_amf_double( c->fp, p_flv->i_duration_pos, total_duration ) );
+        CHECK( rewrite_amf_double( c->fp, p_flv->i_filesize_pos, filesize ) );
+        CHECK( rewrite_amf_double( c->fp, p_flv->i_bitrate_pos, filesize * 8 / ( total_duration * 1000 ) ) );
     }
+    ret = 0;
 
+error:
     fclose( c->fp );
-    free( p_flv );
+    free( c->data );
     free( c );
+    free( p_flv );
 
-    return 0;
+    return ret;
 }
 
 const cli_output_t flv_output = { open_file, set_param, write_headers, write_frame, close_file };

x264-snapshot-20141218-2245.tar.bz2/output/flv_bytestream.c -> x264-snapshot-20150804-2245.tar.bz2/output/flv_bytestream.c Changed

x264-snapshot-20141218-2245.tar.bz2/output/flv_bytestream.h -> x264-snapshot-20150804-2245.tar.bz2/output/flv_bytestream.h Changed

x264-snapshot-20141218-2245.tar.bz2/output/matroska.c -> x264-snapshot-20150804-2245.tar.bz2/output/matroska.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * matroska.c: matroska muxer
  *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
+ * Copyright (C) 2005-2015 x264 project
  *
  * Authors: Mike Matsnev <mike@haali.su>
  *
@@ -62,9 +62,14 @@
     return 0;
 }
 
+#define STEREO_COUNT 7
+static const uint8_t stereo_modes[STEREO_COUNT] = {5,9,7,1,3,13,0};
+static const uint8_t stereo_w_div[STEREO_COUNT] = {1,2,1,2,1,1,1};
+static const uint8_t stereo_h_div[STEREO_COUNT] = {1,1,2,1,2,1,1};
+
 static int set_param( hnd_t handle, x264_param_t *p_param )
 {
-    mkv_hnd_t   *p_mkv = handle;
+    mkv_hnd_t *p_mkv = handle;
     int64_t dw, dh;
 
     if( p_param->i_fps_num > 0 && !p_param->b_vfr_input )
@@ -77,25 +82,27 @@
         p_mkv->frame_duration = 0;
     }
 
-    p_mkv->width = p_mkv->d_width = p_param->i_width;
-    p_mkv->height = p_mkv->d_height = p_param->i_height;
+    dw = p_mkv->width = p_param->i_width;
+    dh = p_mkv->height = p_param->i_height;
     p_mkv->display_size_units = DS_PIXELS;
-    p_mkv->stereo_mode = p_param->i_frame_packing;
-
+    p_mkv->stereo_mode = -1;
+    if( p_param->i_frame_packing >= 0 && p_param->i_frame_packing < STEREO_COUNT )
+    {
+        p_mkv->stereo_mode = stereo_modes[p_param->i_frame_packing];
+        dw /= stereo_w_div[p_param->i_frame_packing];
+        dh /= stereo_h_div[p_param->i_frame_packing];
+    }
     if( p_param->vui.i_sar_width && p_param->vui.i_sar_height
         && p_param->vui.i_sar_width != p_param->vui.i_sar_height )
     {
         if ( p_param->vui.i_sar_width > p_param->vui.i_sar_height ) {
-            dw = (int64_t)p_param->i_width * p_param->vui.i_sar_width / p_param->vui.i_sar_height;
-            dh = p_param->i_height;
+            dw = dw * p_param->vui.i_sar_width / p_param->vui.i_sar_height;
         } else {
-            dw = p_param->i_width;
-            dh = (int64_t)p_param->i_height * p_param->vui.i_sar_height / p_param->vui.i_sar_width;
+            dh = dh * p_param->vui.i_sar_height / p_param->vui.i_sar_width;
         }
-
-        p_mkv->d_width = (int)dw;
-        p_mkv->d_height = (int)dh;
     }
+    p_mkv->d_width = (int)dw;
+    p_mkv->d_height = (int)dh;
 
     p_mkv->i_timebase_num = p_param->i_timebase_num;
     p_mkv->i_timebase_den = p_param->i_timebase_den;
@@ -150,11 +157,11 @@
                            avcC, avcC_len, p_mkv->frame_duration, 50000,
                            p_mkv->width, p_mkv->height,
                            p_mkv->d_width, p_mkv->d_height, p_mkv->display_size_units, p_mkv->stereo_mode );
+    free( avcC );
+
     if( ret < 0 )
         return ret;
 
-    free( avcC );
-
     // SEI
 
     if( !p_mkv->b_writing_frame )

x264-snapshot-20141218-2245.tar.bz2/output/matroska_ebml.c -> x264-snapshot-20150804-2245.tar.bz2/output/matroska_ebml.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * matroska_ebml.c: matroska muxer utilities
  *****************************************************************************
- * Copyright (C) 2005-2014 x264 project
+ * Copyright (C) 2005-2015 x264 project
  *
  * Authors: Mike Matsnev <mike@haali.su>
  *
@@ -317,8 +317,6 @@
     return w;
 }
 
-static const uint8_t mk_stereo_modes[6] = {5,9,7,1,3,13};
-
 int mk_write_header( mk_writer *w, const char *writing_app,
                      const char *codec_id,
                      const void *codec_private, unsigned codec_private_size,
@@ -342,7 +340,7 @@
     CHECK( mk_write_uint( c, 0x42f2, 4 ) ); // EBMLMaxIDLength
     CHECK( mk_write_uint( c, 0x42f3, 8 ) ); // EBMLMaxSizeLength
     CHECK( mk_write_string( c, 0x4282, "matroska") ); // DocType
-    CHECK( mk_write_uint( c, 0x4287, 2 ) ); // DocTypeVersion
+    CHECK( mk_write_uint( c, 0x4287, stereo_mode >= 0 ? 3 : 2 ) ); // DocTypeVersion
     CHECK( mk_write_uint( c, 0x4285, 2 ) ); // DocTypeReadversion
     CHECK( mk_close_context( c, 0 ) );
 
@@ -381,8 +379,8 @@
     CHECK( mk_write_uint( v, 0x54b2, display_size_units ) );
     CHECK( mk_write_uint( v, 0x54b0, d_width ) );
     CHECK( mk_write_uint( v, 0x54ba, d_height ) );
-    if( stereo_mode >= 0 && stereo_mode <= 5 )
-        CHECK( mk_write_uint( v, 0x53b8, mk_stereo_modes[stereo_mode] ) );
+    if( stereo_mode >= 0 )
+        CHECK( mk_write_uint( v, 0x53b8, stereo_mode ) );
     CHECK( mk_close_context( v, 0 ) );
 
     CHECK( mk_close_context( ti, 0 ) );

x264-snapshot-20141218-2245.tar.bz2/output/matroska_ebml.h -> x264-snapshot-20150804-2245.tar.bz2/output/matroska_ebml.h Changed

x264-snapshot-20141218-2245.tar.bz2/output/mp4.c -> x264-snapshot-20150804-2245.tar.bz2/output/mp4.c Changed

x264-snapshot-20141218-2245.tar.bz2/output/mp4_lsmash.c -> x264-snapshot-20150804-2245.tar.bz2/output/mp4_lsmash.c Changed

x264-snapshot-20141218-2245.tar.bz2/output/output.h -> x264-snapshot-20150804-2245.tar.bz2/output/output.h Changed

x264-snapshot-20141218-2245.tar.bz2/output/raw.c -> x264-snapshot-20150804-2245.tar.bz2/output/raw.c Changed

x264-snapshot-20141218-2245.tar.bz2/tools/checkasm-a.asm -> x264-snapshot-20150804-2245.tar.bz2/tools/checkasm-a.asm Changed

@@ -1,7 +1,7 @@
 ;*****************************************************************************
 ;* checkasm-a.asm: assembly check tool
 ;*****************************************************************************
-;* Copyright (C) 2008-2014 x264 project
+;* Copyright (C) 2008-2015 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Henrik Gramner <henrik@gramner.com>
@@ -33,24 +33,24 @@
 %if ARCH_X86_64
 ; just random numbers to reduce the chance of incidental match
 ALIGN 16
-x6:  ddq 0x79445c159ce790641a1b2550a612b48c
-x7:  ddq 0x86b2536fcd8cf6362eed899d5a28ddcd
-x8:  ddq 0x3f2bf84fc0fcca4eb0856806085e7943
-x9:  ddq 0xd229e1f5b281303facbd382dcf5b8de2
-x10: ddq 0xab63e2e11fa38ed971aeaff20b095fd9
-x11: ddq 0x77d410d5c42c882d89b0c0765892729a
-x12: ddq 0x24b3c1d2a024048bc45ea11a955d8dd5
-x13: ddq 0xdd7b8919edd427862e8ec680de14b47c
-x14: ddq 0x11e53e2b2ac655ef135ce6888fa02cbf
-x15: ddq 0x6de8f4c914c334d5011ff554472a7a10
-n7:   dq 0x21f86d66c8ca00ce
-n8:   dq 0x75b6ba21077c48ad
-n9:   dq 0xed56bb2dcb3c7736
-n10:  dq 0x8bda43d3fd1a7e06
-n11:  dq 0xb64a9c9e5d318408
-n12:  dq 0xdf9a54b303f1d3a3
-n13:  dq 0x4a75479abd64e097
-n14:  dq 0x249214109d5d1c88
+x6:  dq 0x1a1b2550a612b48c,0x79445c159ce79064
+x7:  dq 0x2eed899d5a28ddcd,0x86b2536fcd8cf636
+x8:  dq 0xb0856806085e7943,0x3f2bf84fc0fcca4e
+x9:  dq 0xacbd382dcf5b8de2,0xd229e1f5b281303f
+x10: dq 0x71aeaff20b095fd9,0xab63e2e11fa38ed9
+x11: dq 0x89b0c0765892729a,0x77d410d5c42c882d
+x12: dq 0xc45ea11a955d8dd5,0x24b3c1d2a024048b
+x13: dq 0x2e8ec680de14b47c,0xdd7b8919edd42786
+x14: dq 0x135ce6888fa02cbf,0x11e53e2b2ac655ef
+x15: dq 0x011ff554472a7a10,0x6de8f4c914c334d5
+n7:  dq 0x21f86d66c8ca00ce
+n8:  dq 0x75b6ba21077c48ad
+n9:  dq 0xed56bb2dcb3c7736
+n10: dq 0x8bda43d3fd1a7e06
+n11: dq 0xb64a9c9e5d318408
+n12: dq 0xdf9a54b303f1d3a3
+n13: dq 0x4a75479abd64e097
+n14: dq 0x249214109d5d1c88
 %endif
 
 SECTION .text

x264-snapshot-20141218-2245.tar.bz2/tools/checkasm.c -> x264-snapshot-20150804-2245.tar.bz2/tools/checkasm.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * checkasm.c: assembly check tool
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -97,6 +97,12 @@
     asm volatile( "mftb %0" : "=r"(a) :: "memory" );
 #elif ARCH_ARM     // ARMv7 only
     asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) :: "memory" );
+#elif ARCH_AARCH64
+    uint64_t b = 0;
+    asm volatile( "mrs %0, pmccntr_el0" : "=r"(b) :: "memory" );
+    a = b;
+#elif ARCH_MIPS
+    asm volatile( "rdhwr %0, $2" : "=r"(a) :: "memory" );
 #endif
     return a;
 }
@@ -167,12 +173,12 @@
                 continue;
             printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
 #if HAVE_MMX
-                    b->cpu&X264_CPU_AVX2 && b->cpu&X264_CPU_FMA3 ? "avx2_fma3" :
                     b->cpu&X264_CPU_AVX2 ? "avx2" :
                     b->cpu&X264_CPU_FMA3 ? "fma3" :
                     b->cpu&X264_CPU_FMA4 ? "fma4" :
                     b->cpu&X264_CPU_XOP ? "xop" :
                     b->cpu&X264_CPU_AVX ? "avx" :
+                    b->cpu&X264_CPU_SSE42 ? "sse42" :
                     b->cpu&X264_CPU_SSE4 ? "sse4" :
                     b->cpu&X264_CPU_SSSE3 ? "ssse3" :
                     b->cpu&X264_CPU_SSE3 ? "sse3" :
@@ -189,6 +195,8 @@
 #elif ARCH_AARCH64
                     b->cpu&X264_CPU_NEON ? "neon" :
                     b->cpu&X264_CPU_ARMV8 ? "armv8" :
+#elif ARCH_MIPS
+                    b->cpu&X264_CPU_MSA ? "msa" :
 #endif
                     "c",
 #if HAVE_MMX
@@ -637,7 +645,7 @@
             } \
             predict_8x8[res_c>>16]( fdec1, edge ); \
             int res_a = call_a( pixel_asm.name, fenc, fdec2, edge, bitcosts+8-pred_mode, satds_a ); \
-            if( res_c != res_a || memcmp(satds_c, satds_a, sizeof(satds_c)) ) \
+            if( res_c != res_a || memcmp(satds_c, satds_a, 16 * sizeof(*satds_c)) ) \
             { \
                 ok = 0; \
                 fprintf( stderr, #name": %d,%d != %d,%d [FAILED]\n", res_c>>16, res_c&0xffff, res_a>>16, res_a&0xffff ); \
@@ -1409,6 +1417,32 @@
         }
     }
 
+    if( mc_a.plane_copy_swap != mc_ref.plane_copy_swap )
+    {
+        set_func_name( "plane_copy_swap" );
+        used_asm = 1;
+        for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
+        {
+            int w = (plane_specs[i].w + 1) >> 1;
+            int h = plane_specs[i].h;
+            intptr_t src_stride = plane_specs[i].src_stride;
+            intptr_t dst_stride = (2*w + 127) & ~63;
+            assert( dst_stride * h <= 0x1000 );
+            pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
+            memset( pbuf3, 0, 0x1000*sizeof(pixel) );
+            memset( pbuf4, 0, 0x1000*sizeof(pixel) );
+            call_c( mc_c.plane_copy_swap, pbuf3, dst_stride, src1, src_stride, w, h );
+            call_a( mc_a.plane_copy_swap, pbuf4, dst_stride, src1, src_stride, w, h );
+            for( int y = 0; y < h; y++ )
+                if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) )
+                {
+                    ok = 0;
+                    fprintf( stderr, "plane_copy_swap FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
+                    break;
+                }
+        }
+    }
+
     if( mc_a.plane_copy_interleave != mc_ref.plane_copy_interleave )
     {
         set_func_name( "plane_copy_interleave" );
@@ -1496,7 +1530,7 @@
     if( mc_a.plane_copy_deinterleave_v210 != mc_ref.plane_copy_deinterleave_v210 )
     {
         set_func_name( "plane_copy_deinterleave_v210" );
-        used_asm = 1;
+        ok = 1; used_asm = 1;
         for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
         {
             int w = (plane_specs[i].w + 1) >> 1;
@@ -1517,8 +1551,8 @@
                     break;
                 }
         }
+        report( "v210 :" );
     }
-    report( "v210 :" );
 
     if( mc_a.hpel_filter != mc_ref.hpel_filter )
     {
@@ -2311,12 +2345,16 @@
             {\
                 fprintf( stderr, #name "[%d] :  [FAILED]\n", dir );\
                 ok = 0;\
-                for( int k = -1; k < 16; k++ )\
-                    printf( "%2x ", edge[16+k] );\
-                printf( "\n" );\
+                if( ip_c.name == (void *)ip_c.predict_8x8 )\
+                {\
+                    for( int k = -1; k < 16; k++ )\
+                        printf( "%2x ", edge[16+k] );\
+                    printf( "\n" );\
+                }\
                 for( int j = 0; j < h; j++ )\
                 {\
-                    printf( "%2x ", edge[14-j] );\
+                    if( ip_c.name == (void *)ip_c.predict_8x8 )\
+                        printf( "%2x ", edge[14-j] );\
                     for( int k = 0; k < w; k++ )\
                         printf( "%2x ", pbuf4[48+k+j*FDEC_STRIDE] );\
                     printf( "\n" );\
@@ -2324,7 +2362,8 @@
                 printf( "\n" );\
                 for( int j = 0; j < h; j++ )\
                 {\
-                    printf( "   " );\
+                    if( ip_c.name == (void *)ip_c.predict_8x8 )\
+                        printf( "   " );\
                     for( int k = 0; k < w; k++ )\
                         printf( "%2x ", pbuf3[48+k+j*FDEC_STRIDE] );\
                     printf( "\n" );\
@@ -2428,6 +2467,8 @@
 DECL_CABAC(c)
 #if HAVE_MMX
 DECL_CABAC(asm)
+#elif defined(ARCH_AARCH64)
+DECL_CABAC(asm)
 #else
 #define run_cabac_decision_asm run_cabac_decision_c
 #define run_cabac_bypass_asm run_cabac_bypass_c
@@ -2646,7 +2687,7 @@
 #endif
         if( cpu_detect & X264_CPU_LZCNT )
         {
-            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX LZCNT" );
             cpu1 &= ~X264_CPU_LZCNT;
         }
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
@@ -2664,11 +2705,11 @@
         cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
         cpu1 &= ~X264_CPU_SLOW_CTZ;
-    }
-    if( cpu_detect & X264_CPU_LZCNT )
-    {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
-        cpu1 &= ~X264_CPU_LZCNT;
+        if( cpu_detect & X264_CPU_LZCNT )
+        {
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE2 LZCNT" );
+            cpu1 &= ~X264_CPU_LZCNT;
+        }
     }
     if( cpu_detect & X264_CPU_SSE3 )
     {
@@ -2688,9 +2729,16 @@
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
         cpu1 &= ~X264_CPU_CACHELINE_64;
         cpu1 &= ~X264_CPU_SLOW_ATOM;
+        if( cpu_detect & X264_CPU_LZCNT )
+        {
+            ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSSE3 LZCNT" );
+            cpu1 &= ~X264_CPU_LZCNT;
+        }
     }
     if( cpu_detect & X264_CPU_SSE4 )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE4, "SSE4" );
+    if( cpu_detect & X264_CPU_SSE42 )
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE42, "SSE4.2" );
     if( cpu_detect & X264_CPU_AVX )
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX, "AVX" );
     if( cpu_detect & X264_CPU_XOP )
@@ -2700,30 +2748,30 @@
         ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA4, "FMA4" );
         cpu1 &= ~X264_CPU_FMA4;
     }
-    if( cpu_detect & X264_CPU_BMI1 )
+    if( cpu_detect & X264_CPU_FMA3 )
     {
-        ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
-        cpu1 &= ~X264_CPU_BMI1;
+        ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
+        cpu1 &= ~X264_CPU_FMA3;

x264-snapshot-20150804-2245.tar.bz2/tools/gas-preprocessor.pl Added

@@ -0,0 +1,1033 @@
+#!/usr/bin/env perl
+# by David Conrad
+# This code is licensed under GPLv2 or later; go to gnu.org to read it
+#  (not that it much matters for an asm preprocessor)
+# usage: set your assembler to be something like "perl gas-preprocessor.pl gcc"
+use strict;
+
+# Apple's gas is ancient and doesn't support modern preprocessing features like
+# .rept and has ugly macro syntax, among other things. Thus, this script
+# implements the subset of the gas preprocessor used by x264 and ffmpeg
+# that isn't supported by Apple's gas.
+
+my %canonical_arch = ("aarch64" => "aarch64", "arm64" => "aarch64",
+                      "arm"     => "arm",
+                      "powerpc" => "powerpc", "ppc"   => "powerpc");
+
+my %comments = ("aarch64" => '//',
+                "arm"     => '@',
+                "powerpc" => '#');
+
+my @gcc_cmd;
+my @preprocess_c_cmd;
+
+my $comm;
+my $arch;
+my $as_type = "apple-gas";
+
+my $fix_unreq = $^O eq "darwin";
+my $force_thumb = 0;
+
+my $arm_cond_codes = "eq|ne|cs|cc|mi|pl|vs|vc|hi|ls|ge|lt|gt|le|al|hs|lo";
+
+my $usage_str = "
+$0\n
+Gas-preprocessor.pl converts assembler files using modern GNU as syntax for
+Apple's ancient gas version or clang's incompatible integrated assembler. The
+conversion is regularly tested for Libav, x264 and vlc. Other projects might
+use different features which are not correctly handled.
+
+Options for this program needs to be separated with ' -- ' from the assembler
+command. Following options are currently supported:
+
+    -help         - this usage text
+    -arch         - target architecture
+    -as-type      - one value out of {{,apple-}{gas,clang},armasm}
+    -fix-unreq
+    -no-fix-unreq
+    -force-thumb  - assemble as thumb regardless of the input source
+                    (note, this is incomplete and only works for sources
+                    it explicitly was tested with)
+";
+
+sub usage() {
+    print $usage_str;
+}
+
+while (@ARGV) {
+    my $opt = shift;
+
+    if ($opt =~ /^-(no-)?fix-unreq$/) {
+        $fix_unreq = $1 ne "no-";
+    } elsif ($opt eq "-force-thumb") {
+        $force_thumb = 1;
+    } elsif ($opt eq "-arch") {
+        $arch = shift;
+        die "unknown arch: '$arch'\n" if not exists $comments{$arch};
+    } elsif ($opt eq "-as-type") {
+        $as_type = shift;
+        die "unknown as type: '$as_type'\n" if $as_type !~ /^((apple-)?(gas|clang)|armasm)$/;
+    } elsif ($opt eq "-help") {
+        usage();
+        exit 0;
+    } elsif ($opt eq "--" ) {
+        @gcc_cmd = @ARGV;
+    } elsif ($opt =~ /^-/) {
+        die "option '$opt' is not known. See '$0 -help' for usage information\n";
+    } else {
+        push @gcc_cmd, $opt, @ARGV;
+    }
+    last if (@gcc_cmd);
+}
+
+if (grep /\.c$/, @gcc_cmd) {
+    # C file (inline asm?) - compile
+    @preprocess_c_cmd = (@gcc_cmd, "-S");
+} elsif (grep /\.[sS]$/, @gcc_cmd) {
+    # asm file, just do C preprocessor
+    @preprocess_c_cmd = (@gcc_cmd, "-E");
+} elsif (grep /-(v|h|-version|dumpversion)/, @gcc_cmd) {
+    # pass -v/--version along, used during probing. Matching '-v' might have
+    # uninteded results but it doesn't matter much if gas-preprocessor or
+    # the compiler fails.
+    exec(@gcc_cmd);
+} else {
+    die "Unrecognized input filetype";
+}
+if ($as_type eq "armasm") {
+
+    $preprocess_c_cmd[0] = "cpp";
+    push(@preprocess_c_cmd, "-U__ELF__");
+    push(@preprocess_c_cmd, "-U__MACH__");
+
+    @preprocess_c_cmd = grep ! /^-nologo$/, @preprocess_c_cmd;
+    # Remove -ignore XX parameter pairs from preprocess_c_cmd
+    my $index = 1;
+    while ($index < $#preprocess_c_cmd) {
+        if ($preprocess_c_cmd[$index] eq "-ignore" and $index + 1 < $#preprocess_c_cmd) {
+            splice(@preprocess_c_cmd, $index, 2);
+            next;
+        }
+        $index++;
+    }
+    if (grep /^-MM$/, @preprocess_c_cmd) {
+        system(@preprocess_c_cmd) == 0 or die "Error running preprocessor";
+        exit 0;
+    }
+}
+
+# if compiling, avoid creating an output file named '-.o'
+if ((grep /^-c$/, @gcc_cmd) && !(grep /^-o/, @gcc_cmd)) {
+    foreach my $i (@gcc_cmd) {
+        if ($i =~ /\.[csS]$/) {
+            my $outputfile = $i;
+            $outputfile =~ s/\.[csS]$/.o/;
+            push(@gcc_cmd, "-o");
+            push(@gcc_cmd, $outputfile);
+            last;
+        }
+    }
+}
+# replace only the '-o' argument with '-', avoids rewriting the make dependency
+# target specified with -MT to '-'
+my $index = 1;
+while ($index < $#preprocess_c_cmd) {
+    if ($preprocess_c_cmd[$index] eq "-o") {
+        $index++;
+        $preprocess_c_cmd[$index] = "-";
+    }
+    $index++;
+}
+
+my $tempfile;
+if ($as_type ne "armasm") {
+    @gcc_cmd = map { /\.[csS]$/ ? qw(-x assembler -) : $_ } @gcc_cmd;
+} else {
+    @preprocess_c_cmd = grep ! /^-c$/, @preprocess_c_cmd;
+    @preprocess_c_cmd = grep ! /^-m/, @preprocess_c_cmd;
+
+    @preprocess_c_cmd = grep ! /^-G/, @preprocess_c_cmd;
+    @preprocess_c_cmd = grep ! /^-W/, @preprocess_c_cmd;
+    @preprocess_c_cmd = grep ! /^-Z/, @preprocess_c_cmd;
+    @preprocess_c_cmd = grep ! /^-fp/, @preprocess_c_cmd;
+    @preprocess_c_cmd = grep ! /^-EHsc$/, @preprocess_c_cmd;
+    @preprocess_c_cmd = grep ! /^-O/, @preprocess_c_cmd;
+
+    @gcc_cmd = grep ! /^-G/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-W/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-Z/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-fp/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-EHsc$/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-O/, @gcc_cmd;
+
+    my @outfiles = grep /\.(o|obj)$/, @gcc_cmd;
+    $tempfile = $outfiles[0].".asm";
+
+    # Remove most parameters from gcc_cmd, which actually is the armasm command,
+    # which doesn't support any of the common compiler/preprocessor options.
+    @gcc_cmd = grep ! /^-D/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-U/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-m/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-M/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-c$/, @gcc_cmd;
+    @gcc_cmd = grep ! /^-I/, @gcc_cmd;
+    @gcc_cmd = map { /\.S$/ ? $tempfile : $_ } @gcc_cmd;
+}
+
+# detect architecture from gcc binary name
+if (!$arch) {
+    if ($gcc_cmd[0] =~ /(arm64|aarch64|arm|powerpc|ppc)/) {
+        $arch = $1;
+    } else {
+        # look for -arch flag
+        foreach my $i (1 .. $#gcc_cmd-1) {
+            if ($gcc_cmd[$i] eq "-arch" and
+                $gcc_cmd[$i+1] =~ /(arm64|aarch64|arm|powerpc|ppc)/) {
+                $arch = $1;
+            }
+        }
+    }
+}
+
+# assume we're not cross-compiling if no -arch or the binary doesn't have the arch name
+$arch = qx/arch/ if (!$arch);
+
+die "Unknown target architecture '$arch'" if not exists $canonical_arch{$arch};
+
+$arch = $canonical_arch{$arch};
+$comm = $comments{$arch};
+my $inputcomm = $comm;

x264-snapshot-20141218-2245.tar.bz2/x264.c -> x264-snapshot-20150804-2245.tar.bz2/x264.c Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * x264: top-level x264cli functions
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
@@ -209,6 +209,13 @@
 #endif
     0
 };
+static const char * const chroma_format_names[] =
+{
+    [0] = "all",
+    [X264_CSP_I420] = "i420",
+    [X264_CSP_I422] = "i422",
+    [X264_CSP_I444] = "i444"
+};
 
 static const char * const range_names[] = { "auto", "tv", "pc", 0 };
 
@@ -325,7 +332,8 @@
 #else
     printf( "using an unknown compiler\n" );
 #endif
-    printf( "configuration: --bit-depth=%d --chroma-format=%s\n", x264_bit_depth, X264_CHROMA_FORMAT ? (output_csp_names[0]+1) : "all" );
+    printf( "x264 configuration: --bit-depth=%d --chroma-format=%s\n", X264_BIT_DEPTH, chroma_format_names[X264_CHROMA_FORMAT] );
+    printf( "libx264 configuration: --bit-depth=%d --chroma-format=%s\n", x264_bit_depth, chroma_format_names[x264_chroma_format] );
     printf( "x264 license: " );
 #if HAVE_GPL
     printf( "GPL version 2 or later\n" );
@@ -533,7 +541,7 @@
         "                                  Overrides all settings.\n" );
     H2(
 #if X264_CHROMA_FORMAT <= X264_CSP_I420
-#if BIT_DEPTH==8
+#if X264_BIT_DEPTH==8
         "                                  - baseline:\n"
         "                                    --no-8x8dct --bframes 0 --no-cabac\n"
         "                                    --cqm flat --weightp 0\n"
@@ -561,7 +569,7 @@
         else H0(
         "                                  - "
 #if X264_CHROMA_FORMAT <= X264_CSP_I420
-#if BIT_DEPTH==8
+#if X264_BIT_DEPTH==8
         "baseline,main,high,"
 #endif
         "high10,"
@@ -703,7 +711,9 @@
         "                                  - 2: row alternation - L and R are interlaced by row\n"
         "                                  - 3: side by side - L is on the left, R on the right\n"
         "                                  - 4: top bottom - L is on top, R on bottom\n"
-        "                                  - 5: frame alternation - one view per frame\n" );
+        "                                  - 5: frame alternation - one view per frame\n"
+        "                                  - 6: mono - 2D frame without any frame packing\n"
+        "                                  - 7: tile format - L is on top-left, R split across\n" );
     H0( "\n" );
     H0( "Ratecontrol:\n" );
     H0( "\n" );
@@ -726,7 +736,8 @@
     H2( "      --aq-mode <integer>     AQ method [%d]\n"
         "                                  - 0: Disabled\n"
         "                                  - 1: Variance AQ (complexity mask)\n"
-        "                                  - 2: Auto-variance AQ (experimental)\n", defaults->rc.i_aq_mode );
+        "                                  - 2: Auto-variance AQ\n"
+        "                                  - 3: Auto-variance AQ with bias to dark scenes\n", defaults->rc.i_aq_mode );
     H1( "      --aq-strength <float>   Reduces blocking and blurring in flat and\n"
         "                              textured areas. [%.1f]\n", defaults->rc.f_aq_strength );
     H1( "\n" );
@@ -1286,11 +1297,11 @@
     /* force the output csp to what the user specified (or the default) */
     param->i_csp = info->csp;
     int csp = info->csp & X264_CSP_MASK;
-    if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp > X264_CSP_NV12) )
+    if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp >= X264_CSP_I422) )
         param->i_csp = X264_CSP_I420;
-    else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_V210) )
+    else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp >= X264_CSP_I444) )
         param->i_csp = X264_CSP_I422;
-    else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp > X264_CSP_YV24) )
+    else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp >= X264_CSP_BGR) )
         param->i_csp = X264_CSP_I444;
     else if( output_csp == X264_CSP_RGB && (csp < X264_CSP_BGR || csp > X264_CSP_RGB) )
         param->i_csp = X264_CSP_RGB;

x264-snapshot-20141218-2245.tar.bz2/x264.h -> x264-snapshot-20150804-2245.tar.bz2/x264.h Changed

@@ -1,7 +1,7 @@
 /*****************************************************************************
  * x264.h: x264 public header
  *****************************************************************************
- * Copyright (C) 2003-2014 x264 project
+ * Copyright (C) 2003-2015 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
@@ -29,7 +29,7 @@
 #define X264_X264_H
 
 #if !defined(_STDINT_H) && !defined(_STDINT_H_) && !defined(_STDINT_H_INCLUDED) && !defined(_STDINT) &&\
-    !defined(_INTTYPES_H) && !defined(_INTTYPES_H_) && !defined(_INTTYPES)
+    !defined(_SYS_STDINT_H_) && !defined(_INTTYPES_H) && !defined(_INTTYPES_H_) && !defined(_INTTYPES)
 # ifdef _MSC_VER
 #  pragma message("You must include stdint.h or inttypes.h before x264.h")
 # else
@@ -41,7 +41,7 @@
 
 #include "x264_config.h"
 
-#define X264_BUILD 142
+#define X264_BUILD 148
 
 /* Application developers planning to link against a shared library version of
  * libx264 from a Microsoft Visual Studio or similar development environment
@@ -129,8 +129,8 @@
 #define X264_CPU_AVX             0x0000400  /* AVX support: requires OS support even if YMM registers aren't used. */
 #define X264_CPU_XOP             0x0000800  /* AMD XOP */
 #define X264_CPU_FMA4            0x0001000  /* AMD FMA4 */
-#define X264_CPU_AVX2            0x0002000  /* AVX2 */
-#define X264_CPU_FMA3            0x0004000  /* Intel FMA3 */
+#define X264_CPU_FMA3            0x0002000  /* FMA3 */
+#define X264_CPU_AVX2            0x0004000  /* AVX2 */
 #define X264_CPU_BMI1            0x0008000  /* BMI1 */
 #define X264_CPU_BMI2            0x0010000  /* BMI2 */
 /* x86 modifiers */
@@ -158,6 +158,9 @@
 #define X264_CPU_FAST_NEON_MRC   0x0000004  /* Transfer from NEON to ARM register is fast (Cortex-A9) */
 #define X264_CPU_ARMV8           0x0000008
 
+/* MIPS */
+#define X264_CPU_MSA             0x0000001  /* MIPS MSA */
+
 /* Analyse flags */
 #define X264_ANALYSE_I4x4       0x0001  /* Analyse i4x4 */
 #define X264_ANALYSE_I8x8       0x0002  /* Analyse i8x8 (requires 8x8 transform) */
@@ -183,6 +186,7 @@
 #define X264_AQ_NONE                 0
 #define X264_AQ_VARIANCE             1
 #define X264_AQ_AUTOVARIANCE         2
+#define X264_AQ_AUTOVARIANCE_BIASED  3
 #define X264_B_ADAPT_NONE            0
 #define X264_B_ADAPT_FAST            1
 #define X264_B_ADAPT_TRELLIS         2
@@ -213,16 +217,17 @@
 #define X264_CSP_I420           0x0001  /* yuv 4:2:0 planar */
 #define X264_CSP_YV12           0x0002  /* yvu 4:2:0 planar */
 #define X264_CSP_NV12           0x0003  /* yuv 4:2:0, with one y plane and one packed u+v */
-#define X264_CSP_I422           0x0004  /* yuv 4:2:2 planar */
-#define X264_CSP_YV16           0x0005  /* yvu 4:2:2 planar */
-#define X264_CSP_NV16           0x0006  /* yuv 4:2:2, with one y plane and one packed u+v */
-#define X264_CSP_V210           0x0007  /* 10-bit yuv 4:2:2 packed in 32 */
-#define X264_CSP_I444           0x0008  /* yuv 4:4:4 planar */
-#define X264_CSP_YV24           0x0009  /* yvu 4:4:4 planar */
-#define X264_CSP_BGR            0x000a  /* packed bgr 24bits   */
-#define X264_CSP_BGRA           0x000b  /* packed bgr 32bits   */
-#define X264_CSP_RGB            0x000c  /* packed rgb 24bits   */
-#define X264_CSP_MAX            0x000d  /* end of list */
+#define X264_CSP_NV21           0x0004  /* yuv 4:2:0, with one y plane and one packed v+u */
+#define X264_CSP_I422           0x0005  /* yuv 4:2:2 planar */
+#define X264_CSP_YV16           0x0006  /* yvu 4:2:2 planar */
+#define X264_CSP_NV16           0x0007  /* yuv 4:2:2, with one y plane and one packed u+v */
+#define X264_CSP_V210           0x0008  /* 10-bit yuv 4:2:2 packed in 32 */
+#define X264_CSP_I444           0x0009  /* yuv 4:4:4 planar */
+#define X264_CSP_YV24           0x000a  /* yvu 4:4:4 planar */
+#define X264_CSP_BGR            0x000b  /* packed bgr 24bits   */
+#define X264_CSP_BGRA           0x000c  /* packed bgr 32bits   */
+#define X264_CSP_RGB            0x000d  /* packed rgb 24bits   */
+#define X264_CSP_MAX            0x000e  /* end of list */
 #define X264_CSP_VFLIP          0x1000  /* the csp is vertically flipped */
 #define X264_CSP_HIGH_DEPTH     0x2000  /* the csp has a depth of 16 bits per pixel component */
 
@@ -234,7 +239,7 @@
 #define X264_TYPE_BREF          0x0004  /* Non-disposable B-frame */
 #define X264_TYPE_B             0x0005
 #define X264_TYPE_KEYFRAME      0x0006  /* IDR or I depending on b_open_gop option */
-#define IS_X264_TYPE_I(x) ((x)==X264_TYPE_I || (x)==X264_TYPE_IDR)
+#define IS_X264_TYPE_I(x) ((x)==X264_TYPE_I || (x)==X264_TYPE_IDR || (x)==X264_TYPE_KEYFRAME)
 #define IS_X264_TYPE_B(x) ((x)==X264_TYPE_B || (x)==X264_TYPE_BREF)
 
 /* Log level */
@@ -789,8 +794,6 @@
     /* In: force picture type (if not auto)
      *     If x264 encoding parameters are violated in the forcing of picture types,
      *     x264 will correct the input picture type and log a warning.
-     *     The quality of frametype decisions may suffer if a great deal of fine-grained
-     *     mixing of auto and forced frametypes is done.
      * Out: type of the picture encoded */
     int     i_type;
     /* In: force quantizer for != X264_QP_AUTO */

x264-snapshot-20141218-2245.tar.bz2/x264cli.h -> x264-snapshot-20150804-2245.tar.bz2/x264cli.h Changed

x264-snapshot-20141218-2245.tar.bz2/x264dll.c -> x264-snapshot-20150804-2245.tar.bz2/x264dll.c Changed

x264-snapshot-20141218-2245.tar.bz2/x264res.rc -> x264-snapshot-20150804-2245.tar.bz2/x264res.rc Changed

Changes of Revision 10