Changes of Revision 88

obs-studio.changes Changed
x
 
1
@@ -1,4 +1,50 @@
2
 -------------------------------------------------------------------
3
+Wed Jan 06 18:27:38 UTC 2021 - jimmy@boombatower.com
4
+
5
+- Update to version 26.1.1:
6
+  * win-dshow: Fix dshowcapture not linking audio of certain devices
7
+  * linux-jack: fix deadlock when closing the client
8
+  * linux-jack: mark ports as JackPortIsTerminal
9
+  * linux-jack: fix timestamp calculation
10
+  * obs-browser: Initialize CEF early to fix macOS crash
11
+  * libobs: Update version to 26.1.1
12
+  * rtmp-services: Add Loola.tv service
13
+  * rtmp-services: Fix json formatting
14
+  * libobs: Avoid unnecessary mallocs in audio processing
15
+  * UI: Fix padding on Acri context bar buttons
16
+  * image-source: Fix slideshow transition bug when randomized
17
+  * docs/sphinx: Add missing obs_frontend_open_projector
18
+  * libobs: Update to SIMDe 0.7.1
19
+  * libobs: Set lock state when duplicating scene item
20
+  * libobs: Add definitions in ARCH_SIMD_DEFINES
21
+  * cmake: Add ARCH_SIMD_DEFINES variable
22
+  * coreaudio-encoder: Fix cmake for mingw
23
+  * Revert "UI: Only apply new scaling behavior on newer installs"
24
+  * UI: Only apply new scaling behavior on newer installs
25
+  * UI: Support fractional scaling for Canvas/Base size
26
+  * mac-virtualcam: Remove unnecessary logging
27
+  * mac-virtualcam: Mark parameters as unused
28
+  * image-source: Add .webp to "All formats" option
29
+  * image-source: Add webp to file filter
30
+  * CI: Remove jack, speex and fdk-aac from default builds for macOS
31
+  * libobs, obs-ffmpeg: Use correct value for EINVAL error check
32
+  * UI/updater: Increase number of download workers
33
+  * UI/updater: Enable HTTP2 and TLS 1.3
34
+  * UI: Fix name of kab-KAB locale
35
+  * decklink: Fix automatic pixel format detection
36
+  * CI: Fix macOS 10.13 crashes due to unsupported library symbols
37
+  * UI/installer: Add additional VS2019 DLL check
38
+  * mac-virtualcam: Fix file mode
39
+  * CI: Run make with -j$(nproc)
40
+  * CI: Remove obsolete and unused files
41
+  * libobs: Add texture sharing support for macOS/OpenGL
42
+  * CI: Add necessary changes for CEF 4183
43
+  * UI/updater: Move in-use files away before writing
44
+  * UI/updater: Always clean up temporary files
45
+  * UI: Remove Smashcast from AutoConfig
46
+  * rtmp-services: Remove Smashcast
47
+
48
+-------------------------------------------------------------------
49
 Tue Dec 15 23:25:38 UTC 2020 - Jimmy Berry <jimmy@boombatower.com>
50
 
51
 - Add modinfo-use-full-path.patch for new v4l2lookback support.
52
obs-studio.spec Changed
8
 
1
@@ -1,5 +1,5 @@
2
 Name:           obs-studio
3
-Version:        26.1.0
4
+Version:        26.1.1
5
 Release:        0
6
 Summary:        A recording/broadcasting program
7
 Group:          Productivity/Multimedia/Video/Editors and Convertors
8
_service Changed
10
 
1
@@ -1,7 +1,7 @@
2
 <services>
3
   <service name="tar_scm" mode="disabled">
4
     <param name="versionformat">@PARENT_TAG@</param>
5
-    <param name="revision">refs/tags/26.1.0</param>
6
+    <param name="revision">refs/tags/26.1.1</param>
7
     <param name="url">git://github.com/jp9000/obs-studio.git</param>
8
     <param name="scm">git</param>
9
     <param name="changesgenerate">enable</param>
10
_servicedata Changed
9
 
1
@@ -1,6 +1,6 @@
2
 <servicedata>
3
   <service name="tar_scm">
4
     <param name="url">git://github.com/jp9000/obs-studio.git</param>
5
-    <param name="changesrevision">38ad3ba18fc27846e122bd56f589ccb34c4578e2</param>
6
+    <param name="changesrevision">dffa8221124106bc2a4c92e5f5d0fa21128a61f6</param>
7
   </service>
8
 </servicedata>
9
obs-studio-26.1.0.tar.xz/CI/before-deploy-osx.sh Deleted
116
 
1
@@ -1,114 +0,0 @@
2
-hr() {
3
-  echo "───────────────────────────────────────────────────"
4
-  echo $1
5
-  echo "───────────────────────────────────────────────────"
6
-}
7
-
8
-# Exit if something fails
9
-set -e
10
-
11
-# Generate file name variables
12
-export GIT_TAG=$(git describe --abbrev=0)
13
-export GIT_HASH=$(git rev-parse --short HEAD)
14
-export FILE_DATE=$(date +%Y-%m-%d.%H-%M-%S)
15
-export FILENAME=$FILE_DATE-$GIT_HASH-$TRAVIS_BRANCH-osx.dmg
16
-
17
-echo "git tag: $GIT_TAG"
18
-
19
-cd ./build
20
-
21
-# Move obslua
22
-hr "Moving OBS LUA"
23
-mv ./rundir/RelWithDebInfo/data/obs-scripting/obslua.so ./rundir/RelWithDebInfo/bin/
24
-
25
-# Move obspython
26
-hr "Moving OBS Python"
27
-# mv ./rundir/RelWithDebInfo/data/obs-scripting/_obspython.so ./rundir/RelWithDebInfo/bin/
28
-# mv ./rundir/RelWithDebInfo/data/obs-scripting/obspython.py ./rundir/RelWithDebInfo/bin/
29
-
30
-# Package everything into a nice .app
31
-hr "Packaging .app"
32
-STABLE=false
33
-if [ -n "${TRAVIS_TAG}" ]; then
34
-  STABLE=true
35
-fi
36
-
37
-#sudo python ../CI/install/osx/build_app.py --public-key ../CI/install/osx/OBSPublicDSAKey.pem --sparkle-framework ../../sparkle/Sparkle.framework --stable=$STABLE
38
-
39
-../CI/install/osx/packageApp.sh
40
-
41
-# fix obs outputs plugin it doesn't play nicely with dylibBundler at the moment
42
-if [ -f /usr/local/opt/mbedtls/lib/libmbedtls.12.dylib ]; then
43
-    cp /usr/local/opt/mbedtls/lib/libmbedtls.12.dylib ./OBS.app/Contents/Frameworks/
44
-    cp /usr/local/opt/mbedtls/lib/libmbedcrypto.3.dylib ./OBS.app/Contents/Frameworks/
45
-    cp /usr/local/opt/mbedtls/lib/libmbedx509.0.dylib ./OBS.app/Contents/Frameworks/
46
-    chmod +w ./OBS.app/Contents/Frameworks/*.dylib
47
-    install_name_tool -id @executable_path/../Frameworks/libmbedtls.12.dylib ./OBS.app/Contents/Frameworks/libmbedtls.12.dylib
48
-    install_name_tool -id @executable_path/../Frameworks/libmbedcrypto.3.dylib ./OBS.app/Contents/Frameworks/libmbedcrypto.3.dylib
49
-    install_name_tool -id @executable_path/../Frameworks/libmbedx509.0.dylib ./OBS.app/Contents/Frameworks/libmbedx509.0.dylib
50
-    install_name_tool -change libmbedtls.12.dylib @executable_path/../Frameworks/libmbedtls.12.dylib ./OBS.app/Contents/Plugins/obs-outputs.so
51
-    install_name_tool -change libmbedcrypto.3.dylib @executable_path/../Frameworks/libmbedcrypto.3.dylib ./OBS.app/Contents/Plugins/obs-outputs.so
52
-    install_name_tool -change libmbedx509.0.dylib @executable_path/../Frameworks/libmbedx509.0.dylib ./OBS.app/Contents/Plugins/obs-outputs.so
53
-elif [ -f /usr/local/opt/mbedtls/lib/libmbedtls.13.dylib ]; then
54
-    cp /usr/local/opt/mbedtls/lib/libmbedtls.13.dylib ./OBS.app/Contents/Frameworks/
55
-    cp /usr/local/opt/mbedtls/lib/libmbedcrypto.5.dylib ./OBS.app/Contents/Frameworks/
56
-    cp /usr/local/opt/mbedtls/lib/libmbedx509.1.dylib ./OBS.app/Contents/Frameworks/
57
-    chmod +w ./OBS.app/Contents/Frameworks/*.dylib
58
-    install_name_tool -id @executable_path/../Frameworks/libmbedtls.13.dylib ./OBS.app/Contents/Frameworks/libmbedtls.13.dylib
59
-    install_name_tool -id @executable_path/../Frameworks/libmbedcrypto.5.dylib ./OBS.app/Contents/Frameworks/libmbedcrypto.5.dylib
60
-    install_name_tool -id @executable_path/../Frameworks/libmbedx509.1.dylib ./OBS.app/Contents/Frameworks/libmbedx509.1.dylib
61
-    install_name_tool -change libmbedtls.13.dylib @executable_path/../Frameworks/libmbedtls.13.dylib ./OBS.app/Contents/Plugins/obs-outputs.so
62
-    install_name_tool -change libmbedcrypto.5.dylib @executable_path/../Frameworks/libmbedcrypto.5.dylib ./OBS.app/Contents/Plugins/obs-outputs.so
63
-    install_name_tool -change libmbedx509.1.dylib @executable_path/../Frameworks/libmbedx509.1.dylib ./OBS.app/Contents/Plugins/obs-outputs.so
64
-fi
65
-
66
-install_name_tool -change /usr/local/opt/curl/lib/libcurl.4.dylib @executable_path/../Frameworks/libcurl.4.dylib ./OBS.app/Contents/Plugins/obs-outputs.so
67
-install_name_tool -change @rpath/libobs.0.dylib @executable_path/../Frameworks/libobs.0.dylib ./OBS.app/Contents/Plugins/obs-outputs.so
68
-install_name_tool -change /tmp/obsdeps/bin/libjansson.4.dylib @executable_path/../Frameworks/libjansson.4.dylib ./OBS.app/Contents/Plugins/obs-outputs.so
69
-
70
-# copy sparkle into the app
71
-hr "Copying Sparkle.framework"
72
-cp -R ../../sparkle/Sparkle.framework ./OBS.app/Contents/Frameworks/
73
-install_name_tool -change @rpath/Sparkle.framework/Versions/A/Sparkle @executable_path/../Frameworks/Sparkle.framework/Versions/A/Sparkle ./OBS.app/Contents/MacOS/obs
74
-
75
-# Copy Chromium embedded framework to app Frameworks directory
76
-hr "Copying Chromium Embedded Framework.framework"
77
-sudo mkdir -p OBS.app/Contents/Frameworks
78
-sudo cp -R ../../cef_binary_${CEF_BUILD_VERSION}_macosx64/Release/Chromium\ Embedded\ Framework.framework OBS.app/Contents/Frameworks/
79
-
80
-install_name_tool -change /usr/local/opt/qt/lib/QtGui.framework/Versions/5/QtGui @executable_path/../Frameworks/QtGui.framework/Versions/5/QtGui ./OBS.app/Contents/Plugins/obs-browser.so
81
-install_name_tool -change /usr/local/opt/qt/lib/QtCore.framework/Versions/5/QtCore @executable_path/../Frameworks/QtCore.framework/Versions/5/QtCore ./OBS.app/Contents/Plugins/obs-browser.so
82
-install_name_tool -change /usr/local/opt/qt/lib/QtWidgets.framework/Versions/5/QtWidgets @executable_path/../Frameworks/QtWidgets.framework/Versions/5/QtWidgets ./OBS.app/Contents/Plugins/obs-browser.so
83
-
84
-cp ../CI/install/osx/OBSPublicDSAKey.pem OBS.app/Contents/Resources
85
-
86
-# edit plist
87
-plutil -insert CFBundleVersion -string $GIT_TAG ./OBS.app/Contents/Info.plist
88
-plutil -insert CFBundleShortVersionString -string $GIT_TAG ./OBS.app/Contents/Info.plist
89
-plutil -insert OBSFeedsURL -string https://obsproject.com/osx_update/feeds.xml ./OBS.app/Contents/Info.plist
90
-plutil -insert SUFeedURL -string https://obsproject.com/osx_update/stable/updates.xml ./OBS.app/Contents/Info.plist
91
-plutil -insert SUPublicDSAKeyFile -string OBSPublicDSAKey.pem ./OBS.app/Contents/Info.plist
92
-
93
-dmgbuild -s ../CI/install/osx/settings.json "OBS" obs.dmg
94
-
95
-if [ -v "$TRAVIS" ]; then
96
-   # Signing stuff
97
-   hr "Decrypting Cert"
98
-   openssl aes-256-cbc -K $encrypted_dd3c7f5e9db9_key -iv $encrypted_dd3c7f5e9db9_iv -in ../CI/osxcert/Certificates.p12.enc -out Certificates.p12 -d
99
-   hr "Creating Keychain"
100
-   security create-keychain -p mysecretpassword build.keychain
101
-   security default-keychain -s build.keychain
102
-   security unlock-keychain -p mysecretpassword build.keychain
103
-   security set-keychain-settings -t 3600 -u build.keychain
104
-   hr "Importing certs into keychain"
105
-   security import ./Certificates.p12 -k build.keychain -T /usr/bin/productsign -P ""
106
-   # macOS 10.12+
107
-   security set-key-partition-list -S apple-tool:,apple: -s -k mysecretpassword build.keychain
108
-fi
109
-
110
-cp ./OBS.dmg ./$FILENAME
111
-
112
-# Move to the folder that travis uses to upload artifacts from
113
-hr "Moving package to nightly folder for distribution"
114
-mkdir ../nightly
115
-sudo mv ./$FILENAME ../nightly
116
obs-studio-26.1.0.tar.xz/CI/before-script-osx.sh Deleted
19
 
1
@@ -1,17 +0,0 @@
2
-# Make sure ccache is found
3
-export PATH=/usr/local/opt/ccache/libexec:$PATH
4
-
5
-git fetch --tags
6
-
7
-mkdir build
8
-cd build
9
-cmake -DENABLE_SPARKLE_UPDATER=ON \
10
--DCMAKE_OSX_DEPLOYMENT_TARGET=10.13 \
11
--DDISABLE_PYTHON=ON \
12
--DQTDIR=/usr/local/Cellar/qt/5.14.1 \
13
--DDepsPath=/tmp/obsdeps \
14
--DVLCPath=$PWD/../../vlc-3.0.8 \
15
--DBUILD_BROWSER=ON \
16
--DBROWSER_DEPLOY=ON \
17
--DWITH_RTMPS=ON \
18
--DCEF_ROOT_DIR=$PWD/../../cef_binary_${CEF_BUILD_VERSION}_macosx64 ..
19
obs-studio-26.1.0.tar.xz/CI/install Deleted
2
 
1
-(directory)
2
obs-studio-26.1.0.tar.xz/CI/install-dependencies-osx.sh Deleted
83
 
1
@@ -1,81 +0,0 @@
2
-hr() {
3
-  echo "───────────────────────────────────────────────────"
4
-  echo $1
5
-  echo "───────────────────────────────────────────────────"
6
-}
7
-
8
-# Exit if something fails
9
-set -e
10
-
11
-# Echo all commands before executing
12
-set -v
13
-
14
-if [[ $TRAVIS ]]; then
15
-  git fetch --unshallow
16
-fi
17
-
18
-git fetch origin --tags
19
-
20
-# Leave obs-studio folder
21
-cd ../
22
-
23
-# Install Packages app so we can build a package later
24
-# http://s.sudre.free.fr/Software/Packages/about.html
25
-hr "Downloading Packages app"
26
-wget --quiet --retry-connrefused --waitretry=1 https://s3-us-west-2.amazonaws.com/obs-nightly/Packages.pkg
27
-sudo installer -pkg ./Packages.pkg -target /
28
-
29
-brew update
30
-
31
-#Base OBS Deps and ccache
32
-for DEPENDENCY in jack speexdsp ccache mbedtls freetype fdk-aac cmocka; do
33
-    if [ ! -d "$(brew --cellar)/${DEPENDENCY}" ]; then
34
-        brew install $DEPENDENCY
35
-    else
36
-        brew upgrade $DEPENDENCY
37
-    fi
38
-done
39
-
40
-brew install https://gist.githubusercontent.com/DDRBoxman/9c7a2b08933166f4b61ed9a44b242609/raw/ef4de6c587c6bd7f50210eccd5bd51ff08e6de13/qt.rb
41
-if [ -d "$(brew --cellar)/swig" ]; then
42
-    brew unlink swig
43
-fi
44
-brew install https://gist.githubusercontent.com/DDRBoxman/4cada55c51803a2f963fa40ce55c9d3e/raw/572c67e908bfbc1bcb8c476ea77ea3935133f5b5/swig.rb
45
-
46
-pip install dmgbuild
47
-
48
-export PATH=/usr/local/opt/ccache/libexec:$PATH
49
-ccache -s || echo "CCache is not available."
50
-
51
-# Fetch and untar prebuilt OBS deps that are compatible with older versions of OSX
52
-hr "Downloading OBS deps"
53
-wget --quiet --retry-connrefused --waitretry=1 https://github.com/obsproject/obs-deps/releases/download/2020-04-24/osx-deps-2020-04-24.tar.gz
54
-tar -xf ./osx-deps-2020-04-24.tar.gz -C /tmp
55
-
56
-# Fetch vlc codebase
57
-hr "Downloading VLC repo"
58
-wget --quiet --retry-connrefused --waitretry=1 https://downloads.videolan.org/vlc/3.0.8/vlc-3.0.8.tar.xz
59
-tar -xf vlc-3.0.8.tar.xz
60
-
61
-# Get sparkle
62
-hr "Downloading Sparkle framework"
63
-wget --quiet --retry-connrefused --waitretry=1 -O sparkle.tar.bz2 https://github.com/sparkle-project/Sparkle/releases/download/1.23.0/Sparkle-1.23.0.tar.bz2
64
-mkdir ./sparkle
65
-tar -xf ./sparkle.tar.bz2 -C ./sparkle
66
-sudo cp -R ./sparkle/Sparkle.framework /Library/Frameworks/Sparkle.framework
67
-
68
-# CEF Stuff
69
-hr "Downloading CEF"
70
-wget --quiet --retry-connrefused --waitretry=1 https://obs-nightly.s3-us-west-2.amazonaws.com/cef_binary_${CEF_BUILD_VERSION}_macosx64.tar.bz2
71
-tar -xf ./cef_binary_${CEF_BUILD_VERSION}_macosx64.tar.bz2
72
-cd ./cef_binary_${CEF_BUILD_VERSION}_macosx64
73
-# remove a broken test
74
-sed -i '.orig' '/add_subdirectory(tests\/ceftests)/d' ./CMakeLists.txt
75
-# target 10.11
76
-sed -i '.orig' s/\"10.9\"/\"10.11\"/ ./cmake/cef_variables.cmake
77
-mkdir build
78
-cd ./build
79
-cmake -DCMAKE_CXX_FLAGS="-std=c++11 -stdlib=libc++" -DCMAKE_EXE_LINKER_FLAGS="-std=c++11 -stdlib=libc++" -DCMAKE_OSX_DEPLOYMENT_TARGET=10.11 ..
80
-make -j4
81
-mkdir libcef_dll
82
-cd ../../
83
obs-studio-26.1.0.tar.xz/CI/install/osx Deleted
2
 
1
-(directory)
2
obs-studio-26.1.0.tar.xz/CI/install/osx/CMakeLists.pkgproj Deleted
1028
 
1
@@ -1,1026 +0,0 @@
2
-<?xml version="1.0" encoding="UTF-8"?>
3
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4
-<plist version="1.0">
5
-<dict>
6
-   <key>PACKAGES</key>
7
-   <array>
8
-       <dict>
9
-           <key>PACKAGE_FILES</key>
10
-           <dict>
11
-               <key>DEFAULT_INSTALL_LOCATION</key>
12
-               <string>/</string>
13
-               <key>HIERARCHY</key>
14
-               <dict>
15
-                   <key>CHILDREN</key>
16
-                   <array>
17
-                       <dict>
18
-                           <key>CHILDREN</key>
19
-                           <array>
20
-                               <dict>
21
-                                   <key>CHILDREN</key>
22
-                                   <array/>
23
-                                   <key>GID</key>
24
-                                   <integer>80</integer>
25
-                                   <key>PATH</key>
26
-                                   <string>../../../build/OBS.app</string>
27
-                                   <key>PATH_TYPE</key>
28
-                                   <integer>3</integer>
29
-                                   <key>PERMISSIONS</key>
30
-                                   <integer>493</integer>
31
-                                   <key>TYPE</key>
32
-                                   <integer>3</integer>
33
-                                   <key>UID</key>
34
-                                   <integer>0</integer>
35
-                               </dict>
36
-                               <dict>
37
-                                   <key>CHILDREN</key>
38
-                                   <array/>
39
-                                   <key>GID</key>
40
-                                   <integer>80</integer>
41
-                                   <key>PATH</key>
42
-                                   <string>Utilities</string>
43
-                                   <key>PATH_TYPE</key>
44
-                                   <integer>0</integer>
45
-                                   <key>PERMISSIONS</key>
46
-                                   <integer>493</integer>
47
-                                   <key>TYPE</key>
48
-                                   <integer>1</integer>
49
-                                   <key>UID</key>
50
-                                   <integer>0</integer>
51
-                               </dict>
52
-                           </array>
53
-                           <key>GID</key>
54
-                           <integer>80</integer>
55
-                           <key>PATH</key>
56
-                           <string>Applications</string>
57
-                           <key>PATH_TYPE</key>
58
-                           <integer>0</integer>
59
-                           <key>PERMISSIONS</key>
60
-                           <integer>509</integer>
61
-                           <key>TYPE</key>
62
-                           <integer>1</integer>
63
-                           <key>UID</key>
64
-                           <integer>0</integer>
65
-                       </dict>
66
-                       <dict>
67
-                           <key>CHILDREN</key>
68
-                           <array>
69
-                               <dict>
70
-                                   <key>CHILDREN</key>
71
-                                   <array>
72
-                                       <dict>
73
-                                           <key>CHILDREN</key>
74
-                                           <array>
75
-                                               <dict>
76
-                                                   <key>CHILDREN</key>
77
-                                                   <array>
78
-                                                       <dict>
79
-                                                           <key>CHILDREN</key>
80
-                                                           <array>
81
-                                                               <dict>
82
-                                                                   <key>CHILDREN</key>
83
-                                                                   <array>
84
-                                                                       <dict>
85
-                                                                           <key>CHILDREN</key>
86
-                                                                           <array/>
87
-                                                                           <key>GID</key>
88
-                                                                           <integer>80</integer>
89
-                                                                           <key>PATH</key>
90
-                                                                           <string>../../../build/plugins/obs-browser/obs-browser-page</string>
91
-                                                                           <key>PATH_TYPE</key>
92
-                                                                           <integer>3</integer>
93
-                                                                           <key>PERMISSIONS</key>
94
-                                                                           <integer>493</integer>
95
-                                                                           <key>TYPE</key>
96
-                                                                           <integer>3</integer>
97
-                                                                           <key>UID</key>
98
-                                                                           <integer>0</integer>
99
-                                                                       </dict>
100
-                                                                       <dict>
101
-                                                                           <key>CHILDREN</key>
102
-                                                                           <array/>
103
-                                                                           <key>GID</key>
104
-                                                                           <integer>80</integer>
105
-                                                                           <key>PATH</key>
106
-                                                                           <string>../../../build/plugins/obs-browser/obs-browser.so</string>
107
-                                                                           <key>PATH_TYPE</key>
108
-                                                                           <integer>3</integer>
109
-                                                                           <key>PERMISSIONS</key>
110
-                                                                           <integer>493</integer>
111
-                                                                           <key>TYPE</key>
112
-                                                                           <integer>3</integer>
113
-                                                                           <key>UID</key>
114
-                                                                           <integer>0</integer>
115
-                                                                       </dict>
116
-                                                                   </array>
117
-                                                                   <key>GID</key>
118
-                                                                   <integer>80</integer>
119
-                                                                   <key>PATH</key>
120
-                                                                   <string>bin</string>
121
-                                                                   <key>PATH_TYPE</key>
122
-                                                                   <integer>0</integer>
123
-                                                                   <key>PERMISSIONS</key>
124
-                                                                   <integer>493</integer>
125
-                                                                   <key>TYPE</key>
126
-                                                                   <integer>2</integer>
127
-                                                                   <key>UID</key>
128
-                                                                   <integer>0</integer>
129
-                                                               </dict>
130
-                                                           </array>
131
-                                                           <key>GID</key>
132
-                                                           <integer>80</integer>
133
-                                                           <key>PATH</key>
134
-                                                           <string>obs-browser</string>
135
-                                                           <key>PATH_TYPE</key>
136
-                                                           <integer>0</integer>
137
-                                                           <key>PERMISSIONS</key>
138
-                                                           <integer>493</integer>
139
-                                                           <key>TYPE</key>
140
-                                                           <integer>2</integer>
141
-                                                           <key>UID</key>
142
-                                                           <integer>0</integer>
143
-                                                       </dict>
144
-                                                   </array>
145
-                                                   <key>GID</key>
146
-                                                   <integer>80</integer>
147
-                                                   <key>PATH</key>
148
-                                                   <string>plugins</string>
149
-                                                   <key>PATH_TYPE</key>
150
-                                                   <integer>0</integer>
151
-                                                   <key>PERMISSIONS</key>
152
-                                                   <integer>493</integer>
153
-                                                   <key>TYPE</key>
154
-                                                   <integer>2</integer>
155
-                                                   <key>UID</key>
156
-                                                   <integer>0</integer>
157
-                                               </dict>
158
-                                           </array>
159
-                                           <key>GID</key>
160
-                                           <integer>80</integer>
161
-                                           <key>PATH</key>
162
-                                           <string>obs-studio</string>
163
-                                           <key>PATH_TYPE</key>
164
-                                           <integer>0</integer>
165
-                                           <key>PERMISSIONS</key>
166
-                                           <integer>493</integer>
167
-                                           <key>TYPE</key>
168
-                                           <integer>2</integer>
169
-                                           <key>UID</key>
170
-                                           <integer>0</integer>
171
-                                       </dict>
172
-                                   </array>
173
-                                   <key>GID</key>
174
-                                   <integer>80</integer>
175
-                                   <key>PATH</key>
176
-                                   <string>Application Support</string>
177
-                                   <key>PATH_TYPE</key>
178
-                                   <integer>0</integer>
179
-                                   <key>PERMISSIONS</key>
180
-                                   <integer>493</integer>
181
-                                   <key>TYPE</key>
182
-                                   <integer>1</integer>
183
-                                   <key>UID</key>
184
-                                   <integer>0</integer>
185
-                               </dict>
186
-                               <dict>
187
-                                   <key>CHILDREN</key>
188
-                                   <array/>
189
-                                   <key>GID</key>
190
-                                   <integer>0</integer>
191
-                                   <key>PATH</key>
192
-                                   <string>Documentation</string>
193
-                                   <key>PATH_TYPE</key>
194
-                                   <integer>0</integer>
195
-                                   <key>PERMISSIONS</key>
196
-                                   <integer>493</integer>
197
-                                   <key>TYPE</key>
198
-                                   <integer>1</integer>
199
-                                   <key>UID</key>
200
-                                   <integer>0</integer>
201
-                               </dict>
202
-                               <dict>
203
-                                   <key>CHILDREN</key>
204
-                                   <array/>
205
-                                   <key>GID</key>
206
-                                   <integer>0</integer>
207
-                                   <key>PATH</key>
208
-                                   <string>Filesystems</string>
209
-                                   <key>PATH_TYPE</key>
210
-                                   <integer>0</integer>
211
-                                   <key>PERMISSIONS</key>
212
-                                   <integer>493</integer>
213
-                                   <key>TYPE</key>
214
-                                   <integer>1</integer>
215
-                                   <key>UID</key>
216
-                                   <integer>0</integer>
217
-                               </dict>
218
-                               <dict>
219
-                                   <key>CHILDREN</key>
220
-                                   <array/>
221
-                                   <key>GID</key>
222
-                                   <integer>0</integer>
223
-                                   <key>PATH</key>
224
-                                   <string>Frameworks</string>
225
-                                   <key>PATH_TYPE</key>
226
-                                   <integer>0</integer>
227
-                                   <key>PERMISSIONS</key>
228
-                                   <integer>493</integer>
229
-                                   <key>TYPE</key>
230
-                                   <integer>1</integer>
231
-                                   <key>UID</key>
232
-                                   <integer>0</integer>
233
-                               </dict>
234
-                               <dict>
235
-                                   <key>CHILDREN</key>
236
-                                   <array/>
237
-                                   <key>GID</key>
238
-                                   <integer>0</integer>
239
-                                   <key>PATH</key>
240
-                                   <string>Input Methods</string>
241
-                                   <key>PATH_TYPE</key>
242
-                                   <integer>0</integer>
243
-                                   <key>PERMISSIONS</key>
244
-                                   <integer>493</integer>
245
-                                   <key>TYPE</key>
246
-                                   <integer>1</integer>
247
-                                   <key>UID</key>
248
-                                   <integer>0</integer>
249
-                               </dict>
250
-                               <dict>
251
-                                   <key>CHILDREN</key>
252
-                                   <array/>
253
-                                   <key>GID</key>
254
-                                   <integer>0</integer>
255
-                                   <key>PATH</key>
256
-                                   <string>Internet Plug-Ins</string>
257
-                                   <key>PATH_TYPE</key>
258
-                                   <integer>0</integer>
259
-                                   <key>PERMISSIONS</key>
260
-                                   <integer>493</integer>
261
-                                   <key>TYPE</key>
262
-                                   <integer>1</integer>
263
-                                   <key>UID</key>
264
-                                   <integer>0</integer>
265
-                               </dict>
266
-                               <dict>
267
-                                   <key>CHILDREN</key>
268
-                                   <array/>
269
-                                   <key>GID</key>
270
-                                   <integer>0</integer>
271
-                                   <key>PATH</key>
272
-                                   <string>LaunchAgents</string>
273
-                                   <key>PATH_TYPE</key>
274
-                                   <integer>0</integer>
275
-                                   <key>PERMISSIONS</key>
276
-                                   <integer>493</integer>
277
-                                   <key>TYPE</key>
278
-                                   <integer>1</integer>
279
-                                   <key>UID</key>
280
-                                   <integer>0</integer>
281
-                               </dict>
282
-                               <dict>
283
-                                   <key>CHILDREN</key>
284
-                                   <array/>
285
-                                   <key>GID</key>
286
-                                   <integer>0</integer>
287
-                                   <key>PATH</key>
288
-                                   <string>LaunchDaemons</string>
289
-                                   <key>PATH_TYPE</key>
290
-                                   <integer>0</integer>
291
-                                   <key>PERMISSIONS</key>
292
-                                   <integer>493</integer>
293
-                                   <key>TYPE</key>
294
-                                   <integer>1</integer>
295
-                                   <key>UID</key>
296
-                                   <integer>0</integer>
297
-                               </dict>
298
-                               <dict>
299
-                                   <key>CHILDREN</key>
300
-                                   <array/>
301
-                                   <key>GID</key>
302
-                                   <integer>0</integer>
303
-                                   <key>PATH</key>
304
-                                   <string>PreferencePanes</string>
305
-                                   <key>PATH_TYPE</key>
306
-                                   <integer>0</integer>
307
-                                   <key>PERMISSIONS</key>
308
-                                   <integer>493</integer>
309
-                                   <key>TYPE</key>
310
-                                   <integer>1</integer>
311
-                                   <key>UID</key>
312
-                                   <integer>0</integer>
313
-                               </dict>
314
-                               <dict>
315
-                                   <key>CHILDREN</key>
316
-                                   <array/>
317
-                                   <key>GID</key>
318
-                                   <integer>0</integer>
319
-                                   <key>PATH</key>
320
-                                   <string>Preferences</string>
321
-                                   <key>PATH_TYPE</key>
322
-                                   <integer>0</integer>
323
-                                   <key>PERMISSIONS</key>
324
-                                   <integer>493</integer>
325
-                                   <key>TYPE</key>
326
-                                   <integer>1</integer>
327
-                                   <key>UID</key>
328
-                                   <integer>0</integer>
329
-                               </dict>
330
-                               <dict>
331
-                                   <key>CHILDREN</key>
332
-                                   <array/>
333
-                                   <key>GID</key>
334
-                                   <integer>80</integer>
335
-                                   <key>PATH</key>
336
-                                   <string>Printers</string>
337
-                                   <key>PATH_TYPE</key>
338
-                                   <integer>0</integer>
339
-                                   <key>PERMISSIONS</key>
340
-                                   <integer>493</integer>
341
-                                   <key>TYPE</key>
342
-                                   <integer>1</integer>
343
-                                   <key>UID</key>
344
-                                   <integer>0</integer>
345
-                               </dict>
346
-                               <dict>
347
-                                   <key>CHILDREN</key>
348
-                                   <array/>
349
-                                   <key>GID</key>
350
-                                   <integer>0</integer>
351
-                                   <key>PATH</key>
352
-                                   <string>PrivilegedHelperTools</string>
353
-                                   <key>PATH_TYPE</key>
354
-                                   <integer>0</integer>
355
-                                   <key>PERMISSIONS</key>
356
-                                   <integer>493</integer>
357
-                                   <key>TYPE</key>
358
-                                   <integer>1</integer>
359
-                                   <key>UID</key>
360
-                                   <integer>0</integer>
361
-                               </dict>
362
-                               <dict>
363
-                                   <key>CHILDREN</key>
364
-                                   <array/>
365
-                                   <key>GID</key>
366
-                                   <integer>0</integer>
367
-                                   <key>PATH</key>
368
-                                   <string>QuickLook</string>
369
-                                   <key>PATH_TYPE</key>
370
-                                   <integer>0</integer>
371
-                                   <key>PERMISSIONS</key>
372
-                                   <integer>493</integer>
373
-                                   <key>TYPE</key>
374
-                                   <integer>1</integer>
375
-                                   <key>UID</key>
376
-                                   <integer>0</integer>
377
-                               </dict>
378
-                               <dict>
379
-                                   <key>CHILDREN</key>
380
-                                   <array/>
381
-                                   <key>GID</key>
382
-                                   <integer>0</integer>
383
-                                   <key>PATH</key>
384
-                                   <string>QuickTime</string>
385
-                                   <key>PATH_TYPE</key>
386
-                                   <integer>0</integer>
387
-                                   <key>PERMISSIONS</key>
388
-                                   <integer>493</integer>
389
-                                   <key>TYPE</key>
390
-                                   <integer>1</integer>
391
-                                   <key>UID</key>
392
-                                   <integer>0</integer>
393
-                               </dict>
394
-                               <dict>
395
-                                   <key>CHILDREN</key>
396
-                                   <array/>
397
-                                   <key>GID</key>
398
-                                   <integer>0</integer>
399
-                                   <key>PATH</key>
400
-                                   <string>Screen Savers</string>
401
-                                   <key>PATH_TYPE</key>
402
-                                   <integer>0</integer>
403
-                                   <key>PERMISSIONS</key>
404
-                                   <integer>493</integer>
405
-                                   <key>TYPE</key>
406
-                                   <integer>1</integer>
407
-                                   <key>UID</key>
408
-                                   <integer>0</integer>
409
-                               </dict>
410
-                               <dict>
411
-                                   <key>CHILDREN</key>
412
-                                   <array/>
413
-                                   <key>GID</key>
414
-                                   <integer>0</integer>
415
-                                   <key>PATH</key>
416
-                                   <string>Scripts</string>
417
-                                   <key>PATH_TYPE</key>
418
-                                   <integer>0</integer>
419
-                                   <key>PERMISSIONS</key>
420
-                                   <integer>493</integer>
421
-                                   <key>TYPE</key>
422
-                                   <integer>1</integer>
423
-                                   <key>UID</key>
424
-                                   <integer>0</integer>
425
-                               </dict>
426
-                               <dict>
427
-                                   <key>CHILDREN</key>
428
-                                   <array/>
429
-                                   <key>GID</key>
430
-                                   <integer>0</integer>
431
-                                   <key>PATH</key>
432
-                                   <string>Services</string>
433
-                                   <key>PATH_TYPE</key>
434
-                                   <integer>0</integer>
435
-                                   <key>PERMISSIONS</key>
436
-                                   <integer>493</integer>
437
-                                   <key>TYPE</key>
438
-                                   <integer>1</integer>
439
-                                   <key>UID</key>
440
-                                   <integer>0</integer>
441
-                               </dict>
442
-                               <dict>
443
-                                   <key>CHILDREN</key>
444
-                                   <array/>
445
-                                   <key>GID</key>
446
-                                   <integer>0</integer>
447
-                                   <key>PATH</key>
448
-                                   <string>Widgets</string>
449
-                                   <key>PATH_TYPE</key>
450
-                                   <integer>0</integer>
451
-                                   <key>PERMISSIONS</key>
452
-                                   <integer>493</integer>
453
-                                   <key>TYPE</key>
454
-                                   <integer>1</integer>
455
-                                   <key>UID</key>
456
-                                   <integer>0</integer>
457
-                               </dict>
458
-                           </array>
459
-                           <key>GID</key>
460
-                           <integer>0</integer>
461
-                           <key>PATH</key>
462
-                           <string>Library</string>
463
-                           <key>PATH_TYPE</key>
464
-                           <integer>0</integer>
465
-                           <key>PERMISSIONS</key>
466
-                           <integer>493</integer>
467
-                           <key>TYPE</key>
468
-                           <integer>1</integer>
469
-                           <key>UID</key>
470
-                           <integer>0</integer>
471
-                       </dict>
472
-                       <dict>
473
-                           <key>CHILDREN</key>
474
-                           <array>
475
-                               <dict>
476
-                                   <key>CHILDREN</key>
477
-                                   <array>
478
-                                       <dict>
479
-                                           <key>CHILDREN</key>
480
-                                           <array/>
481
-                                           <key>GID</key>
482
-                                           <integer>0</integer>
483
-                                           <key>PATH</key>
484
-                                           <string>Extensions</string>
485
-                                           <key>PATH_TYPE</key>
486
-                                           <integer>0</integer>
487
-                                           <key>PERMISSIONS</key>
488
-                                           <integer>493</integer>
489
-                                           <key>TYPE</key>
490
-                                           <integer>1</integer>
491
-                                           <key>UID</key>
492
-                                           <integer>0</integer>
493
-                                       </dict>
494
-                                   </array>
495
-                                   <key>GID</key>
496
-                                   <integer>0</integer>
497
-                                   <key>PATH</key>
498
-                                   <string>Library</string>
499
-                                   <key>PATH_TYPE</key>
500
-                                   <integer>0</integer>
501
-                                   <key>PERMISSIONS</key>
502
-                                   <integer>493</integer>
503
-                                   <key>TYPE</key>
504
-                                   <integer>1</integer>
505
-                                   <key>UID</key>
506
-                                   <integer>0</integer>
507
-                               </dict>
508
-                           </array>
509
-                           <key>GID</key>
510
-                           <integer>0</integer>
511
-                           <key>PATH</key>
512
-                           <string>System</string>
513
-                           <key>PATH_TYPE</key>
514
-                           <integer>0</integer>
515
-                           <key>PERMISSIONS</key>
516
-                           <integer>493</integer>
517
-                           <key>TYPE</key>
518
-                           <integer>1</integer>
519
-                           <key>UID</key>
520
-                           <integer>0</integer>
521
-                       </dict>
522
-                       <dict>
523
-                           <key>CHILDREN</key>
524
-                           <array>
525
-                               <dict>
526
-                                   <key>CHILDREN</key>
527
-                                   <array/>
528
-                                   <key>GID</key>
529
-                                   <integer>0</integer>
530
-                                   <key>PATH</key>
531
-                                   <string>Shared</string>
532
-                                   <key>PATH_TYPE</key>
533
-                                   <integer>0</integer>
534
-                                   <key>PERMISSIONS</key>
535
-                                   <integer>1023</integer>
536
-                                   <key>TYPE</key>
537
-                                   <integer>1</integer>
538
-                                   <key>UID</key>
539
-                                   <integer>0</integer>
540
-                               </dict>
541
-                           </array>
542
-                           <key>GID</key>
543
-                           <integer>80</integer>
544
-                           <key>PATH</key>
545
-                           <string>Users</string>
546
-                           <key>PATH_TYPE</key>
547
-                           <integer>0</integer>
548
-                           <key>PERMISSIONS</key>
549
-                           <integer>493</integer>
550
-                           <key>TYPE</key>
551
-                           <integer>1</integer>
552
-                           <key>UID</key>
553
-                           <integer>0</integer>
554
-                       </dict>
555
-                   </array>
556
-                   <key>GID</key>
557
-                   <integer>0</integer>
558
-                   <key>PATH</key>
559
-                   <string>/</string>
560
-                   <key>PATH_TYPE</key>
561
-                   <integer>0</integer>
562
-                   <key>PERMISSIONS</key>
563
-                   <integer>493</integer>
564
-                   <key>TYPE</key>
565
-                   <integer>1</integer>
566
-                   <key>UID</key>
567
-                   <integer>0</integer>
568
-               </dict>
569
-               <key>PAYLOAD_TYPE</key>
570
-               <integer>0</integer>
571
-               <key>VERSION</key>
572
-               <integer>2</integer>
573
-           </dict>
574
-           <key>PACKAGE_SCRIPTS</key>
575
-           <dict>
576
-               <key>POSTINSTALL_PATH</key>
577
-               <dict>
578
-                   <key>PATH</key>
579
-                   <string>post-install.sh</string>
580
-                   <key>PATH_TYPE</key>
581
-                   <integer>3</integer>
582
-               </dict>
583
-               <key>PREINSTALL_PATH</key>
584
-               <dict/>
585
-               <key>RESOURCES</key>
586
-               <array/>
587
-           </dict>
588
-           <key>PACKAGE_SETTINGS</key>
589
-           <dict>
590
-               <key>AUTHENTICATION</key>
591
-               <integer>1</integer>
592
-               <key>CONCLUSION_ACTION</key>
593
-               <integer>0</integer>
594
-               <key>IDENTIFIER</key>
595
-               <string>org.obsproject.pkg.obs-studio</string>
596
-               <key>NAME</key>
597
-               <string>OBS</string>
598
-               <key>OVERWRITE_PERMISSIONS</key>
599
-               <false/>
600
-               <key>VERSION</key>
601
-               <string>1.0</string>
602
-           </dict>
603
-           <key>UUID</key>
604
-           <string>19CCE3F2-8911-4364-B673-8B5BC3ABD4DA</string>
605
-       </dict>
606
-       <dict>
607
-           <key>PACKAGE_SETTINGS</key>
608
-           <dict>
609
-               <key>LOCATION</key>
610
-               <integer>0</integer>
611
-               <key>NAME</key>
612
-               <string>SyphonInject</string>
613
-           </dict>
614
-           <key>PATH</key>
615
-           <dict>
616
-               <key>PATH</key>
617
-               <string>SyphonInject.pkg</string>
618
-               <key>PATH_TYPE</key>
619
-               <integer>1</integer>
620
-           </dict>
621
-           <key>TYPE</key>
622
-           <integer>1</integer>
623
-           <key>UUID</key>
624
-           <string>0CC9C67E-4D14-4794-9930-019925513B1C</string>
625
-       </dict>
626
-   </array>
627
-   <key>PROJECT</key>
628
-   <dict>
629
-       <key>PROJECT_COMMENTS</key>
630
-       <dict>
631
-           <key>NOTES</key>
632
-           <data>
633
-           PCFET0NUWVBFIGh0bWwgUFVCTElDICItLy9XM0MvL0RURCBIVE1M
634
-           IDQuMDEvL0VOIiAiaHR0cDovL3d3dy53My5vcmcvVFIvaHRtbDQv
635
-           c3RyaWN0LmR0ZCI+CjxodG1sPgo8aGVhZD4KPG1ldGEgaHR0cC1l
636
-           cXVpdj0iQ29udGVudC1UeXBlIiBjb250ZW50PSJ0ZXh0L2h0bWw7
637
-           IGNoYXJzZXQ9VVRGLTgiPgo8bWV0YSBodHRwLWVxdWl2PSJDb250
638
-           ZW50LVN0eWxlLVR5cGUiIGNvbnRlbnQ9InRleHQvY3NzIj4KPHRp
639
-           dGxlPjwvdGl0bGU+CjxtZXRhIG5hbWU9IkdlbmVyYXRvciIgY29u
640
-           dGVudD0iQ29jb2EgSFRNTCBXcml0ZXIiPgo8bWV0YSBuYW1lPSJD
641
-           b2NvYVZlcnNpb24iIGNvbnRlbnQ9IjE1MDQuODEiPgo8c3R5bGUg
642
-           dHlwZT0idGV4dC9jc3MiPgo8L3N0eWxlPgo8L2hlYWQ+Cjxib2R5
643
-           Pgo8L2JvZHk+CjwvaHRtbD4K
644
-           </data>
645
-       </dict>
646
-       <key>PROJECT_PRESENTATION</key>
647
-       <dict>
648
-           <key>BACKGROUND</key>
649
-           <dict>
650
-               <key>ALIGNMENT</key>
651
-               <integer>4</integer>
652
-               <key>BACKGROUND_PATH</key>
653
-               <dict>
654
-                   <key>PATH</key>
655
-                   <string>obs.png</string>
656
-                   <key>PATH_TYPE</key>
657
-                   <integer>1</integer>
658
-               </dict>
659
-               <key>CUSTOM</key>
660
-               <integer>1</integer>
661
-               <key>SCALING</key>
662
-               <integer>0</integer>
663
-           </dict>
664
-           <key>INSTALLATION TYPE</key>
665
-           <dict>
666
-               <key>HIERARCHIES</key>
667
-               <dict>
668
-                   <key>INSTALLER</key>
669
-                   <dict>
670
-                       <key>LIST</key>
671
-                       <array>
672
-                           <dict>
673
-                               <key>DESCRIPTION</key>
674
-                               <array/>
675
-                               <key>OPTIONS</key>
676
-                               <dict>
677
-                                   <key>HIDDEN</key>
678
-                                   <false/>
679
-                                   <key>STATE</key>
680
-                                   <integer>0</integer>
681
-                               </dict>
682
-                               <key>PACKAGE_UUID</key>
683
-                               <string>19CCE3F2-8911-4364-B673-8B5BC3ABD4DA</string>
684
-                               <key>REQUIREMENTS</key>
685
-                               <array/>
686
-                               <key>TITLE</key>
687
-                               <array/>
688
-                               <key>TOOLTIP</key>
689
-                               <array/>
690
-                               <key>TYPE</key>
691
-                               <integer>0</integer>
692
-                               <key>UUID</key>
693
-                               <string>7C540711-59F4-479C-9CFD-8C4D6594992E</string>
694
-                           </dict>
695
-                           <dict>
696
-                               <key>DESCRIPTION</key>
697
-                               <array/>
698
-                               <key>OPTIONS</key>
699
-                               <dict>
700
-                                   <key>HIDDEN</key>
701
-                                   <false/>
702
-                                   <key>STATE</key>
703
-                                   <integer>1</integer>
704
-                               </dict>
705
-                               <key>PACKAGE_UUID</key>
706
-                               <string>0CC9C67E-4D14-4794-9930-019925513B1C</string>
707
-                               <key>REQUIREMENTS</key>
708
-                               <array/>
709
-                               <key>TITLE</key>
710
-                               <array/>
711
-                               <key>TOOLTIP</key>
712
-                               <array/>
713
-                               <key>TYPE</key>
714
-                               <integer>0</integer>
715
-                               <key>UUID</key>
716
-                               <string>BBDE08F6-D7EE-47CB-881F-7F208B3A604B</string>
717
-                           </dict>
718
-                       </array>
719
-                       <key>REMOVED</key>
720
-                       <dict/>
721
-                   </dict>
722
-               </dict>
723
-               <key>INSTALLATION TYPE</key>
724
-               <integer>0</integer>
725
-               <key>MODE</key>
726
-               <integer>0</integer>
727
-           </dict>
728
-           <key>INSTALLATION_STEPS</key>
729
-           <array>
730
-               <dict>
731
-                   <key>ICPRESENTATION_CHAPTER_VIEW_CONTROLLER_CLASS</key>
732
-                   <string>ICPresentationViewIntroductionController</string>
733
-                   <key>INSTALLER_PLUGIN</key>
734
-                   <string>Introduction</string>
735
-                   <key>LIST_TITLE_KEY</key>
736
-                   <string>InstallerSectionTitle</string>
737
-               </dict>
738
-               <dict>
739
-                   <key>ICPRESENTATION_CHAPTER_VIEW_CONTROLLER_CLASS</key>
740
-                   <string>ICPresentationViewReadMeController</string>
741
-                   <key>INSTALLER_PLUGIN</key>
742
-                   <string>ReadMe</string>
743
-                   <key>LIST_TITLE_KEY</key>
744
-                   <string>InstallerSectionTitle</string>
745
-               </dict>
746
-               <dict>
747
-                   <key>ICPRESENTATION_CHAPTER_VIEW_CONTROLLER_CLASS</key>
748
-                   <string>ICPresentationViewLicenseController</string>
749
-                   <key>INSTALLER_PLUGIN</key>
750
-                   <string>License</string>
751
-                   <key>LIST_TITLE_KEY</key>
752
-                   <string>InstallerSectionTitle</string>
753
-               </dict>
754
-               <dict>
755
-                   <key>ICPRESENTATION_CHAPTER_VIEW_CONTROLLER_CLASS</key>
756
-                   <string>ICPresentationViewDestinationSelectController</string>
757
-                   <key>INSTALLER_PLUGIN</key>
758
-                   <string>TargetSelect</string>
759
-                   <key>LIST_TITLE_KEY</key>
760
-                   <string>InstallerSectionTitle</string>
761
-               </dict>
762
-               <dict>
763
-                   <key>ICPRESENTATION_CHAPTER_VIEW_CONTROLLER_CLASS</key>
764
-                   <string>ICPresentationViewInstallationTypeController</string>
765
-                   <key>INSTALLER_PLUGIN</key>
766
-                   <string>PackageSelection</string>
767
-                   <key>LIST_TITLE_KEY</key>
768
-                   <string>InstallerSectionTitle</string>
769
-               </dict>
770
-               <dict>
771
-                   <key>ICPRESENTATION_CHAPTER_VIEW_CONTROLLER_CLASS</key>
772
-                   <string>ICPresentationViewInstallationController</string>
773
-                   <key>INSTALLER_PLUGIN</key>
774
-                   <string>Install</string>
775
-                   <key>LIST_TITLE_KEY</key>
776
-                   <string>InstallerSectionTitle</string>
777
-               </dict>
778
-               <dict>
779
-                   <key>ICPRESENTATION_CHAPTER_VIEW_CONTROLLER_CLASS</key>
780
-                   <string>ICPresentationViewSummaryController</string>
781
-                   <key>INSTALLER_PLUGIN</key>
782
-                   <string>Summary</string>
783
-                   <key>LIST_TITLE_KEY</key>
784
-                   <string>InstallerSectionTitle</string>
785
-               </dict>
786
-           </array>
787
-           <key>INTRODUCTION</key>
788
-           <dict>
789
-               <key>LOCALIZATIONS</key>
790
-               <array/>
791
-           </dict>
792
-           <key>LICENSE</key>
793
-           <dict>
794
-               <key>KEYWORDS</key>
795
-               <dict/>
796
-               <key>LOCALIZATIONS</key>
797
-               <array/>
798
-               <key>MODE</key>
799
-               <integer>0</integer>
800
-           </dict>
801
-           <key>README</key>
802
-           <dict>
803
-               <key>LOCALIZATIONS</key>
804
-               <array/>
805
-           </dict>
806
-           <key>SUMMARY</key>
807
-           <dict>
808
-               <key>LOCALIZATIONS</key>
809
-               <array/>
810
-           </dict>
811
-           <key>TITLE</key>
812
-           <dict>
813
-               <key>LOCALIZATIONS</key>
814
-               <array>
815
-                   <dict>
816
-                       <key>LANGUAGE</key>
817
-                       <string>English</string>
818
-                       <key>VALUE</key>
819
-                       <string>OBS</string>
820
-                   </dict>
821
-               </array>
822
-           </dict>
823
-       </dict>
824
-       <key>PROJECT_REQUIREMENTS</key>
825
-       <dict>
826
-           <key>LIST</key>
827
-           <array/>
828
-           <key>POSTINSTALL_PATH</key>
829
-           <dict/>
830
-           <key>PREINSTALL_PATH</key>
831
-           <dict/>
832
-           <key>RESOURCES</key>
833
-           <array/>
834
-           <key>ROOT_VOLUME_ONLY</key>
835
-           <false/>
836
-       </dict>
837
-       <key>PROJECT_SETTINGS</key>
838
-       <dict>
839
-           <key>ADVANCED_OPTIONS</key>
840
-           <dict/>
841
-           <key>BUILD_FORMAT</key>
842
-           <integer>0</integer>
843
-           <key>BUILD_PATH</key>
844
-           <dict>
845
-               <key>PATH</key>
846
-               <string>../../../build</string>
847
-               <key>PATH_TYPE</key>
848
-               <integer>3</integer>
849
-           </dict>
850
-           <key>EXCLUDED_FILES</key>
851
-           <array>
852
-               <dict>
853
-                   <key>PATTERNS_ARRAY</key>
854
-                   <array>
855
-                       <dict>
856
-                           <key>REGULAR_EXPRESSION</key>
857
-                           <false/>
858
-                           <key>STRING</key>
859
-                           <string>.DS_Store</string>
860
-                           <key>TYPE</key>
861
-                           <integer>0</integer>
862
-                       </dict>
863
-                   </array>
864
-                   <key>PROTECTED</key>
865
-                   <true/>
866
-                   <key>PROXY_NAME</key>
867
-                   <string>Remove .DS_Store files</string>
868
-                   <key>PROXY_TOOLTIP</key>
869
-                   <string>Remove ".DS_Store" files created by the Finder.</string>
870
-                   <key>STATE</key>
871
-                   <true/>
872
-               </dict>
873
-               <dict>
874
-                   <key>PATTERNS_ARRAY</key>
875
-                   <array>
876
-                       <dict>
877
-                           <key>REGULAR_EXPRESSION</key>
878
-                           <false/>
879
-                           <key>STRING</key>
880
-                           <string>.pbdevelopment</string>
881
-                           <key>TYPE</key>
882
-                           <integer>0</integer>
883
-                       </dict>
884
-                   </array>
885
-                   <key>PROTECTED</key>
886
-                   <true/>
887
-                   <key>PROXY_NAME</key>
888
-                   <string>Remove .pbdevelopment files</string>
889
-                   <key>PROXY_TOOLTIP</key>
890
-                   <string>Remove ".pbdevelopment" files created by ProjectBuilder or Xcode.</string>
891
-                   <key>STATE</key>
892
-                   <true/>
893
-               </dict>
894
-               <dict>
895
-                   <key>PATTERNS_ARRAY</key>
896
-                   <array>
897
-                       <dict>
898
-                           <key>REGULAR_EXPRESSION</key>
899
-                           <false/>
900
-                           <key>STRING</key>
901
-                           <string>CVS</string>
902
-                           <key>TYPE</key>
903
-                           <integer>1</integer>
904
-                       </dict>
905
-                       <dict>
906
-                           <key>REGULAR_EXPRESSION</key>
907
-                           <false/>
908
-                           <key>STRING</key>
909
-                           <string>.cvsignore</string>
910
-                           <key>TYPE</key>
911
-                           <integer>0</integer>
912
-                       </dict>
913
-                       <dict>
914
-                           <key>REGULAR_EXPRESSION</key>
915
-                           <false/>
916
-                           <key>STRING</key>
917
-                           <string>.cvspass</string>
918
-                           <key>TYPE</key>
919
-                           <integer>0</integer>
920
-                       </dict>
921
-                       <dict>
922
-                           <key>REGULAR_EXPRESSION</key>
923
-                           <false/>
924
-                           <key>STRING</key>
925
-                           <string>.svn</string>
926
-                           <key>TYPE</key>
927
-                           <integer>1</integer>
928
-                       </dict>
929
-                       <dict>
930
-                           <key>REGULAR_EXPRESSION</key>
931
-                           <false/>
932
-                           <key>STRING</key>
933
-                           <string>.git</string>
934
-                           <key>TYPE</key>
935
-                           <integer>1</integer>
936
-                       </dict>
937
-                       <dict>
938
-                           <key>REGULAR_EXPRESSION</key>
939
-                           <false/>
940
-                           <key>STRING</key>
941
-                           <string>.gitignore</string>
942
-                           <key>TYPE</key>
943
-                           <integer>0</integer>
944
-                       </dict>
945
-                   </array>
946
-                   <key>PROTECTED</key>
947
-                   <true/>
948
-                   <key>PROXY_NAME</key>
949
-                   <string>Remove SCM metadata</string>
950
-                   <key>PROXY_TOOLTIP</key>
951
-                   <string>Remove helper files and folders used by the CVS, SVN or Git Source Code Management systems.</string>
952
-                   <key>STATE</key>
953
-                   <true/>
954
-               </dict>
955
-               <dict>
956
-                   <key>PATTERNS_ARRAY</key>
957
-                   <array>
958
-                       <dict>
959
-                           <key>REGULAR_EXPRESSION</key>
960
-                           <false/>
961
-                           <key>STRING</key>
962
-                           <string>classes.nib</string>
963
-                           <key>TYPE</key>
964
-                           <integer>0</integer>
965
-                       </dict>
966
-                       <dict>
967
-                           <key>REGULAR_EXPRESSION</key>
968
-                           <false/>
969
-                           <key>STRING</key>
970
-                           <string>designable.db</string>
971
-                           <key>TYPE</key>
972
-                           <integer>0</integer>
973
-                       </dict>
974
-                       <dict>
975
-                           <key>REGULAR_EXPRESSION</key>
976
-                           <false/>
977
-                           <key>STRING</key>
978
-                           <string>info.nib</string>
979
-                           <key>TYPE</key>
980
-                           <integer>0</integer>
981
-                       </dict>
982
-                   </array>
983
-                   <key>PROTECTED</key>
984
-                   <true/>
985
-                   <key>PROXY_NAME</key>
986
-                   <string>Optimize nib files</string>
987
-                   <key>PROXY_TOOLTIP</key>
988
-                   <string>Remove "classes.nib", "info.nib" and "designable.nib" files within .nib bundles.</string>
989
-                   <key>STATE</key>
990
-                   <true/>
991
-               </dict>
992
-               <dict>
993
-                   <key>PATTERNS_ARRAY</key>
994
-                   <array>
995
-                       <dict>
996
-                           <key>REGULAR_EXPRESSION</key>
997
-                           <false/>
998
-                           <key>STRING</key>
999
-                           <string>Resources Disabled</string>
1000
-                           <key>TYPE</key>
1001
-                           <integer>1</integer>
1002
-                       </dict>
1003
-                   </array>
1004
-                   <key>PROTECTED</key>
1005
-                   <true/>
1006
-                   <key>PROXY_NAME</key>
1007
-                   <string>Remove Resources Disabled folders</string>
1008
-                   <key>PROXY_TOOLTIP</key>
1009
-                   <string>Remove "Resources Disabled" folders.</string>
1010
-                   <key>STATE</key>
1011
-                   <true/>
1012
-               </dict>
1013
-               <dict>
1014
-                   <key>SEPARATOR</key>
1015
-                   <true/>
1016
-               </dict>
1017
-           </array>
1018
-           <key>NAME</key>
1019
-           <string>OBS</string>
1020
-       </dict>
1021
-   </dict>
1022
-   <key>TYPE</key>
1023
-   <integer>0</integer>
1024
-   <key>VERSION</key>
1025
-   <integer>2</integer>
1026
-</dict>
1027
-</plist>
1028
obs-studio-26.1.0.tar.xz/CI/install/osx/Info.plist Deleted
30
 
1
@@ -1,28 +0,0 @@
2
-<?xml version="1.0" encoding="UTF-8"?>
3
-<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4
-<plist version="1.0">
5
-<dict>
6
-   <key>CFBundleIconFile</key>
7
-   <string>obs.icns</string>
8
-   <key>CFBundleName</key>
9
-   <string>OBS</string>
10
-   <key>CFBundleGetInfoString</key>
11
-   <string>OBS - Free and Open Source Streaming/Recording Software</string>
12
-   <key>CFBundleExecutable</key>
13
-   <string>obs</string>
14
-   <key>CFBundleIdentifier</key>
15
-   <string>com.obsproject.obs-studio</string>
16
-   <key>CFBundlePackageType</key>
17
-   <string>APPL</string>
18
-   <key>LSMinimumSystemVersion</key>
19
-   <string>10.8.5</string>
20
-   <key>NSHighResolutionCapable</key>
21
-   <true/>
22
-   <key>LSAppNapIsDisabled</key>
23
-   <true/>
24
-   <key>NSCameraUsageDescription</key>
25
-   <string>OBS needs to access the camera to enable camera sources to work.</string>
26
-   <key>NSMicrophoneUsageDescription</key>
27
-   <string>OBS needs to access the microphone to enable audio input.</string>
28
-</dict>
29
-</plist>
30
obs-studio-26.1.0.tar.xz/CI/install/osx/OBSPublicDSAKey.pem Deleted
38
 
1
@@ -1,36 +0,0 @@
2
------BEGIN PUBLIC KEY-----
3
-MIIGPDCCBC4GByqGSM44BAEwggQhAoICAQCZZZ2y7H2GJmMfP4KQihJTJOoiGNUw
4
-mue6sqMbH+utRykRnSKBZux6R665eRFMpNgrgFO1TLLGbdD2U31KiGtCvFJOmOl3
5
-+QP055BuXjEG36NU7AWEFLAlbDlr/2D3oumq3Ib3iMnnr9RrVztJ2VFOvVio1eWr
6
-ZxboVwKPK8D6BqsWiv15vbYlJnTC4Fls6ySmdjVBxwoPlTaMu1ysi5DfbIZ93s5u
7
-aQt1FvXuWtPBWjyVUORcNbcWf49E5R2pV0OSBK95Hw2/wXz4vmj+w92dTePGnVaW
8
-Me4CoF5PIeZILwp6DCLStX4eW2WG1NChJTC8zeQ/3bMMoGyKM/MadyvrDqMywsKY
9
-caxkIwHrDKOEdXXGo80dIwZMMLipPA8DKhx5ojphfkeXjIhKSx+49knXT3ED5okE
10
-Wai7tGUXj/8D8sGh+7b+AVsdujvr4v8WQaZiKUOZ2IIHOg3VLz9T9v0zet1Yt987
11
-KNymFcp2CHeJ6KnDP/ZGQ6Nl0HsPxUgscsXV+R2FEc8Q1j0Ukkuxnopa0E4/huUu
12
-gjyRzpXD734qFMDf7LcXca6qNjBor6gVj5sRyRKCpZ+KQfMUlr8jp506ztYSyeJu
13
-dxJV30tQgztwkbrs02CqOt4Z3Peo6sdht7hWKSPVwmja3tq8/TfUSSoo6wKYN9/w
14
-Mf3dVeRF8hCzJQIVAJnzuzmzQhCKPiQnl3jh5qGII2XfAoICAQCCVATAff89ceHj
15
-ROHEbHTQFpVxJ/kRZPfxnU46DSw79Tih7tthV68oakPSOTP3cx/Tga0GwogarZ9N
16
-F2VVan5w9OQSSewXsr5UDT5bnmJF+h+JB7TMy+sXZBYobUqjlUd5VtKc8RsN86P4
17
-s7xbK0mA+hfe+27r18JT81/eH3xUfh7UOUGSdMN2Ch9f7RFSMZIgUAZUzu2K3ODp
18
-hPgtc2QJ8QVAp7GLvQgw8ZUME/ChZslyBIyJvYgUIxfxlgRWYro5pQT7/ngkgdXo
19
-wlghHKkldwMuY3zaFdhPnFNuEUEtc18ILsbz0+AnagCUd6n+3safskCRqLIHMOY6
20
-iLBSZPX9hJQhVCqSqz1VNDDww8FNa/fojJ1Lr/TI0I+0Ib2pCiY2LChXUqGY5SLZ
21
-2KNs5qFsyZP+I0L8YsGwqvUYyFwk7Ok224n0NtaOwqpLCrtXd/i6DaDNiaoJuwJC
22
-1ELCfaZivorgkC5rhBt2H7qWUAR+EtrFE/gb0k/G5EIhjYql7onGbX+G2re38vQA
23
-fg1pzguhig2dafP/BxMLZrn1Gg61xzmEYPuS9gclktaf675srv8GVb46VkOxXL+D
24
-YvTmpJPP7UUOVlmAMCo4j4y09MW3jq9TDp42VTLeZVubyjslGnavlnq1O+ZyXUye
25
-1FMeby65sIbSHHHwoFnRv3hLSEXI5gOCAgYAAoICAQCUkYnZkPfHfOJZI403xUYP
26
-CE/bLpkza074Xo6EXElsWRnpQgNTx+JFOvItgj3v0OkIqDin9UredKOwfkiftslV
27
-jxUVKA6I5kwnGvCpvTpQMLyLjq+VQr+J2D6eId6tV/iajhdu5r4JThU8KllT7Ywb
28
-NAur34ftLNCVAMRUaDNeEoHfePgderW384e+lbvpmtifmBluammGSxxRtUsdjvJZ
29
-BFkhaJu86CKxcU7D1lbPVOtV/jaxz6d16VdGcfBdi2LzXZzZtYpT9XGPX3NF+xii
30
-spAURWsoe11LTRXF+eJhgCm5iIDN3kh1HEQKYKAVpmrcM0aFzk/NpS+tFyU72vaq
31
-IRSSJw/aa1oELOAakG5oPldc4RcYWl32sbnVwXHO7TZvgTrBSC10o65MAC5CHP/s
32
-b07heDYAIt7re7szvOYq+c/9zAMAlu3pcO8MqaXYMmybdHBXHQ2b+DdJWHmIUWcX
33
-CbUzr09vzGkJAvqsXqbmJPr8aixrO75DhT0iDTILLWe/GWK51nf+Tg0pNxVgGyAl
34
-BqvRqqo7SSDu9FMkwQesFFHhuoHLyEHwVPJ+sMQTNwQcm9c6YuW8EYDRSkeKLWYk
35
-3fkjG+Pe9uVE8a1taDg3FjSY0UqjUT6XMw+i0Lajyus2L6wFBwrrGM6E4xa6x1CC
36
-MGjmuSOlPA1umQsToIcO4g==
37
------END PUBLIC KEY-----
38
obs-studio-26.1.0.tar.xz/CI/install/osx/SyphonInject.pkg Deleted
obs-studio-26.1.0.tar.xz/CI/install/osx/background.png Deleted
obs-studio-26.1.0.tar.xz/CI/install/osx/background.pxd Deleted
2
 
1
-(directory)
2
obs-studio-26.1.0.tar.xz/CI/install/osx/background.pxd/QuickLook Deleted
2
 
1
-(directory)
2
obs-studio-26.1.0.tar.xz/CI/install/osx/background.pxd/QuickLook/Icon.tiff Deleted
obs-studio-26.1.0.tar.xz/CI/install/osx/background.pxd/QuickLook/Preview.tiff Deleted
obs-studio-26.1.0.tar.xz/CI/install/osx/background.pxd/QuickLook/Thumbnail.tiff Deleted
obs-studio-26.1.0.tar.xz/CI/install/osx/background.pxd/data Deleted
2
 
1
-(directory)
2
obs-studio-26.1.0.tar.xz/CI/install/osx/background.pxd/data/556CF265-5721-4F18-BE83-8CF39483B4C2 Deleted
obs-studio-26.1.0.tar.xz/CI/install/osx/background.pxd/data/8CA689C3-ED2A-459E-952C-E08026CFCD07 Deleted
obs-studio-26.1.0.tar.xz/CI/install/osx/background.pxd/metadata.info Deleted
obs-studio-26.1.0.tar.xz/CI/install/osx/background.tiff Deleted
obs-studio-26.1.0.tar.xz/CI/install/osx/background@2x.png Deleted
obs-studio-26.1.0.tar.xz/CI/install/osx/buildDMG Deleted
3
 
1
@@ -1,1 +0,0 @@
2
-dmgbuild -s ./settings.json "OBS" obs.dmg
3
obs-studio-26.1.0.tar.xz/CI/install/osx/build_app.py Deleted
240
 
1
@@ -1,238 +0,0 @@
2
-#!/usr/bin/env python
3
-
4
-candidate_paths = "bin obs-plugins data".split()
5
-
6
-plist_path = "../cmake/osxbundle/Info.plist"
7
-icon_path = "../cmake/osxbundle/obs.icns"
8
-run_path = "../cmake/osxbundle/obslaunch.sh"
9
-
10
-#not copied
11
-blacklist = """/usr /System""".split()
12
-
13
-#copied
14
-whitelist = """/usr/local""".split()
15
-
16
-#
17
-#
18
-#
19
-
20
-
21
-from sys import argv
22
-from glob import glob
23
-from subprocess import check_output, call
24
-from collections import namedtuple
25
-from shutil import copy, copytree, rmtree
26
-from os import makedirs, rename, walk, path as ospath
27
-import plistlib
28
-
29
-import argparse
30
-
31
-def _str_to_bool(s):
32
-    """Convert string to bool (in argparse context)."""
33
-    if s.lower() not in ['true', 'false']:
34
-        raise ValueError('Need bool; got %r' % s)
35
-    return {'true': True, 'false': False}[s.lower()]
36
-
37
-def add_boolean_argument(parser, name, default=False):
38
-    """Add a boolean argument to an ArgumentParser instance."""
39
-    group = parser.add_mutually_exclusive_group()
40
-    group.add_argument(
41
-        '--' + name, nargs='?', default=default, const=True, type=_str_to_bool)
42
-    group.add_argument('--no' + name, dest=name, action='store_false')
43
-
44
-parser = argparse.ArgumentParser(description='obs-studio package util')
45
-parser.add_argument('-d', '--base-dir', dest='dir', default='rundir/RelWithDebInfo')
46
-parser.add_argument('-n', '--build-number', dest='build_number', default='0')
47
-parser.add_argument('-k', '--public-key', dest='public_key', default='OBSPublicDSAKey.pem')
48
-parser.add_argument('-f', '--sparkle-framework', dest='sparkle', default=None)
49
-parser.add_argument('-b', '--base-url', dest='base_url', default='https://obsproject.com/osx_update')
50
-parser.add_argument('-u', '--user', dest='user', default='jp9000')
51
-parser.add_argument('-c', '--channel', dest='channel', default='master')
52
-add_boolean_argument(parser, 'stable', default=False)
53
-parser.add_argument('-p', '--prefix', dest='prefix', default='')
54
-args = parser.parse_args()
55
-
56
-def cmd(cmd):
57
-    import subprocess
58
-    import shlex
59
-    return subprocess.check_output(shlex.split(cmd)).rstrip('\r\n')
60
-
61
-LibTarget = namedtuple("LibTarget", ("path", "external", "copy_as"))
62
-
63
-inspect = list()
64
-
65
-inspected = set()
66
-
67
-build_path = args.dir
68
-build_path = build_path.replace("\\ ", " ")
69
-
70
-def add(name, external=False, copy_as=None):
71
-   if external and copy_as is None:
72
-       copy_as = name.split("/")[-1]
73
-   if name[0] != "/":
74
-       name = build_path+"/"+name
75
-   t = LibTarget(name, external, copy_as)
76
-   if t in inspected:
77
-       return
78
-   inspect.append(t)
79
-   inspected.add(t)
80
-
81
-
82
-for i in candidate_paths:
83
-   print("Checking " + i)
84
-   for root, dirs, files in walk(build_path+"/"+i):
85
-       for file_ in files:
86
-           if ".ini" in file_:
87
-               continue
88
-           if ".png" in file_:
89
-               continue
90
-           if ".effect" in file_:
91
-               continue
92
-           if ".py" in file_:
93
-               continue
94
-           if ".json" in file_:
95
-               continue
96
-           path = root + "/" + file_
97
-           try:
98
-               out = check_output("{0}otool -L '{1}'".format(args.prefix, path), shell=True,
99
-                       universal_newlines=True)
100
-               if "is not an object file" in out:
101
-                   continue
102
-           except:
103
-               continue
104
-           rel_path = path[len(build_path)+1:]
105
-           print(repr(path), repr(rel_path))
106
-           add(rel_path)
107
-
108
-def add_plugins(path, replace):
109
-   for img in glob(path.replace(
110
-       "lib/QtCore.framework/Versions/5/QtCore",
111
-       "plugins/%s/*"%replace).replace(
112
-           "Library/Frameworks/QtCore.framework/Versions/5/QtCore",
113
-           "share/qt5/plugins/%s/*"%replace)):
114
-       if "_debug" in img:
115
-           continue
116
-       add(img, True, img.split("plugins/")[-1])
117
-
118
-actual_sparkle_path = '@loader_path/Frameworks/Sparkle.framework/Versions/A/Sparkle'
119
-
120
-while inspect:
121
-   target = inspect.pop()
122
-   print("inspecting", repr(target))
123
-   path = target.path
124
-   if path[0] == "@":
125
-       continue
126
-   out = check_output("{0}otool -L '{1}'".format(args.prefix, path), shell=True,
127
-           universal_newlines=True)
128
-
129
-   if "QtCore" in path:
130
-       add_plugins(path, "platforms")
131
-       add_plugins(path, "imageformats")
132
-       add_plugins(path, "accessible")
133
-       add_plugins(path, "styles")
134
-
135
-
136
-   for line in out.split("\n")[1:]:
137
-       new = line.strip().split(" (")[0]
138
-       if '@' in new and "sparkle.framework" in new.lower():
139
-           actual_sparkle_path = new
140
-           print "Using sparkle path:", repr(actual_sparkle_path)
141
-       if not new or new[0] == "@" or new.endswith(path.split("/")[-1]):
142
-           continue
143
-       whitelisted = False
144
-       for i in whitelist:
145
-           if new.startswith(i):
146
-               whitelisted = True
147
-       if not whitelisted:
148
-           blacklisted = False
149
-           for i in blacklist:
150
-               if new.startswith(i):
151
-                   blacklisted = True
152
-                   break
153
-           if blacklisted:
154
-               continue
155
-       add(new, True)
156
-
157
-changes = list()
158
-for path, external, copy_as in inspected:
159
-   if not external:
160
-       continue #built with install_rpath hopefully
161
-   changes.append("-change '%s' '@rpath/%s'"%(path, copy_as))
162
-changes = " ".join(changes)
163
-
164
-info = plistlib.readPlist(plist_path)
165
-
166
-latest_tag = cmd('git describe --tags --abbrev=0')
167
-log = cmd('git log --pretty=oneline {0}...HEAD'.format(latest_tag))
168
-
169
-from os import path
170
-# set version
171
-if args.stable:
172
-    info["CFBundleVersion"] = latest_tag
173
-    info["CFBundleShortVersionString"] = latest_tag
174
-    info["SUFeedURL"] = '{0}/stable/updates.xml'.format(args.base_url)
175
-else:
176
-    info["CFBundleVersion"] = args.build_number
177
-    info["CFBundleShortVersionString"] = '{0}.{1}'.format(latest_tag, args.build_number)
178
-    info["SUFeedURL"] = '{0}/{1}/{2}/updates.xml'.format(args.base_url, args.user, args.channel)
179
-
180
-info["SUPublicDSAKeyFile"] = path.basename(args.public_key)
181
-info["OBSFeedsURL"] = '{0}/feeds.xml'.format(args.base_url)
182
-
183
-app_name = info["CFBundleName"]+".app"
184
-icon_file = "tmp/Contents/Resources/%s"%info["CFBundleIconFile"]
185
-
186
-copytree(build_path, "tmp/Contents/Resources/", symlinks=True)
187
-copy(icon_path, icon_file)
188
-plistlib.writePlist(info, "tmp/Contents/Info.plist")
189
-makedirs("tmp/Contents/MacOS")
190
-copy(run_path, "tmp/Contents/MacOS/%s"%info["CFBundleExecutable"])
191
-try:
192
-   copy(args.public_key, "tmp/Contents/Resources")
193
-except:
194
-   pass
195
-
196
-if args.sparkle is not None:
197
-    copytree(args.sparkle, "tmp/Contents/Frameworks/Sparkle.framework", symlinks=True)
198
-
199
-prefix = "tmp/Contents/Resources/"
200
-sparkle_path = '@loader_path/{0}/Frameworks/Sparkle.framework/Versions/A/Sparkle'
201
-
202
-cmd('{0}install_name_tool -change {1} {2} {3}/bin/obs'.format(
203
-    args.prefix, actual_sparkle_path, sparkle_path.format('../..'), prefix))
204
-
205
-
206
-
207
-for path, external, copy_as in inspected:
208
-   id_ = ""
209
-   filename = path
210
-   rpath = ""
211
-   if external:
212
-       if copy_as == "Python":
213
-           continue
214
-       id_ = "-id '@rpath/%s'"%copy_as
215
-       filename = prefix + "bin/" +copy_as
216
-       rpath = "-add_rpath @loader_path/ -add_rpath @executable_path/"
217
-       if "/" in copy_as:
218
-           try:
219
-               dirs = copy_as.rsplit("/", 1)[0]
220
-               makedirs(prefix + "bin/" + dirs)
221
-           except:
222
-               pass
223
-       copy(path, filename)
224
-   else:
225
-       filename = path[len(build_path)+1:]
226
-       id_ = "-id '@rpath/../%s'"%filename
227
-       if not filename.startswith("bin"):
228
-           print(filename)
229
-           rpath = "-add_rpath '@loader_path/{}/'".format(ospath.relpath("bin/", ospath.dirname(filename)))
230
-       filename = prefix + filename
231
-
232
-   cmd = "{0}install_name_tool {1} {2} {3} '{4}'".format(args.prefix, changes, id_, rpath, filename)
233
-   call(cmd, shell=True)
234
-
235
-try:
236
-   rename("tmp", app_name)
237
-except:
238
-   print("App already exists")
239
-   rmtree("tmp")
240
obs-studio-26.1.0.tar.xz/CI/install/osx/dylibBundler Deleted
obs-studio-26.1.0.tar.xz/CI/install/osx/makeRetinaBG Deleted
3
 
1
@@ -1,1 +0,0 @@
2
-tiffutil -cathidpicheck background.png background@2x.png -out background.tiff
3
obs-studio-26.1.0.tar.xz/CI/install/osx/obs.icns Deleted
obs-studio-26.1.0.tar.xz/CI/install/osx/obs.png Deleted
obs-studio-26.1.0.tar.xz/CI/install/osx/packageApp.sh Deleted
79
 
1
@@ -1,77 +0,0 @@
2
-# Exit if something fails
3
-set -e
4
-
5
-rm -rf ./OBS.app
6
-
7
-mkdir OBS.app
8
-mkdir OBS.app/Contents
9
-mkdir OBS.app/Contents/MacOS
10
-mkdir OBS.app/Contents/PlugIns
11
-mkdir OBS.app/Contents/Resources
12
-
13
-cp -R rundir/RelWithDebInfo/bin/ ./OBS.app/Contents/MacOS
14
-cp -R rundir/RelWithDebInfo/data ./OBS.app/Contents/Resources
15
-cp ../CI/install/osx/obs.icns ./OBS.app/Contents/Resources
16
-cp -R rundir/RelWithDebInfo/obs-plugins/ ./OBS.app/Contents/PlugIns
17
-cp ../CI/install/osx/Info.plist ./OBS.app/Contents
18
-
19
-../CI/install/osx/dylibBundler -b -cd -d ./OBS.app/Contents/Frameworks -p @executable_path/../Frameworks/ \
20
--s ./OBS.app/Contents/MacOS \
21
--s /usr/local/opt/mbedtls/lib/ \
22
--x ./OBS.app/Contents/PlugIns/coreaudio-encoder.so \
23
--x ./OBS.app/Contents/PlugIns/decklink-ouput-ui.so \
24
--x ./OBS.app/Contents/PlugIns/frontend-tools.so \
25
--x ./OBS.app/Contents/PlugIns/image-source.so \
26
--x ./OBS.app/Contents/PlugIns/linux-jack.so \
27
--x ./OBS.app/Contents/PlugIns/mac-avcapture.so \
28
--x ./OBS.app/Contents/PlugIns/mac-capture.so \
29
--x ./OBS.app/Contents/PlugIns/mac-decklink.so \
30
--x ./OBS.app/Contents/PlugIns/mac-syphon.so \
31
--x ./OBS.app/Contents/PlugIns/mac-vth264.so \
32
--x ./OBS.app/Contents/PlugIns/obs-browser.so \
33
--x ./OBS.app/Contents/PlugIns/obs-browser-page \
34
--x ./OBS.app/Contents/PlugIns/obs-ffmpeg.so \
35
--x ./OBS.app/Contents/PlugIns/obs-filters.so \
36
--x ./OBS.app/Contents/PlugIns/obs-transitions.so \
37
--x ./OBS.app/Contents/PlugIns/obs-vst.so \
38
--x ./OBS.app/Contents/PlugIns/rtmp-services.so \
39
--x ./OBS.app/Contents/MacOS/obs \
40
--x ./OBS.app/Contents/MacOS/obs-ffmpeg-mux \
41
--x ./OBS.app/Contents/MacOS/obslua.so \
42
--x ./OBS.app/Contents/PlugIns/obs-x264.so \
43
--x ./OBS.app/Contents/PlugIns/text-freetype2.so \
44
--x ./OBS.app/Contents/PlugIns/obs-libfdk.so
45
-# -x ./OBS.app/Contents/MacOS/_obspython.so \
46
-# -x ./OBS.app/Contents/PlugIns/obs-outputs.so \
47
-
48
-/usr/local/Cellar/qt/5.14.1/bin/macdeployqt ./OBS.app
49
-
50
-mv ./OBS.app/Contents/MacOS/libobs-opengl.so ./OBS.app/Contents/Frameworks
51
-
52
-rm -f -r ./OBS.app/Contents/Frameworks/QtNetwork.framework
53
-
54
-# put qt network in here becasuse streamdeck uses it
55
-cp -R /usr/local/opt/qt/lib/QtNetwork.framework ./OBS.app/Contents/Frameworks
56
-chmod -R +w ./OBS.app/Contents/Frameworks/QtNetwork.framework
57
-rm -r ./OBS.app/Contents/Frameworks/QtNetwork.framework/Headers
58
-rm -r ./OBS.app/Contents/Frameworks/QtNetwork.framework/Versions/5/Headers/
59
-chmod 644 ./OBS.app/Contents/Frameworks/QtNetwork.framework/Versions/5/Resources/Info.plist
60
-install_name_tool -id @executable_path/../Frameworks/QtNetwork.framework/Versions/5/QtNetwork ./OBS.app/Contents/Frameworks/QtNetwork.framework/Versions/5/QtNetwork
61
-install_name_tool -change /usr/local/Cellar/qt/5.14.1/lib/QtCore.framework/Versions/5/QtCore @executable_path/../Frameworks/QtCore.framework/Versions/5/QtCore ./OBS.app/Contents/Frameworks/QtNetwork.framework/Versions/5/QtNetwork
62
-
63
-
64
-# decklink ui qt
65
-install_name_tool -change /usr/local/opt/qt/lib/QtGui.framework/Versions/5/QtGui @executable_path/../Frameworks/QtGui.framework/Versions/5/QtGui ./OBS.app/Contents/PlugIns/decklink-ouput-ui.so
66
-install_name_tool -change /usr/local/opt/qt/lib/QtCore.framework/Versions/5/QtCore @executable_path/../Frameworks/QtCore.framework/Versions/5/QtCore ./OBS.app/Contents/PlugIns/decklink-ouput-ui.so
67
-install_name_tool -change /usr/local/opt/qt/lib/QtWidgets.framework/Versions/5/QtWidgets @executable_path/../Frameworks/QtWidgets.framework/Versions/5/QtWidgets ./OBS.app/Contents/PlugIns/decklink-ouput-ui.so
68
-
69
-# frontend tools qt
70
-install_name_tool -change /usr/local/opt/qt/lib/QtGui.framework/Versions/5/QtGui @executable_path/../Frameworks/QtGui.framework/Versions/5/QtGui ./OBS.app/Contents/PlugIns/frontend-tools.so
71
-install_name_tool -change /usr/local/opt/qt/lib/QtCore.framework/Versions/5/QtCore @executable_path/../Frameworks/QtCore.framework/Versions/5/QtCore ./OBS.app/Contents/PlugIns/frontend-tools.so
72
-install_name_tool -change /usr/local/opt/qt/lib/QtWidgets.framework/Versions/5/QtWidgets @executable_path/../Frameworks/QtWidgets.framework/Versions/5/QtWidgets ./OBS.app/Contents/PlugIns/frontend-tools.so
73
-
74
-# vst qt
75
-install_name_tool -change /usr/local/opt/qt/lib/QtGui.framework/Versions/5/QtGui @executable_path/../Frameworks/QtGui.framework/Versions/5/QtGui ./OBS.app/Contents/PlugIns/obs-vst.so
76
-install_name_tool -change /usr/local/opt/qt/lib/QtCore.framework/Versions/5/QtCore @executable_path/../Frameworks/QtCore.framework/Versions/5/QtCore ./OBS.app/Contents/PlugIns/obs-vst.so
77
-install_name_tool -change /usr/local/opt/qt/lib/QtWidgets.framework/Versions/5/QtWidgets @executable_path/../Frameworks/QtWidgets.framework/Versions/5/QtWidgets ./OBS.app/Contents/PlugIns/obs-vst.so
78
-install_name_tool -change /usr/local/opt/qt/lib/QtMacExtras.framework/Versions/5/QtMacExtras @executable_path/../Frameworks/QtMacExtras.framework/Versions/5/QtMacExtras ./OBS.app/Contents/PlugIns/obs-vst.so
79
obs-studio-26.1.0.tar.xz/CI/install/osx/package_util.py Deleted
96
 
1
@@ -1,94 +0,0 @@
2
-def cmd(cmd):
3
-    import subprocess
4
-    import shlex
5
-    return subprocess.check_output(shlex.split(cmd)).rstrip('\r\n')
6
-
7
-def get_tag_info(tag):
8
-    rev = cmd('git rev-parse {0}'.format(latest_tag))
9
-    anno = cmd('git cat-file -p {0}'.format(rev))
10
-    tag_info = []
11
-    for i, v in enumerate(anno.splitlines()):
12
-        if i <= 4:
13
-            continue
14
-        tag_info.append(v.lstrip())
15
-
16
-    return tag_info
17
-
18
-def gen_html(github_user, latest_tag):
19
-
20
-    url = 'https://github.com/{0}/obs-studio/commit/%H'.format(github_user)
21
-
22
-    with open('readme.html', 'w') as f:
23
-        f.write("<html><body>")
24
-        log_cmd = """git log {0}...HEAD --pretty=format:'<li>&bull; <a href="{1}">(view)</a> %s</li>'"""
25
-        log_res = cmd(log_cmd.format(latest_tag, url))
26
-        if len(log_res.splitlines()):
27
-            f.write('<p>Changes since {0}: (Newest to oldest)</p>'.format(latest_tag))
28
-            f.write(log_res)
29
-
30
-        ul = False
31
-        f.write('<p>')
32
-        import re
33
-
34
-        for l in get_tag_info(latest_tag):
35
-            if not len(l):
36
-                continue
37
-            if l.startswith('*'):
38
-                ul = True
39
-                if not ul:
40
-                    f.write('<ul>')
41
-                f.write('<li>&bull; {0}</li>'.format(re.sub(r'^(\s*)?[*](\s*)?', '', l)))
42
-            else:
43
-                if ul:
44
-                    f.write('</ul><p/>')
45
-                ul = False
46
-                f.write('<p>{0}</p>'.format(l))
47
-        if ul:
48
-            f.write('</ul>')
49
-        f.write('</p></body></html>')
50
-
51
-    cmd('textutil -convert rtf readme.html -output readme.rtf')
52
-    cmd("""sed -i '' 's/Times-Roman/Verdana/g' readme.rtf""")
53
-
54
-def save_manifest(latest_tag, user, jenkins_build, branch, stable):
55
-    log = cmd('git log --pretty=oneline {0}...HEAD'.format(latest_tag))
56
-    manifest = {}
57
-    manifest['commits'] = []
58
-    for v in log.splitlines():
59
-        manifest['commits'].append(v)
60
-    manifest['tag'] = {
61
-        'name': latest_tag,
62
-        'description': get_tag_info(latest_tag)
63
-    }
64
-
65
-    manifest['version'] = cmd('git rev-list HEAD --count')
66
-    manifest['sha1'] = cmd('git rev-parse HEAD')
67
-    manifest['jenkins_build'] = jenkins_build
68
-
69
-    manifest['user'] = user
70
-    manifest['branch'] = branch
71
-    manifest['stable'] = stable
72
-
73
-    import cPickle
74
-    with open('manifest', 'w') as f:
75
-        cPickle.dump(manifest, f)
76
-
77
-def prepare_pkg(project, package_id):
78
-    cmd('packagesutil --file "{0}" set package-1 identifier {1}'.format(project, package_id))
79
-    cmd('packagesutil --file "{0}" set package-1 version {1}'.format(project, '1.0'))
80
-
81
-
82
-import argparse
83
-parser = argparse.ArgumentParser(description='obs-studio package util')
84
-parser.add_argument('-u', '--user', dest='user', default='jp9000')
85
-parser.add_argument('-p', '--package-id', dest='package_id', default='org.obsproject.pkg.obs-studio')
86
-parser.add_argument('-f', '--project-file', dest='project', default='OBS.pkgproj')
87
-parser.add_argument('-j', '--jenkins-build', dest='jenkins_build', default='0')
88
-parser.add_argument('-b', '--branch', dest='branch', default='master')
89
-parser.add_argument('-s', '--stable', dest='stable', required=False, action='store_true', default=False)
90
-args = parser.parse_args()
91
-
92
-latest_tag = cmd('git describe --tags --abbrev=0')
93
-gen_html(args.user, latest_tag)
94
-prepare_pkg(args.project, args.package_id)
95
-save_manifest(latest_tag, args.user, args.jenkins_build, args.branch, args.stable)
96
obs-studio-26.1.0.tar.xz/CI/install/osx/post-install.sh Deleted
3
 
1
@@ -1,1 +0,0 @@
2
-#!/usr/bin/env bash
3
obs-studio-26.1.0.tar.xz/CI/install/osx/settings.json Deleted
15
 
1
@@ -1,13 +0,0 @@
2
-{
3
-    "title": "OBS",
4
-    "background": "../CI/install/osx/background.tiff",
5
-    "format": "UDZO",
6
-    "compression-level": 9,
7
-    "window": { "position": { "x": 100, "y": 100 },
8
-                "size": { "width": 540, "height": 380 } },
9
-    "contents": [
10
-        { "x": 120, "y": 180, "type": "file",
11
-          "path": "./OBS.app" },
12
-        { "x": 420, "y": 180, "type": "link", "path": "/Applications" }
13
-    ]
14
-}
15
obs-studio-26.1.0.tar.xz/CI/osxcert Deleted
2
 
1
-(directory)
2
obs-studio-26.1.0.tar.xz/CI/osxcert/Certificates.p12.enc Deleted
obs-studio-26.1.0.tar.xz/CI/util Deleted
2
 
1
-(directory)
2
obs-studio-26.1.0.tar.xz/CI/util/build-package-deps-osx.sh Deleted
163
 
1
@@ -1,161 +0,0 @@
2
-#!/usr/bin/env bash
3
-
4
-set -e
5
-
6
-# This script builds a tar file that contains a bunch of deps that OBS needs for
7
-# advanced functionality on OSX. Currently this tar file is pulled down off of s3
8
-# and used in the CI build process on travis.
9
-# Mostly this sets build flags to compile with older SDKS and make sure that 
10
-# the libs are portable.
11
-
12
-exists()
13
-{
14
-  command -v "$1" >/dev/null 2>&1
15
-}
16
-
17
-if ! exists nasm; then
18
-    echo "nasm not found. Try brew install nasm"
19
-    exit
20
-fi
21
-
22
-CURDIR=$(pwd)
23
-
24
-# the temp directory
25
-WORK_DIR=`mktemp -d`
26
-
27
-# deletes the temp directory
28
-function cleanup {
29
-  #rm -rf "$WORK_DIR"
30
-  echo "Deleted temp working directory $WORK_DIR"
31
-}
32
-
33
-# register the cleanup function to be called on the EXIT signal
34
-trap cleanup EXIT
35
-
36
-cd $WORK_DIR
37
-
38
-DEPS_DEST=$WORK_DIR/obsdeps
39
-
40
-# make dest dirs
41
-mkdir $DEPS_DEST
42
-mkdir $DEPS_DEST/bin
43
-mkdir $DEPS_DEST/include
44
-mkdir $DEPS_DEST/lib
45
-
46
-# OSX COMPAT
47
-export MACOSX_DEPLOYMENT_TARGET=10.11
48
-
49
-# If you need an olders SDK and Xcode won't give it to you
50
-# https://github.com/phracker/MacOSX-SDKs
51
-
52
-# libopus
53
-curl -L -O https://ftp.osuosl.org/pub/xiph/releases/opus/opus-1.2.1.tar.gz
54
-tar -xf opus-1.2.1.tar.gz
55
-cd ./opus-1.2.1
56
-mkdir build
57
-cd ./build
58
-../configure --disable-shared --enable-static --prefix="/tmp/obsdeps"
59
-make -j 12
60
-make install
61
-
62
-cd $WORK_DIR
63
-
64
-# libogg
65
-curl -L -O https://ftp.osuosl.org/pub/xiph/releases/ogg/libogg-1.3.3.tar.gz
66
-tar -xf libogg-1.3.3.tar.gz
67
-cd ./libogg-1.3.3
68
-mkdir build
69
-cd ./build
70
-../configure --disable-shared --enable-static --prefix="/tmp/obsdeps"
71
-make -j 12
72
-make install
73
-
74
-cd $WORK_DIR
75
-
76
-# libvorbis
77
-curl -L -O https://ftp.osuosl.org/pub/xiph/releases/vorbis/libvorbis-1.3.6.tar.gz
78
-tar -xf libvorbis-1.3.6.tar.gz
79
-cd ./libvorbis-1.3.6
80
-mkdir build
81
-cd ./build
82
-../configure --disable-shared --enable-static --prefix="/tmp/obsdeps"
83
-make -j 12
84
-make install
85
-
86
-cd $WORK_DIR
87
-
88
-# libvpx
89
-curl -L -O https://chromium.googlesource.com/webm/libvpx/+archive/v1.7.0.tar.gz
90
-mkdir -p ./libvpx-v1.7.0
91
-tar -xf v1.7.0.tar.gz -C $PWD/libvpx-v1.7.0
92
-cd ./libvpx-v1.7.0
93
-mkdir -p build
94
-cd ./build
95
-../configure --disable-shared --prefix="/tmp/obsdeps" --libdir="/tmp/obsdeps/lib"
96
-make -j 12
97
-make install
98
-
99
-cd $WORK_DIR
100
-
101
-# x264
102
-git clone git://git.videolan.org/x264.git
103
-cd ./x264
104
-git checkout origin/stable
105
-mkdir build
106
-cd ./build
107
-../configure --extra-ldflags="-mmacosx-version-min=10.11" --enable-static --prefix="/tmp/obsdeps"
108
-make -j 12
109
-make install
110
-../configure --extra-ldflags="-mmacosx-version-min=10.11" --enable-shared --libdir="/tmp/obsdeps/bin" --prefix="/tmp/obsdeps"
111
-make -j 12
112
-ln -f -s libx264.*.dylib libx264.dylib
113
-find . -name \*.dylib -exec cp \{\} $DEPS_DEST/bin/ \;
114
-rsync -avh --include="*/" --include="*.h" --exclude="*" ../* $DEPS_DEST/include/
115
-rsync -avh --include="*/" --include="*.h" --exclude="*" ./* $DEPS_DEST/include/
116
-
117
-cd $WORK_DIR
118
-
119
-# janson
120
-curl -L -O http://www.digip.org/jansson/releases/jansson-2.11.tar.gz
121
-tar -xf jansson-2.11.tar.gz
122
-cd jansson-2.11
123
-mkdir build
124
-cd ./build
125
-../configure --libdir="/tmp/obsdeps/bin" --enable-shared --disable-static
126
-make -j 12
127
-find . -name \*.dylib -exec cp \{\} $DEPS_DEST/bin/ \;
128
-rsync -avh --include="*/" --include="*.h" --exclude="*" ../* $DEPS_DEST/include/
129
-rsync -avh --include="*/" --include="*.h" --exclude="*" ./* $DEPS_DEST/include/
130
-
131
-cd $WORK_DIR
132
-
133
-export LDFLAGS="-L/tmp/obsdeps/lib"
134
-export CFLAGS="-I/tmp/obsdeps/include"
135
-
136
-# FFMPEG
137
-curl -L -O https://github.com/FFmpeg/FFmpeg/archive/n4.0.2.zip
138
-unzip ./n4.0.2.zip
139
-cd ./FFmpeg-n4.0.2
140
-mkdir build
141
-cd ./build
142
-../configure --pkg-config-flags="--static" --extra-ldflags="-mmacosx-version-min=10.11" --enable-shared --disable-static --shlibdir="/tmp/obsdeps/bin" --enable-gpl --disable-doc --enable-libx264 --enable-libopus --enable-libvorbis --enable-libvpx --disable-outdev=sdl
143
-make -j 12
144
-find . -name \*.dylib -exec cp \{\} $DEPS_DEST/bin/ \;
145
-rsync -avh --include="*/" --include="*.h" --exclude="*" ../* $DEPS_DEST/include/
146
-rsync -avh --include="*/" --include="*.h" --exclude="*" ./* $DEPS_DEST/include/
147
-
148
-#luajit
149
-curl -L -O https://luajit.org/download/LuaJIT-2.0.5.tar.gz
150
-tar -xf LuaJIT-2.0.5.tar.gz
151
-cd LuaJIT-2.0.5
152
-make PREFIX=/tmp/obsdeps
153
-make PREFIX=/tmp/obsdeps install
154
-find /tmp/obsdeps/lib -name libluajit\*.dylib -exec cp \{\} $DEPS_DEST/lib/ \;
155
-rsync -avh --include="*/" --include="*.h" --exclude="*" src/* $DEPS_DEST/include/
156
-make PREFIX=/tmp/obsdeps uninstall
157
-
158
-cd $WORK_DIR
159
-
160
-tar -czf osx-deps.tar.gz obsdeps
161
-
162
-cp ./osx-deps.tar.gz $CURDIR
163
obs-studio-26.1.0.tar.xz/CI/util/win32.sh Deleted
71
 
1
@@ -1,69 +0,0 @@
2
-#/bin/bash
3
-
4
-cd x264
5
-make clean
6
-LDFLAGS="-static-libgcc" ./configure --enable-shared --enable-win32thread --disable-avs --disable-ffms --disable-gpac --disable-interlaced --disable-lavf --cross-prefix=i686-w64-mingw32- --host=i686-pc-mingw32 --prefix="/home/jim/packages/win32"
7
-make -j6 fprofiled VIDS="CITY_704x576_60_orig_01.yuv"
8
-make install
9
-i686-w64-mingw32-dlltool -z /home/jim/packages/win32/bin/x264.orig.def --export-all-symbols /home/jim/packages/win32/bin/libx264-148.dll
10
-grep "EXPORTS\|x264" /home/jim/packages/win32/bin/x264.orig.def > /home/jim/packages/win32/bin/x264.def
11
-rm -f /home/jim/packages/win32/bin/x264.org.def
12
-sed -i -e "/\\t.*DATA/d" -e "/\\t\".*/d" -e "s/\s@.*//" /home/jim/packages/win32/bin/x264.def
13
-i686-w64-mingw32-dlltool -m i386 -d /home/jim/packages/win32/bin/x264.def -l /home/jim/packages/win32/bin/x264.lib -D /home/jim/win32/packages/bin/libx264-148.dll
14
-cd ..
15
-
16
-cd opus
17
-make clean
18
-LDFLAGS="-static-libgcc" ./configure -host=i686-w64-mingw32 --prefix="/home/jim/packages/win32" --enable-shared
19
-make -j6
20
-make install
21
-cd ..
22
-
23
-cd zlib/build32
24
-make clean
25
-cmake .. -DCMAKE_SYSTEM_NAME=Windows -DCMAKE_C_COMPILER=i686-w64-mingw32-gcc -DCMAKE_INSTALL_PREFIX=/home/jim/packages/win32 -DINSTALL_PKGCONFIG_DIR=/home/jim/packages/win32/lib/pkgconfig -DCMAKE_RC_COMPILER=i686-w64-mingw32-windres -DCMAKE_SHARED_LINKER_FLAGS="-static-libgcc"
26
-make -j6
27
-make install
28
-mv ../../win32/lib/libzlib.dll.a ../../win32/lib/libz.dll.a
29
-mv ../../win32/lib/libzlibstatic.a ../../win32/lib/libz.a
30
-cp ../win32/zlib.def /home/jim/packages/win32/bin
31
-i686-w64-mingw32-dlltool -m i386 -d ../win32/zlib.def -l /home/jim/packages/win32/bin/zlib.lib -D /home/jim/win32/packages/bin/zlib.dll
32
-cd ../..
33
-
34
-cd libpng
35
-make clean
36
-PKG_CONFIG_PATH="/home/jim/packages/win32/lib/pkgconfig" LDFLAGS="-L/home/jim/packages/win32/lib -static-libgcc" CPPFLAGS="-I/home/jim/packages/win32/include" ./configure -host=i686-w64-mingw32 --prefix="/home/jim/packages/win32" --enable-shared
37
-make -j6
38
-make install
39
-cd ..
40
-
41
-cd libogg
42
-make clean
43
-PKG_CONFIG_PATH="/home/jim/packages/win32/lib/pkgconfig" LDFLAGS="-L/home/jim/packages/win32/lib -static-libgcc" CPPFLAGS="-I/home/jim/packages/win32/include" ./configure -host=i686-w64-mingw32 --prefix="/home/jim/packages/win32" --enable-shared
44
-make -j6
45
-make install
46
-cd ..
47
-
48
-cd libvorbis
49
-make clean
50
-PKG_CONFIG_PATH="/home/jim/packages/win32/lib/pkgconfig" LDFLAGS="-L/home/jim/packages/win32/lib -static-libgcc" CPPFLAGS="-I/home/jim/packages/win32/include" ./configure -host=i686-w64-mingw32 --prefix="/home/jim/packages/win32" --enable-shared --with-ogg="/home/jim/packages/win32"
51
-make -j6
52
-make install
53
-cd ..
54
-
55
-cd libvpxbuild
56
-make clean
57
-PKG_CONFIG_PATH="/home/jim/packages/win32/lib/pkgconfig" CROSS=i686-w64-mingw32- LDFLAGS="-static-libgcc" ../libvpx/configure --prefix=/home/jim/packages/win32 --enable-vp8 --enable-vp9 --disable-docs --disable-examples --enable-shared --disable-static --enable-runtime-cpu-detect --enable-realtime-only --disable-install-bins --disable-install-docs --disable-unit-tests --target=x86-win32-gcc
58
-make -j6
59
-make install
60
-i686-w64-mingw32-dlltool -m i386 -d libvpx.def -l /home/jim/packages/win32/bin/vpx.lib -D /home/jim/win32/packages/bin/libvpx-1.dll
61
-cd ..
62
-
63
-cd ffmpeg
64
-make clean
65
-cp /media/sf_linux/nvEncodeAPI.h /home/jim/packages/win32/include
66
-PKG_CONFIG_PATH="/home/jim/packages/win32/lib/pkgconfig" LDFLAGS="-L/home/jim/packages/win32/lib -static-libgcc" CFLAGS="-I/home/jim/packages/win32/include" ./configure --enable-memalign-hack --enable-gpl --disable-programs --disable-doc --arch=x86 --enable-shared --enable-nvenc --enable-libx264 --enable-libopus --enable-libvorbis --enable-libvpx --disable-debug --cross-prefix=i686-w64-mingw32- --target-os=mingw32 --pkg-config=pkg-config --prefix="/home/jim/packages/win32" --disable-postproc
67
-read -n1 -r -p "Press any key to continue building FFmpeg..." key
68
-make -j6
69
-make install
70
-cd ..
71
obs-studio-26.1.0.tar.xz/CI/util/win64.sh Deleted
71
 
1
@@ -1,69 +0,0 @@
2
-#/bin/bash
3
-
4
-cd x264
5
-make clean
6
-LDFLAGS="-static-libgcc" ./configure --enable-shared --enable-win32thread --disable-avs --disable-ffms --disable-gpac --disable-interlaced --disable-lavf --cross-prefix=x86_64-w64-mingw32- --host=x86_64-pc-mingw32 --prefix="/home/jim/packages/win64"
7
-make -j6 fprofiled VIDS="CITY_704x576_60_orig_01.yuv"
8
-make install
9
-x86_64-w64-mingw32-dlltool -z /home/jim/packages/win64/bin/x264.orig.def --export-all-symbols /home/jim/packages/win64/bin/libx264-148.dll
10
-grep "EXPORTS\|x264" /home/jim/packages/win64/bin/x264.orig.def > /home/jim/packages/win64/bin/x264.def
11
-rm -f /home/jim/packages/win64/bin/x264.org.def
12
-sed -i -e "/\\t.*DATA/d" -e "/\\t\".*/d" -e "s/\s@.*//" /home/jim/packages/win64/bin/x264.def
13
-x86_64-w64-mingw32-dlltool -m i386:x86-64 -d /home/jim/packages/win64/bin/x264.def -l /home/jim/packages/win64/bin/x264.lib -D /home/jim/win64/packages/bin/libx264-148.dll
14
-cd ..
15
-
16
-cd opus
17
-make clean
18
-LDFLAGS="-static-libgcc" ./configure -host=x86_64-w64-mingw32 --prefix="/home/jim/packages/win64" --enable-shared
19
-make -j6
20
-make install
21
-cd ..
22
-
23
-cd zlib/build64
24
-make clean
25
-cmake .. -DCMAKE_SYSTEM_NAME=Windows -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_INSTALL_PREFIX=/home/jim/packages/win64 -DCMAKE_RC_COMPILER=x86_64-w64-mingw32-windres -DCMAKE_SHARED_LINKER_FLAGS="-static-libgcc"
26
-make -j6
27
-make install
28
-mv ../../win64/lib/libzlib.dll.a ../../win64/lib/libz.dll.a
29
-mv ../../win64/lib/libzlibstatic.a ../../win64/lib/libz.a
30
-cp ../win64/zlib.def /home/jim/packages/win64/bin
31
-x86_64-w64-mingw32-dlltool -m i386:x86-64 -d ../win32/zlib.def -l /home/jim/packages/win64/bin/zlib.lib -D /home/jim/win64/packages/bin/zlib.dll
32
-cd ../..
33
-
34
-cd libpng
35
-make clean
36
-PKG_CONFIG_PATH="/home/jim/packages/win64/lib/pkgconfig" LDFLAGS="-L/home/jim/packages/win64/lib" CPPFLAGS="-I/home/jim/packages/win64/include" ./configure -host=x86_64-w64-mingw32 --prefix="/home/jim/packages/win64" --enable-shared
37
-make -j6
38
-make install
39
-cd ..
40
-
41
-cd libogg
42
-make clean
43
-PKG_CONFIG_PATH="/home/jim/packages/win64/lib/pkgconfig" LDFLAGS="-L/home/jim/packages/win64/lib -static-libgcc" CPPFLAGS="-I/home/jim/packages/win64/include" ./configure -host=x86_64-w64-mingw32 --prefix="/home/jim/packages/win64" --enable-shared
44
-make -j6
45
-make install
46
-cd ..
47
-
48
-cd libvorbis
49
-make clean
50
-PKG_CONFIG_PATH="/home/jim/packages/win64/lib/pkgconfig" LDFLAGS="-L/home/jim/packages/win64/lib -static-libgcc" CPPFLAGS="-I/home/jim/packages/win64/include" ./configure -host=x86_64-w64-mingw32 --prefix="/home/jim/packages/win64" --enable-shared --with-ogg="/home/jim/packages/win64"
51
-make -j6
52
-make install
53
-cd ..
54
-
55
-cd libvpxbuild
56
-make clean
57
-PKG_CONFIG_PATH="/home/jim/packages/win64/lib/pkgconfig" CROSS=x86_64-w64-mingw32- LDFLAGS="-static-libgcc" ../libvpx/configure --prefix=/home/jim/packages/win64 --enable-vp8 --enable-vp9 --disable-docs --disable-examples --enable-shared --disable-static --enable-runtime-cpu-detect --enable-realtime-only --disable-install-bins --disable-install-docs --disable-unit-tests --target=x86_64-win64-gcc
58
-make -j6
59
-make install
60
-x86_64-w64-mingw32-dlltool -m i386:x86-64 -d libvpx.def -l /home/jim/packages/win64/bin/vpx.lib -D /home/jim/win64/packages/bin/libvpx-1.dll
61
-cd ..
62
-
63
-cd ffmpeg
64
-make clean
65
-cp /media/sf_linux/nvEncodeAPI.h /home/jim/packages/win64/include
66
-PKG_CONFIG_PATH="/home/jim/packages/win64/lib/pkgconfig" LDFLAGS="-L/home/jim/packages/win64/lib" CPPFLAGS="-I/home/jim/packages/win64/include" ./configure --enable-memalign-hack --enable-gpl --disable-doc --arch=x86_64 --enable-shared --enable-nvenc --enable-libx264 --enable-libopus --enable-libvorbis --enable-libvpx --disable-debug --cross-prefix=x86_64-w64-mingw32- --target-os=mingw32 --pkg-config=pkg-config --prefix="/home/jim/packages/win64" --disable-postproc
67
-read -n1 -r -p "Press any key to continue building FFmpeg..." key
68
-make -j6
69
-make install
70
-cd ..
71
obs-studio-26.1.0.tar.xz/libobs/util/simde/mmx.h Deleted
2272
 
1
@@ -1,2270 +0,0 @@
2
-/* SPDX-License-Identifier: MIT
3
- *
4
- * Permission is hereby granted, free of charge, to any person
5
- * obtaining a copy of this software and associated documentation
6
- * files (the "Software"), to deal in the Software without
7
- * restriction, including without limitation the rights to use, copy,
8
- * modify, merge, publish, distribute, sublicense, and/or sell copies
9
- * of the Software, and to permit persons to whom the Software is
10
- * furnished to do so, subject to the following conditions:
11
- *
12
- * The above copyright notice and this permission notice shall be
13
- * included in all copies or substantial portions of the Software.
14
- *
15
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
- * SOFTWARE.
23
- *
24
- * Copyright:
25
- *   2017-2020 Evan Nemerson <evan@nemerson.com>
26
- */
27
-
28
-#if !defined(SIMDE_X86_MMX_H)
29
-#define SIMDE_X86_MMX_H
30
-
31
-#include "simde-common.h"
32
-
33
-#if !defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
34
-#define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES
35
-#endif
36
-
37
-HEDLEY_DIAGNOSTIC_PUSH
38
-SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
39
-
40
-#if defined(SIMDE_X86_MMX_NATIVE)
41
-#define SIMDE_X86_MMX_USE_NATIVE_TYPE
42
-#elif defined(SIMDE_X86_SSE_NATIVE)
43
-#define SIMDE_X86_MMX_USE_NATIVE_TYPE
44
-#endif
45
-
46
-#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
47
-#include <mmintrin.h>
48
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
49
-#include <arm_neon.h>
50
-#endif
51
-
52
-#include <stdint.h>
53
-#include <limits.h>
54
-
55
-SIMDE_BEGIN_DECLS_
56
-
57
-typedef union {
58
-#if defined(SIMDE_VECTOR_SUBSCRIPT)
59
-   SIMDE_ALIGN(8) int8_t i8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
60
-   SIMDE_ALIGN(8) int16_t i16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
61
-   SIMDE_ALIGN(8) int32_t i32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
62
-   SIMDE_ALIGN(8) int64_t i64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
63
-   SIMDE_ALIGN(8) uint8_t u8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
64
-   SIMDE_ALIGN(8) uint16_t u16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
65
-   SIMDE_ALIGN(8) uint32_t u32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
66
-   SIMDE_ALIGN(8) uint64_t u64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
67
-   SIMDE_ALIGN(8) simde_float32 f32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
68
-   SIMDE_ALIGN(8) int_fast32_t i32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
69
-   SIMDE_ALIGN(8) uint_fast32_t u32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
70
-#else
71
-   SIMDE_ALIGN(8) int8_t i8[8];
72
-   SIMDE_ALIGN(8) int16_t i16[4];
73
-   SIMDE_ALIGN(8) int32_t i32[2];
74
-   SIMDE_ALIGN(8) int64_t i64[1];
75
-   SIMDE_ALIGN(8) uint8_t u8[8];
76
-   SIMDE_ALIGN(8) uint16_t u16[4];
77
-   SIMDE_ALIGN(8) uint32_t u32[2];
78
-   SIMDE_ALIGN(8) uint64_t u64[1];
79
-   SIMDE_ALIGN(8) simde_float32 f32[2];
80
-   SIMDE_ALIGN(8) int_fast32_t i32f[8 / sizeof(int_fast32_t)];
81
-   SIMDE_ALIGN(8) uint_fast32_t u32f[8 / sizeof(uint_fast32_t)];
82
-#endif
83
-
84
-#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
85
-   __m64 n;
86
-#endif
87
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
88
-   int8x8_t neon_i8;
89
-   int16x4_t neon_i16;
90
-   int32x2_t neon_i32;
91
-   int64x1_t neon_i64;
92
-   uint8x8_t neon_u8;
93
-   uint16x4_t neon_u16;
94
-   uint32x2_t neon_u32;
95
-   uint64x1_t neon_u64;
96
-   float32x2_t neon_f32;
97
-#endif
98
-} simde__m64_private;
99
-
100
-#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
101
-typedef __m64 simde__m64;
102
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
103
-typedef int32x2_t simde__m64;
104
-#elif defined(SIMDE_VECTOR_SUBSCRIPT)
105
-typedef int32_t simde__m64 SIMDE_ALIGN(8) SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
106
-#else
107
-typedef simde__m64_private simde__m64;
108
-#endif
109
-
110
-#if !defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) && \
111
-   defined(SIMDE_ENABLE_NATIVE_ALIASES)
112
-#define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES
113
-typedef simde__m64 __m64;
114
-#endif
115
-
116
-HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect");
117
-HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64_private), "__m64 size incorrect");
118
-#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
119
-HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64) == 8,
120
-            "simde__m64 is not 8-byte aligned");
121
-HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64_private) == 8,
122
-            "simde__m64_private is not 8-byte aligned");
123
-#endif
124
-
125
-SIMDE_FUNCTION_ATTRIBUTES
126
-simde__m64 simde__m64_from_private(simde__m64_private v)
127
-{
128
-   simde__m64 r;
129
-   simde_memcpy(&r, &v, sizeof(r));
130
-   return r;
131
-}
132
-
133
-SIMDE_FUNCTION_ATTRIBUTES
134
-simde__m64_private simde__m64_to_private(simde__m64 v)
135
-{
136
-   simde__m64_private r;
137
-   simde_memcpy(&r, &v, sizeof(r));
138
-   return r;
139
-}
140
-
141
-#define SIMDE_X86_GENERATE_CONVERSION_FUNCTION(simde_type, source_type, isax, \
142
-                          fragment)                      \
143
-   SIMDE_FUNCTION_ATTRIBUTES                                             \
144
-   simde__##simde_type simde__##simde_type##_from_##isax##_##fragment(   \
145
-       source_type value)                                            \
146
-   {                                                                     \
147
-       simde__##simde_type##_private r_;                             \
148
-       r_.isax##_##fragment = value;                                 \
149
-       return simde__##simde_type##_from_private(r_);                \
150
-   }                                                                     \
151
-                                                                              \
152
-   SIMDE_FUNCTION_ATTRIBUTES                                             \
153
-   source_type simde__##simde_type##_to_##isax##_##fragment(             \
154
-       simde__##simde_type value)                                    \
155
-   {                                                                     \
156
-       simde__##simde_type##_private r_ =                            \
157
-           simde__##simde_type##_to_private(value);              \
158
-       return r_.isax##_##fragment;                                  \
159
-   }
160
-
161
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
162
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, neon, i8)
163
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, neon, i16)
164
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, neon, i32)
165
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64x1_t, neon, i64)
166
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, neon, u8)
167
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, neon, u16)
168
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, neon, u32)
169
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64x1_t, neon, u64)
170
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, float32x2_t, neon, f32)
171
-#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
172
-
173
-SIMDE_FUNCTION_ATTRIBUTES
174
-simde__m64 simde_mm_add_pi8(simde__m64 a, simde__m64 b)
175
-{
176
-#if defined(SIMDE_X86_MMX_NATIVE)
177
-   return _mm_add_pi8(a, b);
178
-#else
179
-   simde__m64_private r_;
180
-   simde__m64_private a_ = simde__m64_to_private(a);
181
-   simde__m64_private b_ = simde__m64_to_private(b);
182
-
183
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
184
-   r_.neon_i8 = vadd_s8(a_.neon_i8, b_.neon_i8);
185
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
186
-   r_.i8 = a_.i8 + b_.i8;
187
-#else
188
-   SIMDE_VECTORIZE
189
-   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
190
-       r_.i8[i] = a_.i8[i] + b_.i8[i];
191
-   }
192
-#endif
193
-
194
-   return simde__m64_from_private(r_);
195
-#endif
196
-}
197
-#define simde_m_paddb(a, b) simde_mm_add_pi8(a, b)
198
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
199
-#define _mm_add_pi8(a, b) simde_mm_add_pi8(a, b)
200
-#define _m_paddb(a, b) simde_m_paddb(a, b)
201
-#endif
202
-
203
-SIMDE_FUNCTION_ATTRIBUTES
204
-simde__m64 simde_mm_add_pi16(simde__m64 a, simde__m64 b)
205
-{
206
-#if defined(SIMDE_X86_MMX_NATIVE)
207
-   return _mm_add_pi16(a, b);
208
-#else
209
-   simde__m64_private r_;
210
-   simde__m64_private a_ = simde__m64_to_private(a);
211
-   simde__m64_private b_ = simde__m64_to_private(b);
212
-
213
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
214
-   r_.neon_i16 = vadd_s16(a_.neon_i16, b_.neon_i16);
215
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
216
-   r_.i16 = a_.i16 + b_.i16;
217
-#else
218
-   SIMDE_VECTORIZE
219
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
220
-       r_.i16[i] = a_.i16[i] + b_.i16[i];
221
-   }
222
-#endif
223
-
224
-   return simde__m64_from_private(r_);
225
-#endif
226
-}
227
-#define simde_m_paddw(a, b) simde_mm_add_pi16(a, b)
228
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
229
-#define _mm_add_pi16(a, b) simde_mm_add_pi16(a, b)
230
-#define _m_add_paddw(a, b) simde_mm_add_pi16(a, b)
231
-#endif
232
-
233
-SIMDE_FUNCTION_ATTRIBUTES
234
-simde__m64 simde_mm_add_pi32(simde__m64 a, simde__m64 b)
235
-{
236
-#if defined(SIMDE_X86_MMX_NATIVE)
237
-   return _mm_add_pi32(a, b);
238
-#else
239
-   simde__m64_private r_;
240
-   simde__m64_private a_ = simde__m64_to_private(a);
241
-   simde__m64_private b_ = simde__m64_to_private(b);
242
-
243
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
244
-   r_.neon_i32 = vadd_s32(a_.neon_i32, b_.neon_i32);
245
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
246
-   r_.i32 = a_.i32 + b_.i32;
247
-#else
248
-   SIMDE_VECTORIZE
249
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
250
-       r_.i32[i] = a_.i32[i] + b_.i32[i];
251
-   }
252
-#endif
253
-
254
-   return simde__m64_from_private(r_);
255
-#endif
256
-}
257
-#define simde_m_paddd(a, b) simde_mm_add_pi32(a, b)
258
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
259
-#define _mm_add_pi32(a, b) simde_mm_add_pi32(a, b)
260
-#define _m_add_paddd(a, b) simde_mm_add_pi32(a, b)
261
-#endif
262
-
263
-SIMDE_FUNCTION_ATTRIBUTES
264
-simde__m64 simde_mm_adds_pi8(simde__m64 a, simde__m64 b)
265
-{
266
-#if defined(SIMDE_X86_MMX_NATIVE)
267
-   return _mm_adds_pi8(a, b);
268
-#else
269
-   simde__m64_private r_, a_ = simde__m64_to_private(a),
270
-                  b_ = simde__m64_to_private(b);
271
-
272
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
273
-   r_.neon_i8 = vqadd_s8(a_.neon_i8, b_.neon_i8);
274
-#else
275
-   SIMDE_VECTORIZE
276
-   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
277
-       if ((((b_.i8[i]) > 0) &&
278
-            ((a_.i8[i]) > (INT8_MAX - (b_.i8[i]))))) {
279
-           r_.i8[i] = INT8_MAX;
280
-       } else if ((((b_.i8[i]) < 0) &&
281
-               ((a_.i8[i]) < (INT8_MIN - (b_.i8[i]))))) {
282
-           r_.i8[i] = INT8_MIN;
283
-       } else {
284
-           r_.i8[i] = (a_.i8[i]) + (b_.i8[i]);
285
-       }
286
-   }
287
-#endif
288
-
289
-   return simde__m64_from_private(r_);
290
-#endif
291
-}
292
-#define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b)
293
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
294
-#define _mm_adds_pi8(a, b) simde_mm_adds_pi8(a, b)
295
-#define _m_add_paddsb(a, b) simde_mm_adds_pi8(a, b)
296
-#endif
297
-
298
-SIMDE_FUNCTION_ATTRIBUTES
299
-simde__m64 simde_mm_adds_pu8(simde__m64 a, simde__m64 b)
300
-{
301
-#if defined(SIMDE_X86_MMX_NATIVE)
302
-   return _mm_adds_pu8(a, b);
303
-#else
304
-   simde__m64_private r_;
305
-   simde__m64_private a_ = simde__m64_to_private(a);
306
-   simde__m64_private b_ = simde__m64_to_private(b);
307
-
308
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
309
-   r_.neon_u8 = vqadd_u8(a_.neon_u8, b_.neon_u8);
310
-#else
311
-   SIMDE_VECTORIZE
312
-   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
313
-       const uint_fast16_t x =
314
-           HEDLEY_STATIC_CAST(uint_fast16_t, a_.u8[i]) +
315
-           HEDLEY_STATIC_CAST(uint_fast16_t, b_.u8[i]);
316
-       if (x > UINT8_MAX)
317
-           r_.u8[i] = UINT8_MAX;
318
-       else
319
-           r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
320
-   }
321
-#endif
322
-
323
-   return simde__m64_from_private(r_);
324
-#endif
325
-}
326
-#define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b)
327
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
328
-#define _mm_adds_pu8(a, b) simde_mm_adds_pu8(a, b)
329
-#define _m_paddusb(a, b) simde_mm_adds_pu8(a, b)
330
-#endif
331
-
332
-SIMDE_FUNCTION_ATTRIBUTES
333
-simde__m64 simde_mm_adds_pi16(simde__m64 a, simde__m64 b)
334
-{
335
-#if defined(SIMDE_X86_MMX_NATIVE)
336
-   return _mm_adds_pi16(a, b);
337
-#else
338
-   simde__m64_private r_;
339
-   simde__m64_private a_ = simde__m64_to_private(a);
340
-   simde__m64_private b_ = simde__m64_to_private(b);
341
-
342
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
343
-   r_.neon_i16 = vqadd_s16(a_.neon_i16, b_.neon_i16);
344
-#else
345
-   SIMDE_VECTORIZE
346
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
347
-       if ((((b_.i16[i]) > 0) &&
348
-            ((a_.i16[i]) > (INT16_MAX - (b_.i16[i]))))) {
349
-           r_.i16[i] = INT16_MAX;
350
-       } else if ((((b_.i16[i]) < 0) &&
351
-               ((a_.i16[i]) < (SHRT_MIN - (b_.i16[i]))))) {
352
-           r_.i16[i] = SHRT_MIN;
353
-       } else {
354
-           r_.i16[i] = (a_.i16[i]) + (b_.i16[i]);
355
-       }
356
-   }
357
-#endif
358
-
359
-   return simde__m64_from_private(r_);
360
-#endif
361
-}
362
-#define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b)
363
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
364
-#define _mm_adds_pi16(a, b) simde_mm_adds_pi16(a, b)
365
-#define _m_paddsw(a, b) simde_mm_adds_pi16(a, b)
366
-#endif
367
-
368
-SIMDE_FUNCTION_ATTRIBUTES
369
-simde__m64 simde_mm_adds_pu16(simde__m64 a, simde__m64 b)
370
-{
371
-#if defined(SIMDE_X86_MMX_NATIVE)
372
-   return _mm_adds_pu16(a, b);
373
-#else
374
-   simde__m64_private r_;
375
-   simde__m64_private a_ = simde__m64_to_private(a);
376
-   simde__m64_private b_ = simde__m64_to_private(b);
377
-
378
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
379
-   r_.neon_u16 = vqadd_u16(a_.neon_u16, b_.neon_u16);
380
-#else
381
-   SIMDE_VECTORIZE
382
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
383
-       const uint32_t x = a_.u16[i] + b_.u16[i];
384
-       if (x > UINT16_MAX)
385
-           r_.u16[i] = UINT16_MAX;
386
-       else
387
-           r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
388
-   }
389
-#endif
390
-
391
-   return simde__m64_from_private(r_);
392
-#endif
393
-}
394
-#define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b)
395
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
396
-#define _mm_adds_pu16(a, b) simde_mm_adds_pu16(a, b)
397
-#define _m_paddusw(a, b) simde_mm_adds_pu16(a, b)
398
-#endif
399
-
400
-SIMDE_FUNCTION_ATTRIBUTES
401
-simde__m64 simde_mm_and_si64(simde__m64 a, simde__m64 b)
402
-{
403
-#if defined(SIMDE_X86_MMX_NATIVE)
404
-   return _mm_and_si64(a, b);
405
-#else
406
-   simde__m64_private r_;
407
-   simde__m64_private a_ = simde__m64_to_private(a);
408
-   simde__m64_private b_ = simde__m64_to_private(b);
409
-
410
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
411
-   r_.neon_i32 = vand_s32(a_.neon_i32, b_.neon_i32);
412
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
413
-   r_.i64 = a_.i64 & b_.i64;
414
-#else
415
-   r_.i64[0] = a_.i64[0] & b_.i64[0];
416
-#endif
417
-
418
-   return simde__m64_from_private(r_);
419
-#endif
420
-}
421
-#define simde_m_pand(a, b) simde_mm_and_si64(a, b)
422
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
423
-#define _mm_and_si64(a, b) simde_mm_and_si64(a, b)
424
-#define _m_pand(a, b) simde_mm_and_si64(a, b)
425
-#endif
426
-
427
-SIMDE_FUNCTION_ATTRIBUTES
428
-simde__m64 simde_mm_andnot_si64(simde__m64 a, simde__m64 b)
429
-{
430
-#if defined(SIMDE_X86_MMX_NATIVE)
431
-   return _mm_andnot_si64(a, b);
432
-#else
433
-   simde__m64_private r_;
434
-   simde__m64_private a_ = simde__m64_to_private(a);
435
-   simde__m64_private b_ = simde__m64_to_private(b);
436
-
437
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
438
-   r_.neon_i32 = vbic_s32(b_.neon_i32, a_.neon_i32);
439
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
440
-   r_.i32f = ~a_.i32f & b_.i32f;
441
-#else
442
-   r_.u64[0] = (~(a_.u64[0])) & (b_.u64[0]);
443
-#endif
444
-
445
-   return simde__m64_from_private(r_);
446
-#endif
447
-}
448
-#define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b)
449
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
450
-#define _mm_andnot_si64(a, b) simde_mm_andnot_si64(a, b)
451
-#define _m_pandn(a, b) simde_mm_andnot_si64(a, b)
452
-#endif
453
-
454
-SIMDE_FUNCTION_ATTRIBUTES
455
-simde__m64 simde_mm_cmpeq_pi8(simde__m64 a, simde__m64 b)
456
-{
457
-#if defined(SIMDE_X86_MMX_NATIVE)
458
-   return _mm_cmpeq_pi8(a, b);
459
-#else
460
-   simde__m64_private r_;
461
-   simde__m64_private a_ = simde__m64_to_private(a);
462
-   simde__m64_private b_ = simde__m64_to_private(b);
463
-
464
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
465
-   r_.neon_i8 = vreinterpret_s8_u8(vceq_s8(a_.neon_i8, b_.neon_i8));
466
-#else
467
-   SIMDE_VECTORIZE
468
-   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
469
-       r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
470
-   }
471
-#endif
472
-
473
-   return simde__m64_from_private(r_);
474
-#endif
475
-}
476
-#define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
477
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
478
-#define _mm_cmpeq_pi8(a, b) simde_mm_cmpeq_pi8(a, b)
479
-#define _m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
480
-#endif
481
-
482
-SIMDE_FUNCTION_ATTRIBUTES
483
-simde__m64 simde_mm_cmpeq_pi16(simde__m64 a, simde__m64 b)
484
-{
485
-#if defined(SIMDE_X86_MMX_NATIVE)
486
-   return _mm_cmpeq_pi16(a, b);
487
-#else
488
-   simde__m64_private r_;
489
-   simde__m64_private a_ = simde__m64_to_private(a);
490
-   simde__m64_private b_ = simde__m64_to_private(b);
491
-
492
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
493
-   r_.neon_i16 = vreinterpret_s16_u16(vceq_s16(a_.neon_i16, b_.neon_i16));
494
-#else
495
-   SIMDE_VECTORIZE
496
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
497
-       r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
498
-   }
499
-#endif
500
-
501
-   return simde__m64_from_private(r_);
502
-#endif
503
-}
504
-#define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
505
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
506
-#define _mm_cmpeq_pi16(a, b) simde_mm_cmpeq_pi16(a, b)
507
-#define _m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
508
-#endif
509
-
510
-SIMDE_FUNCTION_ATTRIBUTES
511
-simde__m64 simde_mm_cmpeq_pi32(simde__m64 a, simde__m64 b)
512
-{
513
-#if defined(SIMDE_X86_MMX_NATIVE)
514
-   return _mm_cmpeq_pi32(a, b);
515
-#else
516
-   simde__m64_private r_;
517
-   simde__m64_private a_ = simde__m64_to_private(a);
518
-   simde__m64_private b_ = simde__m64_to_private(b);
519
-
520
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
521
-   r_.neon_i32 = vreinterpret_s32_u32(vceq_s32(a_.neon_i32, b_.neon_i32));
522
-#else
523
-   SIMDE_VECTORIZE
524
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
525
-       r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
526
-   }
527
-#endif
528
-
529
-   return simde__m64_from_private(r_);
530
-#endif
531
-}
532
-#define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
533
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
534
-#define _mm_cmpeq_pi32(a, b) simde_mm_cmpeq_pi32(a, b)
535
-#define _m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
536
-#endif
537
-
538
-SIMDE_FUNCTION_ATTRIBUTES
539
-simde__m64 simde_mm_cmpgt_pi8(simde__m64 a, simde__m64 b)
540
-{
541
-#if defined(SIMDE_X86_MMX_NATIVE)
542
-   return _mm_cmpgt_pi8(a, b);
543
-#else
544
-   simde__m64_private r_;
545
-   simde__m64_private a_ = simde__m64_to_private(a);
546
-   simde__m64_private b_ = simde__m64_to_private(b);
547
-
548
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
549
-   r_.neon_i8 = vreinterpret_s8_u8(vcgt_s8(a_.neon_i8, b_.neon_i8));
550
-#else
551
-   SIMDE_VECTORIZE
552
-   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
553
-       r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
554
-   }
555
-#endif
556
-
557
-   return simde__m64_from_private(r_);
558
-#endif
559
-}
560
-#define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
561
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
562
-#define _mm_cmpgt_pi8(a, b) simde_mm_cmpgt_pi8(a, b)
563
-#define _m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
564
-#endif
565
-
566
-SIMDE_FUNCTION_ATTRIBUTES
567
-simde__m64 simde_mm_cmpgt_pi16(simde__m64 a, simde__m64 b)
568
-{
569
-#if defined(SIMDE_X86_MMX_NATIVE)
570
-   return _mm_cmpgt_pi16(a, b);
571
-#else
572
-   simde__m64_private r_;
573
-   simde__m64_private a_ = simde__m64_to_private(a);
574
-   simde__m64_private b_ = simde__m64_to_private(b);
575
-
576
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
577
-   r_.neon_i16 = vreinterpret_s16_u16(vcgt_s16(a_.neon_i16, b_.neon_i16));
578
-#else
579
-   SIMDE_VECTORIZE
580
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
581
-       r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
582
-   }
583
-#endif
584
-
585
-   return simde__m64_from_private(r_);
586
-#endif
587
-}
588
-#define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
589
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
590
-#define _mm_cmpgt_pi16(a, b) simde_mm_cmpgt_pi16(a, b)
591
-#define _m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
592
-#endif
593
-
594
-SIMDE_FUNCTION_ATTRIBUTES
595
-simde__m64 simde_mm_cmpgt_pi32(simde__m64 a, simde__m64 b)
596
-{
597
-#if defined(SIMDE_X86_MMX_NATIVE)
598
-   return _mm_cmpgt_pi32(a, b);
599
-#else
600
-   simde__m64_private r_;
601
-   simde__m64_private a_ = simde__m64_to_private(a);
602
-   simde__m64_private b_ = simde__m64_to_private(b);
603
-
604
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
605
-   r_.neon_i32 = vreinterpret_s32_u32(vcgt_s32(a_.neon_i32, b_.neon_i32));
606
-#else
607
-   SIMDE_VECTORIZE
608
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
609
-       r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
610
-   }
611
-#endif
612
-
613
-   return simde__m64_from_private(r_);
614
-#endif
615
-}
616
-#define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
617
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
618
-#define _mm_cmpgt_pi32(a, b) simde_mm_cmpgt_pi32(a, b)
619
-#define _m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
620
-#endif
621
-
622
-SIMDE_FUNCTION_ATTRIBUTES
623
-int64_t simde_mm_cvtm64_si64(simde__m64 a)
624
-{
625
-#if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
626
-   !defined(__PGI)
627
-   return _mm_cvtm64_si64(a);
628
-#else
629
-   simde__m64_private a_ = simde__m64_to_private(a);
630
-
631
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
632
-   return vget_lane_s64(a_.neon_i64, 0);
633
-#else
634
-   return a_.i64[0];
635
-#endif
636
-#endif
637
-}
638
-#define simde_m_to_int64(a) simde_mm_cvtm64_si64(a)
639
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
640
-#define _mm_cvtm64_si64(a) simde_mm_cvtm64_si64(a)
641
-#define _m_to_int64(a) simde_mm_cvtm64_si64(a)
642
-#endif
643
-
644
-SIMDE_FUNCTION_ATTRIBUTES
645
-simde__m64 simde_mm_cvtsi32_si64(int32_t a)
646
-{
647
-#if defined(SIMDE_X86_MMX_NATIVE)
648
-   return _mm_cvtsi32_si64(a);
649
-#else
650
-   simde__m64_private r_;
651
-
652
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
653
-   const int32_t av[sizeof(r_.neon_i32) / sizeof(r_.neon_i32[0])] = {a, 0};
654
-   r_.neon_i32 = vld1_s32(av);
655
-#else
656
-   r_.i32[0] = a;
657
-   r_.i32[1] = 0;
658
-#endif
659
-
660
-   return simde__m64_from_private(r_);
661
-#endif
662
-}
663
-#define simde_m_from_int(a) simde_mm_cvtsi32_si64(a)
664
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
665
-#define _mm_cvtsi32_si64(a) simde_mm_cvtsi32_si64(a)
666
-#define _m_from_int(a) simde_mm_cvtsi32_si64(a)
667
-#endif
668
-
669
-SIMDE_FUNCTION_ATTRIBUTES
670
-simde__m64 simde_mm_cvtsi64_m64(int64_t a)
671
-{
672
-#if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
673
-   !defined(__PGI)
674
-   return _mm_cvtsi64_m64(a);
675
-#else
676
-   simde__m64_private r_;
677
-
678
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
679
-   r_.neon_i64 = vld1_s64(&a);
680
-#else
681
-   r_.i64[0] = a;
682
-#endif
683
-
684
-   return simde__m64_from_private(r_);
685
-#endif
686
-}
687
-#define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a)
688
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
689
-#define _mm_cvtsi64_m64(a) simde_mm_cvtsi64_m64(a)
690
-#define _m_from_int64(a) simde_mm_cvtsi64_m64(a)
691
-#endif
692
-
693
-SIMDE_FUNCTION_ATTRIBUTES
694
-int32_t simde_mm_cvtsi64_si32(simde__m64 a)
695
-{
696
-#if defined(SIMDE_X86_MMX_NATIVE)
697
-   return _mm_cvtsi64_si32(a);
698
-#else
699
-   simde__m64_private a_ = simde__m64_to_private(a);
700
-
701
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
702
-   return vget_lane_s32(a_.neon_i32, 0);
703
-#else
704
-   return a_.i32[0];
705
-#endif
706
-#endif
707
-}
708
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
709
-#define _mm_cvtsi64_si32(a) simde_mm_cvtsi64_si32(a)
710
-#endif
711
-
712
-SIMDE_FUNCTION_ATTRIBUTES
713
-void simde_mm_empty(void)
714
-{
715
-#if defined(SIMDE_X86_MMX_NATIVE)
716
-   _mm_empty();
717
-#else
718
-#endif
719
-}
720
-#define simde_m_empty() simde_mm_empty()
721
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
722
-#define _mm_empty() simde_mm_empty()
723
-#define _m_empty() simde_mm_empty()
724
-#endif
725
-
726
-SIMDE_FUNCTION_ATTRIBUTES
727
-simde__m64 simde_mm_madd_pi16(simde__m64 a, simde__m64 b)
728
-{
729
-#if defined(SIMDE_X86_MMX_NATIVE)
730
-   return _mm_madd_pi16(a, b);
731
-#else
732
-   simde__m64_private r_;
733
-   simde__m64_private a_ = simde__m64_to_private(a);
734
-   simde__m64_private b_ = simde__m64_to_private(b);
735
-
736
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
737
-   int32x4_t i1 = vmull_s16(a_.neon_i16, b_.neon_i16);
738
-   r_.neon_i32 = vpadd_s32(vget_low_s32(i1), vget_high_s32(i1));
739
-#else
740
-   SIMDE_VECTORIZE
741
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i += 2) {
742
-       r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) +
743
-               (a_.i16[i + 1] * b_.i16[i + 1]);
744
-   }
745
-#endif
746
-
747
-   return simde__m64_from_private(r_);
748
-#endif
749
-}
750
-#define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
751
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
752
-#define _mm_madd_pi16(a, b) simde_mm_madd_pi16(a, b)
753
-#define _m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
754
-#endif
755
-
756
-SIMDE_FUNCTION_ATTRIBUTES
757
-simde__m64 simde_mm_mulhi_pi16(simde__m64 a, simde__m64 b)
758
-{
759
-#if defined(SIMDE_X86_MMX_NATIVE)
760
-   return _mm_mulhi_pi16(a, b);
761
-#else
762
-   simde__m64_private r_;
763
-   simde__m64_private a_ = simde__m64_to_private(a);
764
-   simde__m64_private b_ = simde__m64_to_private(b);
765
-
766
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
767
-   const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
768
-   const uint32x4_t t2 = vshrq_n_u32(vreinterpretq_u32_s32(t1), 16);
769
-   const uint16x4_t t3 = vmovn_u32(t2);
770
-   r_.neon_i16 = vreinterpret_s16_u16(t3);
771
-#else
772
-   SIMDE_VECTORIZE
773
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
774
-       r_.i16[i] = HEDLEY_STATIC_CAST(int16_t,
775
-                          ((a_.i16[i] * b_.i16[i]) >> 16));
776
-   }
777
-#endif
778
-
779
-   return simde__m64_from_private(r_);
780
-#endif
781
-}
782
-#define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
783
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
784
-#define _mm_mulhi_pi16(a, b) simde_mm_mulhi_pi16(a, b)
785
-#define _m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
786
-#endif
787
-
788
-SIMDE_FUNCTION_ATTRIBUTES
789
-simde__m64 simde_mm_mullo_pi16(simde__m64 a, simde__m64 b)
790
-{
791
-#if defined(SIMDE_X86_MMX_NATIVE)
792
-   return _mm_mullo_pi16(a, b);
793
-#else
794
-   simde__m64_private r_;
795
-   simde__m64_private a_ = simde__m64_to_private(a);
796
-   simde__m64_private b_ = simde__m64_to_private(b);
797
-
798
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
799
-   const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
800
-   const uint16x4_t t2 = vmovn_u32(vreinterpretq_u32_s32(t1));
801
-   r_.neon_i16 = vreinterpret_s16_u16(t2);
802
-#else
803
-   SIMDE_VECTORIZE
804
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
805
-       r_.i16[i] = HEDLEY_STATIC_CAST(
806
-           int16_t, ((a_.i16[i] * b_.i16[i]) & 0xffff));
807
-   }
808
-#endif
809
-
810
-   return simde__m64_from_private(r_);
811
-#endif
812
-}
813
-#define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
814
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
815
-#define _mm_mullo_pi16(a, b) simde_mm_mullo_pi16(a, b)
816
-#define _m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
817
-#endif
818
-
819
-SIMDE_FUNCTION_ATTRIBUTES
820
-simde__m64 simde_mm_or_si64(simde__m64 a, simde__m64 b)
821
-{
822
-#if defined(SIMDE_X86_MMX_NATIVE)
823
-   return _mm_or_si64(a, b);
824
-#else
825
-   simde__m64_private r_;
826
-   simde__m64_private a_ = simde__m64_to_private(a);
827
-   simde__m64_private b_ = simde__m64_to_private(b);
828
-
829
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
830
-   r_.neon_i32 = vorr_s32(a_.neon_i32, b_.neon_i32);
831
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
832
-   r_.i64 = a_.i64 | b_.i64;
833
-#else
834
-   r_.i64[0] = a_.i64[0] | b_.i64[0];
835
-#endif
836
-
837
-   return simde__m64_from_private(r_);
838
-#endif
839
-}
840
-#define simde_m_por(a, b) simde_mm_or_si64(a, b)
841
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
842
-#define _mm_or_si64(a, b) simde_mm_or_si64(a, b)
843
-#define _m_por(a, b) simde_mm_or_si64(a, b)
844
-#endif
845
-
846
-SIMDE_FUNCTION_ATTRIBUTES
847
-simde__m64 simde_mm_packs_pi16(simde__m64 a, simde__m64 b)
848
-{
849
-#if defined(SIMDE_X86_MMX_NATIVE)
850
-   return _mm_packs_pi16(a, b);
851
-#else
852
-   simde__m64_private r_;
853
-   simde__m64_private a_ = simde__m64_to_private(a);
854
-   simde__m64_private b_ = simde__m64_to_private(b);
855
-
856
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
857
-   r_.neon_i8 = vqmovn_s16(vcombine_s16(a_.neon_i16, b_.neon_i16));
858
-#else
859
-   SIMDE_VECTORIZE
860
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
861
-       if (a_.i16[i] < INT8_MIN) {
862
-           r_.i8[i] = INT8_MIN;
863
-       } else if (a_.i16[i] > INT8_MAX) {
864
-           r_.i8[i] = INT8_MAX;
865
-       } else {
866
-           r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i16[i]);
867
-       }
868
-   }
869
-
870
-   SIMDE_VECTORIZE
871
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
872
-       if (b_.i16[i] < INT8_MIN) {
873
-           r_.i8[i + 4] = INT8_MIN;
874
-       } else if (b_.i16[i] > INT8_MAX) {
875
-           r_.i8[i + 4] = INT8_MAX;
876
-       } else {
877
-           r_.i8[i + 4] = HEDLEY_STATIC_CAST(int8_t, b_.i16[i]);
878
-       }
879
-   }
880
-#endif
881
-
882
-   return simde__m64_from_private(r_);
883
-#endif
884
-}
885
-#define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b)
886
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
887
-#define _mm_packs_pi16(a, b) simde_mm_packs_pi16(a, b)
888
-#define _m_packsswb(a, b) mm_packs_pi16(a, b)
889
-#endif
890
-
891
-SIMDE_FUNCTION_ATTRIBUTES
892
-simde__m64 simde_mm_packs_pi32(simde__m64 a, simde__m64 b)
893
-{
894
-#if defined(SIMDE_X86_MMX_NATIVE)
895
-   return _mm_packs_pi32(a, b);
896
-#else
897
-   simde__m64_private r_;
898
-   simde__m64_private a_ = simde__m64_to_private(a);
899
-   simde__m64_private b_ = simde__m64_to_private(b);
900
-
901
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
902
-   r_.neon_i16 = vqmovn_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
903
-#else
904
-   SIMDE_VECTORIZE
905
-   for (size_t i = 0; i < (8 / sizeof(a_.i32[0])); i++) {
906
-       if (a_.i32[i] < SHRT_MIN) {
907
-           r_.i16[i] = SHRT_MIN;
908
-       } else if (a_.i32[i] > INT16_MAX) {
909
-           r_.i16[i] = INT16_MAX;
910
-       } else {
911
-           r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i32[i]);
912
-       }
913
-   }
914
-
915
-   SIMDE_VECTORIZE
916
-   for (size_t i = 0; i < (8 / sizeof(b_.i32[0])); i++) {
917
-       if (b_.i32[i] < SHRT_MIN) {
918
-           r_.i16[i + 2] = SHRT_MIN;
919
-       } else if (b_.i32[i] > INT16_MAX) {
920
-           r_.i16[i + 2] = INT16_MAX;
921
-       } else {
922
-           r_.i16[i + 2] = HEDLEY_STATIC_CAST(int16_t, b_.i32[i]);
923
-       }
924
-   }
925
-#endif
926
-
927
-   return simde__m64_from_private(r_);
928
-#endif
929
-}
930
-#define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b)
931
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
932
-#define _mm_packs_pi32(a, b) simde_mm_packs_pi32(a, b)
933
-#define _m_packssdw(a, b) simde_mm_packs_pi32(a, b)
934
-#endif
935
-
936
-SIMDE_FUNCTION_ATTRIBUTES
937
-simde__m64 simde_mm_packs_pu16(simde__m64 a, simde__m64 b)
938
-{
939
-#if defined(SIMDE_X86_MMX_NATIVE)
940
-   return _mm_packs_pu16(a, b);
941
-#else
942
-   simde__m64_private r_;
943
-   simde__m64_private a_ = simde__m64_to_private(a);
944
-   simde__m64_private b_ = simde__m64_to_private(b);
945
-
946
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
947
-   const int16x8_t t1 = vcombine_s16(a_.neon_i16, b_.neon_i16);
948
-
949
-   /* Set elements which are < 0 to 0 */
950
-   const int16x8_t t2 =
951
-       vandq_s16(t1, vreinterpretq_s16_u16(vcgezq_s16(t1)));
952
-
953
-   /* Vector with all s16 elements set to UINT8_MAX */
954
-   const int16x8_t vmax = vmovq_n_s16((int16_t)UINT8_MAX);
955
-
956
-   /* Elements which are within the acceptable range */
957
-   const int16x8_t le_max =
958
-       vandq_s16(t2, vreinterpretq_s16_u16(vcleq_s16(t2, vmax)));
959
-   const int16x8_t gt_max =
960
-       vandq_s16(vmax, vreinterpretq_s16_u16(vcgtq_s16(t2, vmax)));
961
-
962
-   /* Final values as 16-bit integers */
963
-   const int16x8_t values = vorrq_s16(le_max, gt_max);
964
-
965
-   r_.neon_u8 = vmovn_u16(vreinterpretq_u16_s16(values));
966
-#else
967
-   SIMDE_VECTORIZE
968
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
969
-       if (a_.i16[i] > UINT8_MAX) {
970
-           r_.u8[i] = UINT8_MAX;
971
-       } else if (a_.i16[i] < 0) {
972
-           r_.u8[i] = 0;
973
-       } else {
974
-           r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]);
975
-       }
976
-   }
977
-
978
-   SIMDE_VECTORIZE
979
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
980
-       if (b_.i16[i] > UINT8_MAX) {
981
-           r_.u8[i + 4] = UINT8_MAX;
982
-       } else if (b_.i16[i] < 0) {
983
-           r_.u8[i + 4] = 0;
984
-       } else {
985
-           r_.u8[i + 4] = HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]);
986
-       }
987
-   }
988
-#endif
989
-
990
-   return simde__m64_from_private(r_);
991
-#endif
992
-}
993
-#define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b)
994
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
995
-#define _mm_packs_pu16(a, b) simde_mm_packs_pu16(a, b)
996
-#define _m_packuswb(a, b) simde_mm_packs_pu16(a, b)
997
-#endif
998
-
999
-SIMDE_FUNCTION_ATTRIBUTES
1000
-simde__m64 simde_mm_set_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
1001
-               int8_t e3, int8_t e2, int8_t e1, int8_t e0)
1002
-{
1003
-#if defined(SIMDE_X86_MMX_NATIVE)
1004
-   return _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0);
1005
-#else
1006
-   simde__m64_private r_;
1007
-
1008
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1009
-   const int8_t v[sizeof(r_.i8) / sizeof(r_.i8[0])] = {e0, e1, e2, e3,
1010
-                               e4, e5, e6, e7};
1011
-   r_.neon_i8 = vld1_s8(v);
1012
-#else
1013
-   r_.i8[0] = e0;
1014
-   r_.i8[1] = e1;
1015
-   r_.i8[2] = e2;
1016
-   r_.i8[3] = e3;
1017
-   r_.i8[4] = e4;
1018
-   r_.i8[5] = e5;
1019
-   r_.i8[6] = e6;
1020
-   r_.i8[7] = e7;
1021
-#endif
1022
-
1023
-   return simde__m64_from_private(r_);
1024
-#endif
1025
-}
1026
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1027
-#define _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0) \
1028
-   simde_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0)
1029
-#endif
1030
-
1031
-SIMDE_FUNCTION_ATTRIBUTES
1032
-simde__m64 simde_x_mm_set_pu8(uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
1033
-                 uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0)
1034
-{
1035
-   simde__m64_private r_;
1036
-
1037
-#if defined(SIMDE_X86_MMX_NATIVE)
1038
-   r_.n = _mm_set_pi8(
1039
-       HEDLEY_STATIC_CAST(int8_t, e7), HEDLEY_STATIC_CAST(int8_t, e6),
1040
-       HEDLEY_STATIC_CAST(int8_t, e5), HEDLEY_STATIC_CAST(int8_t, e4),
1041
-       HEDLEY_STATIC_CAST(int8_t, e3), HEDLEY_STATIC_CAST(int8_t, e2),
1042
-       HEDLEY_STATIC_CAST(int8_t, e1), HEDLEY_STATIC_CAST(int8_t, e0));
1043
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1044
-   const uint8_t v[sizeof(r_.u8) / sizeof(r_.u8[0])] = {e0, e1, e2, e3,
1045
-                                e4, e5, e6, e7};
1046
-   r_.neon_u8 = vld1_u8(v);
1047
-#else
1048
-   r_.u8[0] = e0;
1049
-   r_.u8[1] = e1;
1050
-   r_.u8[2] = e2;
1051
-   r_.u8[3] = e3;
1052
-   r_.u8[4] = e4;
1053
-   r_.u8[5] = e5;
1054
-   r_.u8[6] = e6;
1055
-   r_.u8[7] = e7;
1056
-#endif
1057
-
1058
-   return simde__m64_from_private(r_);
1059
-}
1060
-
1061
-SIMDE_FUNCTION_ATTRIBUTES
1062
-simde__m64 simde_mm_set_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
1063
-{
1064
-#if defined(SIMDE_X86_MMX_NATIVE)
1065
-   return _mm_set_pi16(e3, e2, e1, e0);
1066
-#else
1067
-   simde__m64_private r_;
1068
-
1069
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1070
-   const int16_t v[sizeof(r_.i16) / sizeof(r_.i16[0])] = {e0, e1, e2, e3};
1071
-   r_.neon_i16 = vld1_s16(v);
1072
-#else
1073
-   r_.i16[0] = e0;
1074
-   r_.i16[1] = e1;
1075
-   r_.i16[2] = e2;
1076
-   r_.i16[3] = e3;
1077
-#endif
1078
-   return simde__m64_from_private(r_);
1079
-#endif
1080
-}
1081
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1082
-#define _mm_set_pi16(e3, e2, e1, e0) simde_mm_set_pi16(e3, e2, e1, e0)
1083
-#endif
1084
-
1085
-SIMDE_FUNCTION_ATTRIBUTES
1086
-simde__m64 simde_x_mm_set_pu16(uint16_t e3, uint16_t e2, uint16_t e1,
1087
-                  uint16_t e0)
1088
-{
1089
-   simde__m64_private r_;
1090
-
1091
-#if defined(SIMDE_X86_MMX_NATIVE)
1092
-   r_.n = _mm_set_pi16(HEDLEY_STATIC_CAST(int16_t, e3),
1093
-               HEDLEY_STATIC_CAST(int16_t, e2),
1094
-               HEDLEY_STATIC_CAST(int16_t, e1),
1095
-               HEDLEY_STATIC_CAST(int16_t, e0));
1096
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1097
-   const uint16_t v[sizeof(r_.u16) / sizeof(r_.u16[0])] = {e0, e1, e2, e3};
1098
-   r_.neon_u16 = vld1_u16(v);
1099
-#else
1100
-   r_.u16[0] = e0;
1101
-   r_.u16[1] = e1;
1102
-   r_.u16[2] = e2;
1103
-   r_.u16[3] = e3;
1104
-#endif
1105
-
1106
-   return simde__m64_from_private(r_);
1107
-}
1108
-
1109
-SIMDE_FUNCTION_ATTRIBUTES
1110
-simde__m64 simde_x_mm_set_pu32(uint32_t e1, uint32_t e0)
1111
-{
1112
-   simde__m64_private r_;
1113
-
1114
-#if defined(SIMDE_X86_MMX_NATIVE)
1115
-   r_.n = _mm_set_pi32(HEDLEY_STATIC_CAST(int32_t, e1),
1116
-               HEDLEY_STATIC_CAST(int32_t, e0));
1117
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1118
-   const uint32_t v[sizeof(r_.u32) / sizeof(r_.u32[0])] = {e0, e1};
1119
-   r_.neon_u32 = vld1_u32(v);
1120
-#else
1121
-   r_.u32[0] = e0;
1122
-   r_.u32[1] = e1;
1123
-#endif
1124
-
1125
-   return simde__m64_from_private(r_);
1126
-}
1127
-
1128
-SIMDE_FUNCTION_ATTRIBUTES
1129
-simde__m64 simde_mm_set_pi32(int32_t e1, int32_t e0)
1130
-{
1131
-   simde__m64_private r_;
1132
-
1133
-#if defined(SIMDE_X86_MMX_NATIVE)
1134
-   r_.n = _mm_set_pi32(e1, e0);
1135
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1136
-   const int32_t v[sizeof(r_.i32) / sizeof(r_.i32[0])] = {e0, e1};
1137
-   r_.neon_i32 = vld1_s32(v);
1138
-#else
1139
-   r_.i32[0] = e0;
1140
-   r_.i32[1] = e1;
1141
-#endif
1142
-
1143
-   return simde__m64_from_private(r_);
1144
-}
1145
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1146
-#define _mm_set_pi32(e1, e0) simde_mm_set_pi32(e1, e0)
1147
-#endif
1148
-
1149
-SIMDE_FUNCTION_ATTRIBUTES
1150
-simde__m64 simde_x_mm_set_pi64(int64_t e0)
1151
-{
1152
-   simde__m64_private r_;
1153
-
1154
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1155
-   const int64_t v[sizeof(r_.i64) / sizeof(r_.i64[0])] = {e0};
1156
-   r_.neon_i64 = vld1_s64(v);
1157
-#else
1158
-   r_.i64[0] = e0;
1159
-#endif
1160
-
1161
-   return simde__m64_from_private(r_);
1162
-}
1163
-
1164
-SIMDE_FUNCTION_ATTRIBUTES
1165
-simde__m64 simde_x_mm_set_f32x2(simde_float32 e1, simde_float32 e0)
1166
-{
1167
-   simde__m64_private r_;
1168
-
1169
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1170
-   const simde_float32 v[sizeof(r_.f32) / sizeof(r_.f32[0])] = {e0, e1};
1171
-   r_.neon_f32 = vld1_f32(v);
1172
-#else
1173
-   r_.f32[0] = e0;
1174
-   r_.f32[1] = e1;
1175
-#endif
1176
-
1177
-   return simde__m64_from_private(r_);
1178
-}
1179
-
1180
-SIMDE_FUNCTION_ATTRIBUTES
1181
-simde__m64 simde_mm_set1_pi8(int8_t a)
1182
-{
1183
-#if defined(SIMDE_X86_MMX_NATIVE)
1184
-   return _mm_set1_pi8(a);
1185
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1186
-   simde__m64_private r_;
1187
-   r_.neon_i8 = vmov_n_s8(a);
1188
-   return simde__m64_from_private(r_);
1189
-#else
1190
-   return simde_mm_set_pi8(a, a, a, a, a, a, a, a);
1191
-#endif
1192
-}
1193
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1194
-#define _mm_set1_pi8(a) simde_mm_set1_pi8(a)
1195
-#endif
1196
-
1197
-SIMDE_FUNCTION_ATTRIBUTES
1198
-simde__m64 simde_mm_set1_pi16(int16_t a)
1199
-{
1200
-#if defined(SIMDE_X86_MMX_NATIVE)
1201
-   return _mm_set1_pi16(a);
1202
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1203
-   simde__m64_private r_;
1204
-   r_.neon_i16 = vmov_n_s16(a);
1205
-   return simde__m64_from_private(r_);
1206
-#else
1207
-   return simde_mm_set_pi16(a, a, a, a);
1208
-#endif
1209
-}
1210
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1211
-#define _mm_set1_pi16(a) simde_mm_set1_pi16(a)
1212
-#endif
1213
-
1214
-SIMDE_FUNCTION_ATTRIBUTES
1215
-simde__m64 simde_mm_set1_pi32(int32_t a)
1216
-{
1217
-#if defined(SIMDE_X86_MMX_NATIVE)
1218
-   return _mm_set1_pi32(a);
1219
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1220
-   simde__m64_private r_;
1221
-   r_.neon_i32 = vmov_n_s32(a);
1222
-   return simde__m64_from_private(r_);
1223
-#else
1224
-   return simde_mm_set_pi32(a, a);
1225
-#endif
1226
-}
1227
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1228
-#define _mm_set1_pi32(a) simde_mm_set1_pi32(a)
1229
-#endif
1230
-
1231
-SIMDE_FUNCTION_ATTRIBUTES
1232
-simde__m64 simde_mm_setr_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
1233
-                int8_t e3, int8_t e2, int8_t e1, int8_t e0)
1234
-{
1235
-#if defined(SIMDE_X86_MMX_NATIVE)
1236
-   return _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0);
1237
-#else
1238
-   return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7);
1239
-#endif
1240
-}
1241
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1242
-#define _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0) \
1243
-   simde_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0)
1244
-#endif
1245
-
1246
-SIMDE_FUNCTION_ATTRIBUTES
1247
-simde__m64 simde_mm_setr_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
1248
-{
1249
-#if defined(SIMDE_X86_MMX_NATIVE)
1250
-   return _mm_setr_pi16(e3, e2, e1, e0);
1251
-#else
1252
-   return simde_mm_set_pi16(e0, e1, e2, e3);
1253
-#endif
1254
-}
1255
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1256
-#define _mm_setr_pi16(e3, e2, e1, e0) simde_mm_setr_pi16(e3, e2, e1, e0)
1257
-#endif
1258
-
1259
-SIMDE_FUNCTION_ATTRIBUTES
1260
-simde__m64 simde_mm_setr_pi32(int32_t e1, int32_t e0)
1261
-{
1262
-#if defined(SIMDE_X86_MMX_NATIVE)
1263
-   return _mm_setr_pi32(e1, e0);
1264
-#else
1265
-   return simde_mm_set_pi32(e0, e1);
1266
-#endif
1267
-}
1268
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1269
-#define _mm_setr_pi32(e1, e0) simde_mm_setr_pi32(e1, e0)
1270
-#endif
1271
-
1272
-SIMDE_FUNCTION_ATTRIBUTES
1273
-simde__m64 simde_mm_setzero_si64(void)
1274
-{
1275
-#if defined(SIMDE_X86_MMX_NATIVE)
1276
-   return _mm_setzero_si64();
1277
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1278
-   simde__m64_private r_;
1279
-   r_.neon_u32 = vmov_n_u32(0);
1280
-   return simde__m64_from_private(r_);
1281
-#else
1282
-   return simde_mm_set_pi32(0, 0);
1283
-#endif
1284
-}
1285
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1286
-#define _mm_setzero_si64() simde_mm_setzero_si64()
1287
-#endif
1288
-
1289
-SIMDE_FUNCTION_ATTRIBUTES
1290
-simde__m64 simde_x_mm_setone_si64(void)
1291
-{
1292
-   return simde_mm_set1_pi32(~INT32_C(0));
1293
-}
1294
-
1295
-SIMDE_FUNCTION_ATTRIBUTES
1296
-simde__m64 simde_mm_sll_pi16(simde__m64 a, simde__m64 count)
1297
-{
1298
-#if defined(SIMDE_X86_MMX_NATIVE)
1299
-   return _mm_sll_pi16(a, count);
1300
-#else
1301
-   simde__m64_private r_;
1302
-   simde__m64_private a_ = simde__m64_to_private(a);
1303
-   simde__m64_private count_ = simde__m64_to_private(count);
1304
-
1305
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1306
-   r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t)vget_lane_u64(
1307
-                           count_.neon_u64, 0)));
1308
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1309
-   r_.i16 = a_.i16 << count_.u64[0];
1310
-#else
1311
-   if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) {
1312
-       simde_memset(&r_, 0, sizeof(r_));
1313
-       return simde__m64_from_private(r_);
1314
-   }
1315
-
1316
-   SIMDE_VECTORIZE
1317
-   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
1318
-       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t,
1319
-                          a_.u16[i] << count_.u64[0]);
1320
-   }
1321
-#endif
1322
-
1323
-   return simde__m64_from_private(r_);
1324
-#endif
1325
-}
1326
-#define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count)
1327
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1328
-#define _mm_sll_pi16(a, count) simde_mm_sll_pi16(a, count)
1329
-#define _m_psllw(a, count) simde_mm_sll_pi16(a, count)
1330
-#endif
1331
-
1332
-SIMDE_FUNCTION_ATTRIBUTES
1333
-simde__m64 simde_mm_sll_pi32(simde__m64 a, simde__m64 count)
1334
-{
1335
-#if defined(SIMDE_X86_MMX_NATIVE)
1336
-   return _mm_sll_pi32(a, count);
1337
-#else
1338
-   simde__m64_private r_;
1339
-   simde__m64_private a_ = simde__m64_to_private(a);
1340
-   simde__m64_private count_ = simde__m64_to_private(count);
1341
-
1342
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1343
-   r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t)vget_lane_u64(
1344
-                           count_.neon_u64, 0)));
1345
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1346
-   r_.i32 = a_.i32 << count_.u64[0];
1347
-#else
1348
-   if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) {
1349
-       simde_memset(&r_, 0, sizeof(r_));
1350
-       return simde__m64_from_private(r_);
1351
-   }
1352
-
1353
-   SIMDE_VECTORIZE
1354
-   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
1355
-       r_.u32[i] = a_.u32[i] << count_.u64[0];
1356
-   }
1357
-#endif
1358
-
1359
-   return simde__m64_from_private(r_);
1360
-#endif
1361
-}
1362
-#define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count)
1363
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1364
-#define _mm_sll_pi32(a, count) simde_mm_sll_pi32(a, count)
1365
-#define _m_pslld(a, count) simde_mm_sll_pi32(a, count)
1366
-#endif
1367
-
1368
-SIMDE_FUNCTION_ATTRIBUTES
1369
-simde__m64 simde_mm_slli_pi16(simde__m64 a, int count)
1370
-{
1371
-#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
1372
-   return _mm_slli_pi16(a, count);
1373
-#else
1374
-   simde__m64_private r_;
1375
-   simde__m64_private a_ = simde__m64_to_private(a);
1376
-
1377
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1378
-   r_.i16 = a_.i16 << count;
1379
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1380
-   r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t)count));
1381
-#else
1382
-   SIMDE_VECTORIZE
1383
-   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
1384
-       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << count);
1385
-   }
1386
-#endif
1387
-
1388
-   return simde__m64_from_private(r_);
1389
-#endif
1390
-}
1391
-#define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count)
1392
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1393
-#define _mm_slli_pi16(a, count) simde_mm_slli_pi16(a, count)
1394
-#define _m_psllwi(a, count) simde_mm_slli_pi16(a, count)
1395
-#endif
1396
-
1397
-SIMDE_FUNCTION_ATTRIBUTES
1398
-simde__m64 simde_mm_slli_pi32(simde__m64 a, int count)
1399
-{
1400
-#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
1401
-   return _mm_slli_pi32(a, count);
1402
-#else
1403
-   simde__m64_private r_;
1404
-   simde__m64_private a_ = simde__m64_to_private(a);
1405
-
1406
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1407
-   r_.i32 = a_.i32 << count;
1408
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1409
-   r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t)count));
1410
-#else
1411
-   SIMDE_VECTORIZE
1412
-   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
1413
-       r_.u32[i] = a_.u32[i] << count;
1414
-   }
1415
-#endif
1416
-
1417
-   return simde__m64_from_private(r_);
1418
-#endif
1419
-}
1420
-#define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b)
1421
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1422
-#define _mm_slli_pi32(a, count) simde_mm_slli_pi32(a, count)
1423
-#define _m_pslldi(a, count) simde_mm_slli_pi32(a, count)
1424
-#endif
1425
-
1426
-SIMDE_FUNCTION_ATTRIBUTES
1427
-simde__m64 simde_mm_slli_si64(simde__m64 a, int count)
1428
-{
1429
-#if defined(SIMDE_X86_MMX_NATIVE)
1430
-   return _mm_slli_si64(a, count);
1431
-#else
1432
-   simde__m64_private r_;
1433
-   simde__m64_private a_ = simde__m64_to_private(a);
1434
-
1435
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1436
-   r_.i64 = a_.i64 << count;
1437
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1438
-   r_.neon_i64 = vshl_s64(a_.neon_i64, vmov_n_s64((int64_t)count));
1439
-#else
1440
-   r_.u64[0] = a_.u64[0] << count;
1441
-#endif
1442
-
1443
-   return simde__m64_from_private(r_);
1444
-#endif
1445
-}
1446
-#define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count)
1447
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1448
-#define _mm_slli_si64(a, count) simde_mm_slli_si64(a, count)
1449
-#define _m_psllqi(a, count) simde_mm_slli_si64(a, count)
1450
-#endif
1451
-
1452
-SIMDE_FUNCTION_ATTRIBUTES
1453
-simde__m64 simde_mm_sll_si64(simde__m64 a, simde__m64 count)
1454
-{
1455
-#if defined(SIMDE_X86_MMX_NATIVE)
1456
-   return _mm_sll_si64(a, count);
1457
-#else
1458
-   simde__m64_private r_;
1459
-   simde__m64_private a_ = simde__m64_to_private(a);
1460
-   simde__m64_private count_ = simde__m64_to_private(count);
1461
-
1462
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1463
-   r_.neon_i64 = vshl_s64(a_.neon_i64, count_.neon_i64);
1464
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1465
-   r_.i64 = a_.i64 << count_.i64;
1466
-#else
1467
-   if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) {
1468
-       simde_memset(&r_, 0, sizeof(r_));
1469
-       return simde__m64_from_private(r_);
1470
-   }
1471
-
1472
-   r_.u64[0] = a_.u64[0] << count_.u64[0];
1473
-#endif
1474
-
1475
-   return simde__m64_from_private(r_);
1476
-#endif
1477
-}
1478
-#define simde_m_psllq(a, count) simde_mm_sll_si64(a, count)
1479
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1480
-#define _mm_sll_si64(a, count) simde_mm_sll_si64(a, count)
1481
-#define _m_psllq(a, count) simde_mm_sll_si64(a, count)
1482
-#endif
1483
-
1484
-SIMDE_FUNCTION_ATTRIBUTES
1485
-simde__m64 simde_mm_srl_pi16(simde__m64 a, simde__m64 count)
1486
-{
1487
-#if defined(SIMDE_X86_MMX_NATIVE)
1488
-   return _mm_srl_pi16(a, count);
1489
-#else
1490
-   simde__m64_private r_;
1491
-   simde__m64_private a_ = simde__m64_to_private(a);
1492
-   simde__m64_private count_ = simde__m64_to_private(count);
1493
-
1494
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1495
-   r_.u16 = a_.u16 >> count_.u64[0];
1496
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1497
-   r_.neon_u16 = vshl_u16(
1498
-       a_.neon_u16,
1499
-       vmov_n_s16(-((int16_t)vget_lane_u64(count_.neon_u64, 0))));
1500
-#else
1501
-   if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) {
1502
-       simde_memset(&r_, 0, sizeof(r_));
1503
-       return simde__m64_from_private(r_);
1504
-   }
1505
-
1506
-   SIMDE_VECTORIZE
1507
-   for (size_t i = 0; i < sizeof(r_.u16) / sizeof(r_.u16[0]); i++) {
1508
-       r_.u16[i] = a_.u16[i] >> count_.u64[0];
1509
-   }
1510
-#endif
1511
-
1512
-   return simde__m64_from_private(r_);
1513
-#endif
1514
-}
1515
-#define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count)
1516
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1517
-#define _mm_srl_pi16(a, count) simde_mm_srl_pi16(a, count)
1518
-#define _m_psrlw(a, count) simde_mm_srl_pi16(a, count)
1519
-#endif
1520
-
1521
-SIMDE_FUNCTION_ATTRIBUTES
1522
-simde__m64 simde_mm_srl_pi32(simde__m64 a, simde__m64 count)
1523
-{
1524
-#if defined(SIMDE_X86_MMX_NATIVE)
1525
-   return _mm_srl_pi32(a, count);
1526
-#else
1527
-   simde__m64_private r_;
1528
-   simde__m64_private a_ = simde__m64_to_private(a);
1529
-   simde__m64_private count_ = simde__m64_to_private(count);
1530
-
1531
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1532
-   r_.u32 = a_.u32 >> count_.u64[0];
1533
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1534
-   r_.neon_u32 = vshl_u32(
1535
-       a_.neon_u32,
1536
-       vmov_n_s32(-((int32_t)vget_lane_u64(count_.neon_u64, 0))));
1537
-#else
1538
-   if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) {
1539
-       simde_memset(&r_, 0, sizeof(r_));
1540
-       return simde__m64_from_private(r_);
1541
-   }
1542
-
1543
-   SIMDE_VECTORIZE
1544
-   for (size_t i = 0; i < sizeof(r_.u32) / sizeof(r_.u32[0]); i++) {
1545
-       r_.u32[i] = a_.u32[i] >> count_.u64[0];
1546
-   }
1547
-#endif
1548
-
1549
-   return simde__m64_from_private(r_);
1550
-#endif
1551
-}
1552
-#define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count)
1553
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1554
-#define _mm_srl_pi32(a, count) simde_mm_srl_pi32(a, count)
1555
-#define _m_psrld(a, count) simde_mm_srl_pi32(a, count)
1556
-#endif
1557
-
1558
-SIMDE_FUNCTION_ATTRIBUTES
1559
-simde__m64 simde_mm_srli_pi16(simde__m64 a, int count)
1560
-{
1561
-#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
1562
-   return _mm_srli_pi16(a, count);
1563
-#else
1564
-   simde__m64_private r_;
1565
-   simde__m64_private a_ = simde__m64_to_private(a);
1566
-
1567
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1568
-   r_.u16 = a_.u16 >> count;
1569
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1570
-   r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t)count)));
1571
-#else
1572
-   SIMDE_VECTORIZE
1573
-   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
1574
-       r_.u16[i] = a_.u16[i] >> count;
1575
-   }
1576
-#endif
1577
-
1578
-   return simde__m64_from_private(r_);
1579
-#endif
1580
-}
1581
-#define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
1582
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1583
-#define _mm_srli_pi16(a, count) simde_mm_srli_pi16(a, count)
1584
-#define _m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
1585
-#endif
1586
-
1587
-SIMDE_FUNCTION_ATTRIBUTES
1588
-simde__m64 simde_mm_srli_pi32(simde__m64 a, int count)
1589
-{
1590
-#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
1591
-   return _mm_srli_pi32(a, count);
1592
-#else
1593
-   simde__m64_private r_;
1594
-   simde__m64_private a_ = simde__m64_to_private(a);
1595
-
1596
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1597
-   r_.u32 = a_.u32 >> count;
1598
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1599
-   r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t)count)));
1600
-#else
1601
-   SIMDE_VECTORIZE
1602
-   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
1603
-       r_.u32[i] = a_.u32[i] >> count;
1604
-   }
1605
-#endif
1606
-
1607
-   return simde__m64_from_private(r_);
1608
-#endif
1609
-}
1610
-#define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count)
1611
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1612
-#define _mm_srli_pi32(a, count) simde_mm_srli_pi32(a, count)
1613
-#define _m_psrldi(a, count) simde_mm_srli_pi32(a, count)
1614
-#endif
1615
-
1616
-SIMDE_FUNCTION_ATTRIBUTES
1617
-simde__m64 simde_mm_srli_si64(simde__m64 a, int count)
1618
-{
1619
-#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
1620
-   return _mm_srli_si64(a, count);
1621
-#else
1622
-   simde__m64_private r_;
1623
-   simde__m64_private a_ = simde__m64_to_private(a);
1624
-
1625
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1626
-   r_.neon_u64 = vshl_u64(a_.neon_u64, vmov_n_s64(-count));
1627
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1628
-   r_.u64 = a_.u64 >> count;
1629
-#else
1630
-   r_.u64[0] = a_.u64[0] >> count;
1631
-#endif
1632
-
1633
-   return simde__m64_from_private(r_);
1634
-#endif
1635
-}
1636
-#define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count)
1637
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1638
-#define _mm_srli_si64(a, count) simde_mm_srli_si64(a, count)
1639
-#define _m_psrlqi(a, count) simde_mm_srli_si64(a, count)
1640
-#endif
1641
-
1642
-SIMDE_FUNCTION_ATTRIBUTES
1643
-simde__m64 simde_mm_srl_si64(simde__m64 a, simde__m64 count)
1644
-{
1645
-#if defined(SIMDE_X86_MMX_NATIVE)
1646
-   return _mm_srl_si64(a, count);
1647
-#else
1648
-   simde__m64_private r_;
1649
-   simde__m64_private a_ = simde__m64_to_private(a);
1650
-   simde__m64_private count_ = simde__m64_to_private(count);
1651
-
1652
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1653
-   r_.neon_u64 = vshl_u64(a_.neon_u64, vneg_s64(count_.neon_i64));
1654
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1655
-   r_.u64 = a_.u64 >> count_.u64;
1656
-#else
1657
-   if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) {
1658
-       simde_memset(&r_, 0, sizeof(r_));
1659
-       return simde__m64_from_private(r_);
1660
-   }
1661
-
1662
-   r_.u64[0] = a_.u64[0] >> count_.u64[0];
1663
-#endif
1664
-
1665
-   return simde__m64_from_private(r_);
1666
-#endif
1667
-}
1668
-#define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count)
1669
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1670
-#define _mm_srl_si64(a, count) simde_mm_srl_si64(a, count)
1671
-#define _m_psrlq(a, count) simde_mm_srl_si64(a, count)
1672
-#endif
1673
-
1674
-SIMDE_FUNCTION_ATTRIBUTES
1675
-simde__m64 simde_mm_srai_pi16(simde__m64 a, int count)
1676
-{
1677
-#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
1678
-   return _mm_srai_pi16(a, count);
1679
-#else
1680
-   simde__m64_private r_;
1681
-   simde__m64_private a_ = simde__m64_to_private(a);
1682
-
1683
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1684
-   r_.i16 = a_.i16 >> (count & 0xff);
1685
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1686
-  r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, count));
1687
-#else
1688
-   SIMDE_VECTORIZE
1689
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1690
-       r_.i16[i] = a_.i16[i] >> (count & 0xff);
1691
-   }
1692
-#endif
1693
-
1694
-   return simde__m64_from_private(r_);
1695
-#endif
1696
-}
1697
-#define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count)
1698
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1699
-#define _mm_srai_pi16(a, count) simde_mm_srai_pi16(a, count)
1700
-#define _m_psrawi(a, count) simde_mm_srai_pi16(a, count)
1701
-#endif
1702
-
1703
-SIMDE_FUNCTION_ATTRIBUTES
1704
-simde__m64 simde_mm_srai_pi32(simde__m64 a, int count)
1705
-{
1706
-#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
1707
-   return _mm_srai_pi32(a, count);
1708
-#else
1709
-   simde__m64_private r_;
1710
-   simde__m64_private a_ = simde__m64_to_private(a);
1711
-
1712
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1713
-   r_.i32 = a_.i32 >> (count & 0xff);
1714
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1715
-   r_.neon_i32 = vshl_s32(a_.neon_i32,
1716
-                  vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, count)));
1717
-#else
1718
-   SIMDE_VECTORIZE
1719
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1720
-       r_.i32[i] = a_.i32[i] >> (count & 0xff);
1721
-   }
1722
-#endif
1723
-
1724
-   return simde__m64_from_private(r_);
1725
-#endif
1726
-}
1727
-#define simde_m_psradi(a, count) simde_mm_srai_pi32(a, count)
1728
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1729
-#define _mm_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
1730
-#define _m_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
1731
-#endif
1732
-
1733
-SIMDE_FUNCTION_ATTRIBUTES
1734
-simde__m64 simde_mm_sra_pi16(simde__m64 a, simde__m64 count)
1735
-{
1736
-#if defined(SIMDE_X86_MMX_NATIVE)
1737
-   return _mm_sra_pi16(a, count);
1738
-#else
1739
-   simde__m64_private r_;
1740
-   simde__m64_private a_ = simde__m64_to_private(a);
1741
-   simde__m64_private count_ = simde__m64_to_private(count);
1742
-   const int cnt = HEDLEY_STATIC_CAST(
1743
-       int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
1744
-
1745
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1746
-   r_.i16 = a_.i16 >> cnt;
1747
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1748
-   r_.neon_i16 =
1749
-       vshl_s16(a_.neon_i16,
1750
-            vmov_n_s16(-HEDLEY_STATIC_CAST(
1751
-                int16_t, vget_lane_u64(count_.neon_u64, 0))));
1752
-#else
1753
-   SIMDE_VECTORIZE
1754
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1755
-       r_.i16[i] = a_.i16[i] >> cnt;
1756
-   }
1757
-#endif
1758
-
1759
-   return simde__m64_from_private(r_);
1760
-#endif
1761
-}
1762
-#define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count)
1763
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1764
-#define _mm_sra_pi16(a, count) simde_mm_sra_pi16(a, count)
1765
-#define _m_psraw(a, count) simde_mm_sra_pi16(a, count)
1766
-#endif
1767
-
1768
-SIMDE_FUNCTION_ATTRIBUTES
1769
-simde__m64 simde_mm_sra_pi32(simde__m64 a, simde__m64 count)
1770
-{
1771
-#if defined(SIMDE_X86_MMX_NATIVE)
1772
-   return _mm_sra_pi32(a, count);
1773
-#else
1774
-   simde__m64_private r_;
1775
-   simde__m64_private a_ = simde__m64_to_private(a);
1776
-   simde__m64_private count_ = simde__m64_to_private(count);
1777
-   const int32_t cnt =
1778
-       (count_.u64[0] > 31)
1779
-           ? 31
1780
-           : HEDLEY_STATIC_CAST(int32_t, count_.u64[0]);
1781
-
1782
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1783
-   r_.i32 = a_.i32 >> cnt;
1784
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1785
-   r_.neon_i32 =
1786
-       vshl_s32(a_.neon_i32,
1787
-            vmov_n_s32(-HEDLEY_STATIC_CAST(
1788
-                int32_t, vget_lane_u64(count_.neon_u64, 0))));
1789
-#else
1790
-   SIMDE_VECTORIZE
1791
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1792
-       r_.i32[i] = a_.i32[i] >> cnt;
1793
-   }
1794
-#endif
1795
-
1796
-   return simde__m64_from_private(r_);
1797
-#endif
1798
-}
1799
-#define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b)
1800
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1801
-#define _mm_sra_pi32(a, count) simde_mm_sra_pi32(a, count)
1802
-#define _m_psrad(a, count) simde_mm_sra_pi32(a, count)
1803
-#endif
1804
-
1805
-SIMDE_FUNCTION_ATTRIBUTES
1806
-simde__m64 simde_mm_sub_pi8(simde__m64 a, simde__m64 b)
1807
-{
1808
-#if defined(SIMDE_X86_MMX_NATIVE)
1809
-   return _mm_sub_pi8(a, b);
1810
-#else
1811
-   simde__m64_private r_;
1812
-   simde__m64_private a_ = simde__m64_to_private(a);
1813
-   simde__m64_private b_ = simde__m64_to_private(b);
1814
-
1815
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1816
-   r_.neon_i8 = vsub_s8(a_.neon_i8, b_.neon_i8);
1817
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1818
-   r_.i8 = a_.i8 - b_.i8;
1819
-#else
1820
-   SIMDE_VECTORIZE
1821
-   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
1822
-       r_.i8[i] = a_.i8[i] - b_.i8[i];
1823
-   }
1824
-#endif
1825
-
1826
-   return simde__m64_from_private(r_);
1827
-#endif
1828
-}
1829
-#define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b)
1830
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1831
-#define _mm_sub_pi8(a, b) simde_mm_sub_pi8(a, b)
1832
-#define _m_psubb(a, b) simde_mm_sub_pi8(a, b)
1833
-#endif
1834
-
1835
-SIMDE_FUNCTION_ATTRIBUTES
1836
-simde__m64 simde_mm_sub_pi16(simde__m64 a, simde__m64 b)
1837
-{
1838
-#if defined(SIMDE_X86_MMX_NATIVE)
1839
-   return _mm_sub_pi16(a, b);
1840
-#else
1841
-   simde__m64_private r_;
1842
-   simde__m64_private a_ = simde__m64_to_private(a);
1843
-   simde__m64_private b_ = simde__m64_to_private(b);
1844
-
1845
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1846
-   r_.neon_i16 = vsub_s16(a_.neon_i16, b_.neon_i16);
1847
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1848
-   r_.i16 = a_.i16 - b_.i16;
1849
-#else
1850
-   SIMDE_VECTORIZE
1851
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1852
-       r_.i16[i] = a_.i16[i] - b_.i16[i];
1853
-   }
1854
-#endif
1855
-
1856
-   return simde__m64_from_private(r_);
1857
-#endif
1858
-}
1859
-#define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b)
1860
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1861
-#define _mm_sub_pi16(a, b) simde_mm_sub_pi16(a, b)
1862
-#define _m_psubw(a, b) simde_mm_sub_pi16(a, b)
1863
-#endif
1864
-
1865
-SIMDE_FUNCTION_ATTRIBUTES
1866
-simde__m64 simde_mm_sub_pi32(simde__m64 a, simde__m64 b)
1867
-{
1868
-#if defined(SIMDE_X86_MMX_NATIVE)
1869
-   return _mm_sub_pi32(a, b);
1870
-#else
1871
-   simde__m64_private r_;
1872
-   simde__m64_private a_ = simde__m64_to_private(a);
1873
-   simde__m64_private b_ = simde__m64_to_private(b);
1874
-
1875
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1876
-   r_.neon_i32 = vsub_s32(a_.neon_i32, b_.neon_i32);
1877
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1878
-   r_.i32 = a_.i32 - b_.i32;
1879
-#else
1880
-   SIMDE_VECTORIZE
1881
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1882
-       r_.i32[i] = a_.i32[i] - b_.i32[i];
1883
-   }
1884
-#endif
1885
-
1886
-   return simde__m64_from_private(r_);
1887
-#endif
1888
-}
1889
-#define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b)
1890
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1891
-#define _mm_sub_pi32(a, b) simde_mm_sub_pi32(a, b)
1892
-#define _m_psubd(a, b) simde_mm_sub_pi32(a, b)
1893
-#endif
1894
-
1895
-SIMDE_FUNCTION_ATTRIBUTES
1896
-simde__m64 simde_mm_subs_pi8(simde__m64 a, simde__m64 b)
1897
-{
1898
-#if defined(SIMDE_X86_MMX_NATIVE)
1899
-   return _mm_subs_pi8(a, b);
1900
-#else
1901
-   simde__m64_private r_;
1902
-   simde__m64_private a_ = simde__m64_to_private(a);
1903
-   simde__m64_private b_ = simde__m64_to_private(b);
1904
-
1905
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1906
-   r_.neon_i8 = vqsub_s8(a_.neon_i8, b_.neon_i8);
1907
-#else
1908
-   SIMDE_VECTORIZE
1909
-   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
1910
-       if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
1911
-           r_.i8[i] = INT8_MIN;
1912
-       } else if ((b_.i8[i]) < 0 &&
1913
-              (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
1914
-           r_.i8[i] = INT8_MAX;
1915
-       } else {
1916
-           r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
1917
-       }
1918
-   }
1919
-#endif
1920
-
1921
-   return simde__m64_from_private(r_);
1922
-#endif
1923
-}
1924
-#define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b)
1925
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1926
-#define _mm_subs_pi8(a, b) simde_mm_subs_pi8(a, b)
1927
-#define _m_psubsb(a, b) simde_mm_subs_pi8(a, b)
1928
-#endif
1929
-
1930
-SIMDE_FUNCTION_ATTRIBUTES
1931
-simde__m64 simde_mm_subs_pu8(simde__m64 a, simde__m64 b)
1932
-{
1933
-#if defined(SIMDE_X86_MMX_NATIVE)
1934
-   return _mm_subs_pu8(a, b);
1935
-#else
1936
-   simde__m64_private r_;
1937
-   simde__m64_private a_ = simde__m64_to_private(a);
1938
-   simde__m64_private b_ = simde__m64_to_private(b);
1939
-
1940
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1941
-   r_.neon_u8 = vqsub_u8(a_.neon_u8, b_.neon_u8);
1942
-#else
1943
-   SIMDE_VECTORIZE
1944
-   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
1945
-       const int32_t x = a_.u8[i] - b_.u8[i];
1946
-       if (x < 0) {
1947
-           r_.u8[i] = 0;
1948
-       } else if (x > UINT8_MAX) {
1949
-           r_.u8[i] = UINT8_MAX;
1950
-       } else {
1951
-           r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
1952
-       }
1953
-   }
1954
-#endif
1955
-
1956
-   return simde__m64_from_private(r_);
1957
-#endif
1958
-}
1959
-#define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b)
1960
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1961
-#define _mm_subs_pu8(a, b) simde_mm_subs_pu8(a, b)
1962
-#define _m_psubusb(a, b) simde_mm_subs_pu8(a, b)
1963
-#endif
1964
-
1965
-SIMDE_FUNCTION_ATTRIBUTES
1966
-simde__m64 simde_mm_subs_pi16(simde__m64 a, simde__m64 b)
1967
-{
1968
-#if defined(SIMDE_X86_MMX_NATIVE)
1969
-   return _mm_subs_pi16(a, b);
1970
-#else
1971
-   simde__m64_private r_;
1972
-   simde__m64_private a_ = simde__m64_to_private(a);
1973
-   simde__m64_private b_ = simde__m64_to_private(b);
1974
-
1975
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1976
-   r_.neon_i16 = vqsub_s16(a_.neon_i16, b_.neon_i16);
1977
-#else
1978
-   SIMDE_VECTORIZE
1979
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1980
-       if (((b_.i16[i]) > 0 && (a_.i16[i]) < SHRT_MIN + (b_.i16[i]))) {
1981
-           r_.i16[i] = SHRT_MIN;
1982
-       } else if ((b_.i16[i]) < 0 &&
1983
-              (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
1984
-           r_.i16[i] = INT16_MAX;
1985
-       } else {
1986
-           r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
1987
-       }
1988
-   }
1989
-#endif
1990
-
1991
-   return simde__m64_from_private(r_);
1992
-#endif
1993
-}
1994
-#define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b)
1995
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1996
-#define _mm_subs_pi16(a, b) simde_mm_subs_pi16(a, b)
1997
-#define _m_psubsw(a, b) simde_mm_subs_pi16(a, b)
1998
-#endif
1999
-
2000
-SIMDE_FUNCTION_ATTRIBUTES
2001
-simde__m64 simde_mm_subs_pu16(simde__m64 a, simde__m64 b)
2002
-{
2003
-#if defined(SIMDE_X86_MMX_NATIVE)
2004
-   return _mm_subs_pu16(a, b);
2005
-#else
2006
-   simde__m64_private r_;
2007
-   simde__m64_private a_ = simde__m64_to_private(a);
2008
-   simde__m64_private b_ = simde__m64_to_private(b);
2009
-
2010
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2011
-   r_.neon_u16 = vqsub_u16(a_.neon_u16, b_.neon_u16);
2012
-#else
2013
-   SIMDE_VECTORIZE
2014
-   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
2015
-       const int x = a_.u16[i] - b_.u16[i];
2016
-       if (x < 0) {
2017
-           r_.u16[i] = 0;
2018
-       } else if (x > UINT16_MAX) {
2019
-           r_.u16[i] = UINT16_MAX;
2020
-       } else {
2021
-           r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
2022
-       }
2023
-   }
2024
-#endif
2025
-
2026
-   return simde__m64_from_private(r_);
2027
-#endif
2028
-}
2029
-#define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b)
2030
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2031
-#define _mm_subs_pu16(a, b) simde_mm_subs_pu16(a, b)
2032
-#define _m_psubusw(a, b) simde_mm_subs_pu16(a, b)
2033
-#endif
2034
-
2035
-SIMDE_FUNCTION_ATTRIBUTES
2036
-simde__m64 simde_mm_unpackhi_pi8(simde__m64 a, simde__m64 b)
2037
-{
2038
-#if defined(SIMDE_X86_MMX_NATIVE)
2039
-   return _mm_unpackhi_pi8(a, b);
2040
-#else
2041
-   simde__m64_private r_;
2042
-   simde__m64_private a_ = simde__m64_to_private(a);
2043
-   simde__m64_private b_ = simde__m64_to_private(b);
2044
-
2045
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2046
-   r_.neon_i8 = vzip2_s8(a_.neon_i8, b_.neon_i8);
2047
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
2048
-   r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14,
2049
-                     7, 15);
2050
-#else
2051
-   r_.i8[0] = a_.i8[4];
2052
-   r_.i8[1] = b_.i8[4];
2053
-   r_.i8[2] = a_.i8[5];
2054
-   r_.i8[3] = b_.i8[5];
2055
-   r_.i8[4] = a_.i8[6];
2056
-   r_.i8[5] = b_.i8[6];
2057
-   r_.i8[6] = a_.i8[7];
2058
-   r_.i8[7] = b_.i8[7];
2059
-#endif
2060
-
2061
-   return simde__m64_from_private(r_);
2062
-#endif
2063
-}
2064
-#define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
2065
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2066
-#define _mm_unpackhi_pi8(a, b) simde_mm_unpackhi_pi8(a, b)
2067
-#define _m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
2068
-#endif
2069
-
2070
-SIMDE_FUNCTION_ATTRIBUTES
2071
-simde__m64 simde_mm_unpackhi_pi16(simde__m64 a, simde__m64 b)
2072
-{
2073
-#if defined(SIMDE_X86_MMX_NATIVE)
2074
-   return _mm_unpackhi_pi16(a, b);
2075
-#else
2076
-   simde__m64_private r_;
2077
-   simde__m64_private a_ = simde__m64_to_private(a);
2078
-   simde__m64_private b_ = simde__m64_to_private(b);
2079
-
2080
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2081
-   r_.neon_i16 = vzip2_s16(a_.neon_i16, b_.neon_i16);
2082
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
2083
-   r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 2, 6, 3, 7);
2084
-#else
2085
-   r_.i16[0] = a_.i16[2];
2086
-   r_.i16[1] = b_.i16[2];
2087
-   r_.i16[2] = a_.i16[3];
2088
-   r_.i16[3] = b_.i16[3];
2089
-#endif
2090
-
2091
-   return simde__m64_from_private(r_);
2092
-#endif
2093
-}
2094
-#define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
2095
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2096
-#define _mm_unpackhi_pi16(a, b) simde_mm_unpackhi_pi16(a, b)
2097
-#define _m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
2098
-#endif
2099
-
2100
-SIMDE_FUNCTION_ATTRIBUTES
2101
-simde__m64 simde_mm_unpackhi_pi32(simde__m64 a, simde__m64 b)
2102
-{
2103
-#if defined(SIMDE_X86_MMX_NATIVE)
2104
-   return _mm_unpackhi_pi32(a, b);
2105
-#else
2106
-   simde__m64_private r_;
2107
-   simde__m64_private a_ = simde__m64_to_private(a);
2108
-   simde__m64_private b_ = simde__m64_to_private(b);
2109
-
2110
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2111
-   r_.neon_i32 = vzip2_s32(a_.neon_i32, b_.neon_i32);
2112
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
2113
-   r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3);
2114
-#else
2115
-   r_.i32[0] = a_.i32[1];
2116
-   r_.i32[1] = b_.i32[1];
2117
-#endif
2118
-
2119
-   return simde__m64_from_private(r_);
2120
-#endif
2121
-}
2122
-#define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
2123
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2124
-#define _mm_unpackhi_pi32(a, b) simde_mm_unpackhi_pi32(a, b)
2125
-#define _m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
2126
-#endif
2127
-
2128
-SIMDE_FUNCTION_ATTRIBUTES
2129
-simde__m64 simde_mm_unpacklo_pi8(simde__m64 a, simde__m64 b)
2130
-{
2131
-#if defined(SIMDE_X86_MMX_NATIVE)
2132
-   return _mm_unpacklo_pi8(a, b);
2133
-#else
2134
-   simde__m64_private r_;
2135
-   simde__m64_private a_ = simde__m64_to_private(a);
2136
-   simde__m64_private b_ = simde__m64_to_private(b);
2137
-
2138
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2139
-   r_.neon_i8 = vzip1_s8(a_.neon_i8, b_.neon_i8);
2140
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
2141
-   r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 0, 8, 1, 9, 2, 10, 3,
2142
-                     11);
2143
-#else
2144
-   r_.i8[0] = a_.i8[0];
2145
-   r_.i8[1] = b_.i8[0];
2146
-   r_.i8[2] = a_.i8[1];
2147
-   r_.i8[3] = b_.i8[1];
2148
-   r_.i8[4] = a_.i8[2];
2149
-   r_.i8[5] = b_.i8[2];
2150
-   r_.i8[6] = a_.i8[3];
2151
-   r_.i8[7] = b_.i8[3];
2152
-#endif
2153
-
2154
-   return simde__m64_from_private(r_);
2155
-#endif
2156
-}
2157
-#define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
2158
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2159
-#define _mm_unpacklo_pi8(a, b) simde_mm_unpacklo_pi8(a, b)
2160
-#define _m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
2161
-#endif
2162
-
2163
-SIMDE_FUNCTION_ATTRIBUTES
2164
-simde__m64 simde_mm_unpacklo_pi16(simde__m64 a, simde__m64 b)
2165
-{
2166
-#if defined(SIMDE_X86_MMX_NATIVE)
2167
-   return _mm_unpacklo_pi16(a, b);
2168
-#else
2169
-   simde__m64_private r_;
2170
-   simde__m64_private a_ = simde__m64_to_private(a);
2171
-   simde__m64_private b_ = simde__m64_to_private(b);
2172
-
2173
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2174
-   r_.neon_i16 = vzip1_s16(a_.neon_i16, b_.neon_i16);
2175
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
2176
-   r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 4, 1, 5);
2177
-#else
2178
-   r_.i16[0] = a_.i16[0];
2179
-   r_.i16[1] = b_.i16[0];
2180
-   r_.i16[2] = a_.i16[1];
2181
-   r_.i16[3] = b_.i16[1];
2182
-#endif
2183
-
2184
-   return simde__m64_from_private(r_);
2185
-#endif
2186
-}
2187
-#define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
2188
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2189
-#define _mm_unpacklo_pi16(a, b) simde_mm_unpacklo_pi16(a, b)
2190
-#define _m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
2191
-#endif
2192
-
2193
-SIMDE_FUNCTION_ATTRIBUTES
2194
-simde__m64 simde_mm_unpacklo_pi32(simde__m64 a, simde__m64 b)
2195
-{
2196
-#if defined(SIMDE_X86_MMX_NATIVE)
2197
-   return _mm_unpacklo_pi32(a, b);
2198
-#else
2199
-   simde__m64_private r_;
2200
-   simde__m64_private a_ = simde__m64_to_private(a);
2201
-   simde__m64_private b_ = simde__m64_to_private(b);
2202
-
2203
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2204
-   r_.neon_i32 = vzip1_s32(a_.neon_i32, b_.neon_i32);
2205
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
2206
-   r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2);
2207
-#else
2208
-   r_.i32[0] = a_.i32[0];
2209
-   r_.i32[1] = b_.i32[0];
2210
-#endif
2211
-
2212
-   return simde__m64_from_private(r_);
2213
-#endif
2214
-}
2215
-#define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
2216
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2217
-#define _mm_unpacklo_pi32(a, b) simde_mm_unpacklo_pi32(a, b)
2218
-#define _m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
2219
-#endif
2220
-
2221
-SIMDE_FUNCTION_ATTRIBUTES
2222
-simde__m64 simde_mm_xor_si64(simde__m64 a, simde__m64 b)
2223
-{
2224
-#if defined(SIMDE_X86_MMX_NATIVE)
2225
-   return _mm_xor_si64(a, b);
2226
-#else
2227
-   simde__m64_private r_;
2228
-   simde__m64_private a_ = simde__m64_to_private(a);
2229
-   simde__m64_private b_ = simde__m64_to_private(b);
2230
-
2231
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2232
-   r_.neon_i32 = veor_s32(a_.neon_i32, b_.neon_i32);
2233
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2234
-   r_.i32f = a_.i32f ^ b_.i32f;
2235
-#else
2236
-   r_.u64[0] = a_.u64[0] ^ b_.u64[0];
2237
-#endif
2238
-
2239
-   return simde__m64_from_private(r_);
2240
-#endif
2241
-}
2242
-#define simde_m_pxor(a, b) simde_mm_xor_si64(a, b)
2243
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2244
-#define _mm_xor_si64(a, b) simde_mm_xor_si64(a, b)
2245
-#define _m_pxor(a, b) simde_mm_xor_si64(a, b)
2246
-#endif
2247
-
2248
-SIMDE_FUNCTION_ATTRIBUTES
2249
-int32_t simde_m_to_int(simde__m64 a)
2250
-{
2251
-#if defined(SIMDE_X86_MMX_NATIVE)
2252
-   return _m_to_int(a);
2253
-#else
2254
-   simde__m64_private a_ = simde__m64_to_private(a);
2255
-
2256
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2257
-   return vget_lane_s32(a_.neon_i32, 0);
2258
-#else
2259
-   return a_.i32[0];
2260
-#endif
2261
-#endif
2262
-}
2263
-#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2264
-#define _m_to_int(a) simde_m_to_int(a)
2265
-#endif
2266
-
2267
-SIMDE_END_DECLS_
2268
-
2269
-HEDLEY_DIAGNOSTIC_POP
2270
-
2271
-#endif /* !defined(SIMDE_X86_MMX_H) */
2272
obs-studio-26.1.0.tar.xz/libobs/util/simde/sse.h Deleted
3720
 
1
@@ -1,3718 +0,0 @@
2
-/* SPDX-License-Identifier: MIT
3
- *
4
- * Permission is hereby granted, free of charge, to any person
5
- * obtaining a copy of this software and associated documentation
6
- * files (the "Software"), to deal in the Software without
7
- * restriction, including without limitation the rights to use, copy,
8
- * modify, merge, publish, distribute, sublicense, and/or sell copies
9
- * of the Software, and to permit persons to whom the Software is
10
- * furnished to do so, subject to the following conditions:
11
- *
12
- * The above copyright notice and this permission notice shall be
13
- * included in all copies or substantial portions of the Software.
14
- *
15
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
- * SOFTWARE.
23
- *
24
- * Copyright:
25
- *   2017-2020 Evan Nemerson <evan@nemerson.com>
26
- *   2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
27
- *   2015      Brandon Rowlett <browlett@nvidia.com>
28
- *   2015      Ken Fast <kfast@gdeb.com>
29
- */
30
-
31
-#if !defined(SIMDE_X86_SSE_H)
32
-#define SIMDE_X86_SSE_H
33
-
34
-#include "mmx.h"
35
-
36
-#if !defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
37
-#define SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES
38
-#endif
39
-
40
-#if defined(_WIN32)
41
-#include <windows.h>
42
-#endif
43
-
44
-HEDLEY_DIAGNOSTIC_PUSH
45
-SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
46
-SIMDE_BEGIN_DECLS_
47
-
48
-typedef union {
49
-#if defined(SIMDE_VECTOR_SUBSCRIPT)
50
-   SIMDE_ALIGN(16) int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
51
-   SIMDE_ALIGN(16) int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
52
-   SIMDE_ALIGN(16) int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
53
-   SIMDE_ALIGN(16) int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
54
-   SIMDE_ALIGN(16) uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
55
-   SIMDE_ALIGN(16) uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
56
-   SIMDE_ALIGN(16) uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
57
-   SIMDE_ALIGN(16) uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
58
-#if defined(SIMDE_HAVE_INT128_)
59
-   SIMDE_ALIGN(16) simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
60
-   SIMDE_ALIGN(16) simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
61
-#endif
62
-   SIMDE_ALIGN(16) simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
63
-   SIMDE_ALIGN(16) int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
64
-   SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
65
-#else
66
-   SIMDE_ALIGN(16) int8_t i8[16];
67
-   SIMDE_ALIGN(16) int16_t i16[8];
68
-   SIMDE_ALIGN(16) int32_t i32[4];
69
-   SIMDE_ALIGN(16) int64_t i64[2];
70
-   SIMDE_ALIGN(16) uint8_t u8[16];
71
-   SIMDE_ALIGN(16) uint16_t u16[8];
72
-   SIMDE_ALIGN(16) uint32_t u32[4];
73
-   SIMDE_ALIGN(16) uint64_t u64[2];
74
-#if defined(SIMDE_HAVE_INT128_)
75
-   SIMDE_ALIGN(16) simde_int128 i128[1];
76
-   SIMDE_ALIGN(16) simde_uint128 u128[1];
77
-#endif
78
-   SIMDE_ALIGN(16) simde_float32 f32[4];
79
-   SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)];
80
-   SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
81
-#endif
82
-
83
-   SIMDE_ALIGN(16) simde__m64_private m64_private[2];
84
-   SIMDE_ALIGN(16) simde__m64 m64[2];
85
-
86
-#if defined(SIMDE_X86_SSE_NATIVE)
87
-   SIMDE_ALIGN(16) __m128 n;
88
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
89
-   SIMDE_ALIGN(16) int8x16_t neon_i8;
90
-   SIMDE_ALIGN(16) int16x8_t neon_i16;
91
-   SIMDE_ALIGN(16) int32x4_t neon_i32;
92
-   SIMDE_ALIGN(16) int64x2_t neon_i64;
93
-   SIMDE_ALIGN(16) uint8x16_t neon_u8;
94
-   SIMDE_ALIGN(16) uint16x8_t neon_u16;
95
-   SIMDE_ALIGN(16) uint32x4_t neon_u32;
96
-   SIMDE_ALIGN(16) uint64x2_t neon_u64;
97
-   SIMDE_ALIGN(16) float32x4_t neon_f32;
98
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
99
-   SIMDE_ALIGN(16) float64x2_t neon_f64;
100
-#endif
101
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
102
-   SIMDE_ALIGN(16) v128_t wasm_v128;
103
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
104
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
105
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
106
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
107
-   SIMDE_ALIGN(16)
108
-   SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
109
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
110
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
111
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
112
-   SIMDE_ALIGN(16)
113
-   SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
114
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
115
-#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
116
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
117
-#endif
118
-#endif
119
-} simde__m128_private;
120
-
121
-#if defined(SIMDE_X86_SSE_NATIVE)
122
-typedef __m128 simde__m128;
123
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
124
-typedef float32x4_t simde__m128;
125
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
126
-typedef v128_t simde__m128;
127
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
128
-typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128;
129
-#elif defined(SIMDE_VECTOR_SUBSCRIPT)
130
-typedef simde_float32 simde__m128 SIMDE_ALIGN(16)
131
-   SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
132
-#else
133
-typedef simde__m128_private simde__m128;
134
-#endif
135
-
136
-#if !defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
137
-#define SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES
138
-typedef simde__m128 __m128;
139
-#endif
140
-
141
-HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect");
142
-HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128_private),
143
-            "simde__m128_private size incorrect");
144
-#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
145
-HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128) == 16,
146
-            "simde__m128 is not 16-byte aligned");
147
-HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128_private) == 16,
148
-            "simde__m128_private is not 16-byte aligned");
149
-#endif
150
-
151
-SIMDE_FUNCTION_ATTRIBUTES
152
-simde__m128 simde__m128_from_private(simde__m128_private v)
153
-{
154
-   simde__m128 r;
155
-   simde_memcpy(&r, &v, sizeof(r));
156
-   return r;
157
-}
158
-
159
-SIMDE_FUNCTION_ATTRIBUTES
160
-simde__m128_private simde__m128_to_private(simde__m128 v)
161
-{
162
-   simde__m128_private r;
163
-   simde_memcpy(&r, &v, sizeof(r));
164
-   return r;
165
-}
166
-
167
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
168
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int8x16_t, neon, i8)
169
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int16x8_t, neon, i16)
170
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int32x4_t, neon, i32)
171
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int64x2_t, neon, i64)
172
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint8x16_t, neon, u8)
173
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint16x8_t, neon, u16)
174
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint32x4_t, neon, u32)
175
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint64x2_t, neon, u64)
176
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float32x4_t, neon, f32)
177
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
178
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float64x2_t, neon, f64)
179
-#endif
180
-#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
181
-
182
-#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
183
-HEDLEY_DIAGNOSTIC_POP
184
-#endif
185
-
186
-SIMDE_FUNCTION_ATTRIBUTES
187
-simde__m128 simde_mm_set_ps(simde_float32 e3, simde_float32 e2,
188
-               simde_float32 e1, simde_float32 e0)
189
-{
190
-#if defined(SIMDE_X86_SSE_NATIVE)
191
-   return _mm_set_ps(e3, e2, e1, e0);
192
-#else
193
-   simde__m128_private r_;
194
-
195
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
196
-   SIMDE_ALIGN(16) simde_float32 data[4] = {e0, e1, e2, e3};
197
-   r_.neon_f32 = vld1q_f32(data);
198
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
199
-   r_.wasm_v128 = wasm_f32x4_make(e0, e1, e2, e3);
200
-#else
201
-   r_.f32[0] = e0;
202
-   r_.f32[1] = e1;
203
-   r_.f32[2] = e2;
204
-   r_.f32[3] = e3;
205
-#endif
206
-
207
-   return simde__m128_from_private(r_);
208
-#endif
209
-}
210
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
211
-#define _mm_set_ps(e3, e2, e1, e0) simde_mm_set_ps(e3, e2, e1, e0)
212
-#endif
213
-
214
-SIMDE_FUNCTION_ATTRIBUTES
215
-simde__m128 simde_mm_set_ps1(simde_float32 a)
216
-{
217
-#if defined(SIMDE_X86_SSE_NATIVE)
218
-   return _mm_set_ps1(a);
219
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
220
-   return vdupq_n_f32(a);
221
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
222
-   (void)a;
223
-   return vec_splats(a);
224
-#else
225
-   return simde_mm_set_ps(a, a, a, a);
226
-#endif
227
-}
228
-#define simde_mm_set1_ps(a) simde_mm_set_ps1(a)
229
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
230
-#define _mm_set_ps1(a) simde_mm_set_ps1(a)
231
-#define _mm_set1_ps(a) simde_mm_set1_ps(a)
232
-#endif
233
-
234
-SIMDE_FUNCTION_ATTRIBUTES
235
-simde__m128 simde_mm_move_ss(simde__m128 a, simde__m128 b)
236
-{
237
-#if defined(SIMDE_X86_SSE_NATIVE)
238
-   return _mm_move_ss(a, b);
239
-#else
240
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
241
-               b_ = simde__m128_to_private(b);
242
-
243
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
244
-   r_.neon_f32 =
245
-       vsetq_lane_f32(vgetq_lane_f32(b_.neon_f32, 0), a_.neon_f32, 0);
246
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
247
-   SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)
248
-   m = {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
249
-   r_.altivec_f32 = vec_perm(a_.altivec_f32, b_.altivec_f32, m);
250
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
251
-   r_.wasm_v128 = wasm_v8x16_shuffle(b_.wasm_v128, a_.wasm_v128, 0, 1, 2,
252
-                     3, 20, 21, 22, 23, 24, 25, 26, 27, 28,
253
-                     29, 30, 31);
254
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
255
-   r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 4, 1, 2, 3);
256
-#else
257
-   r_.f32[0] = b_.f32[0];
258
-   r_.f32[1] = a_.f32[1];
259
-   r_.f32[2] = a_.f32[2];
260
-   r_.f32[3] = a_.f32[3];
261
-#endif
262
-
263
-   return simde__m128_from_private(r_);
264
-#endif
265
-}
266
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
267
-#define _mm_move_ss(a, b) simde_mm_move_ss((a), (b))
268
-#endif
269
-
270
-SIMDE_FUNCTION_ATTRIBUTES
271
-simde__m128 simde_mm_add_ps(simde__m128 a, simde__m128 b)
272
-{
273
-#if defined(SIMDE_X86_SSE_NATIVE)
274
-   return _mm_add_ps(a, b);
275
-#else
276
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
277
-               b_ = simde__m128_to_private(b);
278
-
279
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
280
-   r_.neon_f32 = vaddq_f32(a_.neon_f32, b_.neon_f32);
281
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
282
-   r_.wasm_v128 = wasm_f32x4_add(a_.wasm_v128, b_.wasm_v128);
283
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
284
-   r_.altivec_f32 = vec_add(a_.altivec_f32, b_.altivec_f32);
285
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
286
-   r_.f32 = a_.f32 + b_.f32;
287
-#else
288
-   SIMDE_VECTORIZE
289
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
290
-       r_.f32[i] = a_.f32[i] + b_.f32[i];
291
-   }
292
-#endif
293
-
294
-   return simde__m128_from_private(r_);
295
-#endif
296
-}
297
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
298
-#define _mm_add_ps(a, b) simde_mm_add_ps((a), (b))
299
-#endif
300
-
301
-SIMDE_FUNCTION_ATTRIBUTES
302
-simde__m128 simde_mm_add_ss(simde__m128 a, simde__m128 b)
303
-{
304
-#if defined(SIMDE_X86_SSE_NATIVE)
305
-   return _mm_add_ss(a, b);
306
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
307
-   return simde_mm_move_ss(a, simde_mm_add_ps(a, b));
308
-#else
309
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
310
-               b_ = simde__m128_to_private(b);
311
-
312
-   r_.f32[0] = a_.f32[0] + b_.f32[0];
313
-   r_.f32[1] = a_.f32[1];
314
-   r_.f32[2] = a_.f32[2];
315
-   r_.f32[3] = a_.f32[3];
316
-
317
-   return simde__m128_from_private(r_);
318
-#endif
319
-}
320
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
321
-#define _mm_add_ss(a, b) simde_mm_add_ss((a), (b))
322
-#endif
323
-
324
-SIMDE_FUNCTION_ATTRIBUTES
325
-simde__m128 simde_mm_and_ps(simde__m128 a, simde__m128 b)
326
-{
327
-#if defined(SIMDE_X86_SSE_NATIVE)
328
-   return _mm_and_ps(a, b);
329
-#else
330
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
331
-               b_ = simde__m128_to_private(b);
332
-
333
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
334
-   r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
335
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
336
-   r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
337
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
338
-   r_.i32 = a_.i32 & b_.i32;
339
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
340
-   r_.altivec_f32 = vec_and(a_.altivec_f32, b_.altivec_f32);
341
-#else
342
-   SIMDE_VECTORIZE
343
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
344
-       r_.i32[i] = a_.i32[i] & b_.i32[i];
345
-   }
346
-#endif
347
-
348
-   return simde__m128_from_private(r_);
349
-#endif
350
-}
351
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
352
-#define _mm_and_ps(a, b) simde_mm_and_ps((a), (b))
353
-#endif
354
-
355
-SIMDE_FUNCTION_ATTRIBUTES
356
-simde__m128 simde_mm_andnot_ps(simde__m128 a, simde__m128 b)
357
-{
358
-#if defined(SIMDE_X86_SSE_NATIVE)
359
-   return _mm_andnot_ps(a, b);
360
-#else
361
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
362
-               b_ = simde__m128_to_private(b);
363
-
364
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
365
-   r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
366
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
367
-   r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
368
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
369
-   r_.altivec_f32 = vec_andc(b_.altivec_f32, a_.altivec_f32);
370
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
371
-   r_.i32 = ~a_.i32 & b_.i32;
372
-#else
373
-   SIMDE_VECTORIZE
374
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
375
-       r_.i32[i] = ~(a_.i32[i]) & b_.i32[i];
376
-   }
377
-#endif
378
-
379
-   return simde__m128_from_private(r_);
380
-#endif
381
-}
382
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
383
-#define _mm_andnot_ps(a, b) simde_mm_andnot_ps((a), (b))
384
-#endif
385
-
386
-SIMDE_FUNCTION_ATTRIBUTES
387
-simde__m64 simde_mm_avg_pu16(simde__m64 a, simde__m64 b)
388
-{
389
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
390
-   return _mm_avg_pu16(a, b);
391
-#else
392
-   simde__m64_private r_, a_ = simde__m64_to_private(a),
393
-                  b_ = simde__m64_to_private(b);
394
-
395
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
396
-   r_.neon_u16 = vrhadd_u16(b_.neon_u16, a_.neon_u16);
397
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) &&      \
398
-   defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
399
-   defined(SIMDE_CONVERT_VECTOR_)
400
-   uint32_t wa SIMDE_VECTOR(16);
401
-   uint32_t wb SIMDE_VECTOR(16);
402
-   uint32_t wr SIMDE_VECTOR(16);
403
-   SIMDE_CONVERT_VECTOR_(wa, a_.u16);
404
-   SIMDE_CONVERT_VECTOR_(wb, b_.u16);
405
-   wr = (wa + wb + 1) >> 1;
406
-   SIMDE_CONVERT_VECTOR_(r_.u16, wr);
407
-#else
408
-   SIMDE_VECTORIZE
409
-   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
410
-       r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
411
-   }
412
-#endif
413
-
414
-   return simde__m64_from_private(r_);
415
-#endif
416
-}
417
-#define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b)
418
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
419
-#define _mm_avg_pu16(a, b) simde_mm_avg_pu16(a, b)
420
-#define _m_pavgw(a, b) simde_mm_avg_pu16(a, b)
421
-#endif
422
-
423
-SIMDE_FUNCTION_ATTRIBUTES
424
-simde__m64 simde_mm_avg_pu8(simde__m64 a, simde__m64 b)
425
-{
426
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
427
-   return _mm_avg_pu8(a, b);
428
-#else
429
-   simde__m64_private r_, a_ = simde__m64_to_private(a),
430
-                  b_ = simde__m64_to_private(b);
431
-
432
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
433
-   r_.neon_u8 = vrhadd_u8(b_.neon_u8, a_.neon_u8);
434
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) &&      \
435
-   defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
436
-   defined(SIMDE_CONVERT_VECTOR_)
437
-   uint16_t wa SIMDE_VECTOR(16);
438
-   uint16_t wb SIMDE_VECTOR(16);
439
-   uint16_t wr SIMDE_VECTOR(16);
440
-   SIMDE_CONVERT_VECTOR_(wa, a_.u8);
441
-   SIMDE_CONVERT_VECTOR_(wb, b_.u8);
442
-   wr = (wa + wb + 1) >> 1;
443
-   SIMDE_CONVERT_VECTOR_(r_.u8, wr);
444
-#else
445
-   SIMDE_VECTORIZE
446
-   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
447
-       r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
448
-   }
449
-#endif
450
-
451
-   return simde__m64_from_private(r_);
452
-#endif
453
-}
454
-#define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b)
455
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
456
-#define _mm_avg_pu8(a, b) simde_mm_avg_pu8(a, b)
457
-#define _m_pavgb(a, b) simde_mm_avg_pu8(a, b)
458
-#endif
459
-
460
-SIMDE_FUNCTION_ATTRIBUTES
461
-simde__m128 simde_mm_cmpeq_ps(simde__m128 a, simde__m128 b)
462
-{
463
-#if defined(SIMDE_X86_SSE_NATIVE)
464
-   return _mm_cmpeq_ps(a, b);
465
-#else
466
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
467
-               b_ = simde__m128_to_private(b);
468
-
469
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
470
-   r_.neon_u32 = vceqq_f32(a_.neon_f32, b_.neon_f32);
471
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
472
-   r_.wasm_v128 = wasm_f32x4_eq(a_.wasm_v128, b_.wasm_v128);
473
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
474
-   r_.altivec_f32 = (SIMDE_POWER_ALTIVEC_VECTOR(float))vec_cmpeq(
475
-       a_.altivec_f32, b_.altivec_f32);
476
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
477
-   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.f32 == b_.f32);
478
-#else
479
-   SIMDE_VECTORIZE
480
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
481
-       r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0)
482
-                            : UINT32_C(0);
483
-   }
484
-#endif
485
-
486
-   return simde__m128_from_private(r_);
487
-#endif
488
-}
489
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
490
-#define _mm_cmpeq_ps(a, b) simde_mm_cmpeq_ps((a), (b))
491
-#endif
492
-
493
-SIMDE_FUNCTION_ATTRIBUTES
494
-simde__m128 simde_mm_cmpeq_ss(simde__m128 a, simde__m128 b)
495
-{
496
-#if defined(SIMDE_X86_SSE_NATIVE)
497
-   return _mm_cmpeq_ss(a, b);
498
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
499
-   return simde_mm_move_ss(a, simde_mm_cmpeq_ps(a, b));
500
-#else
501
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
502
-               b_ = simde__m128_to_private(b);
503
-
504
-   r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
505
-   SIMDE_VECTORIZE
506
-   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
507
-       r_.u32[i] = a_.u32[i];
508
-   }
509
-
510
-   return simde__m128_from_private(r_);
511
-#endif
512
-}
513
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
514
-#define _mm_cmpeq_ss(a, b) simde_mm_cmpeq_ss((a), (b))
515
-#endif
516
-
517
-SIMDE_FUNCTION_ATTRIBUTES
518
-simde__m128 simde_mm_cmpge_ps(simde__m128 a, simde__m128 b)
519
-{
520
-#if defined(SIMDE_X86_SSE_NATIVE)
521
-   return _mm_cmpge_ps(a, b);
522
-#else
523
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
524
-               b_ = simde__m128_to_private(b);
525
-
526
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
527
-   r_.neon_u32 = vcgeq_f32(a_.neon_f32, b_.neon_f32);
528
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
529
-   r_.wasm_v128 = wasm_f32x4_ge(a_.wasm_v128, b_.wasm_v128);
530
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
531
-   r_.altivec_f32 = (SIMDE_POWER_ALTIVEC_VECTOR(float))vec_cmpge(
532
-       a_.altivec_f32, b_.altivec_f32);
533
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
534
-   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32));
535
-#else
536
-   SIMDE_VECTORIZE
537
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
538
-       r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0)
539
-                            : UINT32_C(0);
540
-   }
541
-#endif
542
-
543
-   return simde__m128_from_private(r_);
544
-#endif
545
-}
546
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
547
-#define _mm_cmpge_ps(a, b) simde_mm_cmpge_ps((a), (b))
548
-#endif
549
-
550
-SIMDE_FUNCTION_ATTRIBUTES
551
-simde__m128 simde_mm_cmpge_ss(simde__m128 a, simde__m128 b)
552
-{
553
-#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
554
-   return _mm_cmpge_ss(a, b);
555
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
556
-   return simde_mm_move_ss(a, simde_mm_cmpge_ps(a, b));
557
-#else
558
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
559
-               b_ = simde__m128_to_private(b);
560
-
561
-   r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
562
-   SIMDE_VECTORIZE
563
-   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
564
-       r_.u32[i] = a_.u32[i];
565
-   }
566
-
567
-   return simde__m128_from_private(r_);
568
-#endif
569
-}
570
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
571
-#define _mm_cmpge_ss(a, b) simde_mm_cmpge_ss((a), (b))
572
-#endif
573
-
574
-SIMDE_FUNCTION_ATTRIBUTES
575
-simde__m128 simde_mm_cmpgt_ps(simde__m128 a, simde__m128 b)
576
-{
577
-#if defined(SIMDE_X86_SSE_NATIVE)
578
-   return _mm_cmpgt_ps(a, b);
579
-#else
580
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
581
-               b_ = simde__m128_to_private(b);
582
-
583
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
584
-   r_.neon_u32 = vcgtq_f32(a_.neon_f32, b_.neon_f32);
585
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
586
-   r_.wasm_v128 = wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128);
587
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
588
-   r_.altivec_f32 = (SIMDE_POWER_ALTIVEC_VECTOR(float))vec_cmpgt(
589
-       a_.altivec_f32, b_.altivec_f32);
590
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
591
-   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32));
592
-#else
593
-   SIMDE_VECTORIZE
594
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
595
-       r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0)
596
-                           : UINT32_C(0);
597
-   }
598
-#endif
599
-
600
-   return simde__m128_from_private(r_);
601
-#endif
602
-}
603
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
604
-#define _mm_cmpgt_ps(a, b) simde_mm_cmpgt_ps((a), (b))
605
-#endif
606
-
607
-SIMDE_FUNCTION_ATTRIBUTES
608
-simde__m128 simde_mm_cmpgt_ss(simde__m128 a, simde__m128 b)
609
-{
610
-#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
611
-   return _mm_cmpgt_ss(a, b);
612
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
613
-   return simde_mm_move_ss(a, simde_mm_cmpgt_ps(a, b));
614
-#else
615
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
616
-               b_ = simde__m128_to_private(b);
617
-
618
-   r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
619
-   SIMDE_VECTORIZE
620
-   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
621
-       r_.u32[i] = a_.u32[i];
622
-   }
623
-
624
-   return simde__m128_from_private(r_);
625
-#endif
626
-}
627
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
628
-#define _mm_cmpgt_ss(a, b) simde_mm_cmpgt_ss((a), (b))
629
-#endif
630
-
631
-SIMDE_FUNCTION_ATTRIBUTES
632
-simde__m128 simde_mm_cmple_ps(simde__m128 a, simde__m128 b)
633
-{
634
-#if defined(SIMDE_X86_SSE_NATIVE)
635
-   return _mm_cmple_ps(a, b);
636
-#else
637
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
638
-               b_ = simde__m128_to_private(b);
639
-
640
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
641
-   r_.neon_u32 = vcleq_f32(a_.neon_f32, b_.neon_f32);
642
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
643
-   r_.wasm_v128 = wasm_f32x4_le(a_.wasm_v128, b_.wasm_v128);
644
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
645
-   r_.altivec_f32 = (SIMDE_POWER_ALTIVEC_VECTOR(float))vec_cmple(
646
-       a_.altivec_f32, b_.altivec_f32);
647
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
648
-   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32));
649
-#else
650
-   SIMDE_VECTORIZE
651
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
652
-       r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0)
653
-                            : UINT32_C(0);
654
-   }
655
-#endif
656
-
657
-   return simde__m128_from_private(r_);
658
-#endif
659
-}
660
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
661
-#define _mm_cmple_ps(a, b) simde_mm_cmple_ps((a), (b))
662
-#endif
663
-
664
-SIMDE_FUNCTION_ATTRIBUTES
665
-simde__m128 simde_mm_cmple_ss(simde__m128 a, simde__m128 b)
666
-{
667
-#if defined(SIMDE_X86_SSE_NATIVE)
668
-   return _mm_cmple_ss(a, b);
669
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
670
-   return simde_mm_move_ss(a, simde_mm_cmple_ps(a, b));
671
-#else
672
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
673
-               b_ = simde__m128_to_private(b);
674
-
675
-   r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
676
-   SIMDE_VECTORIZE
677
-   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
678
-       r_.u32[i] = a_.u32[i];
679
-   }
680
-
681
-   return simde__m128_from_private(r_);
682
-#endif
683
-}
684
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
685
-#define _mm_cmple_ss(a, b) simde_mm_cmple_ss((a), (b))
686
-#endif
687
-
688
-SIMDE_FUNCTION_ATTRIBUTES
689
-simde__m128 simde_mm_cmplt_ps(simde__m128 a, simde__m128 b)
690
-{
691
-#if defined(SIMDE_X86_SSE_NATIVE)
692
-   return _mm_cmplt_ps(a, b);
693
-#else
694
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
695
-               b_ = simde__m128_to_private(b);
696
-
697
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
698
-   r_.neon_u32 = vcltq_f32(a_.neon_f32, b_.neon_f32);
699
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
700
-   r_.wasm_v128 = wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128);
701
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
702
-   r_.altivec_f32 = (SIMDE_POWER_ALTIVEC_VECTOR(float))vec_cmplt(
703
-       a_.altivec_f32, b_.altivec_f32);
704
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
705
-   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32));
706
-#else
707
-   SIMDE_VECTORIZE
708
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
709
-       r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0)
710
-                           : UINT32_C(0);
711
-   }
712
-#endif
713
-
714
-   return simde__m128_from_private(r_);
715
-#endif
716
-}
717
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
718
-#define _mm_cmplt_ps(a, b) simde_mm_cmplt_ps((a), (b))
719
-#endif
720
-
721
-SIMDE_FUNCTION_ATTRIBUTES
722
-simde__m128 simde_mm_cmplt_ss(simde__m128 a, simde__m128 b)
723
-{
724
-#if defined(SIMDE_X86_SSE_NATIVE)
725
-   return _mm_cmplt_ss(a, b);
726
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
727
-   return simde_mm_move_ss(a, simde_mm_cmplt_ps(a, b));
728
-#else
729
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
730
-               b_ = simde__m128_to_private(b);
731
-
732
-   r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
733
-   SIMDE_VECTORIZE
734
-   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
735
-       r_.u32[i] = a_.u32[i];
736
-   }
737
-
738
-   return simde__m128_from_private(r_);
739
-#endif
740
-}
741
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
742
-#define _mm_cmplt_ss(a, b) simde_mm_cmplt_ss((a), (b))
743
-#endif
744
-
745
-SIMDE_FUNCTION_ATTRIBUTES
746
-simde__m128 simde_mm_cmpneq_ps(simde__m128 a, simde__m128 b)
747
-{
748
-#if defined(SIMDE_X86_SSE_NATIVE)
749
-   return _mm_cmpneq_ps(a, b);
750
-#else
751
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
752
-               b_ = simde__m128_to_private(b);
753
-
754
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
755
-   r_.neon_u32 = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
756
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
757
-   r_.wasm_v128 = wasm_f32x4_ne(a_.wasm_v128, b_.wasm_v128);
758
-#elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && SIMDE_ARCH_POWER_CHECK(900) && \
759
-   !defined(HEDLEY_IBM_VERSION)
760
-   /* vec_cmpne(vector float, vector float) is missing from XL C/C++ v16.1.1,
761
-       though the documentation (table 89 on page 432 of the IBM XL C/C++ for
762
-       Linux Compiler Reference, Version 16.1.1) shows that it should be
763
-       present.  Both GCC and clang support it. */
764
-   r_.altivec_f32 = (SIMDE_POWER_ALTIVEC_VECTOR(float))vec_cmpne(
765
-       a_.altivec_f32, b_.altivec_f32);
766
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
767
-   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32));
768
-#else
769
-   SIMDE_VECTORIZE
770
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
771
-       r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0)
772
-                            : UINT32_C(0);
773
-   }
774
-#endif
775
-
776
-   return simde__m128_from_private(r_);
777
-#endif
778
-}
779
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
780
-#define _mm_cmpneq_ps(a, b) simde_mm_cmpneq_ps((a), (b))
781
-#endif
782
-
783
-SIMDE_FUNCTION_ATTRIBUTES
784
-simde__m128 simde_mm_cmpneq_ss(simde__m128 a, simde__m128 b)
785
-{
786
-#if defined(SIMDE_X86_SSE_NATIVE)
787
-   return _mm_cmpneq_ss(a, b);
788
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
789
-   return simde_mm_move_ss(a, simde_mm_cmpneq_ps(a, b));
790
-#else
791
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
792
-               b_ = simde__m128_to_private(b);
793
-
794
-   r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
795
-   SIMDE_VECTORIZE
796
-   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
797
-       r_.u32[i] = a_.u32[i];
798
-   }
799
-
800
-   return simde__m128_from_private(r_);
801
-#endif
802
-}
803
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
804
-#define _mm_cmpneq_ss(a, b) simde_mm_cmpneq_ss((a), (b))
805
-#endif
806
-
807
-SIMDE_FUNCTION_ATTRIBUTES
808
-simde__m128 simde_mm_cmpnge_ps(simde__m128 a, simde__m128 b)
809
-{
810
-   return simde_mm_cmplt_ps(a, b);
811
-}
812
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
813
-#define _mm_cmpnge_ps(a, b) simde_mm_cmpnge_ps((a), (b))
814
-#endif
815
-
816
-SIMDE_FUNCTION_ATTRIBUTES
817
-simde__m128 simde_mm_cmpnge_ss(simde__m128 a, simde__m128 b)
818
-{
819
-   return simde_mm_cmplt_ss(a, b);
820
-}
821
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
822
-#define _mm_cmpnge_ss(a, b) simde_mm_cmpnge_ss((a), (b))
823
-#endif
824
-
825
-SIMDE_FUNCTION_ATTRIBUTES
826
-simde__m128 simde_mm_cmpngt_ps(simde__m128 a, simde__m128 b)
827
-{
828
-   return simde_mm_cmple_ps(a, b);
829
-}
830
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
831
-#define _mm_cmpngt_ps(a, b) simde_mm_cmpngt_ps((a), (b))
832
-#endif
833
-
834
-SIMDE_FUNCTION_ATTRIBUTES
835
-simde__m128 simde_mm_cmpngt_ss(simde__m128 a, simde__m128 b)
836
-{
837
-   return simde_mm_cmple_ss(a, b);
838
-}
839
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
840
-#define _mm_cmpngt_ss(a, b) simde_mm_cmpngt_ss((a), (b))
841
-#endif
842
-
843
-SIMDE_FUNCTION_ATTRIBUTES
844
-simde__m128 simde_mm_cmpnle_ps(simde__m128 a, simde__m128 b)
845
-{
846
-   return simde_mm_cmpgt_ps(a, b);
847
-}
848
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
849
-#define _mm_cmpnle_ps(a, b) simde_mm_cmpnle_ps((a), (b))
850
-#endif
851
-
852
-SIMDE_FUNCTION_ATTRIBUTES
853
-simde__m128 simde_mm_cmpnle_ss(simde__m128 a, simde__m128 b)
854
-{
855
-   return simde_mm_cmpgt_ss(a, b);
856
-}
857
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
858
-#define _mm_cmpnle_ss(a, b) simde_mm_cmpnle_ss((a), (b))
859
-#endif
860
-
861
-SIMDE_FUNCTION_ATTRIBUTES
862
-simde__m128 simde_mm_cmpnlt_ps(simde__m128 a, simde__m128 b)
863
-{
864
-   return simde_mm_cmpge_ps(a, b);
865
-}
866
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
867
-#define _mm_cmpnlt_ps(a, b) simde_mm_cmpnlt_ps((a), (b))
868
-#endif
869
-
870
-SIMDE_FUNCTION_ATTRIBUTES
871
-simde__m128 simde_mm_cmpnlt_ss(simde__m128 a, simde__m128 b)
872
-{
873
-   return simde_mm_cmpge_ss(a, b);
874
-}
875
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
876
-#define _mm_cmpnlt_ss(a, b) simde_mm_cmpnlt_ss((a), (b))
877
-#endif
878
-
879
-SIMDE_FUNCTION_ATTRIBUTES
880
-simde__m128 simde_mm_cmpord_ps(simde__m128 a, simde__m128 b)
881
-{
882
-#if defined(SIMDE_X86_SSE_NATIVE)
883
-   return _mm_cmpord_ps(a, b);
884
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
885
-   return wasm_v128_and(wasm_f32x4_eq(a, a), wasm_f32x4_eq(b, b));
886
-#else
887
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
888
-               b_ = simde__m128_to_private(b);
889
-
890
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
891
-   /* Note: NEON does not have ordered compare builtin
892
-     Need to compare a eq a and b eq b to check for NaN
893
-     Do AND of results to get final */
894
-   uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32);
895
-   uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32);
896
-   r_.neon_u32 = vandq_u32(ceqaa, ceqbb);
897
-#elif defined(simde_math_isnanf)
898
-   SIMDE_VECTORIZE
899
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
900
-       r_.u32[i] = (simde_math_isnanf(a_.f32[i]) ||
901
-                simde_math_isnanf(b_.f32[i]))
902
-                   ? UINT32_C(0)
903
-                   : ~UINT32_C(0);
904
-   }
905
-#else
906
-   HEDLEY_UNREACHABLE();
907
-#endif
908
-
909
-   return simde__m128_from_private(r_);
910
-#endif
911
-}
912
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
913
-#define _mm_cmpord_ps(a, b) simde_mm_cmpord_ps((a), (b))
914
-#endif
915
-
916
-SIMDE_FUNCTION_ATTRIBUTES
917
-simde__m128 simde_mm_cmpunord_ps(simde__m128 a, simde__m128 b)
918
-{
919
-#if defined(SIMDE_X86_SSE_NATIVE)
920
-   return _mm_cmpunord_ps(a, b);
921
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
922
-   return wasm_v128_or(wasm_f32x4_ne(a, a), wasm_f32x4_ne(b, b));
923
-#else
924
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
925
-               b_ = simde__m128_to_private(b);
926
-
927
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
928
-   uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32);
929
-   uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32);
930
-   r_.neon_u32 = vmvnq_u32(vandq_u32(ceqaa, ceqbb));
931
-#elif defined(simde_math_isnanf)
932
-   SIMDE_VECTORIZE
933
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
934
-       r_.u32[i] = (simde_math_isnanf(a_.f32[i]) ||
935
-                simde_math_isnanf(b_.f32[i]))
936
-                   ? ~UINT32_C(0)
937
-                   : UINT32_C(0);
938
-   }
939
-#else
940
-   HEDLEY_UNREACHABLE();
941
-#endif
942
-
943
-   return simde__m128_from_private(r_);
944
-#endif
945
-}
946
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
947
-#define _mm_cmpunord_ps(a, b) simde_mm_cmpunord_ps((a), (b))
948
-#endif
949
-
950
-SIMDE_FUNCTION_ATTRIBUTES
951
-simde__m128 simde_mm_cmpunord_ss(simde__m128 a, simde__m128 b)
952
-{
953
-#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
954
-   return _mm_cmpunord_ss(a, b);
955
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
956
-   return simde_mm_move_ss(a, simde_mm_cmpunord_ps(a, b));
957
-#else
958
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
959
-               b_ = simde__m128_to_private(b);
960
-
961
-#if defined(simde_math_isnanf)
962
-   r_.u32[0] =
963
-       (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0]))
964
-           ? ~UINT32_C(0)
965
-           : UINT32_C(0);
966
-   SIMDE_VECTORIZE
967
-   for (size_t i = 1; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
968
-       r_.u32[i] = a_.u32[i];
969
-   }
970
-#else
971
-   HEDLEY_UNREACHABLE();
972
-#endif
973
-
974
-   return simde__m128_from_private(r_);
975
-#endif
976
-}
977
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
978
-#define _mm_cmpunord_ss(a, b) simde_mm_cmpunord_ss((a), (b))
979
-#endif
980
-
981
-SIMDE_FUNCTION_ATTRIBUTES
982
-int simde_mm_comieq_ss(simde__m128 a, simde__m128 b)
983
-{
984
-#if defined(SIMDE_X86_SSE_NATIVE)
985
-   return _mm_comieq_ss(a, b);
986
-#else
987
-   simde__m128_private a_ = simde__m128_to_private(a),
988
-               b_ = simde__m128_to_private(b);
989
-
990
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
991
-   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
992
-   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
993
-   uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
994
-   uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32);
995
-   return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0);
996
-#else
997
-   return a_.f32[0] == b_.f32[0];
998
-#endif
999
-#endif
1000
-}
1001
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1002
-#define _mm_comieq_ss(a, b) simde_mm_comieq_ss((a), (b))
1003
-#endif
1004
-
1005
-SIMDE_FUNCTION_ATTRIBUTES
1006
-int simde_mm_comige_ss(simde__m128 a, simde__m128 b)
1007
-{
1008
-#if defined(SIMDE_X86_SSE_NATIVE)
1009
-   return _mm_comige_ss(a, b);
1010
-#else
1011
-   simde__m128_private a_ = simde__m128_to_private(a),
1012
-               b_ = simde__m128_to_private(b);
1013
-
1014
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1015
-   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1016
-   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1017
-   uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1018
-   uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32);
1019
-   return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0);
1020
-#else
1021
-   return a_.f32[0] >= b_.f32[0];
1022
-#endif
1023
-#endif
1024
-}
1025
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1026
-#define _mm_comige_ss(a, b) simde_mm_comige_ss((a), (b))
1027
-#endif
1028
-
1029
-SIMDE_FUNCTION_ATTRIBUTES
1030
-int simde_mm_comigt_ss(simde__m128 a, simde__m128 b)
1031
-{
1032
-#if defined(SIMDE_X86_SSE_NATIVE)
1033
-   return _mm_comigt_ss(a, b);
1034
-#else
1035
-   simde__m128_private a_ = simde__m128_to_private(a),
1036
-               b_ = simde__m128_to_private(b);
1037
-
1038
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1039
-   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1040
-   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1041
-   uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1042
-   uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32);
1043
-   return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0);
1044
-#else
1045
-   return a_.f32[0] > b_.f32[0];
1046
-#endif
1047
-#endif
1048
-}
1049
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1050
-#define _mm_comigt_ss(a, b) simde_mm_comigt_ss((a), (b))
1051
-#endif
1052
-
1053
-SIMDE_FUNCTION_ATTRIBUTES
1054
-int simde_mm_comile_ss(simde__m128 a, simde__m128 b)
1055
-{
1056
-#if defined(SIMDE_X86_SSE_NATIVE)
1057
-   return _mm_comile_ss(a, b);
1058
-#else
1059
-   simde__m128_private a_ = simde__m128_to_private(a),
1060
-               b_ = simde__m128_to_private(b);
1061
-
1062
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1063
-   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1064
-   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1065
-   uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1066
-   uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32);
1067
-   return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0);
1068
-#else
1069
-   return a_.f32[0] <= b_.f32[0];
1070
-#endif
1071
-#endif
1072
-}
1073
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1074
-#define _mm_comile_ss(a, b) simde_mm_comile_ss((a), (b))
1075
-#endif
1076
-
1077
-SIMDE_FUNCTION_ATTRIBUTES
1078
-int simde_mm_comilt_ss(simde__m128 a, simde__m128 b)
1079
-{
1080
-#if defined(SIMDE_X86_SSE_NATIVE)
1081
-   return _mm_comilt_ss(a, b);
1082
-#else
1083
-   simde__m128_private a_ = simde__m128_to_private(a),
1084
-               b_ = simde__m128_to_private(b);
1085
-
1086
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1087
-   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1088
-   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1089
-   uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1090
-   uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32);
1091
-   return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0);
1092
-#else
1093
-   return a_.f32[0] < b_.f32[0];
1094
-#endif
1095
-#endif
1096
-}
1097
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1098
-#define _mm_comilt_ss(a, b) simde_mm_comilt_ss((a), (b))
1099
-#endif
1100
-
1101
-SIMDE_FUNCTION_ATTRIBUTES
1102
-int simde_mm_comineq_ss(simde__m128 a, simde__m128 b)
1103
-{
1104
-#if defined(SIMDE_X86_SSE_NATIVE)
1105
-   return _mm_comineq_ss(a, b);
1106
-#else
1107
-   simde__m128_private a_ = simde__m128_to_private(a),
1108
-               b_ = simde__m128_to_private(b);
1109
-
1110
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1111
-   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1112
-   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1113
-   uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1114
-   uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
1115
-   return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0);
1116
-#else
1117
-   return a_.f32[0] != b_.f32[0];
1118
-#endif
1119
-#endif
1120
-}
1121
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1122
-#define _mm_comineq_ss(a, b) simde_mm_comineq_ss((a), (b))
1123
-#endif
1124
-
1125
-SIMDE_FUNCTION_ATTRIBUTES
1126
-simde__m128 simde_mm_cvt_pi2ps(simde__m128 a, simde__m64 b)
1127
-{
1128
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1129
-   return _mm_cvt_pi2ps(a, b);
1130
-#else
1131
-   simde__m128_private r_, a_ = simde__m128_to_private(a);
1132
-   simde__m64_private b_ = simde__m64_to_private(b);
1133
-
1134
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1135
-   r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32),
1136
-                  vget_high_f32(a_.neon_f32));
1137
-#elif defined(SIMDE_CONVERT_VECTOR_)
1138
-   SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32);
1139
-   r_.m64_private[1] = a_.m64_private[1];
1140
-
1141
-#else
1142
-   r_.f32[0] = (simde_float32)b_.i32[0];
1143
-   r_.f32[1] = (simde_float32)b_.i32[1];
1144
-   r_.i32[2] = a_.i32[2];
1145
-   r_.i32[3] = a_.i32[3];
1146
-#endif
1147
-
1148
-   return simde__m128_from_private(r_);
1149
-#endif
1150
-}
1151
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1152
-#define _mm_cvt_pi2ps(a, b) simde_mm_cvt_pi2ps((a), b)
1153
-#endif
1154
-
1155
-SIMDE_FUNCTION_ATTRIBUTES
1156
-simde__m64 simde_mm_cvt_ps2pi(simde__m128 a)
1157
-{
1158
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1159
-   return _mm_cvt_ps2pi(a);
1160
-#else
1161
-   simde__m64_private r_;
1162
-   simde__m128_private a_ = simde__m128_to_private(a);
1163
-
1164
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1165
-   r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
1166
-#elif defined(SIMDE_CONVERT_VECTOR_) && !defined(__clang__)
1167
-   SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].f32);
1168
-#else
1169
-   SIMDE_VECTORIZE
1170
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1171
-       r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.f32[i]);
1172
-   }
1173
-#endif
1174
-
1175
-   return simde__m64_from_private(r_);
1176
-#endif
1177
-}
1178
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1179
-#define _mm_cvt_ps2pi(a) simde_mm_cvt_ps2pi((a))
1180
-#endif
1181
-
1182
-SIMDE_FUNCTION_ATTRIBUTES
1183
-simde__m128 simde_mm_cvt_si2ss(simde__m128 a, int32_t b)
1184
-{
1185
-#if defined(SIMDE_X86_SSE_NATIVE)
1186
-   return _mm_cvt_si2ss(a, b);
1187
-#else
1188
-   simde__m128_private r_, a_ = simde__m128_to_private(a);
1189
-
1190
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1191
-   r_.neon_f32 = vsetq_lane_f32((float)b, a_.neon_f32, 0);
1192
-#else
1193
-   r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
1194
-   r_.i32[1] = a_.i32[1];
1195
-   r_.i32[2] = a_.i32[2];
1196
-   r_.i32[3] = a_.i32[3];
1197
-#endif
1198
-
1199
-   return simde__m128_from_private(r_);
1200
-#endif
1201
-}
1202
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1203
-#define _mm_cvt_si2ss(a, b) simde_mm_cvt_si2ss((a), b)
1204
-#endif
1205
-
1206
-SIMDE_FUNCTION_ATTRIBUTES
1207
-int32_t simde_mm_cvt_ss2si(simde__m128 a)
1208
-{
1209
-#if defined(SIMDE_X86_SSE_NATIVE)
1210
-   return _mm_cvt_ss2si(a);
1211
-#else
1212
-   simde__m128_private a_ = simde__m128_to_private(a);
1213
-
1214
-#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
1215
-   return vgetq_lane_s32(vcvtnq_s32_f32(a_.neon_f32), 0);
1216
-#elif defined(simde_math_nearbyintf)
1217
-   return SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyintf(a_.f32[0]));
1218
-#else
1219
-   HEDLEY_UNREACHABLE();
1220
-#endif
1221
-#endif
1222
-}
1223
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1224
-#define _mm_cvt_ss2si(a) simde_mm_cvt_ss2si((a))
1225
-#endif
1226
-
1227
-SIMDE_FUNCTION_ATTRIBUTES
1228
-simde__m128 simde_mm_cvtpi16_ps(simde__m64 a)
1229
-{
1230
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1231
-   return _mm_cvtpi16_ps(a);
1232
-#else
1233
-   simde__m128_private r_;
1234
-   simde__m64_private a_ = simde__m64_to_private(a);
1235
-
1236
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && 0 /* TODO */
1237
-   r_.neon_f32 = vmovl_s16(
1238
-       vget_low_s16(vuzp1q_s16(a_.neon_i16, vmovq_n_s16(0))));
1239
-#elif defined(SIMDE_CONVERT_VECTOR_)
1240
-   SIMDE_CONVERT_VECTOR_(r_.f32, a_.i16);
1241
-#else
1242
-   SIMDE_VECTORIZE
1243
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1244
-       simde_float32 v = a_.i16[i];
1245
-       r_.f32[i] = v;
1246
-   }
1247
-#endif
1248
-
1249
-   return simde__m128_from_private(r_);
1250
-#endif
1251
-}
1252
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1253
-#define _mm_cvtpi16_ps(a) simde_mm_cvtpi16_ps(a)
1254
-#endif
1255
-
1256
-SIMDE_FUNCTION_ATTRIBUTES
1257
-simde__m128 simde_mm_cvtpi32_ps(simde__m128 a, simde__m64 b)
1258
-{
1259
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1260
-   return _mm_cvtpi32_ps(a, b);
1261
-#else
1262
-   simde__m128_private r_, a_ = simde__m128_to_private(a);
1263
-   simde__m64_private b_ = simde__m64_to_private(b);
1264
-
1265
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1266
-   r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32),
1267
-                  vget_high_f32(a_.neon_f32));
1268
-#elif defined(SIMDE_CONVERT_VECTOR_)
1269
-   SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32);
1270
-   r_.m64_private[1] = a_.m64_private[1];
1271
-#else
1272
-   r_.f32[0] = (simde_float32)b_.i32[0];
1273
-   r_.f32[1] = (simde_float32)b_.i32[1];
1274
-   r_.i32[2] = a_.i32[2];
1275
-   r_.i32[3] = a_.i32[3];
1276
-#endif
1277
-
1278
-   return simde__m128_from_private(r_);
1279
-#endif
1280
-}
1281
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1282
-#define _mm_cvtpi32_ps(a, b) simde_mm_cvtpi32_ps((a), b)
1283
-#endif
1284
-
1285
-SIMDE_FUNCTION_ATTRIBUTES
1286
-simde__m128 simde_mm_cvtpi32x2_ps(simde__m64 a, simde__m64 b)
1287
-{
1288
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1289
-   return _mm_cvtpi32x2_ps(a, b);
1290
-#else
1291
-   simde__m128_private r_;
1292
-   simde__m64_private a_ = simde__m64_to_private(a),
1293
-              b_ = simde__m64_to_private(b);
1294
-
1295
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1296
-   r_.neon_f32 = vcvtq_f32_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
1297
-#elif defined(SIMDE_CONVERT_VECTOR_)
1298
-   SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.i32);
1299
-   SIMDE_CONVERT_VECTOR_(r_.m64_private[1].f32, b_.i32);
1300
-#else
1301
-   r_.f32[0] = (simde_float32)a_.i32[0];
1302
-   r_.f32[1] = (simde_float32)a_.i32[1];
1303
-   r_.f32[2] = (simde_float32)b_.i32[0];
1304
-   r_.f32[3] = (simde_float32)b_.i32[1];
1305
-#endif
1306
-
1307
-   return simde__m128_from_private(r_);
1308
-#endif
1309
-}
1310
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1311
-#define _mm_cvtpi32x2_ps(a, b) simde_mm_cvtpi32x2_ps(a, b)
1312
-#endif
1313
-
1314
-SIMDE_FUNCTION_ATTRIBUTES
1315
-simde__m128 simde_mm_cvtpi8_ps(simde__m64 a)
1316
-{
1317
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1318
-   return _mm_cvtpi8_ps(a);
1319
-#else
1320
-   simde__m128_private r_;
1321
-   simde__m64_private a_ = simde__m64_to_private(a);
1322
-
1323
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1324
-   r_.neon_f32 =
1325
-       vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(a_.neon_i8))));
1326
-#else
1327
-   r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[0]);
1328
-   r_.f32[1] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[1]);
1329
-   r_.f32[2] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[2]);
1330
-   r_.f32[3] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[3]);
1331
-#endif
1332
-
1333
-   return simde__m128_from_private(r_);
1334
-#endif
1335
-}
1336
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1337
-#define _mm_cvtpi8_ps(a) simde_mm_cvtpi8_ps(a)
1338
-#endif
1339
-
1340
-SIMDE_FUNCTION_ATTRIBUTES
1341
-simde__m64 simde_mm_cvtps_pi16(simde__m128 a)
1342
-{
1343
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1344
-   return _mm_cvtps_pi16(a);
1345
-#else
1346
-   simde__m64_private r_;
1347
-   simde__m128_private a_ = simde__m128_to_private(a);
1348
-
1349
-#if defined(SIMDE_CONVERT_VECTOR_)
1350
-   SIMDE_CONVERT_VECTOR_(r_.i16, a_.f32);
1351
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1352
-   r_.neon_i16 = vmovn_s32(vcvtq_s32_f32(a_.neon_f32));
1353
-#else
1354
-   SIMDE_VECTORIZE
1355
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1356
-       r_.i16[i] = SIMDE_CONVERT_FTOI(int16_t, a_.f32[i]);
1357
-   }
1358
-#endif
1359
-
1360
-   return simde__m64_from_private(r_);
1361
-#endif
1362
-}
1363
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1364
-#define _mm_cvtps_pi16(a) simde_mm_cvtps_pi16((a))
1365
-#endif
1366
-
1367
-SIMDE_FUNCTION_ATTRIBUTES
1368
-simde__m64 simde_mm_cvtps_pi32(simde__m128 a)
1369
-{
1370
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1371
-   return _mm_cvtps_pi32(a);
1372
-#else
1373
-   simde__m64_private r_;
1374
-   simde__m128_private a_ = simde__m128_to_private(a);
1375
-
1376
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1377
-   r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
1378
-#elif defined(SIMDE_CONVERT_VECTOR_)
1379
-   SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].f32);
1380
-#else
1381
-   SIMDE_VECTORIZE
1382
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1383
-       r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]);
1384
-   }
1385
-#endif
1386
-
1387
-   return simde__m64_from_private(r_);
1388
-#endif
1389
-}
1390
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1391
-#define _mm_cvtps_pi32(a) simde_mm_cvtps_pi32((a))
1392
-#endif
1393
-
1394
-SIMDE_FUNCTION_ATTRIBUTES
1395
-simde__m64 simde_mm_cvtps_pi8(simde__m128 a)
1396
-{
1397
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1398
-   return _mm_cvtps_pi8(a);
1399
-#else
1400
-   simde__m64_private r_;
1401
-   simde__m128_private a_ = simde__m128_to_private(a);
1402
-
1403
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1404
-   int16x4_t b = vmovn_s32(vcvtq_s32_f32(a_.neon_f32));
1405
-   int16x8_t c = vcombine_s16(b, vmov_n_s16(0));
1406
-   r_.neon_i8 = vmovn_s16(c);
1407
-#else
1408
-   SIMDE_VECTORIZE
1409
-   for (size_t i = 0; i < (sizeof(a_.f32) / sizeof(a_.f32[0])); i++) {
1410
-       r_.i8[i] = SIMDE_CONVERT_FTOI(int8_t, a_.f32[i]);
1411
-   }
1412
-   /* Note: the upper half is undefined */
1413
-#endif
1414
-
1415
-   return simde__m64_from_private(r_);
1416
-#endif
1417
-}
1418
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1419
-#define _mm_cvtps_pi8(a) simde_mm_cvtps_pi8((a))
1420
-#endif
1421
-
1422
-SIMDE_FUNCTION_ATTRIBUTES
1423
-simde__m128 simde_mm_cvtpu16_ps(simde__m64 a)
1424
-{
1425
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1426
-   return _mm_cvtpu16_ps(a);
1427
-#else
1428
-   simde__m128_private r_;
1429
-   simde__m64_private a_ = simde__m64_to_private(a);
1430
-
1431
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1432
-   r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(a_.neon_u16));
1433
-#elif defined(SIMDE_CONVERT_VECTOR_)
1434
-   SIMDE_CONVERT_VECTOR_(r_.f32, a_.u16);
1435
-#else
1436
-   SIMDE_VECTORIZE
1437
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1438
-       r_.f32[i] = (simde_float32)a_.u16[i];
1439
-   }
1440
-#endif
1441
-
1442
-   return simde__m128_from_private(r_);
1443
-#endif
1444
-}
1445
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1446
-#define _mm_cvtpu16_ps(a) simde_mm_cvtpu16_ps(a)
1447
-#endif
1448
-
1449
-SIMDE_FUNCTION_ATTRIBUTES
1450
-simde__m128 simde_mm_cvtpu8_ps(simde__m64 a)
1451
-{
1452
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1453
-   return _mm_cvtpu8_ps(a);
1454
-#else
1455
-   simde__m128_private r_;
1456
-   simde__m64_private a_ = simde__m64_to_private(a);
1457
-
1458
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1459
-   r_.neon_f32 =
1460
-       vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(a_.neon_u8))));
1461
-#else
1462
-   SIMDE_VECTORIZE
1463
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1464
-       r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.u8[i]);
1465
-   }
1466
-#endif
1467
-
1468
-   return simde__m128_from_private(r_);
1469
-#endif
1470
-}
1471
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1472
-#define _mm_cvtpu8_ps(a) simde_mm_cvtpu8_ps(a)
1473
-#endif
1474
-
1475
-SIMDE_FUNCTION_ATTRIBUTES
1476
-simde__m128 simde_mm_cvtsi32_ss(simde__m128 a, int32_t b)
1477
-{
1478
-#if defined(SIMDE_X86_SSE_NATIVE)
1479
-   return _mm_cvtsi32_ss(a, b);
1480
-#else
1481
-   simde__m128_private r_;
1482
-   simde__m128_private a_ = simde__m128_to_private(a);
1483
-
1484
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1485
-   r_.neon_f32 = vsetq_lane_f32((simde_float32)b, a_.neon_f32, 0);
1486
-#else
1487
-   r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
1488
-   SIMDE_VECTORIZE
1489
-   for (size_t i = 1; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1490
-       r_.i32[i] = a_.i32[i];
1491
-   }
1492
-#endif
1493
-
1494
-   return simde__m128_from_private(r_);
1495
-#endif
1496
-}
1497
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1498
-#define _mm_cvtsi32_ss(a, b) simde_mm_cvtsi32_ss((a), b)
1499
-#endif
1500
-
1501
-SIMDE_FUNCTION_ATTRIBUTES
1502
-simde__m128 simde_mm_cvtsi64_ss(simde__m128 a, int64_t b)
1503
-{
1504
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
1505
-#if !defined(__PGI)
1506
-   return _mm_cvtsi64_ss(a, b);
1507
-#else
1508
-   return _mm_cvtsi64x_ss(a, b);
1509
-#endif
1510
-#else
1511
-   simde__m128_private r_;
1512
-   simde__m128_private a_ = simde__m128_to_private(a);
1513
-
1514
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1515
-   r_.neon_f32 = vsetq_lane_f32((simde_float32)b, a_.neon_f32, 0);
1516
-#else
1517
-   r_ = a_;
1518
-   r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
1519
-#endif
1520
-
1521
-   return simde__m128_from_private(r_);
1522
-#endif
1523
-}
1524
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1525
-#define _mm_cvtsi64_ss(a, b) simde_mm_cvtsi64_ss((a), b)
1526
-#endif
1527
-
1528
-SIMDE_FUNCTION_ATTRIBUTES
1529
-simde_float32 simde_mm_cvtss_f32(simde__m128 a)
1530
-{
1531
-#if defined(SIMDE_X86_SSE_NATIVE)
1532
-   return _mm_cvtss_f32(a);
1533
-#else
1534
-   simde__m128_private a_ = simde__m128_to_private(a);
1535
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1536
-   return vgetq_lane_f32(a_.neon_f32, 0);
1537
-#else
1538
-   return a_.f32[0];
1539
-#endif
1540
-#endif
1541
-}
1542
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1543
-#define _mm_cvtss_f32(a) simde_mm_cvtss_f32((a))
1544
-#endif
1545
-
1546
-SIMDE_FUNCTION_ATTRIBUTES
1547
-int32_t simde_mm_cvtss_si32(simde__m128 a)
1548
-{
1549
-   return simde_mm_cvt_ss2si(a);
1550
-}
1551
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1552
-#define _mm_cvtss_si32(a) simde_mm_cvtss_si32((a))
1553
-#endif
1554
-
1555
-SIMDE_FUNCTION_ATTRIBUTES
1556
-int64_t simde_mm_cvtss_si64(simde__m128 a)
1557
-{
1558
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
1559
-#if !defined(__PGI)
1560
-   return _mm_cvtss_si64(a);
1561
-#else
1562
-   return _mm_cvtss_si64x(a);
1563
-#endif
1564
-#else
1565
-   simde__m128_private a_ = simde__m128_to_private(a);
1566
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1567
-   return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0));
1568
-#else
1569
-   return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]);
1570
-#endif
1571
-#endif
1572
-}
1573
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1574
-#define _mm_cvtss_si64(a) simde_mm_cvtss_si64((a))
1575
-#endif
1576
-
1577
-SIMDE_FUNCTION_ATTRIBUTES
1578
-simde__m64 simde_mm_cvtt_ps2pi(simde__m128 a)
1579
-{
1580
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1581
-   return _mm_cvtt_ps2pi(a);
1582
-#else
1583
-   simde__m64_private r_;
1584
-   simde__m128_private a_ = simde__m128_to_private(a);
1585
-
1586
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1587
-   r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
1588
-#elif defined(SIMDE_CONVERT_VECTOR_)
1589
-   SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].f32);
1590
-#else
1591
-   SIMDE_VECTORIZE
1592
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1593
-       r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]);
1594
-   }
1595
-#endif
1596
-
1597
-   return simde__m64_from_private(r_);
1598
-#endif
1599
-}
1600
-#define simde_mm_cvttps_pi32(a) simde_mm_cvtt_ps2pi(a)
1601
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1602
-#define _mm_cvtt_ps2pi(a) simde_mm_cvtt_ps2pi((a))
1603
-#define _mm_cvttps_pi32(a) simde_mm_cvttps_pi32((a))
1604
-#endif
1605
-
1606
-SIMDE_FUNCTION_ATTRIBUTES
1607
-int32_t simde_mm_cvtt_ss2si(simde__m128 a)
1608
-{
1609
-#if defined(SIMDE_X86_SSE_NATIVE)
1610
-   return _mm_cvtt_ss2si(a);
1611
-#else
1612
-   simde__m128_private a_ = simde__m128_to_private(a);
1613
-
1614
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1615
-   return SIMDE_CONVERT_FTOI(int32_t, vgetq_lane_f32(a_.neon_f32, 0));
1616
-#else
1617
-   return SIMDE_CONVERT_FTOI(int32_t, a_.f32[0]);
1618
-#endif
1619
-#endif
1620
-}
1621
-#define simde_mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a))
1622
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1623
-#define _mm_cvtt_ss2si(a) simde_mm_cvtt_ss2si((a))
1624
-#define _mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a))
1625
-#endif
1626
-
1627
-SIMDE_FUNCTION_ATTRIBUTES
1628
-int64_t simde_mm_cvttss_si64(simde__m128 a)
1629
-{
1630
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
1631
-   !defined(_MSC_VER)
1632
-#if defined(__PGI)
1633
-   return _mm_cvttss_si64x(a);
1634
-#else
1635
-   return _mm_cvttss_si64(a);
1636
-#endif
1637
-#else
1638
-   simde__m128_private a_ = simde__m128_to_private(a);
1639
-
1640
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1641
-   return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0));
1642
-#else
1643
-   return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]);
1644
-#endif
1645
-#endif
1646
-}
1647
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1648
-#define _mm_cvttss_si64(a) simde_mm_cvttss_si64((a))
1649
-#endif
1650
-
1651
-SIMDE_FUNCTION_ATTRIBUTES
1652
-simde__m128 simde_mm_cmpord_ss(simde__m128 a, simde__m128 b)
1653
-{
1654
-#if defined(SIMDE_X86_SSE_NATIVE)
1655
-   return _mm_cmpord_ss(a, b);
1656
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
1657
-   return simde_mm_move_ss(a, simde_mm_cmpord_ps(a, b));
1658
-#else
1659
-   simde__m128_private r_, a_ = simde__m128_to_private(a);
1660
-
1661
-#if defined(simde_math_isnanf)
1662
-   r_.u32[0] = (simde_math_isnanf(simde_mm_cvtss_f32(a)) ||
1663
-            simde_math_isnanf(simde_mm_cvtss_f32(b)))
1664
-               ? UINT32_C(0)
1665
-               : ~UINT32_C(0);
1666
-   SIMDE_VECTORIZE
1667
-   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1668
-       r_.u32[i] = a_.u32[i];
1669
-   }
1670
-#else
1671
-   HEDLEY_UNREACHABLE();
1672
-#endif
1673
-
1674
-   return simde__m128_from_private(r_);
1675
-#endif
1676
-}
1677
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1678
-#define _mm_cmpord_ss(a, b) simde_mm_cmpord_ss((a), (b))
1679
-#endif
1680
-
1681
-SIMDE_FUNCTION_ATTRIBUTES
1682
-simde__m128 simde_mm_div_ps(simde__m128 a, simde__m128 b)
1683
-{
1684
-#if defined(SIMDE_X86_SSE_NATIVE)
1685
-   return _mm_div_ps(a, b);
1686
-#else
1687
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
1688
-               b_ = simde__m128_to_private(b);
1689
-
1690
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1691
-   r_.neon_f32 = vdivq_f32(a_.neon_f32, b_.neon_f32);
1692
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1693
-   float32x4_t recip0 = vrecpeq_f32(b_.neon_f32);
1694
-   float32x4_t recip1 =
1695
-       vmulq_f32(recip0, vrecpsq_f32(recip0, b_.neon_f32));
1696
-   r_.neon_f32 = vmulq_f32(a_.neon_f32, recip1);
1697
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1698
-   r_.wasm_v128 = wasm_f32x4_div(a_.wasm_v128, b_.wasm_v128);
1699
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1700
-   r_.f32 = a_.f32 / b_.f32;
1701
-#else
1702
-   SIMDE_VECTORIZE
1703
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1704
-       r_.f32[i] = a_.f32[i] / b_.f32[i];
1705
-   }
1706
-#endif
1707
-
1708
-   return simde__m128_from_private(r_);
1709
-#endif
1710
-}
1711
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1712
-#define _mm_div_ps(a, b) simde_mm_div_ps((a), (b))
1713
-#endif
1714
-
1715
-SIMDE_FUNCTION_ATTRIBUTES
1716
-simde__m128 simde_mm_div_ss(simde__m128 a, simde__m128 b)
1717
-{
1718
-#if defined(SIMDE_X86_SSE_NATIVE)
1719
-   return _mm_div_ss(a, b);
1720
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
1721
-   return simde_mm_move_ss(a, simde_mm_div_ps(a, b));
1722
-#else
1723
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
1724
-               b_ = simde__m128_to_private(b);
1725
-
1726
-   r_.f32[0] = a_.f32[0] / b_.f32[0];
1727
-   SIMDE_VECTORIZE
1728
-   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1729
-       r_.f32[i] = a_.f32[i];
1730
-   }
1731
-
1732
-   return simde__m128_from_private(r_);
1733
-#endif
1734
-}
1735
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1736
-#define _mm_div_ss(a, b) simde_mm_div_ss((a), (b))
1737
-#endif
1738
-
1739
-SIMDE_FUNCTION_ATTRIBUTES
1740
-int16_t simde_mm_extract_pi16(simde__m64 a, const int imm8)
1741
-   SIMDE_REQUIRE_RANGE(imm8, 0, 3)
1742
-{
1743
-   simde__m64_private a_ = simde__m64_to_private(a);
1744
-   return a_.i16[imm8];
1745
-}
1746
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
1747
-   !defined(HEDLEY_PGI_VERSION)
1748
-#if HEDLEY_HAS_WARNING("-Wvector-conversion")
1749
-/* https://bugs.llvm.org/show_bug.cgi?id=44589 */
1750
-#define simde_mm_extract_pi16(a, imm8)                                      \
1751
-   (HEDLEY_DIAGNOSTIC_PUSH _Pragma(                                    \
1752
-       "clang diagnostic ignored \"-Wvector-conversion\"")         \
1753
-        HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16((a), (imm8))) \
1754
-            HEDLEY_DIAGNOSTIC_POP)
1755
-#else
1756
-#define simde_mm_extract_pi16(a, imm8) \
1757
-   HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16(a, imm8))
1758
-#endif
1759
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1760
-#define simde_mm_extract_pi16(a, imm8) \
1761
-   vget_lane_s16(simde__m64_to_private(a).neon_i16, imm8)
1762
-#endif
1763
-#define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a, imm8)
1764
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1765
-#define _mm_extract_pi16(a, imm8) simde_mm_extract_pi16((a), (imm8))
1766
-#endif
1767
-
1768
-enum {
1769
-#if defined(SIMDE_X86_SSE_NATIVE)
1770
-   SIMDE_MM_ROUND_NEAREST = _MM_ROUND_NEAREST,
1771
-   SIMDE_MM_ROUND_DOWN = _MM_ROUND_DOWN,
1772
-   SIMDE_MM_ROUND_UP = _MM_ROUND_UP,
1773
-   SIMDE_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO
1774
-#else
1775
-   SIMDE_MM_ROUND_NEAREST
1776
-#if defined(FE_TONEAREST)
1777
-   = FE_TONEAREST
1778
-#endif
1779
-   ,
1780
-
1781
-   SIMDE_MM_ROUND_DOWN
1782
-#if defined(FE_DOWNWARD)
1783
-   = FE_DOWNWARD
1784
-#endif
1785
-   ,
1786
-
1787
-   SIMDE_MM_ROUND_UP
1788
-#if defined(FE_UPWARD)
1789
-   = FE_UPWARD
1790
-#endif
1791
-   ,
1792
-
1793
-   SIMDE_MM_ROUND_TOWARD_ZERO
1794
-#if defined(FE_TOWARDZERO)
1795
-   = FE_TOWARDZERO
1796
-#endif
1797
-#endif
1798
-};
1799
-
1800
-SIMDE_FUNCTION_ATTRIBUTES
1801
-unsigned int SIMDE_MM_GET_ROUNDING_MODE(void)
1802
-{
1803
-#if defined(SIMDE_X86_SSE_NATIVE)
1804
-   return _MM_GET_ROUNDING_MODE();
1805
-#elif defined(SIMDE_HAVE_FENV_H)
1806
-   return HEDLEY_STATIC_CAST(unsigned int, fegetround());
1807
-#else
1808
-   HEDLEY_UNREACHABLE();
1809
-#endif
1810
-}
1811
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1812
-#define _MM_GET_ROUNDING_MODE() SIMDE_MM_GET_ROUNDING_MODE()
1813
-#endif
1814
-
1815
-SIMDE_FUNCTION_ATTRIBUTES
1816
-void SIMDE_MM_SET_ROUNDING_MODE(unsigned int a)
1817
-{
1818
-#if defined(SIMDE_X86_SSE_NATIVE)
1819
-   _MM_SET_ROUNDING_MODE(a);
1820
-#elif defined(SIMDE_HAVE_FENV_H)
1821
-   fesetround(HEDLEY_STATIC_CAST(int, a));
1822
-#endif
1823
-}
1824
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1825
-#define _MM_SET_ROUNDING_MODE(a) SIMDE_MM_SET_ROUNDING_MODE(a)
1826
-#endif
1827
-
1828
-SIMDE_FUNCTION_ATTRIBUTES
1829
-simde__m64 simde_mm_insert_pi16(simde__m64 a, int16_t i, const int imm8)
1830
-   SIMDE_REQUIRE_RANGE(imm8, 0, 3)
1831
-{
1832
-   simde__m64_private r_, a_ = simde__m64_to_private(a);
1833
-
1834
-   r_.i64[0] = a_.i64[0];
1835
-   r_.i16[imm8] = i;
1836
-
1837
-   return simde__m64_from_private(r_);
1838
-}
1839
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
1840
-   !defined(__PGI)
1841
-#if HEDLEY_HAS_WARNING("-Wvector-conversion")
1842
-/* https://bugs.llvm.org/show_bug.cgi?id=44589 */
1843
-#define ssimde_mm_insert_pi16(a, i, imm8)                            \
1844
-   (HEDLEY_DIAGNOSTIC_PUSH _Pragma(                             \
1845
-       "clang diagnostic ignored \"-Wvector-conversion\"")( \
1846
-       _mm_insert_pi16((a), (i), (imm8))) HEDLEY_DIAGNOSTIC_POP)
1847
-#else
1848
-#define simde_mm_insert_pi16(a, i, imm8) _mm_insert_pi16(a, i, imm8)
1849
-#endif
1850
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1851
-#define simde_mm_insert_pi16(a, i, imm8) \
1852
-   simde__m64_from_neon_i16(        \
1853
-       vset_lane_s16((i), simde__m64_to_neon_i16(a), (imm8)))
1854
-#endif
1855
-#define simde_m_pinsrw(a, i, imm8) (simde_mm_insert_pi16(a, i, imm8))
1856
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1857
-#define _mm_insert_pi16(a, i, imm8) simde_mm_insert_pi16(a, i, imm8)
1858
-#define _m_pinsrw(a, i, imm8) simde_mm_insert_pi16(a, i, imm8)
1859
-#endif
1860
-
1861
-SIMDE_FUNCTION_ATTRIBUTES
1862
-simde__m128
1863
-simde_mm_load_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
1864
-{
1865
-   simde_assert_aligned(16, mem_addr);
1866
-
1867
-#if defined(SIMDE_X86_SSE_NATIVE)
1868
-   return _mm_load_ps(mem_addr);
1869
-#else
1870
-   simde__m128_private r_;
1871
-
1872
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1873
-   r_.neon_f32 = vld1q_f32(mem_addr);
1874
-#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1875
-   r_.altivec_f32 = vec_vsx_ld(0, mem_addr);
1876
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1877
-   r_.altivec_f32 = vec_ld(0, mem_addr);
1878
-#else
1879
-   r_ = *SIMDE_ALIGN_CAST(simde__m128_private const *, mem_addr);
1880
-#endif
1881
-
1882
-   return simde__m128_from_private(r_);
1883
-#endif
1884
-}
1885
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1886
-#define _mm_load_ps(mem_addr) simde_mm_load_ps(mem_addr)
1887
-#endif
1888
-
1889
-SIMDE_FUNCTION_ATTRIBUTES
1890
-simde__m128 simde_mm_load_ps1(simde_float32 const *mem_addr)
1891
-{
1892
-#if defined(SIMDE_X86_SSE_NATIVE)
1893
-   return _mm_load_ps1(mem_addr);
1894
-#else
1895
-   simde__m128_private r_;
1896
-
1897
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1898
-   r_.neon_f32 = vld1q_dup_f32(mem_addr);
1899
-#else
1900
-   r_ = simde__m128_to_private(simde_mm_set1_ps(*mem_addr));
1901
-#endif
1902
-
1903
-   return simde__m128_from_private(r_);
1904
-#endif
1905
-}
1906
-#define simde_mm_load1_ps(mem_addr) simde_mm_load_ps1(mem_addr)
1907
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1908
-#define _mm_load_ps1(mem_addr) simde_mm_load_ps1(mem_addr)
1909
-#define _mm_load1_ps(mem_addr) simde_mm_load_ps1(mem_addr)
1910
-#endif
1911
-
1912
-SIMDE_FUNCTION_ATTRIBUTES
1913
-simde__m128 simde_mm_load_ss(simde_float32 const *mem_addr)
1914
-{
1915
-#if defined(SIMDE_X86_SSE_NATIVE)
1916
-   return _mm_load_ss(mem_addr);
1917
-#else
1918
-   simde__m128_private r_;
1919
-
1920
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1921
-   r_.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0);
1922
-#else
1923
-   r_.f32[0] = *mem_addr;
1924
-   r_.i32[1] = 0;
1925
-   r_.i32[2] = 0;
1926
-   r_.i32[3] = 0;
1927
-#endif
1928
-
1929
-   return simde__m128_from_private(r_);
1930
-#endif
1931
-}
1932
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1933
-#define _mm_load_ss(mem_addr) simde_mm_load_ss(mem_addr)
1934
-#endif
1935
-
1936
-SIMDE_FUNCTION_ATTRIBUTES
1937
-simde__m128 simde_mm_loadh_pi(simde__m128 a, simde__m64 const *mem_addr)
1938
-{
1939
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1940
-   return _mm_loadh_pi(a,
1941
-               HEDLEY_REINTERPRET_CAST(__m64 const *, mem_addr));
1942
-#else
1943
-   simde__m128_private r_, a_ = simde__m128_to_private(a);
1944
-
1945
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1946
-   r_.neon_f32 = vcombine_f32(
1947
-       vget_low_f32(a_.neon_f32),
1948
-       vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr)));
1949
-#else
1950
-   simde__m64_private b_ =
1951
-       *HEDLEY_REINTERPRET_CAST(simde__m64_private const *, mem_addr);
1952
-   r_.f32[0] = a_.f32[0];
1953
-   r_.f32[1] = a_.f32[1];
1954
-   r_.f32[2] = b_.f32[0];
1955
-   r_.f32[3] = b_.f32[1];
1956
-#endif
1957
-
1958
-   return simde__m128_from_private(r_);
1959
-#endif
1960
-}
1961
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1962
-#define _mm_loadh_pi(a, mem_addr) \
1963
-   simde_mm_loadh_pi((a), (simde__m64 const *)(mem_addr))
1964
-#endif
1965
-
1966
-/* The SSE documentation says that there are no alignment requirements
1967
-   for mem_addr.  Unfortunately they used the __m64 type for the argument
1968
-   which is supposed to be 8-byte aligned, so some compilers (like clang
1969
-   with -Wcast-align) will generate a warning if you try to cast, say,
1970
-   a simde_float32* to a simde__m64* for this function.
1971
-
1972
-   I think the choice of argument type is unfortunate, but I do think we
1973
-   need to stick to it here.  If there is demand I can always add something
1974
-   like simde_x_mm_loadl_f32(simde__m128, simde_float32 mem_addr[2]) */
1975
-SIMDE_FUNCTION_ATTRIBUTES
1976
-simde__m128 simde_mm_loadl_pi(simde__m128 a, simde__m64 const *mem_addr)
1977
-{
1978
-#if defined(SIMDE_X86_SSE_NATIVE)
1979
-   return _mm_loadl_pi(a,
1980
-               HEDLEY_REINTERPRET_CAST(__m64 const *, mem_addr));
1981
-#else
1982
-   simde__m128_private r_, a_ = simde__m128_to_private(a);
1983
-
1984
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1985
-   r_.neon_f32 = vcombine_f32(
1986
-       vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr)),
1987
-       vget_high_f32(a_.neon_f32));
1988
-#else
1989
-   simde__m64_private b_;
1990
-   simde_memcpy(&b_, mem_addr, sizeof(b_));
1991
-   r_.i32[0] = b_.i32[0];
1992
-   r_.i32[1] = b_.i32[1];
1993
-   r_.i32[2] = a_.i32[2];
1994
-   r_.i32[3] = a_.i32[3];
1995
-#endif
1996
-
1997
-   return simde__m128_from_private(r_);
1998
-#endif
1999
-}
2000
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2001
-#define _mm_loadl_pi(a, mem_addr) \
2002
-   simde_mm_loadl_pi((a), (simde__m64 const *)(mem_addr))
2003
-#endif
2004
-
2005
-SIMDE_FUNCTION_ATTRIBUTES
2006
-simde__m128
2007
-simde_mm_loadr_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
2008
-{
2009
-   simde_assert_aligned(16, mem_addr);
2010
-
2011
-#if defined(SIMDE_X86_SSE_NATIVE)
2012
-   return _mm_loadr_ps(mem_addr);
2013
-#else
2014
-   simde__m128_private r_,
2015
-       v_ = simde__m128_to_private(simde_mm_load_ps(mem_addr));
2016
-
2017
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2018
-   r_.neon_f32 = vrev64q_f32(v_.neon_f32);
2019
-   r_.neon_f32 = vextq_f32(r_.neon_f32, r_.neon_f32, 2);
2020
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
2021
-   r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, v_.f32, v_.f32, 3, 2, 1, 0);
2022
-#else
2023
-   r_.f32[0] = v_.f32[3];
2024
-   r_.f32[1] = v_.f32[2];
2025
-   r_.f32[2] = v_.f32[1];
2026
-   r_.f32[3] = v_.f32[0];
2027
-#endif
2028
-
2029
-   return simde__m128_from_private(r_);
2030
-#endif
2031
-}
2032
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2033
-#define _mm_loadr_ps(mem_addr) simde_mm_loadr_ps(mem_addr)
2034
-#endif
2035
-
2036
-SIMDE_FUNCTION_ATTRIBUTES
2037
-simde__m128
2038
-simde_mm_loadu_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
2039
-{
2040
-#if defined(SIMDE_X86_SSE_NATIVE)
2041
-   return _mm_loadu_ps(mem_addr);
2042
-#else
2043
-   simde__m128_private r_;
2044
-
2045
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2046
-   r_.neon_f32 =
2047
-       vld1q_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr));
2048
-#else
2049
-   r_.f32[0] = mem_addr[0];
2050
-   r_.f32[1] = mem_addr[1];
2051
-   r_.f32[2] = mem_addr[2];
2052
-   r_.f32[3] = mem_addr[3];
2053
-#endif
2054
-
2055
-   return simde__m128_from_private(r_);
2056
-#endif
2057
-}
2058
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2059
-#define _mm_loadu_ps(mem_addr) simde_mm_loadu_ps(mem_addr)
2060
-#endif
2061
-
2062
-SIMDE_FUNCTION_ATTRIBUTES
2063
-void simde_mm_maskmove_si64(simde__m64 a, simde__m64 mask, int8_t *mem_addr)
2064
-{
2065
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2066
-   _mm_maskmove_si64(a, mask, HEDLEY_REINTERPRET_CAST(char *, mem_addr));
2067
-#else
2068
-   simde__m64_private a_ = simde__m64_to_private(a),
2069
-              mask_ = simde__m64_to_private(mask);
2070
-
2071
-   SIMDE_VECTORIZE
2072
-   for (size_t i = 0; i < (sizeof(a_.i8) / sizeof(a_.i8[0])); i++)
2073
-       if (mask_.i8[i] < 0)
2074
-           mem_addr[i] = a_.i8[i];
2075
-#endif
2076
-}
2077
-#define simde_m_maskmovq(a, mask, mem_addr) \
2078
-   simde_mm_maskmove_si64(a, mask, mem_addr)
2079
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2080
-#define _mm_maskmove_si64(a, mask, mem_addr) \
2081
-   simde_mm_maskmove_si64(              \
2082
-       (a), (mask),                 \
2083
-       SIMDE_CHECKED_REINTERPRET_CAST(int8_t *, char *, (mem_addr)))
2084
-#endif
2085
-
2086
-SIMDE_FUNCTION_ATTRIBUTES
2087
-simde__m64 simde_mm_max_pi16(simde__m64 a, simde__m64 b)
2088
-{
2089
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2090
-   return _mm_max_pi16(a, b);
2091
-#else
2092
-   simde__m64_private r_, a_ = simde__m64_to_private(a),
2093
-                  b_ = simde__m64_to_private(b);
2094
-
2095
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2096
-   r_.neon_i16 = vmax_s16(a_.neon_i16, b_.neon_i16);
2097
-#else
2098
-   SIMDE_VECTORIZE
2099
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
2100
-       r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
2101
-   }
2102
-#endif
2103
-
2104
-   return simde__m64_from_private(r_);
2105
-#endif
2106
-}
2107
-#define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
2108
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2109
-#define _mm_max_pi16(a, b) simde_mm_max_pi16(a, b)
2110
-#define _m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
2111
-#endif
2112
-
2113
-SIMDE_FUNCTION_ATTRIBUTES
2114
-simde__m128 simde_mm_max_ps(simde__m128 a, simde__m128 b)
2115
-{
2116
-#if defined(SIMDE_X86_SSE_NATIVE)
2117
-   return _mm_max_ps(a, b);
2118
-#else
2119
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
2120
-               b_ = simde__m128_to_private(b);
2121
-
2122
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2123
-   r_.neon_f32 = vmaxq_f32(a_.neon_f32, b_.neon_f32);
2124
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2125
-   r_.altivec_f32 = vec_max(a_.altivec_f32, b_.altivec_f32);
2126
-#else
2127
-   SIMDE_VECTORIZE
2128
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2129
-       r_.f32[i] = (a_.f32[i] > b_.f32[i]) ? a_.f32[i] : b_.f32[i];
2130
-   }
2131
-#endif
2132
-
2133
-   return simde__m128_from_private(r_);
2134
-#endif
2135
-}
2136
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2137
-#define _mm_max_ps(a, b) simde_mm_max_ps((a), (b))
2138
-#endif
2139
-
2140
-SIMDE_FUNCTION_ATTRIBUTES
2141
-simde__m64 simde_mm_max_pu8(simde__m64 a, simde__m64 b)
2142
-{
2143
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2144
-   return _mm_max_pu8(a, b);
2145
-#else
2146
-   simde__m64_private r_, a_ = simde__m64_to_private(a),
2147
-                  b_ = simde__m64_to_private(b);
2148
-
2149
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2150
-   r_.neon_u8 = vmax_u8(a_.neon_u8, b_.neon_u8);
2151
-#else
2152
-   SIMDE_VECTORIZE
2153
-   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
2154
-       r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
2155
-   }
2156
-#endif
2157
-
2158
-   return simde__m64_from_private(r_);
2159
-#endif
2160
-}
2161
-#define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b)
2162
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2163
-#define _mm_max_pu8(a, b) simde_mm_max_pu8(a, b)
2164
-#define _m_pmaxub(a, b) simde_mm_max_pu8(a, b)
2165
-#endif
2166
-
2167
-SIMDE_FUNCTION_ATTRIBUTES
2168
-simde__m128 simde_mm_max_ss(simde__m128 a, simde__m128 b)
2169
-{
2170
-#if defined(SIMDE_X86_SSE_NATIVE)
2171
-   return _mm_max_ss(a, b);
2172
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
2173
-   return simde_mm_move_ss(a, simde_mm_max_ps(a, b));
2174
-#else
2175
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
2176
-               b_ = simde__m128_to_private(b);
2177
-
2178
-   r_.f32[0] = (a_.f32[0] > b_.f32[0]) ? a_.f32[0] : b_.f32[0];
2179
-   r_.f32[1] = a_.f32[1];
2180
-   r_.f32[2] = a_.f32[2];
2181
-   r_.f32[3] = a_.f32[3];
2182
-
2183
-   return simde__m128_from_private(r_);
2184
-#endif
2185
-}
2186
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2187
-#define _mm_max_ss(a, b) simde_mm_max_ss((a), (b))
2188
-#endif
2189
-
2190
-SIMDE_FUNCTION_ATTRIBUTES
2191
-simde__m64 simde_mm_min_pi16(simde__m64 a, simde__m64 b)
2192
-{
2193
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2194
-   return _mm_min_pi16(a, b);
2195
-#else
2196
-   simde__m64_private r_, a_ = simde__m64_to_private(a),
2197
-                  b_ = simde__m64_to_private(b);
2198
-
2199
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2200
-   r_.neon_i16 = vmin_s16(a_.neon_i16, b_.neon_i16);
2201
-#else
2202
-   SIMDE_VECTORIZE
2203
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
2204
-       r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
2205
-   }
2206
-#endif
2207
-
2208
-   return simde__m64_from_private(r_);
2209
-#endif
2210
-}
2211
-#define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b)
2212
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2213
-#define _mm_min_pi16(a, b) simde_mm_min_pi16(a, b)
2214
-#define _m_pminsw(a, b) simde_mm_min_pi16(a, b)
2215
-#endif
2216
-
2217
-SIMDE_FUNCTION_ATTRIBUTES
2218
-simde__m128 simde_mm_min_ps(simde__m128 a, simde__m128 b)
2219
-{
2220
-#if defined(SIMDE_X86_SSE_NATIVE)
2221
-   return _mm_min_ps(a, b);
2222
-#else
2223
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
2224
-               b_ = simde__m128_to_private(b);
2225
-
2226
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2227
-   r_.neon_f32 = vminq_f32(a_.neon_f32, b_.neon_f32);
2228
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2229
-   r_.altivec_f32 = vec_min(a_.altivec_f32, b_.altivec_f32);
2230
-#else
2231
-   SIMDE_VECTORIZE
2232
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2233
-       r_.f32[i] = (a_.f32[i] < b_.f32[i]) ? a_.f32[i] : b_.f32[i];
2234
-   }
2235
-#endif
2236
-
2237
-   return simde__m128_from_private(r_);
2238
-#endif
2239
-}
2240
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2241
-#define _mm_min_ps(a, b) simde_mm_min_ps((a), (b))
2242
-#endif
2243
-
2244
-SIMDE_FUNCTION_ATTRIBUTES
2245
-simde__m64 simde_mm_min_pu8(simde__m64 a, simde__m64 b)
2246
-{
2247
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2248
-   return _mm_min_pu8(a, b);
2249
-#else
2250
-   simde__m64_private r_, a_ = simde__m64_to_private(a),
2251
-                  b_ = simde__m64_to_private(b);
2252
-
2253
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2254
-   r_.neon_u8 = vmin_u8(a_.neon_u8, b_.neon_u8);
2255
-#else
2256
-   SIMDE_VECTORIZE
2257
-   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
2258
-       r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
2259
-   }
2260
-#endif
2261
-
2262
-   return simde__m64_from_private(r_);
2263
-#endif
2264
-}
2265
-#define simde_m_pminub(a, b) simde_mm_min_pu8(a, b)
2266
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2267
-#define _mm_min_pu8(a, b) simde_mm_min_pu8(a, b)
2268
-#define _m_pminub(a, b) simde_mm_min_pu8(a, b)
2269
-#endif
2270
-
2271
-SIMDE_FUNCTION_ATTRIBUTES
2272
-simde__m128 simde_mm_min_ss(simde__m128 a, simde__m128 b)
2273
-{
2274
-#if defined(SIMDE_X86_SSE_NATIVE)
2275
-   return _mm_min_ss(a, b);
2276
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
2277
-   return simde_mm_move_ss(a, simde_mm_min_ps(a, b));
2278
-#else
2279
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
2280
-               b_ = simde__m128_to_private(b);
2281
-
2282
-   r_.f32[0] = (a_.f32[0] < b_.f32[0]) ? a_.f32[0] : b_.f32[0];
2283
-   r_.f32[1] = a_.f32[1];
2284
-   r_.f32[2] = a_.f32[2];
2285
-   r_.f32[3] = a_.f32[3];
2286
-
2287
-   return simde__m128_from_private(r_);
2288
-#endif
2289
-}
2290
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2291
-#define _mm_min_ss(a, b) simde_mm_min_ss((a), (b))
2292
-#endif
2293
-
2294
-SIMDE_FUNCTION_ATTRIBUTES
2295
-simde__m128 simde_mm_movehl_ps(simde__m128 a, simde__m128 b)
2296
-{
2297
-#if defined(SIMDE_X86_SSE_NATIVE)
2298
-   return _mm_movehl_ps(a, b);
2299
-#else
2300
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
2301
-               b_ = simde__m128_to_private(b);
2302
-
2303
-#if defined(SIMDE_SHUFFLE_VECTOR_)
2304
-   r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 6, 7, 2, 3);
2305
-#else
2306
-   r_.f32[0] = b_.f32[2];
2307
-   r_.f32[1] = b_.f32[3];
2308
-   r_.f32[2] = a_.f32[2];
2309
-   r_.f32[3] = a_.f32[3];
2310
-#endif
2311
-
2312
-   return simde__m128_from_private(r_);
2313
-#endif
2314
-}
2315
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2316
-#define _mm_movehl_ps(a, b) simde_mm_movehl_ps((a), (b))
2317
-#endif
2318
-
2319
-SIMDE_FUNCTION_ATTRIBUTES
2320
-simde__m128 simde_mm_movelh_ps(simde__m128 a, simde__m128 b)
2321
-{
2322
-#if defined(SIMDE_X86_SSE_NATIVE)
2323
-   return _mm_movelh_ps(a, b);
2324
-#else
2325
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
2326
-               b_ = simde__m128_to_private(b);
2327
-
2328
-#if defined(SIMDE_SHUFFLE_VECTOR_)
2329
-   r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 1, 4, 5);
2330
-#else
2331
-   r_.f32[0] = a_.f32[0];
2332
-   r_.f32[1] = a_.f32[1];
2333
-   r_.f32[2] = b_.f32[0];
2334
-   r_.f32[3] = b_.f32[1];
2335
-#endif
2336
-
2337
-   return simde__m128_from_private(r_);
2338
-#endif
2339
-}
2340
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2341
-#define _mm_movelh_ps(a, b) simde_mm_movelh_ps((a), (b))
2342
-#endif
2343
-
2344
-SIMDE_FUNCTION_ATTRIBUTES
2345
-int simde_mm_movemask_pi8(simde__m64 a)
2346
-{
2347
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2348
-   return _mm_movemask_pi8(a);
2349
-#else
2350
-   simde__m64_private a_ = simde__m64_to_private(a);
2351
-   int r = 0;
2352
-   const size_t nmemb = sizeof(a_.i8) / sizeof(a_.i8[0]);
2353
-
2354
-   SIMDE_VECTORIZE_REDUCTION(| : r)
2355
-   for (size_t i = 0; i < nmemb; i++) {
2356
-       r |= (a_.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i);
2357
-   }
2358
-
2359
-   return r;
2360
-#endif
2361
-}
2362
-#define simde_m_pmovmskb(a, b) simde_mm_movemask_pi8(a, b)
2363
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2364
-#define _mm_movemask_pi8(a) simde_mm_movemask_pi8(a)
2365
-#endif
2366
-
2367
-SIMDE_FUNCTION_ATTRIBUTES
2368
-int simde_mm_movemask_ps(simde__m128 a)
2369
-{
2370
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2371
-   return _mm_movemask_ps(a);
2372
-#else
2373
-   int r = 0;
2374
-   simde__m128_private a_ = simde__m128_to_private(a);
2375
-
2376
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2377
-   /* TODO: check to see if NEON version is faster than the portable version */
2378
-   static const uint32x4_t movemask = {1, 2, 4, 8};
2379
-   static const uint32x4_t highbit = {0x80000000, 0x80000000, 0x80000000,
2380
-                      0x80000000};
2381
-   uint32x4_t t0 = a_.neon_u32;
2382
-   uint32x4_t t1 = vtstq_u32(t0, highbit);
2383
-   uint32x4_t t2 = vandq_u32(t1, movemask);
2384
-   uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
2385
-   r = vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
2386
-#else
2387
-   SIMDE_VECTORIZE_REDUCTION(| : r)
2388
-   for (size_t i = 0; i < sizeof(a_.u32) / sizeof(a_.u32[0]); i++) {
2389
-       r |= (a_.u32[i] >> ((sizeof(a_.u32[i]) * CHAR_BIT) - 1)) << i;
2390
-   }
2391
-#endif
2392
-
2393
-   return r;
2394
-#endif
2395
-}
2396
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2397
-#define _mm_movemask_ps(a) simde_mm_movemask_ps((a))
2398
-#endif
2399
-
2400
-SIMDE_FUNCTION_ATTRIBUTES
2401
-simde__m128 simde_mm_mul_ps(simde__m128 a, simde__m128 b)
2402
-{
2403
-#if defined(SIMDE_X86_SSE_NATIVE)
2404
-   return _mm_mul_ps(a, b);
2405
-#else
2406
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
2407
-               b_ = simde__m128_to_private(b);
2408
-
2409
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2410
-   r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32);
2411
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2412
-   r_.wasm_v128 = wasm_f32x4_mul(a_.wasm_v128, b_.wasm_v128);
2413
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2414
-   r_.f32 = a_.f32 * b_.f32;
2415
-#else
2416
-   SIMDE_VECTORIZE
2417
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2418
-       r_.f32[i] = a_.f32[i] * b_.f32[i];
2419
-   }
2420
-#endif
2421
-
2422
-   return simde__m128_from_private(r_);
2423
-#endif
2424
-}
2425
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2426
-#define _mm_mul_ps(a, b) simde_mm_mul_ps((a), (b))
2427
-#endif
2428
-
2429
-SIMDE_FUNCTION_ATTRIBUTES
2430
-simde__m128 simde_mm_mul_ss(simde__m128 a, simde__m128 b)
2431
-{
2432
-#if defined(SIMDE_X86_SSE_NATIVE)
2433
-   return _mm_mul_ss(a, b);
2434
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
2435
-   return simde_mm_move_ss(a, simde_mm_mul_ps(a, b));
2436
-#else
2437
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
2438
-               b_ = simde__m128_to_private(b);
2439
-
2440
-   r_.f32[0] = a_.f32[0] * b_.f32[0];
2441
-   r_.f32[1] = a_.f32[1];
2442
-   r_.f32[2] = a_.f32[2];
2443
-   r_.f32[3] = a_.f32[3];
2444
-
2445
-   return simde__m128_from_private(r_);
2446
-#endif
2447
-}
2448
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2449
-#define _mm_mul_ss(a, b) simde_mm_mul_ss((a), (b))
2450
-#endif
2451
-
2452
-SIMDE_FUNCTION_ATTRIBUTES
2453
-simde__m64 simde_mm_mulhi_pu16(simde__m64 a, simde__m64 b)
2454
-{
2455
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2456
-   return _mm_mulhi_pu16(a, b);
2457
-#else
2458
-   simde__m64_private r_, a_ = simde__m64_to_private(a),
2459
-                  b_ = simde__m64_to_private(b);
2460
-
2461
-   SIMDE_VECTORIZE
2462
-   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
2463
-       r_.u16[i] = HEDLEY_STATIC_CAST(
2464
-           uint16_t, ((HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) *
2465
-                   HEDLEY_STATIC_CAST(uint32_t, b_.u16[i])) >>
2466
-                  UINT32_C(16)));
2467
-   }
2468
-
2469
-   return simde__m64_from_private(r_);
2470
-#endif
2471
-}
2472
-#define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
2473
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2474
-#define _mm_mulhi_pu16(a, b) simde_mm_mulhi_pu16(a, b)
2475
-#endif
2476
-
2477
-SIMDE_FUNCTION_ATTRIBUTES
2478
-simde__m128 simde_mm_or_ps(simde__m128 a, simde__m128 b)
2479
-{
2480
-#if defined(SIMDE_X86_SSE_NATIVE)
2481
-   return _mm_or_ps(a, b);
2482
-#else
2483
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
2484
-               b_ = simde__m128_to_private(b);
2485
-
2486
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2487
-   r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
2488
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2489
-   r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
2490
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2491
-   r_.i32f = a_.i32f | b_.i32f;
2492
-#else
2493
-   SIMDE_VECTORIZE
2494
-   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
2495
-       r_.u32[i] = a_.u32[i] | b_.u32[i];
2496
-   }
2497
-#endif
2498
-
2499
-   return simde__m128_from_private(r_);
2500
-#endif
2501
-}
2502
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2503
-#define _mm_or_ps(a, b) simde_mm_or_ps((a), (b))
2504
-#endif
2505
-
2506
-SIMDE_FUNCTION_ATTRIBUTES
2507
-void simde_mm_prefetch(char const *p, int i)
2508
-{
2509
-   (void)p;
2510
-   (void)i;
2511
-}
2512
-#if defined(SIMDE_X86_SSE_NATIVE)
2513
-#define simde_mm_prefetch(p, i) _mm_prefetch(p, i)
2514
-#endif
2515
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2516
-#define _mm_prefetch(p, i) simde_mm_prefetch(p, i)
2517
-#endif
2518
-
2519
-SIMDE_FUNCTION_ATTRIBUTES
2520
-simde__m128 simde_mm_rcp_ps(simde__m128 a)
2521
-{
2522
-#if defined(SIMDE_X86_SSE_NATIVE)
2523
-   return _mm_rcp_ps(a);
2524
-#else
2525
-   simde__m128_private r_, a_ = simde__m128_to_private(a);
2526
-
2527
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2528
-   float32x4_t recip = vrecpeq_f32(a_.neon_f32);
2529
-
2530
-#if SIMDE_ACCURACY_PREFERENCE > 0
2531
-   for (int i = 0; i < SIMDE_ACCURACY_PREFERENCE; ++i) {
2532
-       recip = vmulq_f32(recip, vrecpsq_f32(recip, a_.neon_f32));
2533
-   }
2534
-#endif
2535
-
2536
-   r_.neon_f32 = recip;
2537
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2538
-   r_.altivec_f32 = vec_re(a_.altivec_f32);
2539
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
2540
-   r_.f32 = 1.0f / a_.f32;
2541
-#else
2542
-   SIMDE_VECTORIZE
2543
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2544
-       r_.f32[i] = 1.0f / a_.f32[i];
2545
-   }
2546
-#endif
2547
-
2548
-   return simde__m128_from_private(r_);
2549
-#endif
2550
-}
2551
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2552
-#define _mm_rcp_ps(a) simde_mm_rcp_ps((a))
2553
-#endif
2554
-
2555
-SIMDE_FUNCTION_ATTRIBUTES
2556
-simde__m128 simde_mm_rcp_ss(simde__m128 a)
2557
-{
2558
-#if defined(SIMDE_X86_SSE_NATIVE)
2559
-   return _mm_rcp_ss(a);
2560
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
2561
-   return simde_mm_move_ss(a, simde_mm_rcp_ps(a));
2562
-#else
2563
-   simde__m128_private r_, a_ = simde__m128_to_private(a);
2564
-
2565
-   r_.f32[0] = 1.0f / a_.f32[0];
2566
-   r_.f32[1] = a_.f32[1];
2567
-   r_.f32[2] = a_.f32[2];
2568
-   r_.f32[3] = a_.f32[3];
2569
-
2570
-   return simde__m128_from_private(r_);
2571
-#endif
2572
-}
2573
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2574
-#define _mm_rcp_ss(a) simde_mm_rcp_ss((a))
2575
-#endif
2576
-
2577
-SIMDE_FUNCTION_ATTRIBUTES
2578
-simde__m128 simde_mm_rsqrt_ps(simde__m128 a)
2579
-{
2580
-#if defined(SIMDE_X86_SSE_NATIVE)
2581
-   return _mm_rsqrt_ps(a);
2582
-#else
2583
-   simde__m128_private r_, a_ = simde__m128_to_private(a);
2584
-
2585
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2586
-   r_.neon_f32 = vrsqrteq_f32(a_.neon_f32);
2587
-#elif defined(__STDC_IEC_559__)
2588
-   /* https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
2589
-     Pages 100 - 103 */
2590
-   SIMDE_VECTORIZE
2591
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2592
-#if SIMDE_ACCURACY_PREFERENCE <= 0
2593
-       r_.i32[i] = INT32_C(0x5F37624F) - (a_.i32[i] >> 1);
2594
-#else
2595
-       simde_float32 x = a_.f32[i];
2596
-       simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x;
2597
-       int32_t ix;
2598
-
2599
-       simde_memcpy(&ix, &x, sizeof(ix));
2600
-
2601
-#if SIMDE_ACCURACY_PREFERENCE == 1
2602
-       ix = INT32_C(0x5F375A82) - (ix >> 1);
2603
-#else
2604
-       ix = INT32_C(0x5F37599E) - (ix >> 1);
2605
-#endif
2606
-
2607
-       simde_memcpy(&x, &ix, sizeof(x));
2608
-
2609
-#if SIMDE_ACCURACY_PREFERENCE >= 2
2610
-       x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
2611
-#endif
2612
-       x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
2613
-
2614
-       r_.f32[i] = x;
2615
-#endif
2616
-   }
2617
-#elif defined(simde_math_sqrtf)
2618
-   SIMDE_VECTORIZE
2619
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2620
-       r_.f32[i] = 1.0f / simde_math_sqrtf(a_.f32[i]);
2621
-   }
2622
-#else
2623
-   HEDLEY_UNREACHABLE();
2624
-#endif
2625
-
2626
-   return simde__m128_from_private(r_);
2627
-#endif
2628
-}
2629
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2630
-#define _mm_rsqrt_ps(a) simde_mm_rsqrt_ps((a))
2631
-#endif
2632
-
2633
-SIMDE_FUNCTION_ATTRIBUTES
2634
-simde__m128 simde_mm_rsqrt_ss(simde__m128 a)
2635
-{
2636
-#if defined(SIMDE_X86_SSE_NATIVE)
2637
-   return _mm_rsqrt_ss(a);
2638
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
2639
-   return simde_mm_move_ss(a, simde_mm_rsqrt_ps(a));
2640
-#else
2641
-   simde__m128_private r_, a_ = simde__m128_to_private(a);
2642
-
2643
-#if defined(__STDC_IEC_559__)
2644
-   {
2645
-#if SIMDE_ACCURACY_PREFERENCE <= 0
2646
-       r_.i32[0] = INT32_C(0x5F37624F) - (a_.i32[0] >> 1);
2647
-#else
2648
-       simde_float32 x = a_.f32[0];
2649
-       simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x;
2650
-       int32_t ix;
2651
-
2652
-       simde_memcpy(&ix, &x, sizeof(ix));
2653
-
2654
-#if SIMDE_ACCURACY_PREFERENCE == 1
2655
-       ix = INT32_C(0x5F375A82) - (ix >> 1);
2656
-#else
2657
-       ix = INT32_C(0x5F37599E) - (ix >> 1);
2658
-#endif
2659
-
2660
-       simde_memcpy(&x, &ix, sizeof(x));
2661
-
2662
-#if SIMDE_ACCURACY_PREFERENCE >= 2
2663
-       x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
2664
-#endif
2665
-       x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
2666
-
2667
-       r_.f32[0] = x;
2668
-#endif
2669
-   }
2670
-   r_.f32[1] = a_.f32[1];
2671
-   r_.f32[2] = a_.f32[2];
2672
-   r_.f32[3] = a_.f32[3];
2673
-#elif defined(simde_math_sqrtf)
2674
-   r_.f32[0] = 1.0f / simde_math_sqrtf(a_.f32[0]);
2675
-   r_.f32[1] = a_.f32[1];
2676
-   r_.f32[2] = a_.f32[2];
2677
-   r_.f32[3] = a_.f32[3];
2678
-#else
2679
-   HEDLEY_UNREACHABLE();
2680
-#endif
2681
-
2682
-   return simde__m128_from_private(r_);
2683
-#endif
2684
-}
2685
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2686
-#define _mm_rsqrt_ss(a) simde_mm_rsqrt_ss((a))
2687
-#endif
2688
-
2689
-SIMDE_FUNCTION_ATTRIBUTES
2690
-simde__m64 simde_mm_sad_pu8(simde__m64 a, simde__m64 b)
2691
-{
2692
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2693
-   return _mm_sad_pu8(a, b);
2694
-#else
2695
-   simde__m64_private r_, a_ = simde__m64_to_private(a),
2696
-                  b_ = simde__m64_to_private(b);
2697
-   uint16_t sum = 0;
2698
-
2699
-#if defined(SIMDE_HAVE_STDLIB_H)
2700
-   SIMDE_VECTORIZE_REDUCTION(+ : sum)
2701
-   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
2702
-       sum += HEDLEY_STATIC_CAST(uint8_t, abs(a_.u8[i] - b_.u8[i]));
2703
-   }
2704
-
2705
-   r_.i16[0] = HEDLEY_STATIC_CAST(int16_t, sum);
2706
-   r_.i16[1] = 0;
2707
-   r_.i16[2] = 0;
2708
-   r_.i16[3] = 0;
2709
-#else
2710
-   HEDLEY_UNREACHABLE();
2711
-#endif
2712
-
2713
-   return simde__m64_from_private(r_);
2714
-#endif
2715
-}
2716
-#define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b)
2717
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2718
-#define _mm_sad_pu8(a, b) simde_mm_sad_pu8(a, b)
2719
-#define _m_psadbw(a, b) simde_mm_sad_pu8(a, b)
2720
-#endif
2721
-
2722
-SIMDE_FUNCTION_ATTRIBUTES
2723
-simde__m128 simde_mm_set_ss(simde_float32 a)
2724
-{
2725
-#if defined(SIMDE_X86_SSE_NATIVE)
2726
-   return _mm_set_ss(a);
2727
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2728
-   return vsetq_lane_f32(a, vdupq_n_f32(SIMDE_FLOAT32_C(0.0)), 0);
2729
-#else
2730
-   return simde_mm_set_ps(SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0),
2731
-                  SIMDE_FLOAT32_C(0.0), a);
2732
-#endif
2733
-}
2734
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2735
-#define _mm_set_ss(a) simde_mm_set_ss(a)
2736
-#endif
2737
-
2738
-SIMDE_FUNCTION_ATTRIBUTES
2739
-simde__m128 simde_mm_setr_ps(simde_float32 e3, simde_float32 e2,
2740
-                simde_float32 e1, simde_float32 e0)
2741
-{
2742
-#if defined(SIMDE_X86_SSE_NATIVE)
2743
-   return _mm_setr_ps(e3, e2, e1, e0);
2744
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2745
-   SIMDE_ALIGN(16) simde_float32 data[4] = {e3, e2, e1, e0};
2746
-   return vld1q_f32(data);
2747
-#else
2748
-   return simde_mm_set_ps(e0, e1, e2, e3);
2749
-#endif
2750
-}
2751
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2752
-#define _mm_setr_ps(e3, e2, e1, e0) simde_mm_setr_ps(e3, e2, e1, e0)
2753
-#endif
2754
-
2755
-SIMDE_FUNCTION_ATTRIBUTES
2756
-simde__m128 simde_mm_setzero_ps(void)
2757
-{
2758
-#if defined(SIMDE_X86_SSE_NATIVE)
2759
-   return _mm_setzero_ps();
2760
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2761
-   return vdupq_n_f32(SIMDE_FLOAT32_C(0.0));
2762
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2763
-   return vec_splats((float)0);
2764
-#else
2765
-   simde__m128 r;
2766
-   simde_memset(&r, 0, sizeof(r));
2767
-   return r;
2768
-#endif
2769
-}
2770
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2771
-#define _mm_setzero_ps() simde_mm_setzero_ps()
2772
-#endif
2773
-
2774
-#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
2775
-HEDLEY_DIAGNOSTIC_PUSH
2776
-SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
2777
-#endif
2778
-
2779
-SIMDE_FUNCTION_ATTRIBUTES
2780
-simde__m128 simde_mm_undefined_ps(void)
2781
-{
2782
-   simde__m128_private r_;
2783
-
2784
-#if defined(SIMDE_HAVE_UNDEFINED128)
2785
-   r_.n = _mm_undefined_ps();
2786
-#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
2787
-   r_ = simde__m128_to_private(simde_mm_setzero_ps());
2788
-#endif
2789
-
2790
-   return simde__m128_from_private(r_);
2791
-}
2792
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2793
-#define _mm_undefined_ps() simde_mm_undefined_ps()
2794
-#endif
2795
-
2796
-#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
2797
-HEDLEY_DIAGNOSTIC_POP
2798
-#endif
2799
-
2800
-SIMDE_FUNCTION_ATTRIBUTES
2801
-simde__m128 simde_x_mm_setone_ps(void)
2802
-{
2803
-   simde__m128 t = simde_mm_setzero_ps();
2804
-   return simde_mm_cmpeq_ps(t, t);
2805
-}
2806
-
2807
-SIMDE_FUNCTION_ATTRIBUTES
2808
-void simde_mm_sfence(void)
2809
-{
2810
-   /* TODO: Use Hedley. */
2811
-#if defined(SIMDE_X86_SSE_NATIVE)
2812
-   _mm_sfence();
2813
-#elif defined(__GNUC__) && \
2814
-   ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
2815
-   __atomic_thread_fence(__ATOMIC_SEQ_CST);
2816
-#elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \
2817
-   (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
2818
-#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9)
2819
-   __atomic_thread_fence(__ATOMIC_SEQ_CST);
2820
-#else
2821
-   atomic_thread_fence(memory_order_seq_cst);
2822
-#endif
2823
-#elif defined(_MSC_VER)
2824
-   MemoryBarrier();
2825
-#elif HEDLEY_HAS_EXTENSION(c_atomic)
2826
-   __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
2827
-#elif defined(__GNUC__) && \
2828
-   ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
2829
-   __sync_synchronize();
2830
-#elif defined(_OPENMP)
2831
-#pragma omp critical(simde_mm_sfence_)
2832
-   {
2833
-   }
2834
-#endif
2835
-}
2836
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2837
-#define _mm_sfence() simde_mm_sfence()
2838
-#endif
2839
-
2840
-#define SIMDE_MM_SHUFFLE(z, y, x, w) \
2841
-   (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2842
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2843
-#define _MM_SHUFFLE(z, y, x, w) SIMDE_MM_SHUFFLE(z, y, x, w)
2844
-#endif
2845
-
2846
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
2847
-   !defined(__PGI)
2848
-#define simde_mm_shuffle_pi16(a, imm8) _mm_shuffle_pi16(a, imm8)
2849
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
2850
-#define simde_mm_shuffle_pi16(a, imm8)                                    \
2851
-   (__extension__({                                                  \
2852
-       const simde__m64_private simde__tmp_a_ =                  \
2853
-           simde__m64_to_private(a);                         \
2854
-       simde__m64_from_private((simde__m64_private){             \
2855
-           .i16 = SIMDE_SHUFFLE_VECTOR_(                     \
2856
-               16, 8, (simde__tmp_a_).i16,               \
2857
-               (simde__tmp_a_).i16, (((imm8)) & 3),      \
2858
-               (((imm8) >> 2) & 3), (((imm8) >> 4) & 3), \
2859
-               (((imm8) >> 6) & 3))});                   \
2860
-   }))
2861
-#else
2862
-SIMDE_FUNCTION_ATTRIBUTES
2863
-simde__m64 simde_mm_shuffle_pi16(simde__m64 a, const int imm8)
2864
-   SIMDE_REQUIRE_RANGE(imm8, 0, 255)
2865
-{
2866
-   simde__m64_private r_;
2867
-   simde__m64_private a_ = simde__m64_to_private(a);
2868
-
2869
-   for (size_t i = 0; i < sizeof(r_.i16) / sizeof(r_.i16[0]); i++) {
2870
-       r_.i16[i] = a_.i16[(imm8 >> (i * 2)) & 3];
2871
-   }
2872
-
2873
-   HEDLEY_DIAGNOSTIC_PUSH
2874
-#if HEDLEY_HAS_WARNING("-Wconditional-uninitialized")
2875
-#pragma clang diagnostic ignored "-Wconditional-uninitialized"
2876
-#endif
2877
-   return simde__m64_from_private(r_);
2878
-   HEDLEY_DIAGNOSTIC_POP
2879
-}
2880
-#endif
2881
-#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
2882
-#define simde_m_pshufw(a, imm8) _m_pshufw(a, imm8)
2883
-#else
2884
-#define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
2885
-#endif
2886
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2887
-#define _mm_shuffle_pi16(a, imm8) simde_mm_shuffle_pi16(a, imm8)
2888
-#define _m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
2889
-#endif
2890
-
2891
-#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
2892
-#define simde_mm_shuffle_ps(a, b, imm8) _mm_shuffle_ps(a, b, imm8)
2893
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
2894
-#define simde_mm_shuffle_ps(a, b, imm8)                                        \
2895
-   (__extension__({                                                       \
2896
-       simde__m128_from_private((simde__m128_private){                \
2897
-           .f32 = SIMDE_SHUFFLE_VECTOR_(                          \
2898
-               32, 16, simde__m128_to_private(a).f32,         \
2899
-               simde__m128_to_private(b).f32, (((imm8)) & 3), \
2900
-               (((imm8) >> 2) & 3), (((imm8) >> 4) & 3) + 4,  \
2901
-               (((imm8) >> 6) & 3) + 4)});                    \
2902
-   }))
2903
-#else
2904
-SIMDE_FUNCTION_ATTRIBUTES
2905
-simde__m128 simde_mm_shuffle_ps(simde__m128 a, simde__m128 b, const int imm8)
2906
-   SIMDE_REQUIRE_RANGE(imm8, 0, 255)
2907
-{
2908
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
2909
-               b_ = simde__m128_to_private(b);
2910
-
2911
-   r_.f32[0] = a_.f32[(imm8 >> 0) & 3];
2912
-   r_.f32[1] = a_.f32[(imm8 >> 2) & 3];
2913
-   r_.f32[2] = b_.f32[(imm8 >> 4) & 3];
2914
-   r_.f32[3] = b_.f32[(imm8 >> 6) & 3];
2915
-
2916
-   return simde__m128_from_private(r_);
2917
-}
2918
-#endif
2919
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2920
-#define _mm_shuffle_ps(a, b, imm8) simde_mm_shuffle_ps((a), (b), imm8)
2921
-#endif
2922
-
2923
-SIMDE_FUNCTION_ATTRIBUTES
2924
-simde__m128 simde_mm_sqrt_ps(simde__m128 a)
2925
-{
2926
-#if defined(SIMDE_X86_SSE_NATIVE)
2927
-   return _mm_sqrt_ps(a);
2928
-#else
2929
-   simde__m128_private r_, a_ = simde__m128_to_private(a);
2930
-
2931
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2932
-   float32x4_t recipsq = vrsqrteq_f32(a_.neon_f32);
2933
-   float32x4_t sq = vrecpeq_f32(recipsq);
2934
-   /* ??? use step versions of both sqrt and recip for better accuracy? */
2935
-   r_.neon_f32 = sq;
2936
-#elif defined(simde_math_sqrt)
2937
-   SIMDE_VECTORIZE
2938
-   for (size_t i = 0; i < sizeof(r_.f32) / sizeof(r_.f32[0]); i++) {
2939
-       r_.f32[i] = simde_math_sqrtf(a_.f32[i]);
2940
-   }
2941
-#else
2942
-   HEDLEY_UNREACHABLE();
2943
-#endif
2944
-
2945
-   return simde__m128_from_private(r_);
2946
-#endif
2947
-}
2948
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2949
-#define _mm_sqrt_ps(a) simde_mm_sqrt_ps((a))
2950
-#endif
2951
-
2952
-SIMDE_FUNCTION_ATTRIBUTES
2953
-simde__m128 simde_mm_sqrt_ss(simde__m128 a)
2954
-{
2955
-#if defined(SIMDE_X86_SSE_NATIVE)
2956
-   return _mm_sqrt_ss(a);
2957
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
2958
-   return simde_mm_move_ss(a, simde_mm_sqrt_ps(a));
2959
-#else
2960
-   simde__m128_private r_, a_ = simde__m128_to_private(a);
2961
-
2962
-#if defined(simde_math_sqrtf)
2963
-   r_.f32[0] = simde_math_sqrtf(a_.f32[0]);
2964
-   r_.f32[1] = a_.f32[1];
2965
-   r_.f32[2] = a_.f32[2];
2966
-   r_.f32[3] = a_.f32[3];
2967
-#else
2968
-   HEDLEY_UNREACHABLE();
2969
-#endif
2970
-
2971
-   return simde__m128_from_private(r_);
2972
-#endif
2973
-}
2974
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2975
-#define _mm_sqrt_ss(a) simde_mm_sqrt_ss((a))
2976
-#endif
2977
-
2978
-SIMDE_FUNCTION_ATTRIBUTES
2979
-void simde_mm_store_ps(simde_float32 mem_addr[4], simde__m128 a)
2980
-{
2981
-   simde_assert_aligned(16, mem_addr);
2982
-
2983
-#if defined(SIMDE_X86_SSE_NATIVE)
2984
-   _mm_store_ps(mem_addr, a);
2985
-#else
2986
-   simde__m128_private a_ = simde__m128_to_private(a);
2987
-
2988
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2989
-   vst1q_f32(mem_addr, a_.neon_f32);
2990
-#elif defined(SIMDE_POWER_ALTIVE_P7_NATIVE)
2991
-   vec_vsx_st(a_.altivec_32, 0, mem_addr);
2992
-#elif defined(SIMDE_POWER_ALTIVE_P5_NATIVE)
2993
-   vec_st(a_.altivec_32, 0, mem_addr);
2994
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2995
-   wasm_v128_store(mem_addr, a_.wasm_v128);
2996
-#else
2997
-   SIMDE_VECTORIZE_ALIGNED(mem_addr : 16)
2998
-   for (size_t i = 0; i < sizeof(a_.f32) / sizeof(a_.f32[0]); i++) {
2999
-       mem_addr[i] = a_.f32[i];
3000
-   }
3001
-#endif
3002
-#endif
3003
-}
3004
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3005
-#define _mm_store_ps(mem_addr, a)                                      \
3006
-   simde_mm_store_ps(SIMDE_CHECKED_REINTERPRET_CAST(              \
3007
-                 float *, simde_float32 *, mem_addr), \
3008
-             (a))
3009
-#endif
3010
-
3011
-SIMDE_FUNCTION_ATTRIBUTES
3012
-void simde_mm_store_ps1(simde_float32 mem_addr[4], simde__m128 a)
3013
-{
3014
-   simde_assert_aligned(16, mem_addr);
3015
-
3016
-#if defined(SIMDE_X86_SSE_NATIVE)
3017
-   _mm_store_ps1(mem_addr, a);
3018
-#else
3019
-   simde__m128_private a_ = simde__m128_to_private(a);
3020
-
3021
-   SIMDE_VECTORIZE_ALIGNED(mem_addr : 16)
3022
-   for (size_t i = 0; i < sizeof(a_.f32) / sizeof(a_.f32[0]); i++) {
3023
-       mem_addr[i] = a_.f32[0];
3024
-   }
3025
-#endif
3026
-}
3027
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3028
-#define _mm_store_ps1(mem_addr, a)                                      \
3029
-   simde_mm_store_ps1(SIMDE_CHECKED_REINTERPRET_CAST(              \
3030
-                  float *, simde_float32 *, mem_addr), \
3031
-              (a))
3032
-#endif
3033
-
3034
-SIMDE_FUNCTION_ATTRIBUTES
3035
-void simde_mm_store_ss(simde_float32 *mem_addr, simde__m128 a)
3036
-{
3037
-#if defined(SIMDE_X86_SSE_NATIVE)
3038
-   _mm_store_ss(mem_addr, a);
3039
-#else
3040
-   simde__m128_private a_ = simde__m128_to_private(a);
3041
-
3042
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3043
-   vst1q_lane_f32(mem_addr, a_.neon_f32, 0);
3044
-#else
3045
-   *mem_addr = a_.f32[0];
3046
-#endif
3047
-#endif
3048
-}
3049
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3050
-#define _mm_store_ss(mem_addr, a)                                      \
3051
-   simde_mm_store_ss(SIMDE_CHECKED_REINTERPRET_CAST(              \
3052
-                 float *, simde_float32 *, mem_addr), \
3053
-             (a))
3054
-#endif
3055
-
3056
-SIMDE_FUNCTION_ATTRIBUTES
3057
-void simde_mm_store1_ps(simde_float32 mem_addr[4], simde__m128 a)
3058
-{
3059
-   simde_assert_aligned(16, mem_addr);
3060
-
3061
-#if defined(SIMDE_X86_SSE_NATIVE)
3062
-   _mm_store1_ps(mem_addr, a);
3063
-#else
3064
-   simde_mm_store_ps1(mem_addr, a);
3065
-#endif
3066
-}
3067
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3068
-#define _mm_store1_ps(mem_addr, a)                                      \
3069
-   simde_mm_store1_ps(SIMDE_CHECKED_REINTERPRET_CAST(              \
3070
-                  float *, simde_float32 *, mem_addr), \
3071
-              (a))
3072
-#endif
3073
-
3074
-SIMDE_FUNCTION_ATTRIBUTES
3075
-void simde_mm_storeh_pi(simde__m64 *mem_addr, simde__m128 a)
3076
-{
3077
-#if defined(SIMDE_X86_SSE_NATIVE)
3078
-   _mm_storeh_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a);
3079
-#else
3080
-   simde__m64_private *dest_ =
3081
-       HEDLEY_REINTERPRET_CAST(simde__m64_private *, mem_addr);
3082
-   simde__m128_private a_ = simde__m128_to_private(a);
3083
-
3084
-   dest_->f32[0] = a_.f32[2];
3085
-   dest_->f32[1] = a_.f32[3];
3086
-#endif
3087
-}
3088
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3089
-#define _mm_storeh_pi(mem_addr, a) simde_mm_storeh_pi(mem_addr, (a))
3090
-#endif
3091
-
3092
-SIMDE_FUNCTION_ATTRIBUTES
3093
-void simde_mm_storel_pi(simde__m64 *mem_addr, simde__m128 a)
3094
-{
3095
-#if defined(SIMDE_X86_SSE_NATIVE)
3096
-   _mm_storel_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a);
3097
-#else
3098
-   simde__m64_private *dest_ =
3099
-       HEDLEY_REINTERPRET_CAST(simde__m64_private *, mem_addr);
3100
-   simde__m128_private a_ = simde__m128_to_private(a);
3101
-
3102
-   dest_->f32[0] = a_.f32[0];
3103
-   dest_->f32[1] = a_.f32[1];
3104
-#endif
3105
-}
3106
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3107
-#define _mm_storel_pi(mem_addr, a) simde_mm_storel_pi(mem_addr, (a))
3108
-#endif
3109
-
3110
-SIMDE_FUNCTION_ATTRIBUTES
3111
-void simde_mm_storer_ps(simde_float32 mem_addr[4], simde__m128 a)
3112
-{
3113
-   simde_assert_aligned(16, mem_addr);
3114
-
3115
-#if defined(SIMDE_X86_SSE_NATIVE)
3116
-   _mm_storer_ps(mem_addr, a);
3117
-#else
3118
-   simde__m128_private a_ = simde__m128_to_private(a);
3119
-
3120
-#if defined(SIMDE_SHUFFLE_VECTOR_)
3121
-   a_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 3, 2, 1, 0);
3122
-   simde_mm_store_ps(mem_addr, simde__m128_from_private(a_));
3123
-#else
3124
-   SIMDE_VECTORIZE_ALIGNED(mem_addr : 16)
3125
-   for (size_t i = 0; i < sizeof(a_.f32) / sizeof(a_.f32[0]); i++) {
3126
-       mem_addr[i] =
3127
-           a_.f32[((sizeof(a_.f32) / sizeof(a_.f32[0])) - 1) - i];
3128
-   }
3129
-#endif
3130
-#endif
3131
-}
3132
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3133
-#define _mm_storer_ps(mem_addr, a)                                      \
3134
-   simde_mm_storer_ps(SIMDE_CHECKED_REINTERPRET_CAST(              \
3135
-                  float *, simde_float32 *, mem_addr), \
3136
-              (a))
3137
-#endif
3138
-
3139
-SIMDE_FUNCTION_ATTRIBUTES
3140
-void simde_mm_storeu_ps(simde_float32 mem_addr[4], simde__m128 a)
3141
-{
3142
-#if defined(SIMDE_X86_SSE_NATIVE)
3143
-   _mm_storeu_ps(mem_addr, a);
3144
-#else
3145
-   simde__m128_private a_ = simde__m128_to_private(a);
3146
-
3147
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3148
-   vst1q_f32(mem_addr, a_.neon_f32);
3149
-#else
3150
-   simde_memcpy(mem_addr, &a_, sizeof(a_));
3151
-#endif
3152
-#endif
3153
-}
3154
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3155
-#define _mm_storeu_ps(mem_addr, a)                                      \
3156
-   simde_mm_storeu_ps(SIMDE_CHECKED_REINTERPRET_CAST(              \
3157
-                  float *, simde_float32 *, mem_addr), \
3158
-              (a))
3159
-#endif
3160
-
3161
-SIMDE_FUNCTION_ATTRIBUTES
3162
-simde__m128 simde_mm_sub_ps(simde__m128 a, simde__m128 b)
3163
-{
3164
-#if defined(SIMDE_X86_SSE_NATIVE)
3165
-   return _mm_sub_ps(a, b);
3166
-#else
3167
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
3168
-               b_ = simde__m128_to_private(b);
3169
-
3170
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3171
-   r_.neon_f32 = vsubq_f32(a_.neon_f32, b_.neon_f32);
3172
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3173
-   r_.wasm_v128 = wasm_f32x4_sub(a_.wasm_v128, b_.wasm_v128);
3174
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3175
-   r_.f32 = a_.f32 - b_.f32;
3176
-#else
3177
-   SIMDE_VECTORIZE
3178
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3179
-       r_.f32[i] = a_.f32[i] - b_.f32[i];
3180
-   }
3181
-#endif
3182
-
3183
-   return simde__m128_from_private(r_);
3184
-#endif
3185
-}
3186
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3187
-#define _mm_sub_ps(a, b) simde_mm_sub_ps((a), (b))
3188
-#endif
3189
-
3190
-SIMDE_FUNCTION_ATTRIBUTES
3191
-simde__m128 simde_mm_sub_ss(simde__m128 a, simde__m128 b)
3192
-{
3193
-#if defined(SIMDE_X86_SSE_NATIVE)
3194
-   return _mm_sub_ss(a, b);
3195
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
3196
-   return simde_mm_move_ss(a, simde_mm_sub_ps(a, b));
3197
-#else
3198
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
3199
-               b_ = simde__m128_to_private(b);
3200
-
3201
-   r_.f32[0] = a_.f32[0] - b_.f32[0];
3202
-   r_.f32[1] = a_.f32[1];
3203
-   r_.f32[2] = a_.f32[2];
3204
-   r_.f32[3] = a_.f32[3];
3205
-
3206
-   return simde__m128_from_private(r_);
3207
-#endif
3208
-}
3209
-
3210
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3211
-#define _mm_sub_ss(a, b) simde_mm_sub_ss((a), (b))
3212
-#endif
3213
-
3214
-SIMDE_FUNCTION_ATTRIBUTES
3215
-int simde_mm_ucomieq_ss(simde__m128 a, simde__m128 b)
3216
-{
3217
-#if defined(SIMDE_X86_SSE_NATIVE)
3218
-   return _mm_ucomieq_ss(a, b);
3219
-#else
3220
-   simde__m128_private a_ = simde__m128_to_private(a),
3221
-               b_ = simde__m128_to_private(b);
3222
-   int r;
3223
-
3224
-#if defined(SIMDE_HAVE_FENV_H)
3225
-   fenv_t envp;
3226
-   int x = feholdexcept(&envp);
3227
-   r = a_.f32[0] == b_.f32[0];
3228
-   if (HEDLEY_LIKELY(x == 0))
3229
-       fesetenv(&envp);
3230
-#else
3231
-   r = a_.f32[0] == b_.f32[0];
3232
-#endif
3233
-
3234
-   return r;
3235
-#endif
3236
-}
3237
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3238
-#define _mm_ucomieq_ss(a, b) simde_mm_ucomieq_ss((a), (b))
3239
-#endif
3240
-
3241
-SIMDE_FUNCTION_ATTRIBUTES
3242
-int simde_mm_ucomige_ss(simde__m128 a, simde__m128 b)
3243
-{
3244
-#if defined(SIMDE_X86_SSE_NATIVE)
3245
-   return _mm_ucomige_ss(a, b);
3246
-#else
3247
-   simde__m128_private a_ = simde__m128_to_private(a),
3248
-               b_ = simde__m128_to_private(b);
3249
-   int r;
3250
-
3251
-#if defined(SIMDE_HAVE_FENV_H)
3252
-   fenv_t envp;
3253
-   int x = feholdexcept(&envp);
3254
-   r = a_.f32[0] >= b_.f32[0];
3255
-   if (HEDLEY_LIKELY(x == 0))
3256
-       fesetenv(&envp);
3257
-#else
3258
-   r = a_.f32[0] >= b_.f32[0];
3259
-#endif
3260
-
3261
-   return r;
3262
-#endif
3263
-}
3264
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3265
-#define _mm_ucomige_ss(a, b) simde_mm_ucomige_ss((a), (b))
3266
-#endif
3267
-
3268
-SIMDE_FUNCTION_ATTRIBUTES
3269
-int simde_mm_ucomigt_ss(simde__m128 a, simde__m128 b)
3270
-{
3271
-#if defined(SIMDE_X86_SSE_NATIVE)
3272
-   return _mm_ucomigt_ss(a, b);
3273
-#else
3274
-   simde__m128_private a_ = simde__m128_to_private(a),
3275
-               b_ = simde__m128_to_private(b);
3276
-   int r;
3277
-
3278
-#if defined(SIMDE_HAVE_FENV_H)
3279
-   fenv_t envp;
3280
-   int x = feholdexcept(&envp);
3281
-   r = a_.f32[0] > b_.f32[0];
3282
-   if (HEDLEY_LIKELY(x == 0))
3283
-       fesetenv(&envp);
3284
-#else
3285
-   r = a_.f32[0] > b_.f32[0];
3286
-#endif
3287
-
3288
-   return r;
3289
-#endif
3290
-}
3291
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3292
-#define _mm_ucomigt_ss(a, b) simde_mm_ucomigt_ss((a), (b))
3293
-#endif
3294
-
3295
-SIMDE_FUNCTION_ATTRIBUTES
3296
-int simde_mm_ucomile_ss(simde__m128 a, simde__m128 b)
3297
-{
3298
-#if defined(SIMDE_X86_SSE_NATIVE)
3299
-   return _mm_ucomile_ss(a, b);
3300
-#else
3301
-   simde__m128_private a_ = simde__m128_to_private(a),
3302
-               b_ = simde__m128_to_private(b);
3303
-   int r;
3304
-
3305
-#if defined(SIMDE_HAVE_FENV_H)
3306
-   fenv_t envp;
3307
-   int x = feholdexcept(&envp);
3308
-   r = a_.f32[0] <= b_.f32[0];
3309
-   if (HEDLEY_LIKELY(x == 0))
3310
-       fesetenv(&envp);
3311
-#else
3312
-   r = a_.f32[0] <= b_.f32[0];
3313
-#endif
3314
-
3315
-   return r;
3316
-#endif
3317
-}
3318
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3319
-#define _mm_ucomile_ss(a, b) simde_mm_ucomile_ss((a), (b))
3320
-#endif
3321
-
3322
-SIMDE_FUNCTION_ATTRIBUTES
3323
-int simde_mm_ucomilt_ss(simde__m128 a, simde__m128 b)
3324
-{
3325
-#if defined(SIMDE_X86_SSE_NATIVE)
3326
-   return _mm_ucomilt_ss(a, b);
3327
-#else
3328
-   simde__m128_private a_ = simde__m128_to_private(a),
3329
-               b_ = simde__m128_to_private(b);
3330
-   int r;
3331
-
3332
-#if defined(SIMDE_HAVE_FENV_H)
3333
-   fenv_t envp;
3334
-   int x = feholdexcept(&envp);
3335
-   r = a_.f32[0] < b_.f32[0];
3336
-   if (HEDLEY_LIKELY(x == 0))
3337
-       fesetenv(&envp);
3338
-#else
3339
-   r = a_.f32[0] < b_.f32[0];
3340
-#endif
3341
-
3342
-   return r;
3343
-#endif
3344
-}
3345
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3346
-#define _mm_ucomilt_ss(a, b) simde_mm_ucomilt_ss((a), (b))
3347
-#endif
3348
-
3349
-SIMDE_FUNCTION_ATTRIBUTES
3350
-int simde_mm_ucomineq_ss(simde__m128 a, simde__m128 b)
3351
-{
3352
-#if defined(SIMDE_X86_SSE_NATIVE)
3353
-   return _mm_ucomineq_ss(a, b);
3354
-#else
3355
-   simde__m128_private a_ = simde__m128_to_private(a),
3356
-               b_ = simde__m128_to_private(b);
3357
-   int r;
3358
-
3359
-#if defined(SIMDE_HAVE_FENV_H)
3360
-   fenv_t envp;
3361
-   int x = feholdexcept(&envp);
3362
-   r = a_.f32[0] != b_.f32[0];
3363
-   if (HEDLEY_LIKELY(x == 0))
3364
-       fesetenv(&envp);
3365
-#else
3366
-   r = a_.f32[0] != b_.f32[0];
3367
-#endif
3368
-
3369
-   return r;
3370
-#endif
3371
-}
3372
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3373
-#define _mm_ucomineq_ss(a, b) simde_mm_ucomineq_ss((a), (b))
3374
-#endif
3375
-
3376
-#if defined(SIMDE_X86_SSE_NATIVE)
3377
-#if defined(__has_builtin)
3378
-#if __has_builtin(__builtin_ia32_undef128)
3379
-#define SIMDE_HAVE_UNDEFINED128
3380
-#endif
3381
-#elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) && \
3382
-   !defined(_MSC_VER)
3383
-#define SIMDE_HAVE_UNDEFINED128
3384
-#endif
3385
-#endif
3386
-
3387
-#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
3388
-HEDLEY_DIAGNOSTIC_PUSH
3389
-SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
3390
-#endif
3391
-
3392
-SIMDE_FUNCTION_ATTRIBUTES
3393
-simde__m128 simde_mm_unpackhi_ps(simde__m128 a, simde__m128 b)
3394
-{
3395
-#if defined(SIMDE_X86_SSE_NATIVE)
3396
-   return _mm_unpackhi_ps(a, b);
3397
-#else
3398
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
3399
-               b_ = simde__m128_to_private(b);
3400
-
3401
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3402
-   float32x2_t a1 = vget_high_f32(a_.neon_f32);
3403
-   float32x2_t b1 = vget_high_f32(b_.neon_f32);
3404
-   float32x2x2_t result = vzip_f32(a1, b1);
3405
-   r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
3406
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
3407
-   r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 2, 6, 3, 7);
3408
-#else
3409
-   r_.f32[0] = a_.f32[2];
3410
-   r_.f32[1] = b_.f32[2];
3411
-   r_.f32[2] = a_.f32[3];
3412
-   r_.f32[3] = b_.f32[3];
3413
-#endif
3414
-
3415
-   return simde__m128_from_private(r_);
3416
-#endif
3417
-}
3418
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3419
-#define _mm_unpackhi_ps(a, b) simde_mm_unpackhi_ps((a), (b))
3420
-#endif
3421
-
3422
-SIMDE_FUNCTION_ATTRIBUTES
3423
-simde__m128 simde_mm_unpacklo_ps(simde__m128 a, simde__m128 b)
3424
-{
3425
-#if defined(SIMDE_X86_SSE_NATIVE)
3426
-   return _mm_unpacklo_ps(a, b);
3427
-#else
3428
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
3429
-               b_ = simde__m128_to_private(b);
3430
-
3431
-#if defined(SIMDE_SHUFFLE_VECTOR_)
3432
-   r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 4, 1, 5);
3433
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3434
-   float32x2_t a1 = vget_low_f32(a_.neon_f32);
3435
-   float32x2_t b1 = vget_low_f32(b_.neon_f32);
3436
-   float32x2x2_t result = vzip_f32(a1, b1);
3437
-   r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
3438
-#else
3439
-   r_.f32[0] = a_.f32[0];
3440
-   r_.f32[1] = b_.f32[0];
3441
-   r_.f32[2] = a_.f32[1];
3442
-   r_.f32[3] = b_.f32[1];
3443
-#endif
3444
-
3445
-   return simde__m128_from_private(r_);
3446
-#endif
3447
-}
3448
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3449
-#define _mm_unpacklo_ps(a, b) simde_mm_unpacklo_ps((a), (b))
3450
-#endif
3451
-
3452
-SIMDE_FUNCTION_ATTRIBUTES
3453
-simde__m128 simde_mm_xor_ps(simde__m128 a, simde__m128 b)
3454
-{
3455
-#if defined(SIMDE_X86_SSE_NATIVE)
3456
-   return _mm_xor_ps(a, b);
3457
-#else
3458
-   simde__m128_private r_, a_ = simde__m128_to_private(a),
3459
-               b_ = simde__m128_to_private(b);
3460
-
3461
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3462
-   r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
3463
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3464
-   r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
3465
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3466
-   r_.i32f = a_.i32f ^ b_.i32f;
3467
-#else
3468
-   SIMDE_VECTORIZE
3469
-   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
3470
-       r_.u32[i] = a_.u32[i] ^ b_.u32[i];
3471
-   }
3472
-#endif
3473
-
3474
-   return simde__m128_from_private(r_);
3475
-#endif
3476
-}
3477
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3478
-#define _mm_xor_ps(a, b) simde_mm_xor_ps((a), (b))
3479
-#endif
3480
-
3481
-SIMDE_FUNCTION_ATTRIBUTES
3482
-void simde_mm_stream_pi(simde__m64 *mem_addr, simde__m64 a)
3483
-{
3484
-#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3485
-   _mm_stream_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a);
3486
-#else
3487
-   simde__m64_private *dest = HEDLEY_REINTERPRET_CAST(simde__m64_private *,
3488
-                              mem_addr),
3489
-              a_ = simde__m64_to_private(a);
3490
-
3491
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3492
-   dest->i64[0] = vget_lane_s64(a_.neon_i64, 0);
3493
-#else
3494
-   dest->i64[0] = a_.i64[0];
3495
-#endif
3496
-#endif
3497
-}
3498
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3499
-#define _mm_stream_pi(mem_addr, a) simde_mm_stream_pi(mem_addr, (a))
3500
-#endif
3501
-
3502
-SIMDE_FUNCTION_ATTRIBUTES
3503
-void simde_mm_stream_ps(simde_float32 mem_addr[4], simde__m128 a)
3504
-{
3505
-   simde_assert_aligned(16, mem_addr);
3506
-
3507
-#if defined(SIMDE_X86_SSE_NATIVE)
3508
-   _mm_stream_ps(mem_addr, a);
3509
-#else
3510
-   simde__m128_private a_ = simde__m128_to_private(a);
3511
-
3512
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3513
-   vst1q_f32(SIMDE_ASSUME_ALIGNED(16, mem_addr), a_.neon_f32);
3514
-#else
3515
-   simde_memcpy(SIMDE_ASSUME_ALIGNED(16, mem_addr), &a_, sizeof(a_));
3516
-#endif
3517
-#endif
3518
-}
3519
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3520
-#define _mm_stream_ps(mem_addr, a)                                      \
3521
-   simde_mm_stream_ps(SIMDE_CHECKED_REINTERPRET_CAST(              \
3522
-                  float *, simde_float32 *, mem_addr), \
3523
-              (a))
3524
-#endif
3525
-
3526
-SIMDE_FUNCTION_ATTRIBUTES
3527
-uint32_t simde_mm_getcsr(void)
3528
-{
3529
-#if defined(SIMDE_X86_SSE_NATIVE)
3530
-   return _mm_getcsr();
3531
-#else
3532
-   uint32_t r = 0;
3533
-
3534
-#if defined(SIMDE_HAVE_FENV_H)
3535
-   int rounding_mode = fegetround();
3536
-
3537
-   switch (rounding_mode) {
3538
-#if defined(FE_TONEAREST)
3539
-   case FE_TONEAREST:
3540
-       break;
3541
-#endif
3542
-#if defined(FE_UPWARD)
3543
-   case FE_UPWARD:
3544
-       r |= 2 << 13;
3545
-       break;
3546
-#endif
3547
-#if defined(FE_DOWNWARD)
3548
-   case FE_DOWNWARD:
3549
-       r |= 1 << 13;
3550
-       break;
3551
-#endif
3552
-#if defined(FE_TOWARDZERO)
3553
-   case FE_TOWARDZERO:
3554
-       r = 3 << 13;
3555
-       break;
3556
-#endif
3557
-   }
3558
-#else
3559
-   HEDLEY_UNREACHABLE();
3560
-#endif
3561
-
3562
-   return r;
3563
-#endif
3564
-}
3565
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3566
-#define _mm_getcsr() simde_mm_getcsr()
3567
-#endif
3568
-
3569
-SIMDE_FUNCTION_ATTRIBUTES
3570
-void simde_mm_setcsr(uint32_t a)
3571
-{
3572
-#if defined(SIMDE_X86_SSE_NATIVE)
3573
-   _mm_setcsr(a);
3574
-#else
3575
-   switch ((a >> 13) & 3) {
3576
-#if defined(FE_TONEAREST)
3577
-   case 0:
3578
-       fesetround(FE_TONEAREST);
3579
-#endif
3580
-#if defined(FE_DOWNWARD)
3581
-       break;
3582
-   case 1:
3583
-       fesetround(FE_DOWNWARD);
3584
-#endif
3585
-#if defined(FE_UPWARD)
3586
-       break;
3587
-   case 2:
3588
-       fesetround(FE_UPWARD);
3589
-#endif
3590
-#if defined(FE_TOWARDZERO)
3591
-       break;
3592
-   case 3:
3593
-       fesetround(FE_TOWARDZERO);
3594
-       break;
3595
-#endif
3596
-   }
3597
-#endif
3598
-}
3599
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3600
-#define _mm_setcsr(a) simde_mm_setcsr(a)
3601
-#endif
3602
-
3603
-#define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3)       \
3604
-   do {                                                 \
3605
-       simde__m128 tmp3, tmp2, tmp1, tmp0;          \
3606
-       tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \
3607
-       tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \
3608
-       tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \
3609
-       tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \
3610
-       row0 = simde_mm_movelh_ps(tmp0, tmp2);       \
3611
-       row1 = simde_mm_movehl_ps(tmp2, tmp0);       \
3612
-       row2 = simde_mm_movelh_ps(tmp1, tmp3);       \
3613
-       row3 = simde_mm_movehl_ps(tmp3, tmp1);       \
3614
-   } while (0)
3615
-
3616
-#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3617
-#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3618
-   SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3)
3619
-#endif
3620
-
3621
-#if defined(_MM_EXCEPT_INVALID)
3622
-#define SIMDE_MM_EXCEPT_INVALID _MM_EXCEPT_INVALID
3623
-#else
3624
-#define SIMDE_MM_EXCEPT_INVALID (0x0001)
3625
-#endif
3626
-#if defined(_MM_EXCEPT_DENORM)
3627
-#define SIMDE_MM_EXCEPT_DENORM _MM_EXCEPT_DENORM
3628
-#else
3629
-#define SIMDE_MM_EXCEPT_DENORM (0x0002)
3630
-#endif
3631
-#if defined(_MM_EXCEPT_DIV_ZERO)
3632
-#define SIMDE_MM_EXCEPT_DIV_ZERO _MM_EXCEPT_DIV_ZERO
3633
-#else
3634
-#define SIMDE_MM_EXCEPT_DIV_ZERO (0x0004)
3635
-#endif
3636
-#if defined(_MM_EXCEPT_OVERFLOW)
3637
-#define SIMDE_MM_EXCEPT_OVERFLOW _MM_EXCEPT_OVERFLOW
3638
-#else
3639
-#define SIMDE_MM_EXCEPT_OVERFLOW (0x0008)
3640
-#endif
3641
-#if defined(_MM_EXCEPT_UNDERFLOW)
3642
-#define SIMDE_MM_EXCEPT_UNDERFLOW _MM_EXCEPT_UNDERFLOW
3643
-#else
3644
-#define SIMDE_MM_EXCEPT_UNDERFLOW (0x0010)
3645
-#endif
3646
-#if defined(_MM_EXCEPT_INEXACT)
3647
-#define SIMDE_MM_EXCEPT_INEXACT _MM_EXCEPT_INEXACT
3648
-#else
3649
-#define SIMDE_MM_EXCEPT_INEXACT (0x0020)
3650
-#endif
3651
-#if defined(_MM_EXCEPT_MASK)
3652
-#define SIMDE_MM_EXCEPT_MASK _MM_EXCEPT_MASK
3653
-#else
3654
-#define SIMDE_MM_EXCEPT_MASK                                   \
3655
-   (SIMDE_MM_EXCEPT_INVALID | SIMDE_MM_EXCEPT_DENORM |    \
3656
-    SIMDE_MM_EXCEPT_DIV_ZERO | SIMDE_MM_EXCEPT_OVERFLOW | \
3657
-    SIMDE_MM_EXCEPT_UNDERFLOW | SIMDE_MM_EXCEPT_INEXACT)
3658
-#endif
3659
-
3660
-#if defined(_MM_MASK_INVALID)
3661
-#define SIMDE_MM_MASK_INVALID _MM_MASK_INVALID
3662
-#else
3663
-#define SIMDE_MM_MASK_INVALID (0x0080)
3664
-#endif
3665
-#if defined(_MM_MASK_DENORM)
3666
-#define SIMDE_MM_MASK_DENORM _MM_MASK_DENORM
3667
-#else
3668
-#define SIMDE_MM_MASK_DENORM (0x0100)
3669
-#endif
3670
-#if defined(_MM_MASK_DIV_ZERO)
3671
-#define SIMDE_MM_MASK_DIV_ZERO _MM_MASK_DIV_ZERO
3672
-#else
3673
-#define SIMDE_MM_MASK_DIV_ZERO (0x0200)
3674
-#endif
3675
-#if defined(_MM_MASK_OVERFLOW)
3676
-#define SIMDE_MM_MASK_OVERFLOW _MM_MASK_OVERFLOW
3677
-#else
3678
-#define SIMDE_MM_MASK_OVERFLOW (0x0400)
3679
-#endif
3680
-#if defined(_MM_MASK_UNDERFLOW)
3681
-#define SIMDE_MM_MASK_UNDERFLOW _MM_MASK_UNDERFLOW
3682
-#else
3683
-#define SIMDE_MM_MASK_UNDERFLOW (0x0800)
3684
-#endif
3685
-#if defined(_MM_MASK_INEXACT)
3686
-#define SIMDE_MM_MASK_INEXACT _MM_MASK_INEXACT
3687
-#else
3688
-#define SIMDE_MM_MASK_INEXACT (0x1000)
3689
-#endif
3690
-#if defined(_MM_MASK_MASK)
3691
-#define SIMDE_MM_MASK_MASK _MM_MASK_MASK
3692
-#else
3693
-#define SIMDE_MM_MASK_MASK                                 \
3694
-   (SIMDE_MM_MASK_INVALID | SIMDE_MM_MASK_DENORM |    \
3695
-    SIMDE_MM_MASK_DIV_ZERO | SIMDE_MM_MASK_OVERFLOW | \
3696
-    SIMDE_MM_MASK_UNDERFLOW | SIMDE_MM_MASK_INEXACT)
3697
-#endif
3698
-
3699
-#if defined(_MM_FLUSH_ZERO_MASK)
3700
-#define SIMDE_MM_FLUSH_ZERO_MASK _MM_FLUSH_ZERO_MASK
3701
-#else
3702
-#define SIMDE_MM_FLUSH_ZERO_MASK (0x8000)
3703
-#endif
3704
-#if defined(_MM_FLUSH_ZERO_ON)
3705
-#define SIMDE_MM_FLUSH_ZERO_ON _MM_FLUSH_ZERO_ON
3706
-#else
3707
-#define SIMDE_MM_FLUSH_ZERO_ON (0x8000)
3708
-#endif
3709
-#if defined(_MM_FLUSH_ZERO_OFF)
3710
-#define SIMDE_MM_FLUSH_ZERO_OFF _MM_FLUSH_ZERO_OFF
3711
-#else
3712
-#define SIMDE_MM_FLUSH_ZERO_OFF (0x0000)
3713
-#endif
3714
-
3715
-SIMDE_END_DECLS_
3716
-
3717
-HEDLEY_DIAGNOSTIC_POP
3718
-
3719
-#endif /* !defined(SIMDE_X86_SSE_H) */
3720
obs-studio-26.1.0.tar.xz/libobs/util/simde/sse2.h Deleted
6216
 
1
@@ -1,6214 +0,0 @@
2
-/* SPDX-License-Identifier: MIT
3
- *
4
- * Permission is hereby granted, free of charge, to any person
5
- * obtaining a copy of this software and associated documentation
6
- * files (the "Software"), to deal in the Software without
7
- * restriction, including without limitation the rights to use, copy,
8
- * modify, merge, publish, distribute, sublicense, and/or sell copies
9
- * of the Software, and to permit persons to whom the Software is
10
- * furnished to do so, subject to the following conditions:
11
- *
12
- * The above copyright notice and this permission notice shall be
13
- * included in all copies or substantial portions of the Software.
14
- *
15
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
- * SOFTWARE.
23
- *
24
- * Copyright:
25
- *   2017-2020 Evan Nemerson <evan@nemerson.com>
26
- *   2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
27
- *   2015      Brandon Rowlett <browlett@nvidia.com>
28
- *   2015      Ken Fast <kfast@gdeb.com>
29
- *   2017      Hasindu Gamaarachchi <hasindu@unsw.edu.au>
30
- *   2018      Jeff Daily <jeff.daily@amd.com>
31
- */
32
-
33
-#if !defined(SIMDE_X86_SSE2_H)
34
-#define SIMDE_X86_SSE2_H
35
-
36
-#include "sse.h"
37
-
38
-#if !defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
39
-#define SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES
40
-#endif
41
-
42
-HEDLEY_DIAGNOSTIC_PUSH
43
-SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
44
-SIMDE_BEGIN_DECLS_
45
-
46
-typedef union {
47
-#if defined(SIMDE_VECTOR_SUBSCRIPT)
48
-   SIMDE_ALIGN(16) int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
49
-   SIMDE_ALIGN(16) int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
50
-   SIMDE_ALIGN(16) int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
51
-   SIMDE_ALIGN(16) int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
52
-   SIMDE_ALIGN(16) uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
53
-   SIMDE_ALIGN(16) uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
54
-   SIMDE_ALIGN(16) uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
55
-   SIMDE_ALIGN(16) uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
56
-#if defined(SIMDE_HAVE_INT128_)
57
-   SIMDE_ALIGN(16) simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
58
-   SIMDE_ALIGN(16) simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
59
-#endif
60
-   SIMDE_ALIGN(16) simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
61
-   SIMDE_ALIGN(16) simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
62
-
63
-   SIMDE_ALIGN(16) int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
64
-   SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
65
-#else
66
-   SIMDE_ALIGN(16) int8_t i8[16];
67
-   SIMDE_ALIGN(16) int16_t i16[8];
68
-   SIMDE_ALIGN(16) int32_t i32[4];
69
-   SIMDE_ALIGN(16) int64_t i64[2];
70
-   SIMDE_ALIGN(16) uint8_t u8[16];
71
-   SIMDE_ALIGN(16) uint16_t u16[8];
72
-   SIMDE_ALIGN(16) uint32_t u32[4];
73
-   SIMDE_ALIGN(16) uint64_t u64[2];
74
-#if defined(SIMDE_HAVE_INT128_)
75
-   SIMDE_ALIGN(16) simde_int128 i128[1];
76
-   SIMDE_ALIGN(16) simde_uint128 u128[1];
77
-#endif
78
-   SIMDE_ALIGN(16) simde_float32 f32[4];
79
-   SIMDE_ALIGN(16) simde_float64 f64[2];
80
-
81
-   SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)];
82
-   SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
83
-#endif
84
-
85
-   SIMDE_ALIGN(16) simde__m64_private m64_private[2];
86
-   SIMDE_ALIGN(16) simde__m64 m64[2];
87
-
88
-#if defined(SIMDE_X86_SSE2_NATIVE)
89
-   SIMDE_ALIGN(16) __m128i n;
90
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
91
-   SIMDE_ALIGN(16) int8x16_t neon_i8;
92
-   SIMDE_ALIGN(16) int16x8_t neon_i16;
93
-   SIMDE_ALIGN(16) int32x4_t neon_i32;
94
-   SIMDE_ALIGN(16) int64x2_t neon_i64;
95
-   SIMDE_ALIGN(16) uint8x16_t neon_u8;
96
-   SIMDE_ALIGN(16) uint16x8_t neon_u16;
97
-   SIMDE_ALIGN(16) uint32x4_t neon_u32;
98
-   SIMDE_ALIGN(16) uint64x2_t neon_u64;
99
-   SIMDE_ALIGN(16) float32x4_t neon_f32;
100
-#if defined(SIMDE_ARCH_AARCH64)
101
-   SIMDE_ALIGN(16) float64x2_t neon_f64;
102
-#endif
103
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
104
-   SIMDE_ALIGN(16) v128_t wasm_v128;
105
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
106
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
107
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
108
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
109
-#if defined(__UINT_FAST32_TYPE__)
110
-   SIMDE_ALIGN(16)
111
-   SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f;
112
-#else
113
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f;
114
-#endif
115
-   SIMDE_ALIGN(16)
116
-   SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
117
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
118
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
119
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
120
-#if defined(__UINT_FAST32_TYPE__)
121
-   SIMDE_ALIGN(16) vector __UINT_FAST32_TYPE__ altivec_u32f;
122
-#else
123
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f;
124
-#endif
125
-   SIMDE_ALIGN(16)
126
-   SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
127
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
128
-#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
129
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
130
-#endif
131
-#endif
132
-} simde__m128i_private;
133
-
134
-typedef union {
135
-#if defined(SIMDE_VECTOR_SUBSCRIPT)
136
-   SIMDE_ALIGN(16) int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
137
-   SIMDE_ALIGN(16) int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
138
-   SIMDE_ALIGN(16) int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
139
-   SIMDE_ALIGN(16) int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
140
-   SIMDE_ALIGN(16) uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
141
-   SIMDE_ALIGN(16) uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
142
-   SIMDE_ALIGN(16) uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
143
-   SIMDE_ALIGN(16) uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
144
-   SIMDE_ALIGN(16) simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
145
-   SIMDE_ALIGN(16) simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
146
-   SIMDE_ALIGN(16) int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
147
-   SIMDE_ALIGN(16) uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
148
-#else
149
-   SIMDE_ALIGN(16) int8_t i8[16];
150
-   SIMDE_ALIGN(16) int16_t i16[8];
151
-   SIMDE_ALIGN(16) int32_t i32[4];
152
-   SIMDE_ALIGN(16) int64_t i64[2];
153
-   SIMDE_ALIGN(16) uint8_t u8[16];
154
-   SIMDE_ALIGN(16) uint16_t u16[8];
155
-   SIMDE_ALIGN(16) uint32_t u32[4];
156
-   SIMDE_ALIGN(16) uint64_t u64[2];
157
-   SIMDE_ALIGN(16) simde_float32 f32[4];
158
-   SIMDE_ALIGN(16) simde_float64 f64[2];
159
-   SIMDE_ALIGN(16) int_fast32_t i32f[16 / sizeof(int_fast32_t)];
160
-   SIMDE_ALIGN(16) uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
161
-#endif
162
-
163
-   SIMDE_ALIGN(16) simde__m64_private m64_private[2];
164
-   SIMDE_ALIGN(16) simde__m64 m64[2];
165
-
166
-#if defined(SIMDE_X86_SSE2_NATIVE)
167
-   SIMDE_ALIGN(16) __m128d n;
168
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
169
-   SIMDE_ALIGN(16) int8x16_t neon_i8;
170
-   SIMDE_ALIGN(16) int16x8_t neon_i16;
171
-   SIMDE_ALIGN(16) int32x4_t neon_i32;
172
-   SIMDE_ALIGN(16) int64x2_t neon_i64;
173
-   SIMDE_ALIGN(16) uint8x16_t neon_u8;
174
-   SIMDE_ALIGN(16) uint16x8_t neon_u16;
175
-   SIMDE_ALIGN(16) uint32x4_t neon_u32;
176
-   SIMDE_ALIGN(16) uint64x2_t neon_u64;
177
-   SIMDE_ALIGN(16) float32x4_t neon_f32;
178
-#if defined(SIMDE_ARCH_AARCH64)
179
-   SIMDE_ALIGN(16) float64x2_t neon_f64;
180
-#endif
181
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
182
-   SIMDE_ALIGN(16) v128_t wasm_v128;
183
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
184
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
185
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
186
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
187
-#if defined(__INT_FAST32_TYPE__)
188
-   SIMDE_ALIGN(16)
189
-   SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f;
190
-#else
191
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f;
192
-#endif
193
-   SIMDE_ALIGN(16)
194
-   SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
195
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
196
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
197
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
198
-#if defined(__UINT_FAST32_TYPE__)
199
-   SIMDE_ALIGN(16) vector __UINT_FAST32_TYPE__ altivec_u32f;
200
-#else
201
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f;
202
-#endif
203
-   SIMDE_ALIGN(16)
204
-   SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
205
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
206
-#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
207
-   SIMDE_ALIGN(16) SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
208
-#endif
209
-#endif
210
-} simde__m128d_private;
211
-
212
-#if defined(SIMDE_X86_SSE2_NATIVE)
213
-typedef __m128i simde__m128i;
214
-typedef __m128d simde__m128d;
215
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
216
-typedef int64x2_t simde__m128i;
217
-#if defined(SIMDE_ARCH_AARCH64)
218
-typedef float64x2_t simde__m128d;
219
-#elif defined(SIMDE_VECTOR_SUBSCRIPT)
220
-typedef simde_float64 simde__m128d SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
221
-#else
222
-typedef simde__m128d_private simde__m128d;
223
-#endif
224
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
225
-typedef v128_t simde__m128i;
226
-typedef v128_t simde__m128d;
227
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
228
-typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128i;
229
-typedef SIMDE_POWER_ALTIVEC_VECTOR(double) simde__m128d;
230
-#elif defined(SIMDE_VECTOR_SUBSCRIPT)
231
-typedef int_fast32_t simde__m128i SIMDE_ALIGN(16)
232
-   SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
233
-typedef simde_float64 simde__m128d SIMDE_ALIGN(16)
234
-   SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
235
-#else
236
-typedef simde__m128i_private simde__m128i;
237
-typedef simde__m128d_private simde__m128d;
238
-#endif
239
-
240
-#if !defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
241
-#define SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES
242
-typedef simde__m128i __m128i;
243
-typedef simde__m128d __m128d;
244
-#endif
245
-
246
-HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
247
-HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i_private),
248
-            "simde__m128i_private size incorrect");
249
-HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
250
-HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d_private),
251
-            "simde__m128d_private size incorrect");
252
-#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
253
-HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i) == 16,
254
-            "simde__m128i is not 16-byte aligned");
255
-HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i_private) == 16,
256
-            "simde__m128i_private is not 16-byte aligned");
257
-HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d) == 16,
258
-            "simde__m128d is not 16-byte aligned");
259
-HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d_private) == 16,
260
-            "simde__m128d_private is not 16-byte aligned");
261
-#endif
262
-
263
-SIMDE_FUNCTION_ATTRIBUTES
264
-simde__m128i simde__m128i_from_private(simde__m128i_private v)
265
-{
266
-   simde__m128i r;
267
-   simde_memcpy(&r, &v, sizeof(r));
268
-   return r;
269
-}
270
-
271
-SIMDE_FUNCTION_ATTRIBUTES
272
-simde__m128i_private simde__m128i_to_private(simde__m128i v)
273
-{
274
-   simde__m128i_private r;
275
-   simde_memcpy(&r, &v, sizeof(r));
276
-   return r;
277
-}
278
-
279
-SIMDE_FUNCTION_ATTRIBUTES
280
-simde__m128d simde__m128d_from_private(simde__m128d_private v)
281
-{
282
-   simde__m128d r;
283
-   simde_memcpy(&r, &v, sizeof(r));
284
-   return r;
285
-}
286
-
287
-SIMDE_FUNCTION_ATTRIBUTES
288
-simde__m128d_private simde__m128d_to_private(simde__m128d v)
289
-{
290
-   simde__m128d_private r;
291
-   simde_memcpy(&r, &v, sizeof(r));
292
-   return r;
293
-}
294
-
295
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
296
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int8x16_t, neon, i8)
297
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int16x8_t, neon, i16)
298
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int32x4_t, neon, i32)
299
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int64x2_t, neon, i64)
300
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint8x16_t, neon, u8)
301
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint16x8_t, neon, u16)
302
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint32x4_t, neon, u32)
303
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint64x2_t, neon, u64)
304
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float32x4_t, neon, f32)
305
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
306
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float64x2_t, neon, f64)
307
-#endif
308
-#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
309
-
310
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
311
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int8x16_t, neon, i8)
312
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int16x8_t, neon, i16)
313
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int32x4_t, neon, i32)
314
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int64x2_t, neon, i64)
315
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint8x16_t, neon, u8)
316
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint16x8_t, neon, u16)
317
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint32x4_t, neon, u32)
318
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint64x2_t, neon, u64)
319
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float32x4_t, neon, f32)
320
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
321
-SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float64x2_t, neon, f64)
322
-#endif
323
-#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
324
-
325
-SIMDE_FUNCTION_ATTRIBUTES
326
-simde__m128i simde_mm_add_epi8(simde__m128i a, simde__m128i b)
327
-{
328
-#if defined(SIMDE_X86_SSE2_NATIVE)
329
-   return _mm_add_epi8(a, b);
330
-#else
331
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
332
-                b_ = simde__m128i_to_private(b);
333
-
334
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
335
-   r_.neon_i8 = vaddq_s8(a_.neon_i8, b_.neon_i8);
336
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
337
-   r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8);
338
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
339
-   r_.i8 = a_.i8 + b_.i8;
340
-#else
341
-   SIMDE_VECTORIZE
342
-   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
343
-       r_.i8[i] = a_.i8[i] + b_.i8[i];
344
-   }
345
-#endif
346
-
347
-   return simde__m128i_from_private(r_);
348
-#endif
349
-}
350
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
351
-#define _mm_add_epi8(a, b) simde_mm_add_epi8(a, b)
352
-#endif
353
-
354
-SIMDE_FUNCTION_ATTRIBUTES
355
-simde__m128i simde_mm_add_epi16(simde__m128i a, simde__m128i b)
356
-{
357
-#if defined(SIMDE_X86_SSE2_NATIVE)
358
-   return _mm_add_epi16(a, b);
359
-#else
360
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
361
-                b_ = simde__m128i_to_private(b);
362
-
363
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
364
-   r_.neon_i16 = vaddq_s16(a_.neon_i16, b_.neon_i16);
365
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
366
-   r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16);
367
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
368
-   r_.i16 = a_.i16 + b_.i16;
369
-#else
370
-   SIMDE_VECTORIZE
371
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
372
-       r_.i16[i] = a_.i16[i] + b_.i16[i];
373
-   }
374
-#endif
375
-
376
-   return simde__m128i_from_private(r_);
377
-#endif
378
-}
379
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
380
-#define _mm_add_epi16(a, b) simde_mm_add_epi16(a, b)
381
-#endif
382
-
383
-SIMDE_FUNCTION_ATTRIBUTES
384
-simde__m128i simde_mm_add_epi32(simde__m128i a, simde__m128i b)
385
-{
386
-#if defined(SIMDE_X86_SSE2_NATIVE)
387
-   return _mm_add_epi32(a, b);
388
-#else
389
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
390
-                b_ = simde__m128i_to_private(b);
391
-
392
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
393
-   r_.neon_i32 = vaddq_s32(a_.neon_i32, b_.neon_i32);
394
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
395
-   r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32);
396
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
397
-   r_.i32 = a_.i32 + b_.i32;
398
-#else
399
-   SIMDE_VECTORIZE
400
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
401
-       r_.i32[i] = a_.i32[i] + b_.i32[i];
402
-   }
403
-#endif
404
-
405
-   return simde__m128i_from_private(r_);
406
-#endif
407
-}
408
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
409
-#define _mm_add_epi32(a, b) simde_mm_add_epi32(a, b)
410
-#endif
411
-
412
-SIMDE_FUNCTION_ATTRIBUTES
413
-simde__m128i simde_mm_add_epi64(simde__m128i a, simde__m128i b)
414
-{
415
-#if defined(SIMDE_X86_SSE2_NATIVE)
416
-   return _mm_add_epi64(a, b);
417
-#else
418
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
419
-                b_ = simde__m128i_to_private(b);
420
-
421
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
422
-   r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64);
423
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
424
-   r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64);
425
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
426
-   r_.i64 = a_.i64 + b_.i64;
427
-#else
428
-   SIMDE_VECTORIZE
429
-   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
430
-       r_.i64[i] = a_.i64[i] + b_.i64[i];
431
-   }
432
-#endif
433
-
434
-   return simde__m128i_from_private(r_);
435
-#endif
436
-}
437
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
438
-#define _mm_add_epi64(a, b) simde_mm_add_epi64(a, b)
439
-#endif
440
-
441
-SIMDE_FUNCTION_ATTRIBUTES
442
-simde__m128d simde_mm_add_pd(simde__m128d a, simde__m128d b)
443
-{
444
-#if defined(SIMDE_X86_SSE2_NATIVE)
445
-   return _mm_add_pd(a, b);
446
-#else
447
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
448
-                b_ = simde__m128d_to_private(b);
449
-
450
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
451
-   r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64);
452
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
453
-   r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
454
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
455
-   r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64);
456
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
457
-   r_.f64 = a_.f64 + b_.f64;
458
-#else
459
-   SIMDE_VECTORIZE
460
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
461
-       r_.f64[i] = a_.f64[i] + b_.f64[i];
462
-   }
463
-#endif
464
-
465
-   return simde__m128d_from_private(r_);
466
-#endif
467
-}
468
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
469
-#define _mm_add_pd(a, b) simde_mm_add_pd(a, b)
470
-#endif
471
-
472
-SIMDE_FUNCTION_ATTRIBUTES
473
-simde__m128d simde_mm_move_sd(simde__m128d a, simde__m128d b)
474
-{
475
-#if defined(SIMDE_X86_SSE2_NATIVE)
476
-   return _mm_move_sd(a, b);
477
-#else
478
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
479
-                b_ = simde__m128d_to_private(b);
480
-
481
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
482
-   r_.neon_f64 =
483
-       vsetq_lane_f64(vgetq_lane_f64(b_.neon_f64, 0), a_.neon_f64, 0);
484
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
485
-   SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)
486
-   m = {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15};
487
-   r_.altivec_f64 = vec_perm(a_.altivec_f64, b_.altivec_f64, m);
488
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
489
-   r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1);
490
-#else
491
-   r_.f64[0] = b_.f64[0];
492
-   r_.f64[1] = a_.f64[1];
493
-#endif
494
-
495
-   return simde__m128d_from_private(r_);
496
-#endif
497
-}
498
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
499
-#define _mm_move_sd(a, b) simde_mm_move_sd(a, b)
500
-#endif
501
-
502
-SIMDE_FUNCTION_ATTRIBUTES
503
-simde__m128d simde_mm_add_sd(simde__m128d a, simde__m128d b)
504
-{
505
-#if defined(SIMDE_X86_SSE2_NATIVE)
506
-   return _mm_add_sd(a, b);
507
-#else
508
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
509
-                b_ = simde__m128d_to_private(b);
510
-
511
-   r_.f64[0] = a_.f64[0] + b_.f64[0];
512
-   r_.f64[1] = a_.f64[1];
513
-
514
-#if defined(SIMDE_ASSUME_VECTORIZATION)
515
-   return simde_mm_move_sd(a, simde_mm_add_pd(a, b));
516
-#else
517
-   r_.f64[0] = a_.f64[0] + b_.f64[0];
518
-   r_.f64[1] = a_.f64[1];
519
-#endif
520
-
521
-   return simde__m128d_from_private(r_);
522
-#endif
523
-}
524
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
525
-#define _mm_add_sd(a, b) simde_mm_add_sd(a, b)
526
-#endif
527
-
528
-SIMDE_FUNCTION_ATTRIBUTES
529
-simde__m64 simde_mm_add_si64(simde__m64 a, simde__m64 b)
530
-{
531
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
532
-   return _mm_add_si64(a, b);
533
-#else
534
-   simde__m64_private r_, a_ = simde__m64_to_private(a),
535
-                  b_ = simde__m64_to_private(b);
536
-
537
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
538
-   r_.neon_i64 = vadd_s64(a_.neon_i64, b_.neon_i64);
539
-#else
540
-   r_.i64[0] = a_.i64[0] + b_.i64[0];
541
-#endif
542
-
543
-   return simde__m64_from_private(r_);
544
-#endif
545
-}
546
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
547
-#define _mm_add_si64(a, b) simde_mm_add_si64(a, b)
548
-#endif
549
-
550
-SIMDE_FUNCTION_ATTRIBUTES
551
-simde__m128i simde_mm_adds_epi8(simde__m128i a, simde__m128i b)
552
-{
553
-#if defined(SIMDE_X86_SSE2_NATIVE)
554
-   return _mm_adds_epi8(a, b);
555
-#else
556
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
557
-                b_ = simde__m128i_to_private(b);
558
-
559
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
560
-   r_.neon_i8 = vqaddq_s8(a_.neon_i8, b_.neon_i8);
561
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
562
-   r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8);
563
-#else
564
-   SIMDE_VECTORIZE
565
-   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
566
-       const int32_t tmp = HEDLEY_STATIC_CAST(int16_t, a_.i8[i]) +
567
-                   HEDLEY_STATIC_CAST(int16_t, b_.i8[i]);
568
-       r_.i8[i] = HEDLEY_STATIC_CAST(
569
-           int8_t,
570
-           ((tmp < INT8_MAX) ? ((tmp > INT8_MIN) ? tmp : INT8_MIN)
571
-                     : INT8_MAX));
572
-   }
573
-#endif
574
-
575
-   return simde__m128i_from_private(r_);
576
-#endif
577
-}
578
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
579
-#define _mm_adds_epi8(a, b) simde_mm_adds_epi8(a, b)
580
-#endif
581
-
582
-SIMDE_FUNCTION_ATTRIBUTES
583
-simde__m128i simde_mm_adds_epi16(simde__m128i a, simde__m128i b)
584
-{
585
-#if defined(SIMDE_X86_SSE2_NATIVE)
586
-   return _mm_adds_epi16(a, b);
587
-#else
588
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
589
-                b_ = simde__m128i_to_private(b);
590
-
591
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
592
-   r_.neon_i16 = vqaddq_s16(a_.neon_i16, b_.neon_i16);
593
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
594
-   r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16);
595
-#else
596
-   SIMDE_VECTORIZE
597
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
598
-       const int32_t tmp = HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) +
599
-                   HEDLEY_STATIC_CAST(int32_t, b_.i16[i]);
600
-       r_.i16[i] = HEDLEY_STATIC_CAST(
601
-           int16_t,
602
-           ((tmp < INT16_MAX)
603
-                ? ((tmp > INT16_MIN) ? tmp : INT16_MIN)
604
-                : INT16_MAX));
605
-   }
606
-#endif
607
-
608
-   return simde__m128i_from_private(r_);
609
-#endif
610
-}
611
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
612
-#define _mm_adds_epi16(a, b) simde_mm_adds_epi16(a, b)
613
-#endif
614
-
615
-SIMDE_FUNCTION_ATTRIBUTES
616
-simde__m128i simde_mm_adds_epu8(simde__m128i a, simde__m128i b)
617
-{
618
-#if defined(SIMDE_X86_SSE2_NATIVE)
619
-   return _mm_adds_epu8(a, b);
620
-#else
621
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
622
-                b_ = simde__m128i_to_private(b);
623
-
624
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
625
-   r_.neon_u8 = vqaddq_u8(a_.neon_u8, b_.neon_u8);
626
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
627
-   r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8);
628
-#else
629
-   SIMDE_VECTORIZE
630
-   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
631
-       r_.u8[i] = ((UINT8_MAX - a_.u8[i]) > b_.u8[i])
632
-                  ? (a_.u8[i] + b_.u8[i])
633
-                  : UINT8_MAX;
634
-   }
635
-#endif
636
-
637
-   return simde__m128i_from_private(r_);
638
-#endif
639
-}
640
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
641
-#define _mm_adds_epu8(a, b) simde_mm_adds_epu8(a, b)
642
-#endif
643
-
644
-SIMDE_FUNCTION_ATTRIBUTES
645
-simde__m128i simde_mm_adds_epu16(simde__m128i a, simde__m128i b)
646
-{
647
-#if defined(SIMDE_X86_SSE2_NATIVE)
648
-   return _mm_adds_epu16(a, b);
649
-#else
650
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
651
-                b_ = simde__m128i_to_private(b);
652
-
653
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
654
-   r_.neon_u16 = vqaddq_u16(a_.neon_u16, b_.neon_u16);
655
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
656
-   r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16);
657
-#else
658
-   SIMDE_VECTORIZE
659
-   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
660
-       r_.u16[i] = ((UINT16_MAX - a_.u16[i]) > b_.u16[i])
661
-                   ? (a_.u16[i] + b_.u16[i])
662
-                   : UINT16_MAX;
663
-   }
664
-#endif
665
-
666
-   return simde__m128i_from_private(r_);
667
-#endif
668
-}
669
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
670
-#define _mm_adds_epu16(a, b) simde_mm_adds_epu16(a, b)
671
-#endif
672
-
673
-SIMDE_FUNCTION_ATTRIBUTES
674
-simde__m128d simde_mm_and_pd(simde__m128d a, simde__m128d b)
675
-{
676
-#if defined(SIMDE_X86_SSE2_NATIVE)
677
-   return _mm_and_pd(a, b);
678
-#else
679
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
680
-                b_ = simde__m128d_to_private(b);
681
-
682
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
683
-   r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
684
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
685
-   r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
686
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
687
-   r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64);
688
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
689
-   r_.i32f = a_.i32f & b_.i32f;
690
-#else
691
-   SIMDE_VECTORIZE
692
-   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
693
-       r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
694
-   }
695
-#endif
696
-
697
-   return simde__m128d_from_private(r_);
698
-#endif
699
-}
700
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
701
-#define _mm_and_pd(a, b) simde_mm_and_pd(a, b)
702
-#endif
703
-
704
-SIMDE_FUNCTION_ATTRIBUTES
705
-simde__m128i simde_mm_and_si128(simde__m128i a, simde__m128i b)
706
-{
707
-#if defined(SIMDE_X86_SSE2_NATIVE)
708
-   return _mm_and_si128(a, b);
709
-#else
710
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
711
-                b_ = simde__m128i_to_private(b);
712
-
713
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
714
-   r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32);
715
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
716
-   r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f);
717
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
718
-   r_.i32f = a_.i32f & b_.i32f;
719
-#else
720
-   SIMDE_VECTORIZE
721
-   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
722
-       r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
723
-   }
724
-#endif
725
-
726
-   return simde__m128i_from_private(r_);
727
-#endif
728
-}
729
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
730
-#define _mm_and_si128(a, b) simde_mm_and_si128(a, b)
731
-#endif
732
-
733
-SIMDE_FUNCTION_ATTRIBUTES
734
-simde__m128d simde_mm_andnot_pd(simde__m128d a, simde__m128d b)
735
-{
736
-#if defined(SIMDE_X86_SSE2_NATIVE)
737
-   return _mm_andnot_pd(a, b);
738
-#else
739
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
740
-                b_ = simde__m128d_to_private(b);
741
-
742
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
743
-   r_.neon_i32 = vbicq_s32(a_.neon_i32, b_.neon_i32);
744
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
745
-   r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
746
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
747
-   r_.altivec_i32f = vec_andc(a_.altivec_i32f, b_.altivec_i32f);
748
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
749
-   r_.i32f = ~a_.i32f & b_.i32f;
750
-#else
751
-   SIMDE_VECTORIZE
752
-   for (size_t i = 0; i < (sizeof(r_.u64) / sizeof(r_.u64[0])); i++) {
753
-       r_.u64[i] = ~a_.u64[i] & b_.u64[i];
754
-   }
755
-#endif
756
-
757
-   return simde__m128d_from_private(r_);
758
-#endif
759
-}
760
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
761
-#define _mm_andnot_pd(a, b) simde_mm_andnot_pd(a, b)
762
-#endif
763
-
764
-SIMDE_FUNCTION_ATTRIBUTES
765
-simde__m128i simde_mm_andnot_si128(simde__m128i a, simde__m128i b)
766
-{
767
-#if defined(SIMDE_X86_SSE2_NATIVE)
768
-   return _mm_andnot_si128(a, b);
769
-#else
770
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
771
-                b_ = simde__m128i_to_private(b);
772
-
773
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
774
-   r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
775
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
776
-   r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32);
777
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
778
-   r_.i32f = ~a_.i32f & b_.i32f;
779
-#else
780
-   SIMDE_VECTORIZE
781
-   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
782
-       r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
783
-   }
784
-#endif
785
-
786
-   return simde__m128i_from_private(r_);
787
-#endif
788
-}
789
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
790
-#define _mm_andnot_si128(a, b) simde_mm_andnot_si128(a, b)
791
-#endif
792
-
793
-SIMDE_FUNCTION_ATTRIBUTES
794
-simde__m128i simde_mm_avg_epu8(simde__m128i a, simde__m128i b)
795
-{
796
-#if defined(SIMDE_X86_SSE2_NATIVE)
797
-   return _mm_avg_epu8(a, b);
798
-#else
799
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
800
-                b_ = simde__m128i_to_private(b);
801
-
802
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
803
-   r_.neon_u8 = vrhaddq_u8(b_.neon_u8, a_.neon_u8);
804
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
805
-   r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8);
806
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) &&      \
807
-   defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
808
-   defined(SIMDE_CONVERT_VECTOR_)
809
-   uint16_t wa SIMDE_VECTOR(32);
810
-   uint16_t wb SIMDE_VECTOR(32);
811
-   uint16_t wr SIMDE_VECTOR(32);
812
-   SIMDE_CONVERT_VECTOR_(wa, a_.u8);
813
-   SIMDE_CONVERT_VECTOR_(wb, b_.u8);
814
-   wr = (wa + wb + 1) >> 1;
815
-   SIMDE_CONVERT_VECTOR_(r_.u8, wr);
816
-#else
817
-   SIMDE_VECTORIZE
818
-   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
819
-       r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
820
-   }
821
-#endif
822
-
823
-   return simde__m128i_from_private(r_);
824
-#endif
825
-}
826
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
827
-#define _mm_avg_epu8(a, b) simde_mm_avg_epu8(a, b)
828
-#endif
829
-
830
-SIMDE_FUNCTION_ATTRIBUTES
831
-simde__m128i simde_mm_avg_epu16(simde__m128i a, simde__m128i b)
832
-{
833
-#if defined(SIMDE_X86_SSE2_NATIVE)
834
-   return _mm_avg_epu16(a, b);
835
-#else
836
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
837
-                b_ = simde__m128i_to_private(b);
838
-
839
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
840
-   r_.neon_u16 = vrhaddq_u16(b_.neon_u16, a_.neon_u16);
841
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
842
-   r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16);
843
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) &&      \
844
-   defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
845
-   defined(SIMDE_CONVERT_VECTOR_)
846
-   uint32_t wa SIMDE_VECTOR(32);
847
-   uint32_t wb SIMDE_VECTOR(32);
848
-   uint32_t wr SIMDE_VECTOR(32);
849
-   SIMDE_CONVERT_VECTOR_(wa, a_.u16);
850
-   SIMDE_CONVERT_VECTOR_(wb, b_.u16);
851
-   wr = (wa + wb + 1) >> 1;
852
-   SIMDE_CONVERT_VECTOR_(r_.u16, wr);
853
-#else
854
-   SIMDE_VECTORIZE
855
-   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
856
-       r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
857
-   }
858
-#endif
859
-
860
-   return simde__m128i_from_private(r_);
861
-#endif
862
-}
863
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
864
-#define _mm_avg_epu16(a, b) simde_mm_avg_epu16(a, b)
865
-#endif
866
-
867
-SIMDE_FUNCTION_ATTRIBUTES
868
-simde__m128i simde_mm_setzero_si128(void)
869
-{
870
-#if defined(SIMDE_X86_SSE2_NATIVE)
871
-   return _mm_setzero_si128();
872
-#else
873
-   simde__m128i_private r_;
874
-
875
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
876
-   r_.neon_i32 = vdupq_n_s32(0);
877
-#else
878
-   SIMDE_VECTORIZE
879
-   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
880
-       r_.i32f[i] = 0;
881
-   }
882
-#endif
883
-
884
-   return simde__m128i_from_private(r_);
885
-#endif
886
-}
887
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
888
-#define _mm_setzero_si128() (simde_mm_setzero_si128())
889
-#endif
890
-
891
-SIMDE_FUNCTION_ATTRIBUTES
892
-simde__m128i simde_mm_bslli_si128(simde__m128i a, const int imm8)
893
-   SIMDE_REQUIRE_RANGE(imm8, 0, 255)
894
-{
895
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
896
-
897
-   if (HEDLEY_UNLIKELY((imm8 & ~15))) {
898
-       return simde_mm_setzero_si128();
899
-   }
900
-
901
-#if defined(SIMDE_HAVE_INT128_) && defined(__BYTE_ORDER__) && \
902
-   (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && 0
903
-   r_.u128[0] = a_.u128[0] << s;
904
-#else
905
-   r_ = simde__m128i_to_private(simde_mm_setzero_si128());
906
-   for (int i = imm8;
907
-        i < HEDLEY_STATIC_CAST(int, sizeof(r_.i8) / sizeof(r_.i8[0]));
908
-        i++) {
909
-       r_.i8[i] = a_.i8[i - imm8];
910
-   }
911
-#endif
912
-
913
-   return simde__m128i_from_private(r_);
914
-}
915
-#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
916
-#define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8)
917
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
918
-#define simde_mm_bslli_si128(a, imm8)                                      \
919
-   simde__m128i_from_neon_i8(                                         \
920
-       ((imm8) <= 0)                                              \
921
-           ? simde__m128i_to_neon_i8(a)                       \
922
-           : (((imm8) > 15)                                   \
923
-                  ? (vdupq_n_s8(0))                       \
924
-                  : (vextq_s8(vdupq_n_s8(0),              \
925
-                          simde__m128i_to_neon_i8(a), \
926
-                          16 - (imm8)))))
927
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
928
-#define simde_mm_bslli_si128(a, imm8)                                          \
929
-   (__extension__({                                                       \
930
-       const simde__m128i_private simde__tmp_a_ =                     \
931
-           simde__m128i_to_private(a);                            \
932
-       const simde__m128i_private simde__tmp_z_ =                     \
933
-           simde__m128i_to_private(simde_mm_setzero_si128());     \
934
-       simde__m128i_private simde__tmp_r_;                            \
935
-       if (HEDLEY_UNLIKELY(imm8 > 15)) {                              \
936
-           simde__tmp_r_ = simde__m128i_to_private(               \
937
-               simde_mm_setzero_si128());                     \
938
-       } else {                                                       \
939
-           simde__tmp_r_.i8 = SIMDE_SHUFFLE_VECTOR_(              \
940
-               8, 16, simde__tmp_z_.i8, (simde__tmp_a_).i8,   \
941
-               HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31),  \
942
-               HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31),  \
943
-               HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31),  \
944
-               HEDLEY_STATIC_CAST(int8_t, (19 - imm8) & 31),  \
945
-               HEDLEY_STATIC_CAST(int8_t, (20 - imm8) & 31),  \
946
-               HEDLEY_STATIC_CAST(int8_t, (21 - imm8) & 31),  \
947
-               HEDLEY_STATIC_CAST(int8_t, (22 - imm8) & 31),  \
948
-               HEDLEY_STATIC_CAST(int8_t, (23 - imm8) & 31),  \
949
-               HEDLEY_STATIC_CAST(int8_t, (24 - imm8) & 31),  \
950
-               HEDLEY_STATIC_CAST(int8_t, (25 - imm8) & 31),  \
951
-               HEDLEY_STATIC_CAST(int8_t, (26 - imm8) & 31),  \
952
-               HEDLEY_STATIC_CAST(int8_t, (27 - imm8) & 31),  \
953
-               HEDLEY_STATIC_CAST(int8_t, (28 - imm8) & 31),  \
954
-               HEDLEY_STATIC_CAST(int8_t, (29 - imm8) & 31),  \
955
-               HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31),  \
956
-               HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \
957
-       }                                                              \
958
-       simde__m128i_from_private(simde__tmp_r_);                      \
959
-   }))
960
-#endif
961
-#define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
962
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
963
-#define _mm_bslli_si128(a, b) simde_mm_bslli_si128(a, b)
964
-#define _mm_slli_si128(a, b) simde_mm_bslli_si128(a, b)
965
-#endif
966
-
967
-SIMDE_FUNCTION_ATTRIBUTES
968
-simde__m128i simde_mm_bsrli_si128(simde__m128i a, const int imm8)
969
-   SIMDE_REQUIRE_RANGE(imm8, 0, 255)
970
-{
971
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
972
-
973
-   SIMDE_VECTORIZE
974
-   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
975
-       const int e = HEDLEY_STATIC_CAST(int, i) + imm8;
976
-       r_.i8[i] = (e < 16) ? a_.i8[e] : 0;
977
-   }
978
-
979
-   return simde__m128i_from_private(r_);
980
-}
981
-#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
982
-#define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8)
983
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
984
-#define simde_mm_bsrli_si128(a, imm8)                                   \
985
-   simde__m128i_from_neon_i8(                                      \
986
-       ((imm8 < 0) || (imm8 > 15))                             \
987
-           ? vdupq_n_s8(0)                                 \
988
-           : (vextq_s8(simde__m128i_to_private(a).neon_i8, \
989
-                   vdupq_n_s8(0),                      \
990
-                   ((imm8 & 15) != 0) ? imm8 : (imm8 & 15))))
991
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
992
-#define simde_mm_bsrli_si128(a, imm8)                                          \
993
-   (__extension__({                                                       \
994
-       const simde__m128i_private simde__tmp_a_ =                     \
995
-           simde__m128i_to_private(a);                            \
996
-       const simde__m128i_private simde__tmp_z_ =                     \
997
-           simde__m128i_to_private(simde_mm_setzero_si128());     \
998
-       simde__m128i_private simde__tmp_r_ =                           \
999
-           simde__m128i_to_private(a);                            \
1000
-       if (HEDLEY_UNLIKELY(imm8 > 15)) {                              \
1001
-           simde__tmp_r_ = simde__m128i_to_private(               \
1002
-               simde_mm_setzero_si128());                     \
1003
-       } else {                                                       \
1004
-           simde__tmp_r_.i8 = SIMDE_SHUFFLE_VECTOR_(              \
1005
-               8, 16, simde__tmp_z_.i8, (simde__tmp_a_).i8,   \
1006
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31),  \
1007
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31),  \
1008
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31),  \
1009
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31),  \
1010
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31),  \
1011
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31),  \
1012
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31),  \
1013
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31),  \
1014
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31),  \
1015
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31),  \
1016
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31),  \
1017
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31),  \
1018
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31),  \
1019
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31),  \
1020
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31),  \
1021
-               HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \
1022
-       }                                                              \
1023
-       simde__m128i_from_private(simde__tmp_r_);                      \
1024
-   }))
1025
-#endif
1026
-#define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1027
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1028
-#define _mm_bsrli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1029
-#define _mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1030
-#endif
1031
-
1032
-SIMDE_FUNCTION_ATTRIBUTES
1033
-void simde_mm_clflush(void const *p)
1034
-{
1035
-#if defined(SIMDE_X86_SSE2_NATIVE)
1036
-   _mm_clflush(p);
1037
-#else
1038
-   (void)p;
1039
-#endif
1040
-}
1041
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1042
-#define _mm_clflush(a, b) simde_mm_clflush()
1043
-#endif
1044
-
1045
-SIMDE_FUNCTION_ATTRIBUTES
1046
-int simde_mm_comieq_sd(simde__m128d a, simde__m128d b)
1047
-{
1048
-#if defined(SIMDE_X86_SSE2_NATIVE)
1049
-   return _mm_comieq_sd(a, b);
1050
-#else
1051
-   simde__m128d_private a_ = simde__m128d_to_private(a),
1052
-                b_ = simde__m128d_to_private(b);
1053
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1054
-   return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1055
-#else
1056
-   return a_.f64[0] == b_.f64[0];
1057
-#endif
1058
-#endif
1059
-}
1060
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1061
-#define _mm_comieq_sd(a, b) simde_mm_comieq_sd(a, b)
1062
-#endif
1063
-
1064
-SIMDE_FUNCTION_ATTRIBUTES
1065
-int simde_mm_comige_sd(simde__m128d a, simde__m128d b)
1066
-{
1067
-#if defined(SIMDE_X86_SSE2_NATIVE)
1068
-   return _mm_comige_sd(a, b);
1069
-#else
1070
-   simde__m128d_private a_ = simde__m128d_to_private(a),
1071
-                b_ = simde__m128d_to_private(b);
1072
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1073
-   return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0);
1074
-#else
1075
-   return a_.f64[0] >= b_.f64[0];
1076
-#endif
1077
-#endif
1078
-}
1079
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1080
-#define _mm_comige_sd(a, b) simde_mm_comige_sd(a, b)
1081
-#endif
1082
-
1083
-SIMDE_FUNCTION_ATTRIBUTES
1084
-int simde_mm_comigt_sd(simde__m128d a, simde__m128d b)
1085
-{
1086
-#if defined(SIMDE_X86_SSE2_NATIVE)
1087
-   return _mm_comigt_sd(a, b);
1088
-#else
1089
-   simde__m128d_private a_ = simde__m128d_to_private(a),
1090
-                b_ = simde__m128d_to_private(b);
1091
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1092
-   return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0);
1093
-#else
1094
-   return a_.f64[0] > b_.f64[0];
1095
-#endif
1096
-#endif
1097
-}
1098
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1099
-#define _mm_comigt_sd(a, b) simde_mm_comigt_sd(a, b)
1100
-#endif
1101
-
1102
-SIMDE_FUNCTION_ATTRIBUTES
1103
-int simde_mm_comile_sd(simde__m128d a, simde__m128d b)
1104
-{
1105
-#if defined(SIMDE_X86_SSE2_NATIVE)
1106
-   return _mm_comile_sd(a, b);
1107
-#else
1108
-   simde__m128d_private a_ = simde__m128d_to_private(a),
1109
-                b_ = simde__m128d_to_private(b);
1110
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1111
-   return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0);
1112
-#else
1113
-   return a_.f64[0] <= b_.f64[0];
1114
-#endif
1115
-#endif
1116
-}
1117
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1118
-#define _mm_comile_sd(a, b) simde_mm_comile_sd(a, b)
1119
-#endif
1120
-
1121
-SIMDE_FUNCTION_ATTRIBUTES
1122
-int simde_mm_comilt_sd(simde__m128d a, simde__m128d b)
1123
-{
1124
-#if defined(SIMDE_X86_SSE2_NATIVE)
1125
-   return _mm_comilt_sd(a, b);
1126
-#else
1127
-   simde__m128d_private a_ = simde__m128d_to_private(a),
1128
-                b_ = simde__m128d_to_private(b);
1129
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1130
-   return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0);
1131
-#else
1132
-   return a_.f64[0] < b_.f64[0];
1133
-#endif
1134
-#endif
1135
-}
1136
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1137
-#define _mm_comilt_sd(a, b) simde_mm_comilt_sd(a, b)
1138
-#endif
1139
-
1140
-SIMDE_FUNCTION_ATTRIBUTES
1141
-int simde_mm_comineq_sd(simde__m128d a, simde__m128d b)
1142
-{
1143
-#if defined(SIMDE_X86_SSE2_NATIVE)
1144
-   return _mm_comineq_sd(a, b);
1145
-#else
1146
-   simde__m128d_private a_ = simde__m128d_to_private(a),
1147
-                b_ = simde__m128d_to_private(b);
1148
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1149
-   return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1150
-#else
1151
-   return a_.f64[0] != b_.f64[0];
1152
-#endif
1153
-#endif
1154
-}
1155
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1156
-#define _mm_comineq_sd(a, b) simde_mm_comineq_sd(a, b)
1157
-#endif
1158
-
1159
-SIMDE_FUNCTION_ATTRIBUTES
1160
-simde__m128 simde_mm_castpd_ps(simde__m128d a)
1161
-{
1162
-#if defined(SIMDE_X86_SSE2_NATIVE)
1163
-   return _mm_castpd_ps(a);
1164
-#else
1165
-   simde__m128 r;
1166
-   simde_memcpy(&r, &a, sizeof(a));
1167
-   return r;
1168
-#endif
1169
-}
1170
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1171
-#define _mm_castpd_ps(a) simde_mm_castpd_ps(a)
1172
-#endif
1173
-
1174
-SIMDE_FUNCTION_ATTRIBUTES
1175
-simde__m128i simde_mm_castpd_si128(simde__m128d a)
1176
-{
1177
-#if defined(SIMDE_X86_SSE2_NATIVE)
1178
-   return _mm_castpd_si128(a);
1179
-#else
1180
-   simde__m128i r;
1181
-   simde_memcpy(&r, &a, sizeof(a));
1182
-   return r;
1183
-#endif
1184
-}
1185
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1186
-#define _mm_castpd_si128(a) simde_mm_castpd_si128(a)
1187
-#endif
1188
-
1189
-SIMDE_FUNCTION_ATTRIBUTES
1190
-simde__m128d simde_mm_castps_pd(simde__m128 a)
1191
-{
1192
-#if defined(SIMDE_X86_SSE2_NATIVE)
1193
-   return _mm_castps_pd(a);
1194
-#else
1195
-   simde__m128d r;
1196
-   simde_memcpy(&r, &a, sizeof(a));
1197
-   return r;
1198
-#endif
1199
-}
1200
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1201
-#define _mm_castps_pd(a) simde_mm_castps_pd(a)
1202
-#endif
1203
-
1204
-SIMDE_FUNCTION_ATTRIBUTES
1205
-simde__m128i simde_mm_castps_si128(simde__m128 a)
1206
-{
1207
-#if defined(SIMDE_X86_SSE2_NATIVE)
1208
-   return _mm_castps_si128(a);
1209
-#else
1210
-   simde__m128i r;
1211
-   simde_memcpy(&r, &a, sizeof(a));
1212
-   return r;
1213
-#endif
1214
-}
1215
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1216
-#define _mm_castps_si128(a) simde_mm_castps_si128(a)
1217
-#endif
1218
-
1219
-SIMDE_FUNCTION_ATTRIBUTES
1220
-simde__m128d simde_mm_castsi128_pd(simde__m128i a)
1221
-{
1222
-#if defined(SIMDE_X86_SSE2_NATIVE)
1223
-   return _mm_castsi128_pd(a);
1224
-#else
1225
-   simde__m128d r;
1226
-   simde_memcpy(&r, &a, sizeof(a));
1227
-   return r;
1228
-#endif
1229
-}
1230
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1231
-#define _mm_castsi128_pd(a) simde_mm_castsi128_pd(a)
1232
-#endif
1233
-
1234
-SIMDE_FUNCTION_ATTRIBUTES
1235
-simde__m128 simde_mm_castsi128_ps(simde__m128i a)
1236
-{
1237
-#if defined(SIMDE_X86_SSE2_NATIVE)
1238
-   return _mm_castsi128_ps(a);
1239
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1240
-   return a;
1241
-#else
1242
-   simde__m128 r;
1243
-   simde_memcpy(&r, &a, sizeof(a));
1244
-   return r;
1245
-#endif
1246
-}
1247
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1248
-#define _mm_castsi128_ps(a) simde_mm_castsi128_ps(a)
1249
-#endif
1250
-
1251
-SIMDE_FUNCTION_ATTRIBUTES
1252
-simde__m128i simde_mm_cmpeq_epi8(simde__m128i a, simde__m128i b)
1253
-{
1254
-#if defined(SIMDE_X86_SSE2_NATIVE)
1255
-   return _mm_cmpeq_epi8(a, b);
1256
-#else
1257
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1258
-                b_ = simde__m128i_to_private(b);
1259
-
1260
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1261
-   r_.neon_i8 = vreinterpretq_s8_u8(vceqq_s8(b_.neon_i8, a_.neon_i8));
1262
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1263
-   r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128);
1264
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1265
-   r_.altivec_i8 = (SIMDE_POWER_ALTIVEC_VECTOR(signed char))vec_cmpeq(
1266
-       a_.altivec_i8, b_.altivec_i8);
1267
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1268
-   r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8));
1269
-#else
1270
-   SIMDE_VECTORIZE
1271
-   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
1272
-       r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1273
-   }
1274
-#endif
1275
-
1276
-   return simde__m128i_from_private(r_);
1277
-#endif
1278
-}
1279
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1280
-#define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b)
1281
-#endif
1282
-
1283
-SIMDE_FUNCTION_ATTRIBUTES
1284
-simde__m128i simde_mm_cmpeq_epi16(simde__m128i a, simde__m128i b)
1285
-{
1286
-#if defined(SIMDE_X86_SSE2_NATIVE)
1287
-   return _mm_cmpeq_epi16(a, b);
1288
-#else
1289
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1290
-                b_ = simde__m128i_to_private(b);
1291
-
1292
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1293
-   r_.neon_i16 =
1294
-       vreinterpretq_s16_u16(vceqq_s16(b_.neon_i16, a_.neon_i16));
1295
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1296
-   r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128);
1297
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1298
-   r_.altivec_i16 = (SIMDE_POWER_ALTIVEC_VECTOR(signed short))vec_cmpeq(
1299
-       a_.altivec_i16, b_.altivec_i16);
1300
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1301
-   r_.i16 = (a_.i16 == b_.i16);
1302
-#else
1303
-   SIMDE_VECTORIZE
1304
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1305
-       r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1306
-   }
1307
-#endif
1308
-
1309
-   return simde__m128i_from_private(r_);
1310
-#endif
1311
-}
1312
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1313
-#define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b)
1314
-#endif
1315
-
1316
-SIMDE_FUNCTION_ATTRIBUTES
1317
-simde__m128i simde_mm_cmpeq_epi32(simde__m128i a, simde__m128i b)
1318
-{
1319
-#if defined(SIMDE_X86_SSE2_NATIVE)
1320
-   return _mm_cmpeq_epi32(a, b);
1321
-#else
1322
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1323
-                b_ = simde__m128i_to_private(b);
1324
-
1325
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1326
-   r_.neon_i32 =
1327
-       vreinterpretq_s32_u32(vceqq_s32(b_.neon_i32, a_.neon_i32));
1328
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1329
-   r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128);
1330
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1331
-   r_.altivec_i32 = (SIMDE_POWER_ALTIVEC_VECTOR(signed int))vec_cmpeq(
1332
-       a_.altivec_i32, b_.altivec_i32);
1333
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1334
-   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.i32 == b_.i32);
1335
-#else
1336
-   SIMDE_VECTORIZE
1337
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1338
-       r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1339
-   }
1340
-#endif
1341
-
1342
-   return simde__m128i_from_private(r_);
1343
-#endif
1344
-}
1345
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1346
-#define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b)
1347
-#endif
1348
-
1349
-SIMDE_FUNCTION_ATTRIBUTES
1350
-simde__m128d simde_mm_cmpeq_pd(simde__m128d a, simde__m128d b)
1351
-{
1352
-#if defined(SIMDE_X86_SSE2_NATIVE)
1353
-   return _mm_cmpeq_pd(a, b);
1354
-#else
1355
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1356
-                b_ = simde__m128d_to_private(b);
1357
-
1358
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1359
-   r_.neon_i64 = vreinterpretq_s64_u64(
1360
-       vceqq_s64(vreinterpretq_s64_f64(b_.neon_f64),
1361
-             vreinterpretq_s64_f64(a_.neon_f64)));
1362
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1363
-   r_.wasm_v128 = wasm_f64x2_eq(a_.wasm_v128, b_.wasm_v128);
1364
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1365
-   r_.altivec_f64 = (SIMDE_POWER_ALTIVEC_VECTOR(double))vec_cmpeq(
1366
-       a_.altivec_f64, b_.altivec_f64);
1367
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1368
-   r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
1369
-#else
1370
-   SIMDE_VECTORIZE
1371
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
1372
-       r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0)
1373
-                            : UINT64_C(0);
1374
-   }
1375
-#endif
1376
-
1377
-   return simde__m128d_from_private(r_);
1378
-#endif
1379
-}
1380
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1381
-#define _mm_cmpeq_pd(a, b) simde_mm_cmpeq_pd(a, b)
1382
-#endif
1383
-
1384
-SIMDE_FUNCTION_ATTRIBUTES
1385
-simde__m128d simde_mm_cmpeq_sd(simde__m128d a, simde__m128d b)
1386
-{
1387
-#if defined(SIMDE_X86_SSE2_NATIVE)
1388
-   return _mm_cmpeq_sd(a, b);
1389
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
1390
-   return simde_mm_move_sd(a, simde_mm_cmpeq_pd(a, b));
1391
-#else
1392
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1393
-                b_ = simde__m128d_to_private(b);
1394
-
1395
-   r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0;
1396
-   r_.u64[1] = a_.u64[1];
1397
-
1398
-   return simde__m128d_from_private(r_);
1399
-#endif
1400
-}
1401
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1402
-#define _mm_cmpeq_sd(a, b) simde_mm_cmpeq_sd(a, b)
1403
-#endif
1404
-
1405
-SIMDE_FUNCTION_ATTRIBUTES
1406
-simde__m128d simde_mm_cmpneq_pd(simde__m128d a, simde__m128d b)
1407
-{
1408
-#if defined(SIMDE_X86_SSE2_NATIVE)
1409
-   return _mm_cmpneq_pd(a, b);
1410
-#else
1411
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1412
-                b_ = simde__m128d_to_private(b);
1413
-
1414
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1415
-   r_.neon_f32 = vreinterpretq_f32_u16(
1416
-       vmvnq_u16(vceqq_s16(b_.neon_i16, a_.neon_i16)));
1417
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1418
-   r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128);
1419
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1420
-   r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
1421
-#else
1422
-   SIMDE_VECTORIZE
1423
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
1424
-       r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0)
1425
-                            : UINT64_C(0);
1426
-   }
1427
-#endif
1428
-
1429
-   return simde__m128d_from_private(r_);
1430
-#endif
1431
-}
1432
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1433
-#define _mm_cmpneq_pd(a, b) simde_mm_cmpneq_pd(a, b)
1434
-#endif
1435
-
1436
-SIMDE_FUNCTION_ATTRIBUTES
1437
-simde__m128d simde_mm_cmpneq_sd(simde__m128d a, simde__m128d b)
1438
-{
1439
-#if defined(SIMDE_X86_SSE2_NATIVE)
1440
-   return _mm_cmpneq_sd(a, b);
1441
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
1442
-   return simde_mm_move_sd(a, simde_mm_cmpneq_pd(a, b));
1443
-#else
1444
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1445
-                b_ = simde__m128d_to_private(b);
1446
-
1447
-   r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1448
-   r_.u64[1] = a_.u64[1];
1449
-
1450
-   return simde__m128d_from_private(r_);
1451
-#endif
1452
-}
1453
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1454
-#define _mm_cmpneq_sd(a, b) simde_mm_cmpneq_sd(a, b)
1455
-#endif
1456
-
1457
-SIMDE_FUNCTION_ATTRIBUTES
1458
-simde__m128i simde_mm_cmplt_epi8(simde__m128i a, simde__m128i b)
1459
-{
1460
-#if defined(SIMDE_X86_SSE2_NATIVE)
1461
-   return _mm_cmplt_epi8(a, b);
1462
-#else
1463
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1464
-                b_ = simde__m128i_to_private(b);
1465
-
1466
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1467
-   r_.neon_i8 = vreinterpretq_s8_u8(vcltq_s8(a_.neon_i8, b_.neon_i8));
1468
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1469
-   r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(
1470
-       SIMDE_POWER_ALTIVEC_VECTOR(signed char),
1471
-       vec_cmplt(a_.altivec_i8, b_.altivec_i8));
1472
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1473
-   r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128);
1474
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1475
-   r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8));
1476
-#else
1477
-   SIMDE_VECTORIZE
1478
-   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
1479
-       r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1480
-   }
1481
-#endif
1482
-
1483
-   return simde__m128i_from_private(r_);
1484
-#endif
1485
-}
1486
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1487
-#define _mm_cmplt_epi8(a, b) simde_mm_cmplt_epi8(a, b)
1488
-#endif
1489
-
1490
-SIMDE_FUNCTION_ATTRIBUTES
1491
-simde__m128i simde_mm_cmplt_epi16(simde__m128i a, simde__m128i b)
1492
-{
1493
-#if defined(SIMDE_X86_SSE2_NATIVE)
1494
-   return _mm_cmplt_epi16(a, b);
1495
-#else
1496
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1497
-                b_ = simde__m128i_to_private(b);
1498
-
1499
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1500
-   r_.neon_i16 =
1501
-       vreinterpretq_s16_u16(vcltq_s16(a_.neon_i16, b_.neon_i16));
1502
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1503
-   r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(
1504
-       SIMDE_POWER_ALTIVEC_VECTOR(signed short),
1505
-       vec_cmplt(a_.altivec_i16, b_.altivec_i16));
1506
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1507
-   r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128);
1508
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1509
-   r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16));
1510
-#else
1511
-   SIMDE_VECTORIZE
1512
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1513
-       r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1514
-   }
1515
-#endif
1516
-
1517
-   return simde__m128i_from_private(r_);
1518
-#endif
1519
-}
1520
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1521
-#define _mm_cmplt_epi16(a, b) simde_mm_cmplt_epi16(a, b)
1522
-#endif
1523
-
1524
-SIMDE_FUNCTION_ATTRIBUTES
1525
-simde__m128i simde_mm_cmplt_epi32(simde__m128i a, simde__m128i b)
1526
-{
1527
-#if defined(SIMDE_X86_SSE2_NATIVE)
1528
-   return _mm_cmplt_epi32(a, b);
1529
-#else
1530
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1531
-                b_ = simde__m128i_to_private(b);
1532
-
1533
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1534
-   r_.neon_i32 =
1535
-       vreinterpretq_s32_u32(vcltq_s32(a_.neon_i32, b_.neon_i32));
1536
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1537
-   r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(
1538
-       SIMDE_POWER_ALTIVEC_VECTOR(signed int),
1539
-       vec_cmplt(a_.altivec_i32, b_.altivec_i32));
1540
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1541
-   r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128);
1542
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1543
-   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32));
1544
-#else
1545
-   SIMDE_VECTORIZE
1546
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1547
-       r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1548
-   }
1549
-#endif
1550
-
1551
-   return simde__m128i_from_private(r_);
1552
-#endif
1553
-}
1554
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1555
-#define _mm_cmplt_epi32(a, b) simde_mm_cmplt_epi32(a, b)
1556
-#endif
1557
-
1558
-SIMDE_FUNCTION_ATTRIBUTES
1559
-simde__m128d simde_mm_cmplt_pd(simde__m128d a, simde__m128d b)
1560
-{
1561
-#if defined(SIMDE_X86_SSE2_NATIVE)
1562
-   return _mm_cmplt_pd(a, b);
1563
-#else
1564
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1565
-                b_ = simde__m128d_to_private(b);
1566
-
1567
-#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1568
-   r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
1569
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1570
-   r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128);
1571
-#else
1572
-   SIMDE_VECTORIZE
1573
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
1574
-       r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0)
1575
-                           : UINT64_C(0);
1576
-   }
1577
-#endif
1578
-
1579
-   return simde__m128d_from_private(r_);
1580
-#endif
1581
-}
1582
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1583
-#define _mm_cmplt_pd(a, b) simde_mm_cmplt_pd(a, b)
1584
-#endif
1585
-
1586
-SIMDE_FUNCTION_ATTRIBUTES
1587
-simde__m128d simde_mm_cmplt_sd(simde__m128d a, simde__m128d b)
1588
-{
1589
-#if defined(SIMDE_X86_SSE2_NATIVE)
1590
-   return _mm_cmplt_sd(a, b);
1591
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
1592
-   return simde_mm_move_sd(a, simde_mm_cmplt_pd(a, b));
1593
-#else
1594
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1595
-                b_ = simde__m128d_to_private(b);
1596
-
1597
-   r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1598
-   r_.u64[1] = a_.u64[1];
1599
-
1600
-   return simde__m128d_from_private(r_);
1601
-#endif
1602
-}
1603
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1604
-#define _mm_cmplt_sd(a, b) simde_mm_cmplt_sd(a, b)
1605
-#endif
1606
-
1607
-SIMDE_FUNCTION_ATTRIBUTES
1608
-simde__m128d simde_mm_cmple_pd(simde__m128d a, simde__m128d b)
1609
-{
1610
-#if defined(SIMDE_X86_SSE2_NATIVE)
1611
-   return _mm_cmple_pd(a, b);
1612
-#else
1613
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1614
-                b_ = simde__m128d_to_private(b);
1615
-
1616
-#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1617
-   r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
1618
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1619
-   r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128);
1620
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1621
-   r_.altivec_f64 = (SIMDE_POWER_ALTIVEC_VECTOR(double))vec_cmple(
1622
-       a_.altivec_f64, b_.altivec_f64);
1623
-#else
1624
-   SIMDE_VECTORIZE
1625
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
1626
-       r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0)
1627
-                            : UINT64_C(0);
1628
-   }
1629
-#endif
1630
-
1631
-   return simde__m128d_from_private(r_);
1632
-#endif
1633
-}
1634
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1635
-#define _mm_cmple_pd(a, b) simde_mm_cmple_pd(a, b)
1636
-#endif
1637
-
1638
-SIMDE_FUNCTION_ATTRIBUTES
1639
-simde__m128d simde_mm_cmple_sd(simde__m128d a, simde__m128d b)
1640
-{
1641
-#if defined(SIMDE_X86_SSE2_NATIVE)
1642
-   return _mm_cmple_sd(a, b);
1643
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
1644
-   return simde_mm_move_sd(a, simde_mm_cmple_pd(a, b));
1645
-#else
1646
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1647
-                b_ = simde__m128d_to_private(b);
1648
-
1649
-   r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1650
-   r_.u64[1] = a_.u64[1];
1651
-
1652
-   return simde__m128d_from_private(r_);
1653
-#endif
1654
-}
1655
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1656
-#define _mm_cmple_sd(a, b) simde_mm_cmple_sd(a, b)
1657
-#endif
1658
-
1659
-SIMDE_FUNCTION_ATTRIBUTES
1660
-simde__m128i simde_mm_cmpgt_epi8(simde__m128i a, simde__m128i b)
1661
-{
1662
-#if defined(SIMDE_X86_SSE2_NATIVE)
1663
-   return _mm_cmpgt_epi8(a, b);
1664
-#else
1665
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1666
-                b_ = simde__m128i_to_private(b);
1667
-
1668
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1669
-   r_.neon_i8 = vreinterpretq_s8_u8(vcgtq_s8(a_.neon_i8, b_.neon_i8));
1670
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1671
-   r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128);
1672
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1673
-   r_.altivec_i8 = (SIMDE_POWER_ALTIVEC_VECTOR(signed char))vec_cmpgt(
1674
-       a_.altivec_i8, b_.altivec_i8);
1675
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1676
-   r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8));
1677
-#else
1678
-   SIMDE_VECTORIZE
1679
-   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
1680
-       r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1681
-   }
1682
-#endif
1683
-
1684
-   return simde__m128i_from_private(r_);
1685
-#endif
1686
-}
1687
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1688
-#define _mm_cmpgt_epi8(a, b) simde_mm_cmpgt_epi8(a, b)
1689
-#endif
1690
-
1691
-SIMDE_FUNCTION_ATTRIBUTES
1692
-simde__m128i simde_mm_cmpgt_epi16(simde__m128i a, simde__m128i b)
1693
-{
1694
-#if defined(SIMDE_X86_SSE2_NATIVE)
1695
-   return _mm_cmpgt_epi16(a, b);
1696
-#else
1697
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1698
-                b_ = simde__m128i_to_private(b);
1699
-
1700
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1701
-   r_.neon_i16 =
1702
-       vreinterpretq_s16_u16(vcgtq_s16(a_.neon_i16, b_.neon_i16));
1703
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1704
-   r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128);
1705
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1706
-   r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(
1707
-       SIMDE_POWER_ALTIVEC_VECTOR(signed short),
1708
-       vec_cmpgt(a_.altivec_i16, b_.altivec_i16));
1709
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1710
-   r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16));
1711
-#else
1712
-   SIMDE_VECTORIZE
1713
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1714
-       r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1715
-   }
1716
-#endif
1717
-
1718
-   return simde__m128i_from_private(r_);
1719
-#endif
1720
-}
1721
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1722
-#define _mm_cmpgt_epi16(a, b) simde_mm_cmpgt_epi16(a, b)
1723
-#endif
1724
-
1725
-SIMDE_FUNCTION_ATTRIBUTES
1726
-simde__m128i simde_mm_cmpgt_epi32(simde__m128i a, simde__m128i b)
1727
-{
1728
-#if defined(SIMDE_X86_SSE2_NATIVE)
1729
-   return _mm_cmpgt_epi32(a, b);
1730
-#else
1731
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1732
-                b_ = simde__m128i_to_private(b);
1733
-
1734
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1735
-   r_.neon_i32 =
1736
-       vreinterpretq_s32_u32(vcgtq_s32(a_.neon_i32, b_.neon_i32));
1737
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1738
-   r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128);
1739
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1740
-   r_.altivec_i32 = (SIMDE_POWER_ALTIVEC_VECTOR(signed int))vec_cmpgt(
1741
-       a_.altivec_i32, b_.altivec_i32);
1742
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1743
-   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32));
1744
-#else
1745
-   SIMDE_VECTORIZE
1746
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1747
-       r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1748
-   }
1749
-#endif
1750
-
1751
-   return simde__m128i_from_private(r_);
1752
-#endif
1753
-}
1754
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1755
-#define _mm_cmpgt_epi32(a, b) simde_mm_cmpgt_epi32(a, b)
1756
-#endif
1757
-
1758
-SIMDE_FUNCTION_ATTRIBUTES
1759
-simde__m128d simde_mm_cmpgt_pd(simde__m128d a, simde__m128d b)
1760
-{
1761
-#if defined(SIMDE_X86_SSE2_NATIVE)
1762
-   return _mm_cmpgt_pd(a, b);
1763
-#else
1764
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1765
-                b_ = simde__m128d_to_private(b);
1766
-
1767
-#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1768
-   r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
1769
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1770
-   r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128);
1771
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1772
-   r_.altivec_f64 =
1773
-       HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double),
1774
-                  vec_cmpgt(a_.altivec_f64, b_.altivec_f64));
1775
-#else
1776
-   SIMDE_VECTORIZE
1777
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
1778
-       r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0)
1779
-                           : UINT64_C(0);
1780
-   }
1781
-#endif
1782
-
1783
-   return simde__m128d_from_private(r_);
1784
-#endif
1785
-}
1786
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1787
-#define _mm_cmpgt_pd(a, b) simde_mm_cmpgt_pd(a, b)
1788
-#endif
1789
-
1790
-SIMDE_FUNCTION_ATTRIBUTES
1791
-simde__m128d simde_mm_cmpgt_sd(simde__m128d a, simde__m128d b)
1792
-{
1793
-#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1794
-   return _mm_cmpgt_sd(a, b);
1795
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
1796
-   return simde_mm_move_sd(a, simde_mm_cmpgt_pd(a, b));
1797
-#else
1798
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1799
-                b_ = simde__m128d_to_private(b);
1800
-
1801
-   r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1802
-   r_.u64[1] = a_.u64[1];
1803
-
1804
-   return simde__m128d_from_private(r_);
1805
-#endif
1806
-}
1807
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1808
-#define _mm_cmpgt_sd(a, b) simde_mm_cmpgt_sd(a, b)
1809
-#endif
1810
-
1811
-SIMDE_FUNCTION_ATTRIBUTES
1812
-simde__m128d simde_mm_cmpge_pd(simde__m128d a, simde__m128d b)
1813
-{
1814
-#if defined(SIMDE_X86_SSE2_NATIVE)
1815
-   return _mm_cmpge_pd(a, b);
1816
-#else
1817
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1818
-                b_ = simde__m128d_to_private(b);
1819
-
1820
-#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1821
-   r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
1822
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1823
-   r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128);
1824
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
1825
-   r_.altivec_f64 =
1826
-       HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double),
1827
-                  vec_cmpge(a_.altivec_f64, b_.altivec_f64));
1828
-#else
1829
-   SIMDE_VECTORIZE
1830
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
1831
-       r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0)
1832
-                            : UINT64_C(0);
1833
-   }
1834
-#endif
1835
-
1836
-   return simde__m128d_from_private(r_);
1837
-#endif
1838
-}
1839
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1840
-#define _mm_cmpge_pd(a, b) simde_mm_cmpge_pd(a, b)
1841
-#endif
1842
-
1843
-SIMDE_FUNCTION_ATTRIBUTES
1844
-simde__m128d simde_mm_cmpge_sd(simde__m128d a, simde__m128d b)
1845
-{
1846
-#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1847
-   return _mm_cmpge_sd(a, b);
1848
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
1849
-   return simde_mm_move_sd(a, simde_mm_cmpge_pd(a, b));
1850
-#else
1851
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1852
-                b_ = simde__m128d_to_private(b);
1853
-
1854
-   r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1855
-   r_.u64[1] = a_.u64[1];
1856
-
1857
-   return simde__m128d_from_private(r_);
1858
-#endif
1859
-}
1860
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1861
-#define _mm_cmpge_sd(a, b) simde_mm_cmpge_sd(a, b)
1862
-#endif
1863
-
1864
-SIMDE_FUNCTION_ATTRIBUTES
1865
-simde__m128d simde_mm_cmpnge_pd(simde__m128d a, simde__m128d b)
1866
-{
1867
-#if defined(SIMDE_X86_SSE2_NATIVE)
1868
-   return _mm_cmpnge_pd(a, b);
1869
-#else
1870
-   return simde_mm_cmplt_pd(a, b);
1871
-#endif
1872
-}
1873
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1874
-#define _mm_cmpnge_pd(a, b) simde_mm_cmpnge_pd(a, b)
1875
-#endif
1876
-
1877
-SIMDE_FUNCTION_ATTRIBUTES
1878
-simde__m128d simde_mm_cmpnge_sd(simde__m128d a, simde__m128d b)
1879
-{
1880
-#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1881
-   return _mm_cmpnge_sd(a, b);
1882
-#else
1883
-   return simde_mm_cmplt_sd(a, b);
1884
-#endif
1885
-}
1886
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1887
-#define _mm_cmpnge_sd(a, b) simde_mm_cmpnge_sd(a, b)
1888
-#endif
1889
-
1890
-SIMDE_FUNCTION_ATTRIBUTES
1891
-simde__m128d simde_mm_cmpnlt_pd(simde__m128d a, simde__m128d b)
1892
-{
1893
-#if defined(SIMDE_X86_SSE2_NATIVE)
1894
-   return _mm_cmpnlt_pd(a, b);
1895
-#else
1896
-   return simde_mm_cmpge_pd(a, b);
1897
-#endif
1898
-}
1899
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1900
-#define _mm_cmpnlt_pd(a, b) simde_mm_cmpnlt_pd(a, b)
1901
-#endif
1902
-
1903
-SIMDE_FUNCTION_ATTRIBUTES
1904
-simde__m128d simde_mm_cmpnlt_sd(simde__m128d a, simde__m128d b)
1905
-{
1906
-#if defined(SIMDE_X86_SSE2_NATIVE)
1907
-   return _mm_cmpnlt_sd(a, b);
1908
-#else
1909
-   return simde_mm_cmpge_sd(a, b);
1910
-#endif
1911
-}
1912
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1913
-#define _mm_cmpnlt_sd(a, b) simde_mm_cmpnlt_sd(a, b)
1914
-#endif
1915
-
1916
-SIMDE_FUNCTION_ATTRIBUTES
1917
-simde__m128d simde_mm_cmpnle_pd(simde__m128d a, simde__m128d b)
1918
-{
1919
-#if defined(SIMDE_X86_SSE2_NATIVE)
1920
-   return _mm_cmpnle_pd(a, b);
1921
-#else
1922
-   return simde_mm_cmpgt_pd(a, b);
1923
-#endif
1924
-}
1925
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1926
-#define _mm_cmpnle_pd(a, b) simde_mm_cmpnle_pd(a, b)
1927
-#endif
1928
-
1929
-SIMDE_FUNCTION_ATTRIBUTES
1930
-simde__m128d simde_mm_cmpnle_sd(simde__m128d a, simde__m128d b)
1931
-{
1932
-#if defined(SIMDE_X86_SSE2_NATIVE)
1933
-   return _mm_cmpnle_sd(a, b);
1934
-#else
1935
-   return simde_mm_cmpgt_sd(a, b);
1936
-#endif
1937
-}
1938
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1939
-#define _mm_cmpnle_sd(a, b) simde_mm_cmpnle_sd(a, b)
1940
-#endif
1941
-
1942
-SIMDE_FUNCTION_ATTRIBUTES
1943
-simde__m128d simde_mm_cmpord_pd(simde__m128d a, simde__m128d b)
1944
-{
1945
-#if defined(SIMDE_X86_SSE2_NATIVE)
1946
-   return _mm_cmpord_pd(a, b);
1947
-#else
1948
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1949
-                b_ = simde__m128d_to_private(b);
1950
-
1951
-#if defined(simde_math_isnan)
1952
-   SIMDE_VECTORIZE
1953
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
1954
-       r_.u64[i] = (!simde_math_isnan(a_.f64[i]) &&
1955
-                !simde_math_isnan(b_.f64[i]))
1956
-                   ? ~UINT64_C(0)
1957
-                   : UINT64_C(0);
1958
-   }
1959
-#else
1960
-   HEDLEY_UNREACHABLE();
1961
-#endif
1962
-
1963
-   return simde__m128d_from_private(r_);
1964
-#endif
1965
-}
1966
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1967
-#define _mm_cmpord_pd(a, b) simde_mm_cmpord_pd(a, b)
1968
-#endif
1969
-
1970
-SIMDE_FUNCTION_ATTRIBUTES
1971
-simde_float64 simde_mm_cvtsd_f64(simde__m128d a)
1972
-{
1973
-#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1974
-   return _mm_cvtsd_f64(a);
1975
-#else
1976
-   simde__m128d_private a_ = simde__m128d_to_private(a);
1977
-   return a_.f64[0];
1978
-#endif
1979
-}
1980
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1981
-#define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(a)
1982
-#endif
1983
-
1984
-SIMDE_FUNCTION_ATTRIBUTES
1985
-simde__m128d simde_mm_cmpord_sd(simde__m128d a, simde__m128d b)
1986
-{
1987
-#if defined(SIMDE_X86_SSE2_NATIVE)
1988
-   return _mm_cmpord_sd(a, b);
1989
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
1990
-   return simde_mm_move_sd(a, simde_mm_cmpord_pd(a, b));
1991
-#else
1992
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1993
-                b_ = simde__m128d_to_private(b);
1994
-
1995
-#if defined(simde_math_isnan)
1996
-   r_.u64[0] =
1997
-       (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0]))
1998
-           ? ~UINT64_C(0)
1999
-           : UINT64_C(0);
2000
-   r_.u64[1] = a_.u64[1];
2001
-#else
2002
-   HEDLEY_UNREACHABLE();
2003
-#endif
2004
-
2005
-   return simde__m128d_from_private(r_);
2006
-#endif
2007
-}
2008
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2009
-#define _mm_cmpord_sd(a, b) simde_mm_cmpord_sd(a, b)
2010
-#endif
2011
-
2012
-SIMDE_FUNCTION_ATTRIBUTES
2013
-simde__m128d simde_mm_cmpunord_pd(simde__m128d a, simde__m128d b)
2014
-{
2015
-#if defined(SIMDE_X86_SSE2_NATIVE)
2016
-   return _mm_cmpunord_pd(a, b);
2017
-#else
2018
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
2019
-                b_ = simde__m128d_to_private(b);
2020
-
2021
-#if defined(simde_math_isnan)
2022
-   SIMDE_VECTORIZE
2023
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
2024
-       r_.u64[i] = (simde_math_isnan(a_.f64[i]) ||
2025
-                simde_math_isnan(b_.f64[i]))
2026
-                   ? ~UINT64_C(0)
2027
-                   : UINT64_C(0);
2028
-   }
2029
-#else
2030
-   HEDLEY_UNREACHABLE();
2031
-#endif
2032
-
2033
-   return simde__m128d_from_private(r_);
2034
-#endif
2035
-}
2036
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2037
-#define _mm_cmpunord_pd(a, b) simde_mm_cmpunord_pd(a, b)
2038
-#endif
2039
-
2040
-SIMDE_FUNCTION_ATTRIBUTES
2041
-simde__m128d simde_mm_cmpunord_sd(simde__m128d a, simde__m128d b)
2042
-{
2043
-#if defined(SIMDE_X86_SSE2_NATIVE)
2044
-   return _mm_cmpunord_sd(a, b);
2045
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
2046
-   return simde_mm_move_sd(a, simde_mm_cmpunord_pd(a, b));
2047
-#else
2048
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
2049
-                b_ = simde__m128d_to_private(b);
2050
-
2051
-#if defined(simde_math_isnan)
2052
-   r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0]))
2053
-               ? ~UINT64_C(0)
2054
-               : UINT64_C(0);
2055
-   r_.u64[1] = a_.u64[1];
2056
-
2057
-#else
2058
-   HEDLEY_UNREACHABLE();
2059
-#endif
2060
-
2061
-   return simde__m128d_from_private(r_);
2062
-#endif
2063
-}
2064
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2065
-#define _mm_cmpunord_sd(a, b) simde_mm_cmpunord_sd(a, b)
2066
-#endif
2067
-
2068
-SIMDE_FUNCTION_ATTRIBUTES
2069
-simde__m128d simde_mm_cvtepi32_pd(simde__m128i a)
2070
-{
2071
-#if defined(SIMDE_X86_SSE2_NATIVE)
2072
-   return _mm_cvtepi32_pd(a);
2073
-#else
2074
-   simde__m128d_private r_;
2075
-   simde__m128i_private a_ = simde__m128i_to_private(a);
2076
-
2077
-#if defined(SIMDE_CONVERT_VECTOR_)
2078
-   SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32);
2079
-#else
2080
-   SIMDE_VECTORIZE
2081
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
2082
-       r_.f64[i] = (simde_float64)a_.i32[i];
2083
-   }
2084
-#endif
2085
-
2086
-   return simde__m128d_from_private(r_);
2087
-#endif
2088
-}
2089
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2090
-#define _mm_cvtepi32_pd(a) simde_mm_cvtepi32_pd(a)
2091
-#endif
2092
-
2093
-SIMDE_FUNCTION_ATTRIBUTES
2094
-simde__m128 simde_mm_cvtepi32_ps(simde__m128i a)
2095
-{
2096
-#if defined(SIMDE_X86_SSE2_NATIVE)
2097
-   return _mm_cvtepi32_ps(a);
2098
-#else
2099
-   simde__m128_private r_;
2100
-   simde__m128i_private a_ = simde__m128i_to_private(a);
2101
-
2102
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2103
-   r_.neon_f32 = vcvtq_f32_s32(a_.neon_i32);
2104
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2105
-   r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0);
2106
-#elif defined(SIMDE_CONVERT_VECTOR_)
2107
-   SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32);
2108
-#else
2109
-   SIMDE_VECTORIZE
2110
-   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2111
-       r_.f32[i] = (simde_float32)a_.i32[i];
2112
-   }
2113
-#endif
2114
-
2115
-   return simde__m128_from_private(r_);
2116
-#endif
2117
-}
2118
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2119
-#define _mm_cvtepi32_ps(a) simde_mm_cvtepi32_ps(a)
2120
-#endif
2121
-
2122
-SIMDE_FUNCTION_ATTRIBUTES
2123
-simde__m128i simde_mm_cvtpd_epi32(simde__m128d a)
2124
-{
2125
-#if defined(SIMDE_X86_SSE2_NATIVE)
2126
-   return _mm_cvtpd_epi32(a);
2127
-#else
2128
-   simde__m128i_private r_;
2129
-   simde__m128d_private a_ = simde__m128d_to_private(a);
2130
-
2131
-#if defined(SIMDE_CONVERT_VECTOR_)
2132
-   SIMDE_CONVERT_VECTOR_(r_.m64_private[0].i32, a_.f64);
2133
-   r_.m64_private[1] = simde__m64_to_private(simde_mm_setzero_si64());
2134
-#else
2135
-   SIMDE_VECTORIZE
2136
-   for (size_t i = 0; i < (sizeof(a_.f64) / sizeof(a_.f64[0])); i++) {
2137
-       r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.f64[i]);
2138
-   }
2139
-   simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2140
-#endif
2141
-
2142
-   return simde__m128i_from_private(r_);
2143
-#endif
2144
-}
2145
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2146
-#define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a)
2147
-#endif
2148
-
2149
-SIMDE_FUNCTION_ATTRIBUTES
2150
-simde__m64 simde_mm_cvtpd_pi32(simde__m128d a)
2151
-{
2152
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2153
-   return _mm_cvtpd_pi32(a);
2154
-#else
2155
-   simde__m64_private r_;
2156
-   simde__m128d_private a_ = simde__m128d_to_private(a);
2157
-
2158
-#if defined(SIMDE_CONVERT_VECTOR_)
2159
-   SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64);
2160
-#else
2161
-   SIMDE_VECTORIZE
2162
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
2163
-       r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.f64[i]);
2164
-   }
2165
-#endif
2166
-
2167
-   return simde__m64_from_private(r_);
2168
-#endif
2169
-}
2170
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2171
-#define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a)
2172
-#endif
2173
-
2174
-SIMDE_FUNCTION_ATTRIBUTES
2175
-simde__m128 simde_mm_cvtpd_ps(simde__m128d a)
2176
-{
2177
-#if defined(SIMDE_X86_SSE2_NATIVE)
2178
-   return _mm_cvtpd_ps(a);
2179
-#else
2180
-   simde__m128_private r_;
2181
-   simde__m128d_private a_ = simde__m128d_to_private(a);
2182
-
2183
-#if defined(SIMDE_CONVERT_VECTOR_)
2184
-   SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.f64);
2185
-   r_.m64_private[1] = simde__m64_to_private(simde_mm_setzero_si64());
2186
-#else
2187
-   SIMDE_VECTORIZE
2188
-   for (size_t i = 0; i < (sizeof(a_.f64) / sizeof(a_.f64[0])); i++) {
2189
-       r_.f32[i] = (simde_float32)a_.f64[i];
2190
-   }
2191
-   simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2192
-#endif
2193
-
2194
-   return simde__m128_from_private(r_);
2195
-#endif
2196
-}
2197
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2198
-#define _mm_cvtpd_ps(a) simde_mm_cvtpd_ps(a)
2199
-#endif
2200
-
2201
-SIMDE_FUNCTION_ATTRIBUTES
2202
-simde__m128d simde_mm_cvtpi32_pd(simde__m64 a)
2203
-{
2204
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2205
-   return _mm_cvtpi32_pd(a);
2206
-#else
2207
-   simde__m128d_private r_;
2208
-   simde__m64_private a_ = simde__m64_to_private(a);
2209
-
2210
-#if defined(SIMDE_CONVERT_VECTOR_)
2211
-   SIMDE_CONVERT_VECTOR_(r_.f64, a_.i32);
2212
-#else
2213
-   SIMDE_VECTORIZE
2214
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
2215
-       r_.f64[i] = (simde_float64)a_.i32[i];
2216
-   }
2217
-#endif
2218
-
2219
-   return simde__m128d_from_private(r_);
2220
-#endif
2221
-}
2222
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2223
-#define _mm_cvtpi32_pd(a) simde_mm_cvtpi32_pd(a)
2224
-#endif
2225
-
2226
-SIMDE_FUNCTION_ATTRIBUTES
2227
-simde__m128i simde_mm_cvtps_epi32(simde__m128 a)
2228
-{
2229
-#if defined(SIMDE_X86_SSE2_NATIVE)
2230
-   return _mm_cvtps_epi32(a);
2231
-#else
2232
-   simde__m128i_private r_;
2233
-   simde__m128_private a_ = simde__m128_to_private(a);
2234
-
2235
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2236
-/* The default rounding mode on SSE is 'round to even', which ArmV7
2237
-     does not support!  It is supported on ARMv8 however. */
2238
-#if defined(SIMDE_ARCH_AARCH64)
2239
-   r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2240
-#else
2241
-   uint32x4_t signmask = vdupq_n_u32(0x80000000);
2242
-   float32x4_t half = vbslq_f32(signmask, a_.neon_f32,
2243
-                    vdupq_n_f32(0.5f)); /* +/- 0.5 */
2244
-   int32x4_t r_normal = vcvtq_s32_f32(
2245
-       vaddq_f32(a_.neon_f32, half)); /* round to integer: [a + 0.5]*/
2246
-   int32x4_t r_trunc =
2247
-       vcvtq_s32_f32(a_.neon_f32); /* truncate to integer: [a] */
2248
-   int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31); /* 1 or 0 */
2249
-   int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
2250
-                    vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
2251
-   float32x4_t delta = vsubq_f32(
2252
-       a_.neon_f32,
2253
-       vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
2254
-   uint32x4_t is_delta_half =
2255
-       vceqq_f32(delta, half); /* delta == +/- 0.5 */
2256
-   r_.neon_i32 = vbslq_s32(is_delta_half, r_even, r_normal);
2257
-#endif
2258
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2259
-   r_.altivec_i32 = vec_cts(a_.altivec_f32, 0);
2260
-#elif defined(SIMDE_CONVERT_VECTOR_)
2261
-   SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32);
2262
-#else
2263
-   SIMDE_VECTORIZE
2264
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
2265
-       r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.f32[i]);
2266
-   }
2267
-#endif
2268
-
2269
-   return simde__m128i_from_private(r_);
2270
-#endif
2271
-}
2272
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2273
-#define _mm_cvtps_epi32(a) simde_mm_cvtps_epi32(a)
2274
-#endif
2275
-
2276
-SIMDE_FUNCTION_ATTRIBUTES
2277
-simde__m128d simde_mm_cvtps_pd(simde__m128 a)
2278
-{
2279
-#if defined(SIMDE_X86_SSE2_NATIVE)
2280
-   return _mm_cvtps_pd(a);
2281
-#else
2282
-   simde__m128d_private r_;
2283
-   simde__m128_private a_ = simde__m128_to_private(a);
2284
-
2285
-#if defined(SIMDE_CONVERT_VECTOR_)
2286
-   SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32);
2287
-#else
2288
-   SIMDE_VECTORIZE
2289
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
2290
-       r_.f64[i] = a_.f32[i];
2291
-   }
2292
-#endif
2293
-
2294
-   return simde__m128d_from_private(r_);
2295
-#endif
2296
-}
2297
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2298
-#define _mm_cvtps_pd(a) simde_mm_cvtps_pd(a)
2299
-#endif
2300
-
2301
-SIMDE_FUNCTION_ATTRIBUTES
2302
-int32_t simde_mm_cvtsd_si32(simde__m128d a)
2303
-{
2304
-#if defined(SIMDE_X86_SSE2_NATIVE)
2305
-   return _mm_cvtsd_si32(a);
2306
-#else
2307
-   simde__m128d_private a_ = simde__m128d_to_private(a);
2308
-   return SIMDE_CONVERT_FTOI(int32_t, a_.f64[0]);
2309
-#endif
2310
-}
2311
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2312
-#define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(a)
2313
-#endif
2314
-
2315
-SIMDE_FUNCTION_ATTRIBUTES
2316
-int64_t simde_mm_cvtsd_si64(simde__m128d a)
2317
-{
2318
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2319
-#if defined(__PGI)
2320
-   return _mm_cvtsd_si64x(a);
2321
-#else
2322
-   return _mm_cvtsd_si64(a);
2323
-#endif
2324
-#else
2325
-   simde__m128d_private a_ = simde__m128d_to_private(a);
2326
-   return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]);
2327
-#endif
2328
-}
2329
-#define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
2330
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2331
-#define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(a)
2332
-#define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(a)
2333
-#endif
2334
-
2335
-SIMDE_FUNCTION_ATTRIBUTES
2336
-simde__m128 simde_mm_cvtsd_ss(simde__m128 a, simde__m128d b)
2337
-{
2338
-#if defined(SIMDE_X86_SSE2_NATIVE)
2339
-   return _mm_cvtsd_ss(a, b);
2340
-#else
2341
-   simde__m128_private r_, a_ = simde__m128_to_private(a);
2342
-   simde__m128d_private b_ = simde__m128d_to_private(b);
2343
-
2344
-   r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]);
2345
-
2346
-   SIMDE_VECTORIZE
2347
-   for (size_t i = 1; i < (sizeof(r_) / sizeof(r_.i32[0])); i++) {
2348
-       r_.i32[i] = a_.i32[i];
2349
-   }
2350
-
2351
-   return simde__m128_from_private(r_);
2352
-#endif
2353
-}
2354
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2355
-#define _mm_cvtsd_ss(a, b) simde_mm_cvtsd_ss(a, b)
2356
-#endif
2357
-
2358
-SIMDE_FUNCTION_ATTRIBUTES
2359
-int32_t simde_mm_cvtsi128_si32(simde__m128i a)
2360
-{
2361
-#if defined(SIMDE_X86_SSE2_NATIVE)
2362
-   return _mm_cvtsi128_si32(a);
2363
-#else
2364
-   simde__m128i_private a_ = simde__m128i_to_private(a);
2365
-
2366
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2367
-   return vgetq_lane_s32(a_.neon_i32, 0);
2368
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2369
-#if defined(SIMDE_BUG_GCC_95227)
2370
-   (void)a_;
2371
-#endif
2372
-   return vec_extract(a_.altivec_i32, 0);
2373
-#else
2374
-   return a_.i32[0];
2375
-#endif
2376
-#endif
2377
-}
2378
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2379
-#define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(a)
2380
-#endif
2381
-
2382
-SIMDE_FUNCTION_ATTRIBUTES
2383
-int64_t simde_mm_cvtsi128_si64(simde__m128i a)
2384
-{
2385
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2386
-#if defined(__PGI)
2387
-   return _mm_cvtsi128_si64x(a);
2388
-#else
2389
-   return _mm_cvtsi128_si64(a);
2390
-#endif
2391
-#else
2392
-   simde__m128i_private a_ = simde__m128i_to_private(a);
2393
-#if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) && !defined(HEDLEY_IBM_VERSION)
2394
-   return vec_extract(a_.i64, 0);
2395
-#endif
2396
-   return a_.i64[0];
2397
-#endif
2398
-}
2399
-#define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
2400
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2401
-#define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(a)
2402
-#define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(a)
2403
-#endif
2404
-
2405
-SIMDE_FUNCTION_ATTRIBUTES
2406
-simde__m128d simde_mm_cvtsi32_sd(simde__m128d a, int32_t b)
2407
-{
2408
-
2409
-#if defined(SIMDE_X86_SSE2_NATIVE)
2410
-   return _mm_cvtsi32_sd(a, b);
2411
-#else
2412
-   simde__m128d_private r_;
2413
-   simde__m128d_private a_ = simde__m128d_to_private(a);
2414
-
2415
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_AMD64)
2416
-   r_.neon_f64 = vsetq_lane_f64((simde_float64)b, a_.neon_f64, 0);
2417
-#else
2418
-   r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2419
-   r_.i64[1] = a_.i64[1];
2420
-#endif
2421
-
2422
-   return simde__m128d_from_private(r_);
2423
-#endif
2424
-}
2425
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2426
-#define _mm_cvtsi32_sd(a, b) simde_mm_cvtsi32_sd(a, b)
2427
-#endif
2428
-
2429
-SIMDE_FUNCTION_ATTRIBUTES
2430
-simde__m128i simde_mm_cvtsi32_si128(int32_t a)
2431
-{
2432
-#if defined(SIMDE_X86_SSE2_NATIVE)
2433
-   return _mm_cvtsi32_si128(a);
2434
-#else
2435
-   simde__m128i_private r_;
2436
-
2437
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2438
-   r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
2439
-#else
2440
-   r_.i32[0] = a;
2441
-   r_.i32[1] = 0;
2442
-   r_.i32[2] = 0;
2443
-   r_.i32[3] = 0;
2444
-#endif
2445
-
2446
-   return simde__m128i_from_private(r_);
2447
-#endif
2448
-}
2449
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2450
-#define _mm_cvtsi32_si128(a) simde_mm_cvtsi32_si128(a)
2451
-#endif
2452
-
2453
-SIMDE_FUNCTION_ATTRIBUTES
2454
-simde__m128d simde_mm_cvtsi64_sd(simde__m128d a, int64_t b)
2455
-{
2456
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2457
-#if !defined(__PGI)
2458
-   return _mm_cvtsi64_sd(a, b);
2459
-#else
2460
-   return _mm_cvtsi64x_sd(a, b);
2461
-#endif
2462
-#else
2463
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a);
2464
-
2465
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2466
-   r_.neon_f64 = vsetq_lane_f64((simde_float64)b, a_.neon_f64, 0);
2467
-#else
2468
-   r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2469
-   r_.f64[1] = a_.f64[1];
2470
-#endif
2471
-
2472
-   return simde__m128d_from_private(r_);
2473
-#endif
2474
-}
2475
-#define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2476
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2477
-#define _mm_cvtsi64_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2478
-#define _mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64x_sd(a, b)
2479
-#endif
2480
-
2481
-SIMDE_FUNCTION_ATTRIBUTES
2482
-simde__m128i simde_mm_cvtsi64_si128(int64_t a)
2483
-{
2484
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2485
-#if !defined(__PGI)
2486
-   return _mm_cvtsi64_si128(a);
2487
-#else
2488
-   return _mm_cvtsi64x_si128(a);
2489
-#endif
2490
-#else
2491
-   simde__m128i_private r_;
2492
-
2493
-   r_.i64[0] = a;
2494
-   r_.i64[1] = 0;
2495
-
2496
-   return simde__m128i_from_private(r_);
2497
-#endif
2498
-}
2499
-#define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
2500
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2501
-#define _mm_cvtsi64_si128(a) simde_mm_cvtsi64_si128(a)
2502
-#define _mm_cvtsi64x_si128(a) simde_mm_cvtsi64x_si128(a)
2503
-#endif
2504
-
2505
-SIMDE_FUNCTION_ATTRIBUTES
2506
-simde__m128d simde_mm_cvtss_sd(simde__m128d a, simde__m128 b)
2507
-{
2508
-#if defined(SIMDE_X86_SSE2_NATIVE)
2509
-   return _mm_cvtss_sd(a, b);
2510
-#else
2511
-   simde__m128d_private a_ = simde__m128d_to_private(a);
2512
-   simde__m128_private b_ = simde__m128_to_private(b);
2513
-
2514
-   a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]);
2515
-
2516
-   return simde__m128d_from_private(a_);
2517
-#endif
2518
-}
2519
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2520
-#define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b)
2521
-#endif
2522
-
2523
-SIMDE_FUNCTION_ATTRIBUTES
2524
-simde__m128i simde_mm_cvttpd_epi32(simde__m128d a)
2525
-{
2526
-#if defined(SIMDE_X86_SSE2_NATIVE)
2527
-   return _mm_cvttpd_epi32(a);
2528
-#else
2529
-   simde__m128i_private r_;
2530
-   simde__m128d_private a_ = simde__m128d_to_private(a);
2531
-
2532
-   for (size_t i = 0; i < (sizeof(a_.f64) / sizeof(a_.f64[0])); i++) {
2533
-       r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f64[i]);
2534
-   }
2535
-   simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2536
-
2537
-   return simde__m128i_from_private(r_);
2538
-#endif
2539
-}
2540
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2541
-#define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a)
2542
-#endif
2543
-
2544
-SIMDE_FUNCTION_ATTRIBUTES
2545
-simde__m64 simde_mm_cvttpd_pi32(simde__m128d a)
2546
-{
2547
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2548
-   return _mm_cvttpd_pi32(a);
2549
-#else
2550
-   simde__m64_private r_;
2551
-   simde__m128d_private a_ = simde__m128d_to_private(a);
2552
-
2553
-#if defined(SIMDE_CONVERT_VECTOR_)
2554
-   SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64);
2555
-#else
2556
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
2557
-       r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f64[i]);
2558
-   }
2559
-#endif
2560
-
2561
-   return simde__m64_from_private(r_);
2562
-#endif
2563
-}
2564
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2565
-#define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a)
2566
-#endif
2567
-
2568
-SIMDE_FUNCTION_ATTRIBUTES
2569
-simde__m128i simde_mm_cvttps_epi32(simde__m128 a)
2570
-{
2571
-#if defined(SIMDE_X86_SSE2_NATIVE)
2572
-   return _mm_cvttps_epi32(a);
2573
-#else
2574
-   simde__m128i_private r_;
2575
-   simde__m128_private a_ = simde__m128_to_private(a);
2576
-
2577
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2578
-   r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32);
2579
-#elif defined(SIMDE_CONVERT_VECTOR_)
2580
-   SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32);
2581
-#else
2582
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
2583
-       r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, a_.f32[i]);
2584
-   }
2585
-#endif
2586
-
2587
-   return simde__m128i_from_private(r_);
2588
-#endif
2589
-}
2590
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2591
-#define _mm_cvttps_epi32(a) simde_mm_cvttps_epi32(a)
2592
-#endif
2593
-
2594
-SIMDE_FUNCTION_ATTRIBUTES
2595
-int32_t simde_mm_cvttsd_si32(simde__m128d a)
2596
-{
2597
-#if defined(SIMDE_X86_SSE2_NATIVE)
2598
-   return _mm_cvttsd_si32(a);
2599
-#else
2600
-   simde__m128d_private a_ = simde__m128d_to_private(a);
2601
-   return SIMDE_CONVERT_FTOI(int32_t, a_.f64[0]);
2602
-#endif
2603
-}
2604
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2605
-#define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(a)
2606
-#endif
2607
-
2608
-SIMDE_FUNCTION_ATTRIBUTES
2609
-int64_t simde_mm_cvttsd_si64(simde__m128d a)
2610
-{
2611
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2612
-#if !defined(__PGI)
2613
-   return _mm_cvttsd_si64(a);
2614
-#else
2615
-   return _mm_cvttsd_si64x(a);
2616
-#endif
2617
-#else
2618
-   simde__m128d_private a_ = simde__m128d_to_private(a);
2619
-   return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]);
2620
-#endif
2621
-}
2622
-#define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
2623
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2624
-#define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(a)
2625
-#define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(a)
2626
-#endif
2627
-
2628
-SIMDE_FUNCTION_ATTRIBUTES
2629
-simde__m128d simde_mm_div_pd(simde__m128d a, simde__m128d b)
2630
-{
2631
-#if defined(SIMDE_X86_SSE2_NATIVE)
2632
-   return _mm_div_pd(a, b);
2633
-#else
2634
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
2635
-                b_ = simde__m128d_to_private(b);
2636
-
2637
-#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2638
-   r_.f64 = a_.f64 / b_.f64;
2639
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2640
-   r_.wasm_v128 = wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128);
2641
-#else
2642
-   SIMDE_VECTORIZE
2643
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
2644
-       r_.f64[i] = a_.f64[i] / b_.f64[i];
2645
-   }
2646
-#endif
2647
-
2648
-   return simde__m128d_from_private(r_);
2649
-#endif
2650
-}
2651
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2652
-#define _mm_div_pd(a, b) simde_mm_div_pd(a, b)
2653
-#endif
2654
-
2655
-SIMDE_FUNCTION_ATTRIBUTES
2656
-simde__m128d simde_mm_div_sd(simde__m128d a, simde__m128d b)
2657
-{
2658
-#if defined(SIMDE_X86_SSE2_NATIVE)
2659
-   return _mm_div_sd(a, b);
2660
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
2661
-   return simde_mm_move_sd(a, simde_mm_div_pd(a, b));
2662
-#else
2663
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
2664
-                b_ = simde__m128d_to_private(b);
2665
-
2666
-   r_.f64[0] = a_.f64[0] / b_.f64[0];
2667
-   r_.f64[1] = a_.f64[1];
2668
-
2669
-   return simde__m128d_from_private(r_);
2670
-#endif
2671
-}
2672
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2673
-#define _mm_div_sd(a, b) simde_mm_div_sd(a, b)
2674
-#endif
2675
-
2676
-SIMDE_FUNCTION_ATTRIBUTES
2677
-int32_t simde_mm_extract_epi16(simde__m128i a, const int imm8)
2678
-   SIMDE_REQUIRE_RANGE(imm8, 0, 7)
2679
-{
2680
-   uint16_t r;
2681
-   simde__m128i_private a_ = simde__m128i_to_private(a);
2682
-
2683
-#if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2684
-#if defined(SIMDE_BUG_GCC_95227)
2685
-   (void)a_;
2686
-   (void)imm8;
2687
-#endif
2688
-   r = vec_extract(a_.altivec_i16, imm8);
2689
-#else
2690
-   r = a_.u16[imm8 & 7];
2691
-#endif
2692
-
2693
-   return HEDLEY_STATIC_CAST(int32_t, r);
2694
-}
2695
-#if defined(SIMDE_X86_SSE2_NATIVE) && \
2696
-   (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4, 6, 0))
2697
-#define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8)
2698
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2699
-#define simde_mm_extract_epi16(a, imm8)                                        \
2700
-   HEDLEY_STATIC_CAST(int32_t,                                            \
2701
-              vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, \
2702
-                     (imm8)) &                            \
2703
-                  (UINT32_C(0x0000ffff)))
2704
-#endif
2705
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2706
-#define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8)
2707
-#endif
2708
-
2709
-SIMDE_FUNCTION_ATTRIBUTES
2710
-simde__m128i simde_mm_insert_epi16(simde__m128i a, int16_t i, const int imm8)
2711
-   SIMDE_REQUIRE_RANGE(imm8, 0, 7)
2712
-{
2713
-   simde__m128i_private a_ = simde__m128i_to_private(a);
2714
-   a_.i16[imm8 & 7] = i;
2715
-   return simde__m128i_from_private(a_);
2716
-}
2717
-#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2718
-#define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8))
2719
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2720
-#define simde_mm_insert_epi16(a, i, imm8) \
2721
-   simde__m128i_from_neon_i16(       \
2722
-       vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8)))
2723
-#endif
2724
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2725
-#define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8)
2726
-#endif
2727
-
2728
-SIMDE_FUNCTION_ATTRIBUTES
2729
-simde__m128d
2730
-simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
2731
-{
2732
-   simde_assert_aligned(16, mem_addr);
2733
-
2734
-#if defined(SIMDE_X86_SSE2_NATIVE)
2735
-   return _mm_load_pd(mem_addr);
2736
-#else
2737
-   simde__m128d_private r_;
2738
-
2739
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2740
-   r_.neon_u32 =
2741
-       vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const *, mem_addr));
2742
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) && !defined(HEDLEY_IBM_VERSION)
2743
-   r_.altivec_f64 = vec_ld(
2744
-       0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double)
2745
-                          const *,
2746
-                      mem_addr));
2747
-#else
2748
-   r_ = *SIMDE_ALIGN_CAST(simde__m128d_private const *, mem_addr);
2749
-#endif
2750
-
2751
-   return simde__m128d_from_private(r_);
2752
-#endif
2753
-}
2754
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2755
-#define _mm_load_pd(mem_addr) simde_mm_load_pd(mem_addr)
2756
-#endif
2757
-
2758
-SIMDE_FUNCTION_ATTRIBUTES
2759
-simde__m128d simde_mm_load_pd1(simde_float64 const *mem_addr)
2760
-{
2761
-#if defined(SIMDE_X86_SSE2_NATIVE)
2762
-   return _mm_load1_pd(mem_addr);
2763
-#else
2764
-   simde__m128d_private r_;
2765
-
2766
-   r_.f64[0] = *mem_addr;
2767
-   r_.f64[1] = *mem_addr;
2768
-
2769
-   return simde__m128d_from_private(r_);
2770
-#endif
2771
-}
2772
-#define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr)
2773
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2774
-#define _mm_load_pd1(mem_addr) simde_mm_load_pd1(mem_addr)
2775
-#define _mm_load1_pd(mem_addr) simde_mm_load1_pd(mem_addr)
2776
-#endif
2777
-
2778
-SIMDE_FUNCTION_ATTRIBUTES
2779
-simde__m128d simde_mm_load_sd(simde_float64 const *mem_addr)
2780
-{
2781
-#if defined(SIMDE_X86_SSE2_NATIVE)
2782
-   return _mm_load_sd(mem_addr);
2783
-#else
2784
-   simde__m128d_private r_;
2785
-
2786
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2787
-   r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0);
2788
-#else
2789
-   r_.f64[0] = *mem_addr;
2790
-   r_.u64[1] = UINT64_C(0);
2791
-#endif
2792
-
2793
-   return simde__m128d_from_private(r_);
2794
-#endif
2795
-}
2796
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2797
-#define _mm_load_sd(mem_addr) simde_mm_load_sd(mem_addr)
2798
-#endif
2799
-
2800
-SIMDE_FUNCTION_ATTRIBUTES
2801
-simde__m128i simde_mm_load_si128(simde__m128i const *mem_addr)
2802
-{
2803
-   simde_assert_aligned(16, mem_addr);
2804
-
2805
-#if defined(SIMDE_X86_SSE2_NATIVE)
2806
-   return _mm_load_si128(
2807
-       HEDLEY_REINTERPRET_CAST(__m128i const *, mem_addr));
2808
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2809
-   simde__m128i_private r_;
2810
-
2811
-#if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
2812
-   r_.altivec_i32 = vec_ld(
2813
-       0, HEDLEY_REINTERPRET_CAST(
2814
-              SIMDE_POWER_ALTIVEC_VECTOR(int) const *, mem_addr));
2815
-#else
2816
-   r_.neon_i32 = vld1q_s32((int32_t const *)mem_addr);
2817
-#endif
2818
-
2819
-   return simde__m128i_from_private(r_);
2820
-#else
2821
-   return *mem_addr;
2822
-#endif
2823
-}
2824
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2825
-#define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr)
2826
-#endif
2827
-
2828
-SIMDE_FUNCTION_ATTRIBUTES
2829
-simde__m128d simde_mm_loadh_pd(simde__m128d a, simde_float64 const *mem_addr)
2830
-{
2831
-#if defined(SIMDE_X86_SSE2_NATIVE)
2832
-   return _mm_loadh_pd(a, mem_addr);
2833
-#else
2834
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a);
2835
-   simde_float64 t;
2836
-
2837
-   simde_memcpy(&t, mem_addr, sizeof(t));
2838
-   r_.f64[0] = a_.f64[0];
2839
-   r_.f64[1] = t;
2840
-
2841
-   return simde__m128d_from_private(r_);
2842
-#endif
2843
-}
2844
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2845
-#define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr)
2846
-#endif
2847
-
2848
-SIMDE_FUNCTION_ATTRIBUTES
2849
-simde__m128i simde_mm_loadl_epi64(simde__m128i const *mem_addr)
2850
-{
2851
-   simde_assert_aligned(16, mem_addr);
2852
-
2853
-#if defined(SIMDE_X86_SSE2_NATIVE)
2854
-   return _mm_loadl_epi64(
2855
-       HEDLEY_REINTERPRET_CAST(__m128i const *, mem_addr));
2856
-#else
2857
-   simde__m128i_private r_;
2858
-
2859
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2860
-   r_.neon_i32 = vcombine_s32(vld1_s32((int32_t const *)mem_addr),
2861
-                  vcreate_s32(0));
2862
-#else
2863
-   r_.i64[0] = *HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr);
2864
-   r_.i64[1] = 0;
2865
-#endif
2866
-
2867
-   return simde__m128i_from_private(r_);
2868
-#endif
2869
-}
2870
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2871
-#define _mm_loadl_epi64(mem_addr) simde_mm_loadl_epi64(mem_addr)
2872
-#endif
2873
-
2874
-SIMDE_FUNCTION_ATTRIBUTES
2875
-simde__m128d simde_mm_loadl_pd(simde__m128d a, simde_float64 const *mem_addr)
2876
-{
2877
-#if defined(SIMDE_X86_SSE2_NATIVE)
2878
-   return _mm_loadl_pd(a, mem_addr);
2879
-#else
2880
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a);
2881
-
2882
-   r_.f64[0] = *mem_addr;
2883
-   r_.u64[1] = a_.u64[1];
2884
-
2885
-   return simde__m128d_from_private(r_);
2886
-#endif
2887
-}
2888
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2889
-#define _mm_loadl_pd(a, mem_addr) simde_mm_loadl_pd(a, mem_addr)
2890
-#endif
2891
-
2892
-SIMDE_FUNCTION_ATTRIBUTES
2893
-simde__m128d
2894
-simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
2895
-{
2896
-   simde_assert_aligned(16, mem_addr);
2897
-
2898
-#if defined(SIMDE_X86_SSE2_NATIVE)
2899
-   return _mm_loadr_pd(mem_addr);
2900
-#else
2901
-   simde__m128d_private r_;
2902
-
2903
-   r_.f64[0] = mem_addr[1];
2904
-   r_.f64[1] = mem_addr[0];
2905
-
2906
-   return simde__m128d_from_private(r_);
2907
-#endif
2908
-}
2909
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2910
-#define _mm_loadr_pd(mem_addr) simde_mm_loadr_pd(mem_addr)
2911
-#endif
2912
-
2913
-SIMDE_FUNCTION_ATTRIBUTES
2914
-simde__m128d
2915
-simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
2916
-{
2917
-#if defined(SIMDE_X86_SSE2_NATIVE)
2918
-   return _mm_loadu_pd(mem_addr);
2919
-#else
2920
-   simde__m128d_private r_;
2921
-
2922
-   simde_memcpy(&r_, mem_addr, sizeof(r_));
2923
-
2924
-   return simde__m128d_from_private(r_);
2925
-#endif
2926
-}
2927
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2928
-#define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr)
2929
-#endif
2930
-
2931
-SIMDE_FUNCTION_ATTRIBUTES
2932
-simde__m128i simde_mm_loadu_si128(simde__m128i const *mem_addr)
2933
-{
2934
-#if defined(SIMDE_X86_SSE2_NATIVE)
2935
-   return _mm_loadu_si128(HEDLEY_STATIC_CAST(__m128i const *, mem_addr));
2936
-#else
2937
-   simde__m128i_private r_;
2938
-
2939
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2940
-   r_.neon_i32 = vld1q_s32((int32_t const *)mem_addr);
2941
-#else
2942
-   simde_memcpy(&r_, mem_addr, sizeof(r_));
2943
-#endif
2944
-
2945
-   return simde__m128i_from_private(r_);
2946
-#endif
2947
-}
2948
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2949
-#define _mm_loadu_si128(mem_addr) simde_mm_loadu_si128(mem_addr)
2950
-#endif
2951
-
2952
-SIMDE_FUNCTION_ATTRIBUTES
2953
-simde__m128i simde_mm_madd_epi16(simde__m128i a, simde__m128i b)
2954
-{
2955
-#if defined(SIMDE_X86_SSE2_NATIVE)
2956
-   return _mm_madd_epi16(a, b);
2957
-#else
2958
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
2959
-                b_ = simde__m128i_to_private(b);
2960
-
2961
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2962
-   int32x4_t pl =
2963
-       vmull_s16(vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16));
2964
-   int32x4_t ph = vmull_s16(vget_high_s16(a_.neon_i16),
2965
-                vget_high_s16(b_.neon_i16));
2966
-   int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
2967
-   int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
2968
-   r_.neon_i32 = vcombine_s32(rl, rh);
2969
-#else
2970
-   SIMDE_VECTORIZE
2971
-   for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i16[0])); i += 2) {
2972
-       r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) +
2973
-               (a_.i16[i + 1] * b_.i16[i + 1]);
2974
-   }
2975
-#endif
2976
-
2977
-   return simde__m128i_from_private(r_);
2978
-#endif
2979
-}
2980
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2981
-#define _mm_madd_epi16(a, b) simde_mm_madd_epi16(a, b)
2982
-#endif
2983
-
2984
-SIMDE_FUNCTION_ATTRIBUTES
2985
-void simde_mm_maskmoveu_si128(simde__m128i a, simde__m128i mask,
2986
-                 int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)])
2987
-{
2988
-#if defined(SIMDE_X86_SSE2_NATIVE)
2989
-   _mm_maskmoveu_si128(a, mask, HEDLEY_REINTERPRET_CAST(char *, mem_addr));
2990
-#else
2991
-   simde__m128i_private a_ = simde__m128i_to_private(a),
2992
-                mask_ = simde__m128i_to_private(mask);
2993
-
2994
-   for (size_t i = 0; i < (sizeof(a_.i8) / sizeof(a_.i8[0])); i++) {
2995
-       if (mask_.u8[i] & 0x80) {
2996
-           mem_addr[i] = a_.i8[i];
2997
-       }
2998
-   }
2999
-#endif
3000
-}
3001
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3002
-#define _mm_maskmoveu_si128(a, mask, mem_addr) \
3003
-   simde_mm_maskmoveu_si128(              \
3004
-       (a), (mask),                   \
3005
-       SIMDE_CHECKED_REINTERPRET_CAST(int8_t *, char *, (mem_addr)))
3006
-#endif
3007
-
3008
-SIMDE_FUNCTION_ATTRIBUTES
3009
-int32_t simde_mm_movemask_epi8(simde__m128i a)
3010
-{
3011
-#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__INTEL_COMPILER)
3012
-   /* ICC has trouble with _mm_movemask_epi8 at -O2 and above: */
3013
-   return _mm_movemask_epi8(a);
3014
-#else
3015
-   int32_t r = 0;
3016
-   simde__m128i_private a_ = simde__m128i_to_private(a);
3017
-
3018
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3019
-   uint8x16_t input = a_.neon_u8;
3020
-   SIMDE_ALIGN_AS(16, int8x8_t)
3021
-   static const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
3022
-   uint8x8_t mask_and = vdup_n_u8(0x80);
3023
-   int8x8_t mask_shift = vld1_s8(xr);
3024
-
3025
-   uint8x8_t lo = vget_low_u8(input);
3026
-   uint8x8_t hi = vget_high_u8(input);
3027
-
3028
-   lo = vand_u8(lo, mask_and);
3029
-   lo = vshl_u8(lo, mask_shift);
3030
-
3031
-   hi = vand_u8(hi, mask_and);
3032
-   hi = vshl_u8(hi, mask_shift);
3033
-
3034
-   lo = vpadd_u8(lo, lo);
3035
-   lo = vpadd_u8(lo, lo);
3036
-   lo = vpadd_u8(lo, lo);
3037
-
3038
-   hi = vpadd_u8(hi, hi);
3039
-   hi = vpadd_u8(hi, hi);
3040
-   hi = vpadd_u8(hi, hi);
3041
-
3042
-   r = ((hi[0] << 8) | (lo[0] & 0xFF));
3043
-#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION)
3044
-   static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)
3045
-       perm = {120, 112, 104, 96, 88, 80, 72, 64,
3046
-           56,  48,  40,  32, 24, 16, 8,  0};
3047
-   r = HEDLEY_STATIC_CAST(
3048
-       int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 1));
3049
-#else
3050
-   SIMDE_VECTORIZE_REDUCTION(| : r)
3051
-   for (size_t i = 0; i < (sizeof(a_.u8) / sizeof(a_.u8[0])); i++) {
3052
-       r |= (a_.u8[15 - i] >> 7) << (15 - i);
3053
-   }
3054
-#endif
3055
-
3056
-   return r;
3057
-#endif
3058
-}
3059
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3060
-#define _mm_movemask_epi8(a) simde_mm_movemask_epi8(a)
3061
-#endif
3062
-
3063
-SIMDE_FUNCTION_ATTRIBUTES
3064
-int32_t simde_mm_movemask_pd(simde__m128d a)
3065
-{
3066
-#if defined(SIMDE_X86_SSE2_NATIVE)
3067
-   return _mm_movemask_pd(a);
3068
-#else
3069
-   int32_t r = 0;
3070
-   simde__m128d_private a_ = simde__m128d_to_private(a);
3071
-
3072
-   SIMDE_VECTORIZE
3073
-   for (size_t i = 0; i < (sizeof(a_.u64) / sizeof(a_.u64[0])); i++) {
3074
-       r |= (a_.u64[i] >> 63) << i;
3075
-   }
3076
-
3077
-   return r;
3078
-#endif
3079
-}
3080
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3081
-#define _mm_movemask_pd(a) simde_mm_movemask_pd(a)
3082
-#endif
3083
-
3084
-SIMDE_FUNCTION_ATTRIBUTES
3085
-simde__m64 simde_mm_movepi64_pi64(simde__m128i a)
3086
-{
3087
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3088
-   return _mm_movepi64_pi64(a);
3089
-#else
3090
-   simde__m64_private r_;
3091
-   simde__m128i_private a_ = simde__m128i_to_private(a);
3092
-
3093
-   r_.i64[0] = a_.i64[0];
3094
-
3095
-   return simde__m64_from_private(r_);
3096
-#endif
3097
-}
3098
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3099
-#define _mm_movepi64_pi64(a) simde_mm_movepi64_pi64(a)
3100
-#endif
3101
-
3102
-SIMDE_FUNCTION_ATTRIBUTES
3103
-simde__m128i simde_mm_movpi64_epi64(simde__m64 a)
3104
-{
3105
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3106
-   return _mm_movpi64_epi64(a);
3107
-#else
3108
-   simde__m128i_private r_;
3109
-   simde__m64_private a_ = simde__m64_to_private(a);
3110
-
3111
-   r_.i64[0] = a_.i64[0];
3112
-   r_.i64[1] = 0;
3113
-
3114
-   return simde__m128i_from_private(r_);
3115
-#endif
3116
-}
3117
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3118
-#define _mm_movpi64_epi64(a) simde_mm_movpi64_epi64(a)
3119
-#endif
3120
-
3121
-SIMDE_FUNCTION_ATTRIBUTES
3122
-simde__m128i simde_mm_min_epi16(simde__m128i a, simde__m128i b)
3123
-{
3124
-#if defined(SIMDE_X86_SSE2_NATIVE)
3125
-   return _mm_min_epi16(a, b);
3126
-#else
3127
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3128
-                b_ = simde__m128i_to_private(b);
3129
-
3130
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3131
-   r_.neon_i16 = vminq_s16(a_.neon_i16, b_.neon_i16);
3132
-#else
3133
-   SIMDE_VECTORIZE
3134
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
3135
-       r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3136
-   }
3137
-#endif
3138
-
3139
-   return simde__m128i_from_private(r_);
3140
-#endif
3141
-}
3142
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3143
-#define _mm_min_epi16(a, b) simde_mm_min_epi16(a, b)
3144
-#endif
3145
-
3146
-SIMDE_FUNCTION_ATTRIBUTES
3147
-simde__m128i simde_mm_min_epu8(simde__m128i a, simde__m128i b)
3148
-{
3149
-#if defined(SIMDE_X86_SSE2_NATIVE)
3150
-   return _mm_min_epu8(a, b);
3151
-#else
3152
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3153
-                b_ = simde__m128i_to_private(b);
3154
-
3155
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3156
-   r_.neon_u8 = vminq_u8(a_.neon_u8, b_.neon_u8);
3157
-#else
3158
-   SIMDE_VECTORIZE
3159
-   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
3160
-       r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3161
-   }
3162
-#endif
3163
-
3164
-   return simde__m128i_from_private(r_);
3165
-#endif
3166
-}
3167
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3168
-#define _mm_min_epu8(a, b) simde_mm_min_epu8(a, b)
3169
-#endif
3170
-
3171
-SIMDE_FUNCTION_ATTRIBUTES
3172
-simde__m128d simde_mm_min_pd(simde__m128d a, simde__m128d b)
3173
-{
3174
-#if defined(SIMDE_X86_SSE2_NATIVE)
3175
-   return _mm_min_pd(a, b);
3176
-#else
3177
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
3178
-                b_ = simde__m128d_to_private(b);
3179
-
3180
-   SIMDE_VECTORIZE
3181
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
3182
-       r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3183
-   }
3184
-
3185
-   return simde__m128d_from_private(r_);
3186
-#endif
3187
-}
3188
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3189
-#define _mm_min_pd(a, b) simde_mm_min_pd(a, b)
3190
-#endif
3191
-
3192
-SIMDE_FUNCTION_ATTRIBUTES
3193
-simde__m128d simde_mm_min_sd(simde__m128d a, simde__m128d b)
3194
-{
3195
-#if defined(SIMDE_X86_SSE2_NATIVE)
3196
-   return _mm_min_sd(a, b);
3197
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
3198
-   return simde_mm_move_sd(a, simde_mm_min_pd(a, b));
3199
-#else
3200
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
3201
-                b_ = simde__m128d_to_private(b);
3202
-
3203
-   r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3204
-   r_.f64[1] = a_.f64[1];
3205
-
3206
-   return simde__m128d_from_private(r_);
3207
-#endif
3208
-}
3209
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3210
-#define _mm_min_sd(a, b) simde_mm_min_sd(a, b)
3211
-#endif
3212
-
3213
-SIMDE_FUNCTION_ATTRIBUTES
3214
-simde__m128i simde_mm_max_epi16(simde__m128i a, simde__m128i b)
3215
-{
3216
-#if defined(SIMDE_X86_SSE2_NATIVE)
3217
-   return _mm_max_epi16(a, b);
3218
-#else
3219
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3220
-                b_ = simde__m128i_to_private(b);
3221
-
3222
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3223
-   r_.neon_i16 = vmaxq_s16(a_.neon_i16, b_.neon_i16);
3224
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3225
-   r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16);
3226
-#else
3227
-   SIMDE_VECTORIZE
3228
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
3229
-       r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3230
-   }
3231
-#endif
3232
-
3233
-   return simde__m128i_from_private(r_);
3234
-#endif
3235
-}
3236
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3237
-#define _mm_max_epi16(a, b) simde_mm_max_epi16(a, b)
3238
-#endif
3239
-
3240
-SIMDE_FUNCTION_ATTRIBUTES
3241
-simde__m128i simde_mm_max_epu8(simde__m128i a, simde__m128i b)
3242
-{
3243
-#if defined(SIMDE_X86_SSE2_NATIVE)
3244
-   return _mm_max_epu8(a, b);
3245
-#else
3246
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3247
-                b_ = simde__m128i_to_private(b);
3248
-
3249
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3250
-   r_.neon_u8 = vmaxq_u8(a_.neon_u8, b_.neon_u8);
3251
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3252
-   r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8);
3253
-#else
3254
-   SIMDE_VECTORIZE
3255
-   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
3256
-       r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3257
-   }
3258
-#endif
3259
-
3260
-   return simde__m128i_from_private(r_);
3261
-#endif
3262
-}
3263
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3264
-#define _mm_max_epu8(a, b) simde_mm_max_epu8(a, b)
3265
-#endif
3266
-
3267
-SIMDE_FUNCTION_ATTRIBUTES
3268
-simde__m128d simde_mm_max_pd(simde__m128d a, simde__m128d b)
3269
-{
3270
-#if defined(SIMDE_X86_SSE2_NATIVE)
3271
-   return _mm_max_pd(a, b);
3272
-#else
3273
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
3274
-                b_ = simde__m128d_to_private(b);
3275
-
3276
-#if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3277
-   r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64);
3278
-#else
3279
-   SIMDE_VECTORIZE
3280
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
3281
-       r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3282
-   }
3283
-#endif
3284
-
3285
-   return simde__m128d_from_private(r_);
3286
-#endif
3287
-}
3288
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3289
-#define _mm_max_pd(a, b) simde_mm_max_pd(a, b)
3290
-#endif
3291
-
3292
-SIMDE_FUNCTION_ATTRIBUTES
3293
-simde__m128d simde_mm_max_sd(simde__m128d a, simde__m128d b)
3294
-{
3295
-#if defined(SIMDE_X86_SSE2_NATIVE)
3296
-   return _mm_max_sd(a, b);
3297
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
3298
-   return simde_mm_move_sd(a, simde_mm_max_pd(a, b));
3299
-#else
3300
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
3301
-                b_ = simde__m128d_to_private(b);
3302
-
3303
-   r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3304
-   r_.f64[1] = a_.f64[1];
3305
-
3306
-   return simde__m128d_from_private(r_);
3307
-#endif
3308
-}
3309
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3310
-#define _mm_max_sd(a, b) simde_mm_max_sd(a, b)
3311
-#endif
3312
-
3313
-SIMDE_FUNCTION_ATTRIBUTES
3314
-simde__m128i simde_mm_move_epi64(simde__m128i a)
3315
-{
3316
-#if defined(SIMDE_X86_SSE2_NATIVE)
3317
-   return _mm_move_epi64(a);
3318
-#else
3319
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
3320
-
3321
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3322
-   r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1);
3323
-#else
3324
-   r_.i64[0] = a_.i64[0];
3325
-   r_.i64[1] = 0;
3326
-#endif
3327
-
3328
-   return simde__m128i_from_private(r_);
3329
-#endif
3330
-}
3331
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3332
-#define _mm_move_epi64(a) simde_mm_move_epi64(a)
3333
-#endif
3334
-
3335
-SIMDE_FUNCTION_ATTRIBUTES
3336
-simde__m128i simde_mm_mul_epu32(simde__m128i a, simde__m128i b)
3337
-{
3338
-#if defined(SIMDE_X86_SSE2_NATIVE)
3339
-   return _mm_mul_epu32(a, b);
3340
-#else
3341
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3342
-                b_ = simde__m128i_to_private(b);
3343
-
3344
-   SIMDE_VECTORIZE
3345
-   for (size_t i = 0; i < (sizeof(r_.u64) / sizeof(r_.u64[0])); i++) {
3346
-       r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) *
3347
-               HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]);
3348
-   }
3349
-
3350
-   return simde__m128i_from_private(r_);
3351
-#endif
3352
-}
3353
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3354
-#define _mm_mul_epu32(a, b) simde_mm_mul_epu32(a, b)
3355
-#endif
3356
-
3357
-SIMDE_FUNCTION_ATTRIBUTES
3358
-simde__m128i simde_x_mm_mul_epi64(simde__m128i a, simde__m128i b)
3359
-{
3360
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3361
-                b_ = simde__m128i_to_private(b);
3362
-
3363
-#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3364
-   r_.i64 = a_.i64 * b_.i64;
3365
-#else
3366
-   SIMDE_VECTORIZE
3367
-   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
3368
-       r_.i64[i] = a_.i64[i] * b_.i64[i];
3369
-   }
3370
-#endif
3371
-
3372
-   return simde__m128i_from_private(r_);
3373
-}
3374
-
3375
-SIMDE_FUNCTION_ATTRIBUTES
3376
-simde__m128i simde_x_mm_mod_epi64(simde__m128i a, simde__m128i b)
3377
-{
3378
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3379
-                b_ = simde__m128i_to_private(b);
3380
-
3381
-#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3382
-   r_.i64 = a_.i64 % b_.i64;
3383
-#else
3384
-   SIMDE_VECTORIZE
3385
-   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
3386
-       r_.i64[i] = a_.i64[i] % b_.i64[i];
3387
-   }
3388
-#endif
3389
-
3390
-   return simde__m128i_from_private(r_);
3391
-}
3392
-
3393
-SIMDE_FUNCTION_ATTRIBUTES
3394
-simde__m128d simde_mm_mul_pd(simde__m128d a, simde__m128d b)
3395
-{
3396
-#if defined(SIMDE_X86_SSE2_NATIVE)
3397
-   return _mm_mul_pd(a, b);
3398
-#else
3399
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
3400
-                b_ = simde__m128d_to_private(b);
3401
-
3402
-#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3403
-   r_.f64 = a_.f64 * b_.f64;
3404
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3405
-   r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128);
3406
-#else
3407
-   SIMDE_VECTORIZE
3408
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
3409
-       r_.f64[i] = a_.f64[i] * b_.f64[i];
3410
-   }
3411
-#endif
3412
-
3413
-   return simde__m128d_from_private(r_);
3414
-#endif
3415
-}
3416
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3417
-#define _mm_mul_pd(a, b) simde_mm_mul_pd(a, b)
3418
-#endif
3419
-
3420
-SIMDE_FUNCTION_ATTRIBUTES
3421
-simde__m128d simde_mm_mul_sd(simde__m128d a, simde__m128d b)
3422
-{
3423
-#if defined(SIMDE_X86_SSE2_NATIVE)
3424
-   return _mm_mul_sd(a, b);
3425
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
3426
-   return simde_mm_move_sd(a, simde_mm_mul_pd(a, b));
3427
-#else
3428
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
3429
-                b_ = simde__m128d_to_private(b);
3430
-
3431
-   r_.f64[0] = a_.f64[0] * b_.f64[0];
3432
-   r_.f64[1] = a_.f64[1];
3433
-
3434
-   return simde__m128d_from_private(r_);
3435
-#endif
3436
-}
3437
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3438
-#define _mm_mul_sd(a, b) simde_mm_mul_sd(a, b)
3439
-#endif
3440
-
3441
-SIMDE_FUNCTION_ATTRIBUTES
3442
-simde__m64 simde_mm_mul_su32(simde__m64 a, simde__m64 b)
3443
-{
3444
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
3445
-   !defined(__PGI)
3446
-   return _mm_mul_su32(a, b);
3447
-#else
3448
-   simde__m64_private r_, a_ = simde__m64_to_private(a),
3449
-                  b_ = simde__m64_to_private(b);
3450
-
3451
-   r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[0]) *
3452
-           HEDLEY_STATIC_CAST(uint64_t, b_.u32[0]);
3453
-
3454
-   return simde__m64_from_private(r_);
3455
-#endif
3456
-}
3457
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3458
-#define _mm_mul_su32(a, b) simde_mm_mul_su32(a, b)
3459
-#endif
3460
-
3461
-SIMDE_FUNCTION_ATTRIBUTES
3462
-simde__m128i simde_mm_mulhi_epi16(simde__m128i a, simde__m128i b)
3463
-{
3464
-#if defined(SIMDE_X86_SSE2_NATIVE)
3465
-   return _mm_mulhi_epi16(a, b);
3466
-#else
3467
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3468
-                b_ = simde__m128i_to_private(b);
3469
-
3470
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3471
-   int16x4_t a3210 = vget_low_s16(a_.neon_i16);
3472
-   int16x4_t b3210 = vget_low_s16(b_.neon_i16);
3473
-   int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
3474
-   int16x4_t a7654 = vget_high_s16(a_.neon_i16);
3475
-   int16x4_t b7654 = vget_high_s16(b_.neon_i16);
3476
-   int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
3477
-   uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210),
3478
-                   vreinterpretq_u16_s32(ab7654));
3479
-   r_.neon_u16 = rv.val[1];
3480
-#else
3481
-   SIMDE_VECTORIZE
3482
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
3483
-       r_.u16[i] = HEDLEY_STATIC_CAST(
3484
-           uint16_t,
3485
-           (HEDLEY_STATIC_CAST(
3486
-                uint32_t,
3487
-                HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) *
3488
-                    HEDLEY_STATIC_CAST(int32_t,
3489
-                               b_.i16[i])) >>
3490
-            16));
3491
-   }
3492
-#endif
3493
-
3494
-   return simde__m128i_from_private(r_);
3495
-#endif
3496
-}
3497
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3498
-#define _mm_mulhi_epi16(a, b) simde_mm_mulhi_epi16(a, b)
3499
-#endif
3500
-
3501
-SIMDE_FUNCTION_ATTRIBUTES
3502
-simde__m128i simde_mm_mulhi_epu16(simde__m128i a, simde__m128i b)
3503
-{
3504
-#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
3505
-   return _mm_mulhi_epu16(a, b);
3506
-#else
3507
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3508
-                b_ = simde__m128i_to_private(b);
3509
-
3510
-   SIMDE_VECTORIZE
3511
-   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
3512
-       r_.u16[i] = HEDLEY_STATIC_CAST(
3513
-           uint16_t,
3514
-           HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) *
3515
-                   HEDLEY_STATIC_CAST(uint32_t,
3516
-                              b_.u16[i]) >>
3517
-               16);
3518
-   }
3519
-
3520
-   return simde__m128i_from_private(r_);
3521
-#endif
3522
-}
3523
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3524
-#define _mm_mulhi_epu16(a, b) simde_mm_mulhi_epu16(a, b)
3525
-#endif
3526
-
3527
-SIMDE_FUNCTION_ATTRIBUTES
3528
-simde__m128i simde_mm_mullo_epi16(simde__m128i a, simde__m128i b)
3529
-{
3530
-#if defined(SIMDE_X86_SSE2_NATIVE)
3531
-   return _mm_mullo_epi16(a, b);
3532
-#else
3533
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3534
-                b_ = simde__m128i_to_private(b);
3535
-
3536
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3537
-   r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16);
3538
-#else
3539
-   SIMDE_VECTORIZE
3540
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
3541
-       r_.u16[i] = HEDLEY_STATIC_CAST(
3542
-           uint16_t,
3543
-           HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) *
3544
-               HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]));
3545
-   }
3546
-#endif
3547
-
3548
-   return simde__m128i_from_private(r_);
3549
-#endif
3550
-}
3551
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3552
-#define _mm_mullo_epi16(a, b) simde_mm_mullo_epi16(a, b)
3553
-#endif
3554
-
3555
-SIMDE_FUNCTION_ATTRIBUTES
3556
-simde__m128d simde_mm_or_pd(simde__m128d a, simde__m128d b)
3557
-{
3558
-#if defined(SIMDE_X86_SSE2_NATIVE)
3559
-   return _mm_or_pd(a, b);
3560
-#else
3561
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
3562
-                b_ = simde__m128d_to_private(b);
3563
-
3564
-#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3565
-   r_.i32f = a_.i32f | b_.i32f;
3566
-#else
3567
-   SIMDE_VECTORIZE
3568
-   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
3569
-       r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
3570
-   }
3571
-#endif
3572
-
3573
-   return simde__m128d_from_private(r_);
3574
-#endif
3575
-}
3576
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3577
-#define _mm_or_pd(a, b) simde_mm_or_pd(a, b)
3578
-#endif
3579
-
3580
-SIMDE_FUNCTION_ATTRIBUTES
3581
-simde__m128i simde_mm_or_si128(simde__m128i a, simde__m128i b)
3582
-{
3583
-#if defined(SIMDE_X86_SSE2_NATIVE)
3584
-   return _mm_or_si128(a, b);
3585
-#else
3586
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3587
-                b_ = simde__m128i_to_private(b);
3588
-
3589
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3590
-   r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
3591
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3592
-   r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
3593
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3594
-   r_.i32f = a_.i32f | b_.i32f;
3595
-#else
3596
-   SIMDE_VECTORIZE
3597
-   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
3598
-       r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
3599
-   }
3600
-#endif
3601
-
3602
-   return simde__m128i_from_private(r_);
3603
-#endif
3604
-}
3605
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3606
-#define _mm_or_si128(a, b) simde_mm_or_si128(a, b)
3607
-#endif
3608
-
3609
-SIMDE_FUNCTION_ATTRIBUTES
3610
-simde__m128i simde_mm_packs_epi16(simde__m128i a, simde__m128i b)
3611
-{
3612
-#if defined(SIMDE_X86_SSE2_NATIVE)
3613
-   return _mm_packs_epi16(a, b);
3614
-#else
3615
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3616
-                b_ = simde__m128i_to_private(b);
3617
-
3618
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3619
-   r_.neon_i8 =
3620
-       vcombine_s8(vqmovn_s16(a_.neon_i16), vqmovn_s16(b_.neon_i16));
3621
-#else
3622
-   SIMDE_VECTORIZE
3623
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
3624
-       r_.i8[i] = (a_.i16[i] > INT8_MAX)
3625
-                  ? INT8_MAX
3626
-                  : ((a_.i16[i] < INT8_MIN)
3627
-                         ? INT8_MIN
3628
-                         : HEDLEY_STATIC_CAST(int8_t,
3629
-                                  a_.i16[i]));
3630
-       r_.i8[i + 8] = (b_.i16[i] > INT8_MAX)
3631
-                      ? INT8_MAX
3632
-                      : ((b_.i16[i] < INT8_MIN)
3633
-                         ? INT8_MIN
3634
-                         : HEDLEY_STATIC_CAST(
3635
-                               int8_t, b_.i16[i]));
3636
-   }
3637
-#endif
3638
-
3639
-   return simde__m128i_from_private(r_);
3640
-#endif
3641
-}
3642
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3643
-#define _mm_packs_epi16(a, b) simde_mm_packs_epi16(a, b)
3644
-#endif
3645
-
3646
-SIMDE_FUNCTION_ATTRIBUTES
3647
-simde__m128i simde_mm_packs_epi32(simde__m128i a, simde__m128i b)
3648
-{
3649
-#if defined(SIMDE_X86_SSE2_NATIVE)
3650
-   return _mm_packs_epi32(a, b);
3651
-#else
3652
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3653
-                b_ = simde__m128i_to_private(b);
3654
-
3655
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3656
-   r_.neon_i16 =
3657
-       vcombine_s16(vqmovn_s32(a_.neon_i32), vqmovn_s32(b_.neon_i32));
3658
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3659
-   r_.altivec_i16 = vec_packs(a_.altivec_i32, b_.altivec_i32);
3660
-#else
3661
-   SIMDE_VECTORIZE
3662
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
3663
-       r_.i16[i] = (a_.i32[i] > INT16_MAX)
3664
-                   ? INT16_MAX
3665
-                   : ((a_.i32[i] < INT16_MIN)
3666
-                          ? INT16_MIN
3667
-                          : HEDLEY_STATIC_CAST(int16_t,
3668
-                                   a_.i32[i]));
3669
-       r_.i16[i + 4] =
3670
-           (b_.i32[i] > INT16_MAX)
3671
-               ? INT16_MAX
3672
-               : ((b_.i32[i] < INT16_MIN)
3673
-                      ? INT16_MIN
3674
-                      : HEDLEY_STATIC_CAST(int16_t,
3675
-                               b_.i32[i]));
3676
-   }
3677
-#endif
3678
-
3679
-   return simde__m128i_from_private(r_);
3680
-#endif
3681
-}
3682
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3683
-#define _mm_packs_epi32(a, b) simde_mm_packs_epi32(a, b)
3684
-#endif
3685
-
3686
-SIMDE_FUNCTION_ATTRIBUTES
3687
-simde__m128i simde_mm_packus_epi16(simde__m128i a, simde__m128i b)
3688
-{
3689
-#if defined(SIMDE_X86_SSE2_NATIVE)
3690
-   return _mm_packus_epi16(a, b);
3691
-#else
3692
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3693
-                b_ = simde__m128i_to_private(b);
3694
-
3695
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3696
-   r_.neon_u8 =
3697
-       vcombine_u8(vqmovun_s16(a_.neon_i16), vqmovun_s16(b_.neon_i16));
3698
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
3699
-   r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16);
3700
-#else
3701
-   SIMDE_VECTORIZE
3702
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
3703
-       r_.u8[i] = (a_.i16[i] > UINT8_MAX)
3704
-                  ? UINT8_MAX
3705
-                  : ((a_.i16[i] < 0)
3706
-                         ? UINT8_C(0)
3707
-                         : HEDLEY_STATIC_CAST(uint8_t,
3708
-                                  a_.i16[i]));
3709
-       r_.u8[i + 8] =
3710
-           (b_.i16[i] > UINT8_MAX)
3711
-               ? UINT8_MAX
3712
-               : ((b_.i16[i] < 0)
3713
-                      ? UINT8_C(0)
3714
-                      : HEDLEY_STATIC_CAST(uint8_t,
3715
-                               b_.i16[i]));
3716
-   }
3717
-#endif
3718
-
3719
-   return simde__m128i_from_private(r_);
3720
-#endif
3721
-}
3722
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3723
-#define _mm_packus_epi16(a, b) simde_mm_packus_epi16(a, b)
3724
-#endif
3725
-
3726
-SIMDE_FUNCTION_ATTRIBUTES
3727
-void simde_mm_pause(void)
3728
-{
3729
-#if defined(SIMDE_X86_SSE2_NATIVE)
3730
-   _mm_pause();
3731
-#endif
3732
-}
3733
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3734
-#define _mm_pause() (simde_mm_pause())
3735
-#endif
3736
-
3737
-SIMDE_FUNCTION_ATTRIBUTES
3738
-simde__m128i simde_mm_sad_epu8(simde__m128i a, simde__m128i b)
3739
-{
3740
-#if defined(SIMDE_X86_SSE2_NATIVE)
3741
-   return _mm_sad_epu8(a, b);
3742
-#else
3743
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3744
-                b_ = simde__m128i_to_private(b);
3745
-
3746
-   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
3747
-       uint16_t tmp = 0;
3748
-       SIMDE_VECTORIZE_REDUCTION(+ : tmp)
3749
-       for (size_t j = 0; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2);
3750
-            j++) {
3751
-           const size_t e = j + (i * 8);
3752
-           tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e])
3753
-                            : (b_.u8[e] - a_.u8[e]);
3754
-       }
3755
-       r_.i64[i] = tmp;
3756
-   }
3757
-
3758
-   return simde__m128i_from_private(r_);
3759
-#endif
3760
-}
3761
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3762
-#define _mm_sad_epu8(a, b) simde_mm_sad_epu8(a, b)
3763
-#endif
3764
-
3765
-SIMDE_FUNCTION_ATTRIBUTES
3766
-simde__m128i simde_mm_set_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12,
3767
-                  int8_t e11, int8_t e10, int8_t e9, int8_t e8,
3768
-                  int8_t e7, int8_t e6, int8_t e5, int8_t e4,
3769
-                  int8_t e3, int8_t e2, int8_t e1, int8_t e0)
3770
-{
3771
-
3772
-#if defined(SIMDE_X86_SSE2_NATIVE)
3773
-   return _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5,
3774
-               e4, e3, e2, e1, e0);
3775
-#else
3776
-   simde__m128i_private r_;
3777
-
3778
-#if defined(SIMDE_WASM_SIMD128_NATIVE)
3779
-   r_.wasm_v128 = wasm_i8x16_make(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9,
3780
-                      e10, e11, e12, e13, e14, e15);
3781
-#else
3782
-   r_.i8[0] = e0;
3783
-   r_.i8[1] = e1;
3784
-   r_.i8[2] = e2;
3785
-   r_.i8[3] = e3;
3786
-   r_.i8[4] = e4;
3787
-   r_.i8[5] = e5;
3788
-   r_.i8[6] = e6;
3789
-   r_.i8[7] = e7;
3790
-   r_.i8[8] = e8;
3791
-   r_.i8[9] = e9;
3792
-   r_.i8[10] = e10;
3793
-   r_.i8[11] = e11;
3794
-   r_.i8[12] = e12;
3795
-   r_.i8[13] = e13;
3796
-   r_.i8[14] = e14;
3797
-   r_.i8[15] = e15;
3798
-#endif
3799
-
3800
-   return simde__m128i_from_private(r_);
3801
-#endif
3802
-}
3803
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3804
-#define _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, \
3805
-            e2, e1, e0)                                               \
3806
-   simde_mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5,    \
3807
-             e4, e3, e2, e1, e0)
3808
-#endif
3809
-
3810
-SIMDE_FUNCTION_ATTRIBUTES
3811
-simde__m128i simde_mm_set_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4,
3812
-               int16_t e3, int16_t e2, int16_t e1, int16_t e0)
3813
-{
3814
-#if defined(SIMDE_X86_SSE2_NATIVE)
3815
-   return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
3816
-#else
3817
-   simde__m128i_private r_;
3818
-
3819
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3820
-   SIMDE_ALIGN_AS(16, int16x8_t)
3821
-   int16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7};
3822
-   r_.neon_i16 = vld1q_s16(data);
3823
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3824
-   r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7);
3825
-#else
3826
-   r_.i16[0] = e0;
3827
-   r_.i16[1] = e1;
3828
-   r_.i16[2] = e2;
3829
-   r_.i16[3] = e3;
3830
-   r_.i16[4] = e4;
3831
-   r_.i16[5] = e5;
3832
-   r_.i16[6] = e6;
3833
-   r_.i16[7] = e7;
3834
-#endif
3835
-
3836
-   return simde__m128i_from_private(r_);
3837
-#endif
3838
-}
3839
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3840
-#define _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) \
3841
-   simde_mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
3842
-#endif
3843
-
3844
-SIMDE_FUNCTION_ATTRIBUTES
3845
-simde__m128i simde_mm_set_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
3846
-{
3847
-#if defined(SIMDE_X86_SSE2_NATIVE)
3848
-   return _mm_set_epi32(e3, e2, e1, e0);
3849
-#else
3850
-   simde__m128i_private r_;
3851
-
3852
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3853
-   SIMDE_ALIGN_AS(16, int32x4_t) int32_t data[4] = {e0, e1, e2, e3};
3854
-   r_.neon_i32 = vld1q_s32(data);
3855
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3856
-   r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3);
3857
-#else
3858
-   r_.i32[0] = e0;
3859
-   r_.i32[1] = e1;
3860
-   r_.i32[2] = e2;
3861
-   r_.i32[3] = e3;
3862
-#endif
3863
-
3864
-   return simde__m128i_from_private(r_);
3865
-#endif
3866
-}
3867
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3868
-#define _mm_set_epi32(e3, e2, e1, e0) simde_mm_set_epi32(e3, e2, e1, e0)
3869
-#endif
3870
-
3871
-SIMDE_FUNCTION_ATTRIBUTES
3872
-simde__m128i simde_mm_set_epi64(simde__m64 e1, simde__m64 e0)
3873
-{
3874
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3875
-   return _mm_set_epi64(e1, e0);
3876
-#else
3877
-   simde__m128i_private r_;
3878
-
3879
-   r_.m64_private[0] = simde__m64_to_private(e0);
3880
-   r_.m64_private[1] = simde__m64_to_private(e1);
3881
-
3882
-   return simde__m128i_from_private(r_);
3883
-#endif
3884
-}
3885
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3886
-#define _mm_set_epi64(e1, e0) (simde_mm_set_epi64((e1), (e0)))
3887
-#endif
3888
-
3889
-SIMDE_FUNCTION_ATTRIBUTES
3890
-simde__m128i simde_mm_set_epi64x(int64_t e1, int64_t e0)
3891
-{
3892
-#if defined(SIMDE_X86_SSE2_NATIVE) && \
3893
-   (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19, 0, 0))
3894
-   return _mm_set_epi64x(e1, e0);
3895
-#else
3896
-   simde__m128i_private r_;
3897
-
3898
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3899
-   r_.neon_i64 = vcombine_s64(vdup_n_s64(e0), vdup_n_s64(e1));
3900
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3901
-   r_.wasm_v128 = wasm_i64x2_make(e0, e1);
3902
-#else
3903
-   r_.i64[0] = e0;
3904
-   r_.i64[1] = e1;
3905
-#endif
3906
-
3907
-   return simde__m128i_from_private(r_);
3908
-#endif
3909
-}
3910
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3911
-#define _mm_set_epi64x(e1, e0) simde_mm_set_epi64x(e1, e0)
3912
-#endif
3913
-
3914
-SIMDE_FUNCTION_ATTRIBUTES
3915
-simde__m128i simde_x_mm_set_epu8(uint8_t e15, uint8_t e14, uint8_t e13,
3916
-                uint8_t e12, uint8_t e11, uint8_t e10,
3917
-                uint8_t e9, uint8_t e8, uint8_t e7, uint8_t e6,
3918
-                uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2,
3919
-                uint8_t e1, uint8_t e0)
3920
-{
3921
-#if defined(SIMDE_X86_SSE2_NATIVE)
3922
-   return _mm_set_epi8(
3923
-       HEDLEY_STATIC_CAST(char, e15), HEDLEY_STATIC_CAST(char, e14),
3924
-       HEDLEY_STATIC_CAST(char, e13), HEDLEY_STATIC_CAST(char, e12),
3925
-       HEDLEY_STATIC_CAST(char, e11), HEDLEY_STATIC_CAST(char, e10),
3926
-       HEDLEY_STATIC_CAST(char, e9), HEDLEY_STATIC_CAST(char, e8),
3927
-       HEDLEY_STATIC_CAST(char, e7), HEDLEY_STATIC_CAST(char, e6),
3928
-       HEDLEY_STATIC_CAST(char, e5), HEDLEY_STATIC_CAST(char, e4),
3929
-       HEDLEY_STATIC_CAST(char, e3), HEDLEY_STATIC_CAST(char, e2),
3930
-       HEDLEY_STATIC_CAST(char, e1), HEDLEY_STATIC_CAST(char, e0));
3931
-#else
3932
-   simde__m128i_private r_;
3933
-
3934
-   r_.u8[0] = e0;
3935
-   r_.u8[1] = e1;
3936
-   r_.u8[2] = e2;
3937
-   r_.u8[3] = e3;
3938
-   r_.u8[4] = e4;
3939
-   r_.u8[5] = e5;
3940
-   r_.u8[6] = e6;
3941
-   r_.u8[7] = e7;
3942
-   r_.u8[8] = e8;
3943
-   r_.u8[9] = e9;
3944
-   r_.u8[10] = e10;
3945
-   r_.u8[11] = e11;
3946
-   r_.u8[12] = e12;
3947
-   r_.u8[13] = e13;
3948
-   r_.u8[14] = e14;
3949
-   r_.u8[15] = e15;
3950
-
3951
-   return simde__m128i_from_private(r_);
3952
-#endif
3953
-}
3954
-
3955
-SIMDE_FUNCTION_ATTRIBUTES
3956
-simde__m128i simde_x_mm_set_epu16(uint16_t e7, uint16_t e6, uint16_t e5,
3957
-                 uint16_t e4, uint16_t e3, uint16_t e2,
3958
-                 uint16_t e1, uint16_t e0)
3959
-{
3960
-#if defined(SIMDE_X86_SSE2_NATIVE)
3961
-   return _mm_set_epi16(
3962
-       HEDLEY_STATIC_CAST(short, e7), HEDLEY_STATIC_CAST(short, e6),
3963
-       HEDLEY_STATIC_CAST(short, e5), HEDLEY_STATIC_CAST(short, e4),
3964
-       HEDLEY_STATIC_CAST(short, e3), HEDLEY_STATIC_CAST(short, e2),
3965
-       HEDLEY_STATIC_CAST(short, e1), HEDLEY_STATIC_CAST(short, e0));
3966
-#else
3967
-   simde__m128i_private r_;
3968
-
3969
-   r_.u16[0] = e0;
3970
-   r_.u16[1] = e1;
3971
-   r_.u16[2] = e2;
3972
-   r_.u16[3] = e3;
3973
-   r_.u16[4] = e4;
3974
-   r_.u16[5] = e5;
3975
-   r_.u16[6] = e6;
3976
-   r_.u16[7] = e7;
3977
-
3978
-   return simde__m128i_from_private(r_);
3979
-#endif
3980
-}
3981
-
3982
-SIMDE_FUNCTION_ATTRIBUTES
3983
-simde__m128i simde_x_mm_set_epu32(uint32_t e3, uint32_t e2, uint32_t e1,
3984
-                 uint32_t e0)
3985
-{
3986
-#if defined(SIMDE_X86_SSE2_NATIVE)
3987
-   return _mm_set_epi32(HEDLEY_STATIC_CAST(int, e3),
3988
-                HEDLEY_STATIC_CAST(int, e2),
3989
-                HEDLEY_STATIC_CAST(int, e1),
3990
-                HEDLEY_STATIC_CAST(int, e0));
3991
-#else
3992
-   simde__m128i_private r_;
3993
-
3994
-   r_.u32[0] = e0;
3995
-   r_.u32[1] = e1;
3996
-   r_.u32[2] = e2;
3997
-   r_.u32[3] = e3;
3998
-
3999
-   return simde__m128i_from_private(r_);
4000
-#endif
4001
-}
4002
-
4003
-SIMDE_FUNCTION_ATTRIBUTES
4004
-simde__m128i simde_x_mm_set_epu64x(uint64_t e1, uint64_t e0)
4005
-{
4006
-#if defined(SIMDE_X86_SSE2_NATIVE) && \
4007
-   (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19, 0, 0))
4008
-   return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t, e1),
4009
-                 HEDLEY_STATIC_CAST(int64_t, e0));
4010
-#else
4011
-   simde__m128i_private r_;
4012
-
4013
-   r_.u64[0] = e0;
4014
-   r_.u64[1] = e1;
4015
-
4016
-   return simde__m128i_from_private(r_);
4017
-#endif
4018
-}
4019
-
4020
-SIMDE_FUNCTION_ATTRIBUTES
4021
-simde__m128d simde_mm_set_pd(simde_float64 e1, simde_float64 e0)
4022
-{
4023
-#if defined(SIMDE_X86_SSE2_NATIVE)
4024
-   return _mm_set_pd(e1, e0);
4025
-#else
4026
-   simde__m128d_private r_;
4027
-
4028
-#if defined(SIMDE_WASM_SIMD128_NATIVE)
4029
-   r_.wasm_v128 = wasm_f64x2_make(e0, e1);
4030
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4031
-   r_.wasm_v128 = wasm_f64x2_make(e0, e1);
4032
-#else
4033
-   r_.f64[0] = e0;
4034
-   r_.f64[1] = e1;
4035
-#endif
4036
-
4037
-   return simde__m128d_from_private(r_);
4038
-#endif
4039
-}
4040
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4041
-#define _mm_set_pd(e1, e0) simde_mm_set_pd(e1, e0)
4042
-#endif
4043
-
4044
-SIMDE_FUNCTION_ATTRIBUTES
4045
-simde__m128d simde_mm_set_pd1(simde_float64 a)
4046
-{
4047
-#if defined(SIMDE_X86_SSE2_NATIVE)
4048
-   return _mm_set1_pd(a);
4049
-#else
4050
-   simde__m128d_private r_;
4051
-
4052
-   r_.f64[0] = a;
4053
-   r_.f64[1] = a;
4054
-
4055
-   return simde__m128d_from_private(r_);
4056
-#endif
4057
-}
4058
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4059
-#define _mm_set_pd1(a) simde_mm_set1_pd(a)
4060
-#endif
4061
-
4062
-SIMDE_FUNCTION_ATTRIBUTES
4063
-simde__m128d simde_mm_set_sd(simde_float64 a)
4064
-{
4065
-#if defined(SIMDE_X86_SSE2_NATIVE)
4066
-   return _mm_set_sd(a);
4067
-#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4068
-   return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT32_C(0.0)), 0);
4069
-#else
4070
-   return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a);
4071
-
4072
-#endif
4073
-}
4074
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4075
-#define _mm_set_sd(a) simde_mm_set_sd(a)
4076
-#endif
4077
-
4078
-SIMDE_FUNCTION_ATTRIBUTES
4079
-simde__m128i simde_mm_set1_epi8(int8_t a)
4080
-{
4081
-#if defined(SIMDE_X86_SSE2_NATIVE)
4082
-   return _mm_set1_epi8(a);
4083
-#else
4084
-   simde__m128i_private r_;
4085
-
4086
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4087
-   r_.neon_i8 = vdupq_n_s8(a);
4088
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4089
-   r_.wasm_v128 = wasm_i8x16_splat(a);
4090
-#else
4091
-   SIMDE_VECTORIZE
4092
-   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
4093
-       r_.i8[i] = a;
4094
-   }
4095
-#endif
4096
-
4097
-   return simde__m128i_from_private(r_);
4098
-#endif
4099
-}
4100
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4101
-#define _mm_set1_epi8(a) simde_mm_set1_epi8(a)
4102
-#endif
4103
-
4104
-SIMDE_FUNCTION_ATTRIBUTES
4105
-simde__m128i simde_mm_set1_epi16(int16_t a)
4106
-{
4107
-#if defined(SIMDE_X86_SSE2_NATIVE)
4108
-   return _mm_set1_epi16(a);
4109
-#else
4110
-   simde__m128i_private r_;
4111
-
4112
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4113
-   r_.neon_i16 = vdupq_n_s16(a);
4114
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4115
-   r_.wasm_v128 = wasm_i16x8_splat(a);
4116
-#else
4117
-   SIMDE_VECTORIZE
4118
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
4119
-       r_.i16[i] = a;
4120
-   }
4121
-#endif
4122
-
4123
-   return simde__m128i_from_private(r_);
4124
-#endif
4125
-}
4126
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4127
-#define _mm_set1_epi16(a) simde_mm_set1_epi16(a)
4128
-#endif
4129
-
4130
-SIMDE_FUNCTION_ATTRIBUTES
4131
-simde__m128i simde_mm_set1_epi32(int32_t a)
4132
-{
4133
-#if defined(SIMDE_X86_SSE2_NATIVE)
4134
-   return _mm_set1_epi32(a);
4135
-#else
4136
-   simde__m128i_private r_;
4137
-
4138
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4139
-   r_.neon_i32 = vdupq_n_s32(a);
4140
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4141
-   r_.wasm_v128 = wasm_i32x4_splat(a);
4142
-#else
4143
-   SIMDE_VECTORIZE
4144
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
4145
-       r_.i32[i] = a;
4146
-   }
4147
-#endif
4148
-
4149
-   return simde__m128i_from_private(r_);
4150
-#endif
4151
-}
4152
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4153
-#define _mm_set1_epi32(a) simde_mm_set1_epi32(a)
4154
-#endif
4155
-
4156
-SIMDE_FUNCTION_ATTRIBUTES
4157
-simde__m128i simde_mm_set1_epi64x(int64_t a)
4158
-{
4159
-#if defined(SIMDE_X86_SSE2_NATIVE) && \
4160
-   (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19, 0, 0))
4161
-   return _mm_set1_epi64x(a);
4162
-#else
4163
-   simde__m128i_private r_;
4164
-
4165
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4166
-   r_.neon_i64 = vmovq_n_s64(a);
4167
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4168
-   r_.wasm_v128 = wasm_i64x2_splat(a);
4169
-#else
4170
-   SIMDE_VECTORIZE
4171
-   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
4172
-       r_.i64[i] = a;
4173
-   }
4174
-#endif
4175
-
4176
-   return simde__m128i_from_private(r_);
4177
-#endif
4178
-}
4179
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4180
-#define _mm_set1_epi64x(a) simde_mm_set1_epi64x(a)
4181
-#endif
4182
-
4183
-SIMDE_FUNCTION_ATTRIBUTES
4184
-simde__m128i simde_mm_set1_epi64(simde__m64 a)
4185
-{
4186
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4187
-   return _mm_set1_epi64(a);
4188
-#else
4189
-   simde__m64_private a_ = simde__m64_to_private(a);
4190
-   return simde_mm_set1_epi64x(a_.i64[0]);
4191
-#endif
4192
-}
4193
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4194
-#define _mm_set1_epi64(a) simde_mm_set1_epi64(a)
4195
-#endif
4196
-
4197
-SIMDE_FUNCTION_ATTRIBUTES
4198
-simde__m128i simde_x_mm_set1_epu8(uint8_t value)
4199
-{
4200
-   return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value));
4201
-}
4202
-
4203
-SIMDE_FUNCTION_ATTRIBUTES
4204
-simde__m128i simde_x_mm_set1_epu16(uint16_t value)
4205
-{
4206
-   return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value));
4207
-}
4208
-
4209
-SIMDE_FUNCTION_ATTRIBUTES
4210
-simde__m128i simde_x_mm_set1_epu32(uint32_t value)
4211
-{
4212
-   return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value));
4213
-}
4214
-
4215
-SIMDE_FUNCTION_ATTRIBUTES
4216
-simde__m128i simde_x_mm_set1_epu64(uint64_t value)
4217
-{
4218
-   return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value));
4219
-}
4220
-
4221
-SIMDE_FUNCTION_ATTRIBUTES
4222
-simde__m128d simde_mm_set1_pd(simde_float64 a)
4223
-{
4224
-#if defined(SIMDE_X86_SSE2_NATIVE)
4225
-   return _mm_set1_pd(a);
4226
-#else
4227
-   simde__m128d_private r_;
4228
-
4229
-#if defined(SIMDE_WASM_SIMD128_NATIVE)
4230
-   r_.wasm_v128 = wasm_f64x2_splat(a);
4231
-#else
4232
-   SIMDE_VECTORIZE
4233
-   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
4234
-       r_.f64[i] = a;
4235
-   }
4236
-#endif
4237
-
4238
-   return simde__m128d_from_private(r_);
4239
-#endif
4240
-}
4241
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4242
-#define _mm_set1_pd(a) simde_mm_set1_pd(a)
4243
-#endif
4244
-
4245
-SIMDE_FUNCTION_ATTRIBUTES
4246
-simde__m128i simde_mm_setr_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12,
4247
-               int8_t e11, int8_t e10, int8_t e9, int8_t e8,
4248
-               int8_t e7, int8_t e6, int8_t e5, int8_t e4,
4249
-               int8_t e3, int8_t e2, int8_t e1, int8_t e0)
4250
-{
4251
-#if defined(SIMDE_X86_SSE2_NATIVE)
4252
-   return _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5,
4253
-                e4, e3, e2, e1, e0);
4254
-#else
4255
-   return simde_mm_set_epi8(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10,
4256
-                e11, e12, e13, e14, e15);
4257
-#endif
4258
-}
4259
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4260
-#define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4,  \
4261
-             e3, e2, e1, e0)                                        \
4262
-   simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, \
4263
-              e4, e3, e2, e1, e0)
4264
-#endif
4265
-
4266
-SIMDE_FUNCTION_ATTRIBUTES
4267
-simde__m128i simde_mm_setr_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4,
4268
-                int16_t e3, int16_t e2, int16_t e1, int16_t e0)
4269
-{
4270
-#if defined(SIMDE_X86_SSE2_NATIVE)
4271
-   return _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
4272
-#else
4273
-   return simde_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7);
4274
-#endif
4275
-}
4276
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4277
-#define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) \
4278
-   simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
4279
-#endif
4280
-
4281
-SIMDE_FUNCTION_ATTRIBUTES
4282
-simde__m128i simde_mm_setr_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
4283
-{
4284
-#if defined(SIMDE_X86_SSE2_NATIVE)
4285
-   return _mm_setr_epi32(e3, e2, e1, e0);
4286
-#else
4287
-   return simde_mm_set_epi32(e0, e1, e2, e3);
4288
-#endif
4289
-}
4290
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4291
-#define _mm_setr_epi32(e3, e2, e1, e0) simde_mm_setr_epi32(e3, e2, e1, e0)
4292
-#endif
4293
-
4294
-SIMDE_FUNCTION_ATTRIBUTES
4295
-simde__m128i simde_mm_setr_epi64(simde__m64 e1, simde__m64 e0)
4296
-{
4297
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4298
-   return _mm_setr_epi64(e1, e0);
4299
-#else
4300
-   return simde_mm_set_epi64(e0, e1);
4301
-#endif
4302
-}
4303
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4304
-#define _mm_setr_epi64(e1, e0) (simde_mm_setr_epi64((e1), (e0)))
4305
-#endif
4306
-
4307
-SIMDE_FUNCTION_ATTRIBUTES
4308
-simde__m128d simde_mm_setr_pd(simde_float64 e1, simde_float64 e0)
4309
-{
4310
-#if defined(SIMDE_X86_SSE2_NATIVE)
4311
-   return _mm_setr_pd(e1, e0);
4312
-#else
4313
-   return simde_mm_set_pd(e0, e1);
4314
-#endif
4315
-}
4316
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4317
-#define _mm_setr_pd(e1, e0) simde_mm_setr_pd(e1, e0)
4318
-#endif
4319
-
4320
-SIMDE_FUNCTION_ATTRIBUTES
4321
-simde__m128d simde_mm_setzero_pd(void)
4322
-{
4323
-#if defined(SIMDE_X86_SSE2_NATIVE)
4324
-   return _mm_setzero_pd();
4325
-#else
4326
-   simde__m128d_private r_;
4327
-
4328
-   SIMDE_VECTORIZE
4329
-   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
4330
-       r_.i32f[i] = 0;
4331
-   }
4332
-
4333
-   return simde__m128d_from_private(r_);
4334
-#endif
4335
-}
4336
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4337
-#define _mm_setzero_pd() simde_mm_setzero_pd()
4338
-#endif
4339
-
4340
-#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4341
-HEDLEY_DIAGNOSTIC_PUSH
4342
-SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
4343
-#endif
4344
-
4345
-SIMDE_FUNCTION_ATTRIBUTES
4346
-simde__m128d simde_mm_undefined_pd(void)
4347
-{
4348
-   simde__m128d_private r_;
4349
-
4350
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
4351
-   r_.n = _mm_undefined_pd();
4352
-#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4353
-   r_ = simde__m128d_to_private(simde_mm_setzero_pd());
4354
-#endif
4355
-
4356
-   return simde__m128d_from_private(r_);
4357
-}
4358
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4359
-#define _mm_undefined_pd() simde_mm_undefined_pd()
4360
-#endif
4361
-
4362
-SIMDE_FUNCTION_ATTRIBUTES
4363
-simde__m128i simde_mm_undefined_si128(void)
4364
-{
4365
-   simde__m128i_private r_;
4366
-
4367
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
4368
-   r_.n = _mm_undefined_si128();
4369
-#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4370
-   r_ = simde__m128i_to_private(simde_mm_setzero_si128());
4371
-#endif
4372
-
4373
-   return simde__m128i_from_private(r_);
4374
-}
4375
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4376
-#define _mm_undefined_si128() (simde_mm_undefined_si128())
4377
-#endif
4378
-
4379
-#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4380
-HEDLEY_DIAGNOSTIC_POP
4381
-#endif
4382
-
4383
-SIMDE_FUNCTION_ATTRIBUTES
4384
-simde__m128d simde_x_mm_setone_pd(void)
4385
-{
4386
-   return simde_mm_castps_pd(simde_x_mm_setone_ps());
4387
-}
4388
-
4389
-SIMDE_FUNCTION_ATTRIBUTES
4390
-simde__m128i simde_x_mm_setone_si128(void)
4391
-{
4392
-   return simde_mm_castps_si128(simde_x_mm_setone_ps());
4393
-}
4394
-
4395
-SIMDE_FUNCTION_ATTRIBUTES
4396
-simde__m128i simde_mm_shuffle_epi32(simde__m128i a, const int imm8)
4397
-   SIMDE_REQUIRE_RANGE(imm8, 0, 255)
4398
-{
4399
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
4400
-
4401
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
4402
-       r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
4403
-   }
4404
-
4405
-   return simde__m128i_from_private(r_);
4406
-}
4407
-#if defined(SIMDE_X86_SSE2_NATIVE)
4408
-#define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8))
4409
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
4410
-#define simde_mm_shuffle_epi32(a, imm8)                               \
4411
-   (__extension__({                                              \
4412
-       const simde__m128i_private simde__tmp_a_ =            \
4413
-           simde__m128i_to_private(a);                   \
4414
-       simde__m128i_from_private((simde__m128i_private){     \
4415
-           .i32 = SIMDE_SHUFFLE_VECTOR_(                 \
4416
-               32, 16, (simde__tmp_a_).i32,          \
4417
-               (simde__tmp_a_).i32, ((imm8)) & 3,    \
4418
-               ((imm8) >> 2) & 3, ((imm8) >> 4) & 3, \
4419
-               ((imm8) >> 6) & 3)});                 \
4420
-   }))
4421
-#endif
4422
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4423
-#define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8)
4424
-#endif
4425
-
4426
-SIMDE_FUNCTION_ATTRIBUTES
4427
-simde__m128d simde_mm_shuffle_pd(simde__m128d a, simde__m128d b, const int imm8)
4428
-   SIMDE_REQUIRE_RANGE(imm8, 0, 3)
4429
-{
4430
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
4431
-                b_ = simde__m128d_to_private(b);
4432
-
4433
-   r_.f64[0] = ((imm8 & 1) == 0) ? a_.f64[0] : a_.f64[1];
4434
-   r_.f64[1] = ((imm8 & 2) == 0) ? b_.f64[0] : b_.f64[1];
4435
-
4436
-   return simde__m128d_from_private(r_);
4437
-}
4438
-#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
4439
-#define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8))
4440
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
4441
-#define simde_mm_shuffle_pd(a, b, imm8)                                     \
4442
-   (__extension__({                                                    \
4443
-       simde__m128d_from_private((simde__m128d_private){           \
4444
-           .f64 = SIMDE_SHUFFLE_VECTOR_(                       \
4445
-               64, 16, simde__m128d_to_private(a).f64,     \
4446
-               simde__m128d_to_private(b).f64,             \
4447
-               (((imm8)) & 1), (((imm8) >> 1) & 1) + 2)}); \
4448
-   }))
4449
-#endif
4450
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4451
-#define _mm_shuffle_pd(a, b, imm8) simde_mm_shuffle_pd(a, b, imm8)
4452
-#endif
4453
-
4454
-SIMDE_FUNCTION_ATTRIBUTES
4455
-simde__m128i simde_mm_shufflehi_epi16(simde__m128i a, const int imm8)
4456
-   SIMDE_REQUIRE_RANGE(imm8, 0, 255)
4457
-{
4458
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
4459
-
4460
-   SIMDE_VECTORIZE
4461
-   for (size_t i = 0; i < ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2);
4462
-        i++) {
4463
-       r_.i16[i] = a_.i16[i];
4464
-   }
4465
-   for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2);
4466
-        i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
4467
-       r_.i16[i] = a_.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
4468
-   }
4469
-
4470
-   return simde__m128i_from_private(r_);
4471
-}
4472
-#if defined(SIMDE_X86_SSE2_NATIVE)
4473
-#define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8))
4474
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
4475
-#define simde_mm_shufflehi_epi16(a, imm8)                                    \
4476
-   (__extension__({                                                     \
4477
-       const simde__m128i_private simde__tmp_a_ =                   \
4478
-           simde__m128i_to_private(a);                          \
4479
-       simde__m128i_from_private((simde__m128i_private){            \
4480
-           .i16 = SIMDE_SHUFFLE_VECTOR_(                        \
4481
-               16, 16, (simde__tmp_a_).i16,                 \
4482
-               (simde__tmp_a_).i16, 0, 1, 2, 3,             \
4483
-               (((imm8)) & 3) + 4, (((imm8) >> 2) & 3) + 4, \
4484
-               (((imm8) >> 4) & 3) + 4,                     \
4485
-               (((imm8) >> 6) & 3) + 4)});                  \
4486
-   }))
4487
-#endif
4488
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4489
-#define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8)
4490
-#endif
4491
-
4492
-SIMDE_FUNCTION_ATTRIBUTES
4493
-simde__m128i simde_mm_shufflelo_epi16(simde__m128i a, const int imm8)
4494
-   SIMDE_REQUIRE_RANGE(imm8, 0, 255)
4495
-{
4496
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
4497
-
4498
-   for (size_t i = 0; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2);
4499
-        i++) {
4500
-       r_.i16[i] = a_.i16[((imm8 >> (i * 2)) & 3)];
4501
-   }
4502
-   SIMDE_VECTORIZE
4503
-   for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2);
4504
-        i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
4505
-       r_.i16[i] = a_.i16[i];
4506
-   }
4507
-
4508
-   return simde__m128i_from_private(r_);
4509
-}
4510
-#if defined(SIMDE_X86_SSE2_NATIVE)
4511
-#define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8))
4512
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
4513
-#define simde_mm_shufflelo_epi16(a, imm8)                                 \
4514
-   (__extension__({                                                  \
4515
-       const simde__m128i_private simde__tmp_a_ =                \
4516
-           simde__m128i_to_private(a);                       \
4517
-       simde__m128i_from_private((simde__m128i_private){         \
4518
-           .i16 = SIMDE_SHUFFLE_VECTOR_(                     \
4519
-               16, 16, (simde__tmp_a_).i16,              \
4520
-               (simde__tmp_a_).i16, (((imm8)) & 3),      \
4521
-               (((imm8) >> 2) & 3), (((imm8) >> 4) & 3), \
4522
-               (((imm8) >> 6) & 3), 4, 5, 6, 7)});       \
4523
-   }))
4524
-#endif
4525
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4526
-#define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8)
4527
-#endif
4528
-
4529
-SIMDE_FUNCTION_ATTRIBUTES
4530
-simde__m128i simde_mm_sll_epi16(simde__m128i a, simde__m128i count)
4531
-{
4532
-#if defined(SIMDE_X86_SSE2_NATIVE)
4533
-   return _mm_sll_epi16(a, count);
4534
-#else
4535
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4536
-                count_ = simde__m128i_to_private(count);
4537
-
4538
-   if (count_.u64[0] > 15)
4539
-       return simde_mm_setzero_si128();
4540
-
4541
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4542
-   r_.u16 = (a_.u16 << count_.u64[0]);
4543
-#else
4544
-   SIMDE_VECTORIZE
4545
-   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
4546
-       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t,
4547
-                          (a_.u16[i] << count_.u64[0]));
4548
-   }
4549
-#endif
4550
-
4551
-   return simde__m128i_from_private(r_);
4552
-#endif
4553
-}
4554
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4555
-#define _mm_sll_epi16(a, count) simde_mm_sll_epi16((a), (count))
4556
-#endif
4557
-
4558
-SIMDE_FUNCTION_ATTRIBUTES
4559
-simde__m128i simde_mm_sll_epi32(simde__m128i a, simde__m128i count)
4560
-{
4561
-#if defined(SIMDE_X86_SSE2_NATIVE)
4562
-   return _mm_sll_epi32(a, count);
4563
-#else
4564
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4565
-                count_ = simde__m128i_to_private(count);
4566
-
4567
-   if (count_.u64[0] > 31)
4568
-       return simde_mm_setzero_si128();
4569
-
4570
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4571
-   r_.u32 = (a_.u32 << count_.u64[0]);
4572
-#else
4573
-   SIMDE_VECTORIZE
4574
-   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
4575
-       r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t,
4576
-                          (a_.u32[i] << count_.u64[0]));
4577
-   }
4578
-#endif
4579
-
4580
-   return simde__m128i_from_private(r_);
4581
-#endif
4582
-}
4583
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4584
-#define _mm_sll_epi32(a, count) (simde_mm_sll_epi32(a, (count)))
4585
-#endif
4586
-
4587
-SIMDE_FUNCTION_ATTRIBUTES
4588
-simde__m128i simde_mm_sll_epi64(simde__m128i a, simde__m128i count)
4589
-{
4590
-#if defined(SIMDE_X86_SSE2_NATIVE)
4591
-   return _mm_sll_epi64(a, count);
4592
-#else
4593
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4594
-                count_ = simde__m128i_to_private(count);
4595
-
4596
-   if (count_.u64[0] > 63)
4597
-       return simde_mm_setzero_si128();
4598
-
4599
-   const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
4600
-#if !defined(SIMDE_BUG_GCC_94488)
4601
-   SIMDE_VECTORIZE
4602
-#endif
4603
-   for (size_t i = 0; i < (sizeof(r_.u64) / sizeof(r_.u64[0])); i++) {
4604
-       r_.u64[i] = a_.u64[i] << s;
4605
-   }
4606
-
4607
-   return simde__m128i_from_private(r_);
4608
-#endif
4609
-}
4610
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4611
-#define _mm_sll_epi64(a, count) (simde_mm_sll_epi64(a, (count)))
4612
-#endif
4613
-
4614
-SIMDE_FUNCTION_ATTRIBUTES
4615
-simde__m128d simde_mm_sqrt_pd(simde__m128d a)
4616
-{
4617
-#if defined(SIMDE_X86_SSE2_NATIVE)
4618
-   return _mm_sqrt_pd(a);
4619
-#else
4620
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a);
4621
-
4622
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4623
-   r_.neon_f64 = vsqrtq_f64(a_.neon_f64);
4624
-#elif defined(simde_math_sqrt)
4625
-   SIMDE_VECTORIZE
4626
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
4627
-       r_.f64[i] = simde_math_sqrt(a_.f64[i]);
4628
-   }
4629
-#else
4630
-   HEDLEY_UNREACHABLE();
4631
-#endif
4632
-
4633
-   return simde__m128d_from_private(r_);
4634
-#endif
4635
-}
4636
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4637
-#define _mm_sqrt_pd(a) simde_mm_sqrt_pd(a)
4638
-#endif
4639
-
4640
-SIMDE_FUNCTION_ATTRIBUTES
4641
-simde__m128d simde_mm_sqrt_sd(simde__m128d a, simde__m128d b)
4642
-{
4643
-#if defined(SIMDE_X86_SSE2_NATIVE)
4644
-   return _mm_sqrt_sd(a, b);
4645
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
4646
-   return simde_mm_move_sd(a, simde_mm_sqrt_pd(b));
4647
-#else
4648
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
4649
-                b_ = simde__m128d_to_private(b);
4650
-
4651
-#if defined(simde_math_sqrt)
4652
-   r_.f64[0] = simde_math_sqrt(b_.f64[0]);
4653
-   r_.f64[1] = a_.f64[1];
4654
-#else
4655
-   HEDLEY_UNREACHABLE();
4656
-#endif
4657
-
4658
-   return simde__m128d_from_private(r_);
4659
-#endif
4660
-}
4661
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4662
-#define _mm_sqrt_sd(a, b) simde_mm_sqrt_sd(a, b)
4663
-#endif
4664
-
4665
-SIMDE_FUNCTION_ATTRIBUTES
4666
-simde__m128i simde_mm_srl_epi16(simde__m128i a, simde__m128i count)
4667
-{
4668
-#if defined(SIMDE_X86_SSE2_NATIVE)
4669
-   return _mm_srl_epi16(a, count);
4670
-#else
4671
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4672
-                count_ = simde__m128i_to_private(count);
4673
-
4674
-   const int cnt = HEDLEY_STATIC_CAST(
4675
-       int, (count_.i64[0] > 16 ? 16 : count_.i64[0]));
4676
-
4677
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4678
-   r_.neon_u16 = vshlq_u16(a_.neon_u16,
4679
-               vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
4680
-#else
4681
-   SIMDE_VECTORIZE
4682
-   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
4683
-       r_.u16[i] = a_.u16[i] >> cnt;
4684
-   }
4685
-#endif
4686
-
4687
-   return simde__m128i_from_private(r_);
4688
-#endif
4689
-}
4690
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4691
-#define _mm_srl_epi16(a, count) (simde_mm_srl_epi16(a, (count)))
4692
-#endif
4693
-
4694
-SIMDE_FUNCTION_ATTRIBUTES
4695
-simde__m128i simde_mm_srl_epi32(simde__m128i a, simde__m128i count)
4696
-{
4697
-#if defined(SIMDE_X86_SSE2_NATIVE)
4698
-   return _mm_srl_epi32(a, count);
4699
-#else
4700
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4701
-                count_ = simde__m128i_to_private(count);
4702
-
4703
-   const int cnt = HEDLEY_STATIC_CAST(
4704
-       int, (count_.i64[0] > 32 ? 32 : count_.i64[0]));
4705
-
4706
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4707
-   r_.neon_u32 = vshlq_u32(a_.neon_u32,
4708
-               vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
4709
-#else
4710
-   SIMDE_VECTORIZE
4711
-   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
4712
-       r_.u32[i] = a_.u32[i] >> cnt;
4713
-   }
4714
-#endif
4715
-
4716
-   return simde__m128i_from_private(r_);
4717
-#endif
4718
-}
4719
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4720
-#define _mm_srl_epi32(a, count) (simde_mm_srl_epi32(a, (count)))
4721
-#endif
4722
-
4723
-SIMDE_FUNCTION_ATTRIBUTES
4724
-simde__m128i simde_mm_srl_epi64(simde__m128i a, simde__m128i count)
4725
-{
4726
-#if defined(SIMDE_X86_SSE2_NATIVE)
4727
-   return _mm_srl_epi64(a, count);
4728
-#else
4729
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4730
-                count_ = simde__m128i_to_private(count);
4731
-
4732
-   const int cnt = HEDLEY_STATIC_CAST(
4733
-       int, (count_.i64[0] > 64 ? 64 : count_.i64[0]));
4734
-
4735
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4736
-   r_.neon_u64 = vshlq_u64(a_.neon_u64,
4737
-               vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt)));
4738
-#else
4739
-#if !defined(SIMDE_BUG_GCC_94488)
4740
-   SIMDE_VECTORIZE
4741
-#endif
4742
-   for (size_t i = 0; i < (sizeof(r_.u64) / sizeof(r_.u64[0])); i++) {
4743
-       r_.u64[i] = a_.u64[i] >> cnt;
4744
-   }
4745
-#endif
4746
-
4747
-   return simde__m128i_from_private(r_);
4748
-#endif
4749
-}
4750
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4751
-#define _mm_srl_epi64(a, count) (simde_mm_srl_epi64(a, (count)))
4752
-#endif
4753
-
4754
-SIMDE_FUNCTION_ATTRIBUTES
4755
-simde__m128i simde_mm_srai_epi16(simde__m128i a, const int imm8)
4756
-   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
4757
-{
4758
-   /* MSVC requires a range of (0, 255). */
4759
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
4760
-
4761
-   const int cnt = (imm8 & ~15) ? 15 : imm8;
4762
-
4763
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4764
-   r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(-cnt));
4765
-#else
4766
-   SIMDE_VECTORIZE
4767
-   for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i16[0])); i++) {
4768
-       r_.i16[i] = a_.i16[i] >> cnt;
4769
-   }
4770
-#endif
4771
-
4772
-   return simde__m128i_from_private(r_);
4773
-}
4774
-#if defined(SIMDE_X86_SSE2_NATIVE)
4775
-#define simde_mm_srai_epi16(a, imm8) _mm_srai_epi16((a), (imm8))
4776
-#endif
4777
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4778
-#define _mm_srai_epi16(a, imm8) simde_mm_srai_epi16(a, imm8)
4779
-#endif
4780
-
4781
-SIMDE_FUNCTION_ATTRIBUTES
4782
-simde__m128i simde_mm_srai_epi32(simde__m128i a, const int imm8)
4783
-   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
4784
-{
4785
-   /* MSVC requires a range of (0, 255). */
4786
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
4787
-
4788
-   const int cnt = (imm8 & ~31) ? 31 : imm8;
4789
-
4790
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4791
-   r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt));
4792
-#else
4793
-   SIMDE_VECTORIZE
4794
-   for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i32[0])); i++) {
4795
-       r_.i32[i] = a_.i32[i] >> cnt;
4796
-   }
4797
-#endif
4798
-
4799
-   return simde__m128i_from_private(r_);
4800
-}
4801
-#if defined(SIMDE_X86_SSE2_NATIVE)
4802
-#define simde_mm_srai_epi32(a, imm8) _mm_srai_epi32((a), (imm8))
4803
-#endif
4804
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4805
-#define _mm_srai_epi32(a, imm8) simde_mm_srai_epi32(a, imm8)
4806
-#endif
4807
-
4808
-SIMDE_FUNCTION_ATTRIBUTES
4809
-simde__m128i simde_mm_sra_epi16(simde__m128i a, simde__m128i count)
4810
-{
4811
-#if defined(SIMDE_X86_SSE2_NATIVE)
4812
-   return _mm_sra_epi16(a, count);
4813
-#else
4814
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4815
-                count_ = simde__m128i_to_private(count);
4816
-
4817
-   const int cnt = HEDLEY_STATIC_CAST(
4818
-       int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
4819
-
4820
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4821
-   r_.neon_i16 = vshlq_s16(a_.neon_i16,
4822
-               vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
4823
-#else
4824
-   SIMDE_VECTORIZE
4825
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
4826
-       r_.i16[i] = a_.i16[i] >> cnt;
4827
-   }
4828
-#endif
4829
-
4830
-   return simde__m128i_from_private(r_);
4831
-#endif
4832
-}
4833
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4834
-#define _mm_sra_epi16(a, count) (simde_mm_sra_epi16(a, count))
4835
-#endif
4836
-
4837
-SIMDE_FUNCTION_ATTRIBUTES
4838
-simde__m128i simde_mm_sra_epi32(simde__m128i a, simde__m128i count)
4839
-{
4840
-#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
4841
-   return _mm_sra_epi32(a, count);
4842
-#else
4843
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4844
-                count_ = simde__m128i_to_private(count);
4845
-
4846
-   const int cnt = count_.u64[0] > 31
4847
-               ? 31
4848
-               : HEDLEY_STATIC_CAST(int, count_.u64[0]);
4849
-
4850
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4851
-   r_.neon_i32 = vshlq_s32(a_.neon_i32,
4852
-               vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
4853
-#else
4854
-   SIMDE_VECTORIZE
4855
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
4856
-       r_.i32[i] = a_.i32[i] >> cnt;
4857
-   }
4858
-#endif
4859
-
4860
-   return simde__m128i_from_private(r_);
4861
-#endif
4862
-}
4863
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4864
-#define _mm_sra_epi32(a, count) (simde_mm_sra_epi32(a, (count)))
4865
-#endif
4866
-
4867
-SIMDE_FUNCTION_ATTRIBUTES
4868
-simde__m128i simde_mm_slli_epi16(simde__m128i a, const int imm8)
4869
-   SIMDE_REQUIRE_RANGE(imm8, 0, 255)
4870
-{
4871
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
4872
-
4873
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4874
-   r_.i16 = a_.i16 << (imm8 & 0xff);
4875
-#else
4876
-   const int s =
4877
-       (imm8 >
4878
-        HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1)
4879
-           ? 0
4880
-           : imm8;
4881
-   SIMDE_VECTORIZE
4882
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
4883
-       r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << s);
4884
-   }
4885
-#endif
4886
-
4887
-   return simde__m128i_from_private(r_);
4888
-}
4889
-#if defined(SIMDE_X86_SSE2_NATIVE)
4890
-#define simde_mm_slli_epi16(a, imm8) _mm_slli_epi16(a, imm8)
4891
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4892
-#define simde_mm_slli_epi16(a, imm8) \
4893
-   simde__m128i_from_neon_u16(  \
4894
-       vshlq_n_u16(simde__m128i_to_neon_u16(a), (imm8)))
4895
-#endif
4896
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4897
-#define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8)
4898
-#endif
4899
-
4900
-SIMDE_FUNCTION_ATTRIBUTES
4901
-simde__m128i simde_mm_slli_epi32(simde__m128i a, const int imm8)
4902
-   SIMDE_REQUIRE_RANGE(imm8, 0, 255)
4903
-{
4904
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
4905
-
4906
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4907
-   r_.i32 = a_.i32 << imm8;
4908
-#else
4909
-   SIMDE_VECTORIZE
4910
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
4911
-       r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
4912
-   }
4913
-#endif
4914
-
4915
-   return simde__m128i_from_private(r_);
4916
-}
4917
-#if defined(SIMDE_X86_SSE2_NATIVE)
4918
-#define simde_mm_slli_epi32(a, imm8) _mm_slli_epi32(a, imm8)
4919
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4920
-#define simde_mm_slli_epi32(a, imm8) \
4921
-   simde__m128i_from_neon_u32(  \
4922
-       vshlq_n_u32(simde__m128i_to_neon_u32(a), (imm8)))
4923
-#endif
4924
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4925
-#define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8)
4926
-#endif
4927
-
4928
-SIMDE_FUNCTION_ATTRIBUTES
4929
-simde__m128i simde_mm_slli_epi64(simde__m128i a, const int imm8)
4930
-   SIMDE_REQUIRE_RANGE(imm8, 0, 255)
4931
-{
4932
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
4933
-
4934
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4935
-   r_.i64 = a_.i64 << imm8;
4936
-#else
4937
-   SIMDE_VECTORIZE
4938
-   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
4939
-       r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
4940
-   }
4941
-#endif
4942
-
4943
-   return simde__m128i_from_private(r_);
4944
-}
4945
-#if defined(SIMDE_X86_SSE2_NATIVE)
4946
-#define simde_mm_slli_epi64(a, imm8) _mm_slli_epi64(a, imm8)
4947
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4948
-#define simde_mm_slli_epi64(a, imm8) \
4949
-   simde__m128i_from_neon_u64(  \
4950
-       vshlq_n_u64(simde__m128i_to_neon_u64(a), (imm8)))
4951
-#endif
4952
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4953
-#define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8)
4954
-#endif
4955
-
4956
-SIMDE_FUNCTION_ATTRIBUTES
4957
-simde__m128i simde_mm_srli_epi16(simde__m128i a, const int imm8)
4958
-   SIMDE_REQUIRE_RANGE(imm8, 0, 255)
4959
-{
4960
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
4961
-
4962
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4963
-   r_.u16 = a_.u16 >> imm8;
4964
-#else
4965
-   SIMDE_VECTORIZE
4966
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
4967
-       r_.u16[i] = a_.u16[i] >> (imm8 & 0xff);
4968
-   }
4969
-#endif
4970
-
4971
-   return simde__m128i_from_private(r_);
4972
-}
4973
-#if defined(SIMDE_X86_SSE2_NATIVE)
4974
-#define simde_mm_srli_epi16(a, imm8) _mm_srli_epi16(a, imm8)
4975
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4976
-#define simde_mm_srli_epi16(a, imm8) \
4977
-   simde__m128i_from_neon_u16(  \
4978
-       vshrq_n_u16(simde__m128i_to_neon_u16(a), imm8))
4979
-#endif
4980
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4981
-#define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8)
4982
-#endif
4983
-
4984
-SIMDE_FUNCTION_ATTRIBUTES
4985
-simde__m128i simde_mm_srli_epi32(simde__m128i a, const int imm8)
4986
-   SIMDE_REQUIRE_RANGE(imm8, 0, 255)
4987
-{
4988
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
4989
-
4990
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
4991
-   r_.u32 = a_.u32 >> (imm8 & 0xff);
4992
-#else
4993
-   SIMDE_VECTORIZE
4994
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
4995
-       r_.u32[i] = a_.u32[i] >> (imm8 & 0xff);
4996
-   }
4997
-#endif
4998
-
4999
-   return simde__m128i_from_private(r_);
5000
-}
5001
-#if defined(SIMDE_X86_SSE2_NATIVE)
5002
-#define simde_mm_srli_epi32(a, imm8) _mm_srli_epi32(a, imm8)
5003
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5004
-#define simde_mm_srli_epi32(a, imm8) \
5005
-   simde__m128i_from_neon_u32(  \
5006
-       vshrq_n_u32(simde__m128i_to_neon_u32(a), imm8))
5007
-#endif
5008
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5009
-#define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8)
5010
-#endif
5011
-
5012
-SIMDE_FUNCTION_ATTRIBUTES
5013
-simde__m128i simde_mm_srli_epi64(simde__m128i a, const int imm8)
5014
-   SIMDE_REQUIRE_RANGE(imm8, 0, 255)
5015
-{
5016
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
5017
-
5018
-   if (HEDLEY_UNLIKELY((imm8 & 63) != imm8))
5019
-       return simde_mm_setzero_si128();
5020
-
5021
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5022
-   r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8));
5023
-#else
5024
-#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488)
5025
-   r_.u64 = a_.u64 >> imm8;
5026
-#else
5027
-   SIMDE_VECTORIZE
5028
-   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
5029
-       r_.u64[i] = a_.u64[i] >> imm8;
5030
-   }
5031
-#endif
5032
-#endif
5033
-
5034
-   return simde__m128i_from_private(r_);
5035
-}
5036
-#if defined(SIMDE_X86_SSE2_NATIVE)
5037
-#define simde_mm_srli_epi64(a, imm8) _mm_srli_epi64(a, imm8)
5038
-#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
5039
-#define simde_mm_srli_epi64(a, imm8)                            \
5040
-   ((imm8 == 0) ? (a)                                      \
5041
-            : (simde__m128i_from_neon_u64(vshrq_n_u64( \
5042
-                  simde__m128i_to_neon_u64(a), imm8))))
5043
-#endif
5044
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5045
-#define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)
5046
-#endif
5047
-
5048
-SIMDE_FUNCTION_ATTRIBUTES
5049
-void simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
5050
-              simde__m128d a)
5051
-{
5052
-   simde_assert_aligned(16, mem_addr);
5053
-
5054
-#if defined(SIMDE_X86_SSE2_NATIVE)
5055
-   _mm_store_pd(mem_addr, a);
5056
-#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5057
-   vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
5058
-#else
5059
-   simde_memcpy(mem_addr, &a, sizeof(a));
5060
-#endif
5061
-}
5062
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5063
-#define _mm_store_pd(mem_addr, a) \
5064
-   simde_mm_store_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
5065
-#endif
5066
-
5067
-SIMDE_FUNCTION_ATTRIBUTES
5068
-void simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
5069
-           simde__m128d a)
5070
-{
5071
-   simde_assert_aligned(16, mem_addr);
5072
-
5073
-#if defined(SIMDE_X86_SSE2_NATIVE)
5074
-   _mm_store1_pd(mem_addr, a);
5075
-#else
5076
-   simde__m128d_private a_ = simde__m128d_to_private(a);
5077
-
5078
-   mem_addr[0] = a_.f64[0];
5079
-   mem_addr[1] = a_.f64[0];
5080
-#endif
5081
-}
5082
-#define simde_mm_store_pd1(mem_addr, a) \
5083
-   simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
5084
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5085
-#define _mm_store1_pd(mem_addr, a) \
5086
-   simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
5087
-#define _mm_store_pd1(mem_addr, a) \
5088
-   simde_mm_store_pd1(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
5089
-#endif
5090
-
5091
-SIMDE_FUNCTION_ATTRIBUTES
5092
-void simde_mm_store_sd(simde_float64 *mem_addr, simde__m128d a)
5093
-{
5094
-#if defined(SIMDE_X86_SSE2_NATIVE)
5095
-   _mm_store_sd(mem_addr, a);
5096
-#else
5097
-   simde__m128d_private a_ = simde__m128d_to_private(a);
5098
-
5099
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5100
-   simde_float64 v = vgetq_lane_f64(a_.neon_f64, 0);
5101
-   simde_memcpy(mem_addr, &v, sizeof(simde_float64));
5102
-#else
5103
-   simde_float64 v = a_.f64[0];
5104
-   simde_memcpy(mem_addr, &v, sizeof(simde_float64));
5105
-#endif
5106
-#endif
5107
-}
5108
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5109
-#define _mm_store_sd(mem_addr, a) \
5110
-   simde_mm_store_sd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
5111
-#endif
5112
-
5113
-SIMDE_FUNCTION_ATTRIBUTES
5114
-void simde_mm_store_si128(simde__m128i *mem_addr, simde__m128i a)
5115
-{
5116
-#if defined(SIMDE_X86_SSE2_NATIVE)
5117
-   _mm_store_si128(HEDLEY_STATIC_CAST(__m128i *, mem_addr), a);
5118
-#else
5119
-   simde__m128i_private a_ = simde__m128i_to_private(a);
5120
-
5121
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5122
-   vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), a_.neon_i32);
5123
-#else
5124
-   simde_memcpy(SIMDE_ASSUME_ALIGNED(16, mem_addr), &a_, sizeof(a_));
5125
-#endif
5126
-#endif
5127
-}
5128
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5129
-#define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, a)
5130
-#endif
5131
-
5132
-SIMDE_FUNCTION_ATTRIBUTES
5133
-void simde_mm_storeh_pd(simde_float64 *mem_addr, simde__m128d a)
5134
-{
5135
-#if defined(SIMDE_X86_SSE2_NATIVE)
5136
-   _mm_storeh_pd(mem_addr, a);
5137
-#else
5138
-   simde__m128d_private a_ = simde__m128d_to_private(a);
5139
-
5140
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5141
-   *mem_addr = vgetq_lane_f64(a_.neon_f64, 1);
5142
-#else
5143
-   *mem_addr = a_.f64[1];
5144
-#endif
5145
-#endif
5146
-}
5147
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5148
-#define _mm_storeh_pd(mem_addr, a) \
5149
-   simde_mm_storeh_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
5150
-#endif
5151
-
5152
-SIMDE_FUNCTION_ATTRIBUTES
5153
-void simde_mm_storel_epi64(simde__m128i *mem_addr, simde__m128i a)
5154
-{
5155
-#if defined(SIMDE_X86_SSE2_NATIVE)
5156
-   _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i *, mem_addr), a);
5157
-#else
5158
-   simde__m128i_private a_ = simde__m128i_to_private(a);
5159
-   int64_t tmp;
5160
-
5161
-   /* memcpy to prevent aliasing, tmp because we can't take the
5162
-     * address of a vector element. */
5163
-
5164
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5165
-   tmp = vgetq_lane_s64(a_.neon_i64, 0);
5166
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
5167
-#if defined(SIMDE_BUG_GCC_95227)
5168
-   (void)a_;
5169
-#endif
5170
-   tmp = vec_extract(a_.altivec_i64, 0);
5171
-#else
5172
-   tmp = a_.i64[0];
5173
-#endif
5174
-
5175
-   simde_memcpy(mem_addr, &tmp, sizeof(tmp));
5176
-#endif
5177
-}
5178
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5179
-#define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, a)
5180
-#endif
5181
-
5182
-SIMDE_FUNCTION_ATTRIBUTES
5183
-void simde_mm_storel_pd(simde_float64 *mem_addr, simde__m128d a)
5184
-{
5185
-#if defined(SIMDE_X86_SSE2_NATIVE)
5186
-   _mm_storel_pd(mem_addr, a);
5187
-#else
5188
-   simde__m128d_private a_ = simde__m128d_to_private(a);
5189
-
5190
-   *mem_addr = a_.f64[0];
5191
-#endif
5192
-}
5193
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5194
-#define _mm_storel_pd(mem_addr, a) \
5195
-   simde_mm_storel_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
5196
-#endif
5197
-
5198
-SIMDE_FUNCTION_ATTRIBUTES
5199
-void simde_mm_storer_pd(simde_float64 mem_addr[2], simde__m128d a)
5200
-{
5201
-   simde_assert_aligned(16, mem_addr);
5202
-
5203
-#if defined(SIMDE_X86_SSE2_NATIVE)
5204
-   _mm_storer_pd(mem_addr, a);
5205
-#else
5206
-   simde__m128d_private a_ = simde__m128d_to_private(a);
5207
-
5208
-   mem_addr[0] = a_.f64[1];
5209
-   mem_addr[1] = a_.f64[0];
5210
-#endif
5211
-}
5212
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5213
-#define _mm_storer_pd(mem_addr, a) \
5214
-   simde_mm_storer_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
5215
-#endif
5216
-
5217
-SIMDE_FUNCTION_ATTRIBUTES
5218
-void simde_mm_storeu_pd(simde_float64 *mem_addr, simde__m128d a)
5219
-{
5220
-#if defined(SIMDE_X86_SSE2_NATIVE)
5221
-   _mm_storeu_pd(mem_addr, a);
5222
-#else
5223
-   simde_memcpy(mem_addr, &a, sizeof(a));
5224
-#endif
5225
-}
5226
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5227
-#define _mm_storeu_pd(mem_addr, a) \
5228
-   simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
5229
-#endif
5230
-
5231
-SIMDE_FUNCTION_ATTRIBUTES
5232
-void simde_mm_storeu_si128(simde__m128i *mem_addr, simde__m128i a)
5233
-{
5234
-#if defined(SIMDE_X86_SSE2_NATIVE)
5235
-   _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i *, mem_addr), a);
5236
-#else
5237
-   simde__m128i_private a_ = simde__m128i_to_private(a);
5238
-
5239
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5240
-   vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), a_.neon_i32);
5241
-#else
5242
-   simde_memcpy(mem_addr, &a_, sizeof(a_));
5243
-#endif
5244
-#endif
5245
-}
5246
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5247
-#define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, a)
5248
-#endif
5249
-
5250
-SIMDE_FUNCTION_ATTRIBUTES
5251
-void simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
5252
-           simde__m128d a)
5253
-{
5254
-   simde_assert_aligned(16, mem_addr);
5255
-
5256
-#if defined(SIMDE_X86_SSE2_NATIVE)
5257
-   _mm_stream_pd(mem_addr, a);
5258
-#else
5259
-   simde_memcpy(mem_addr, &a, sizeof(a));
5260
-#endif
5261
-}
5262
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5263
-#define _mm_stream_pd(mem_addr, a) \
5264
-   simde_mm_stream_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
5265
-#endif
5266
-
5267
-SIMDE_FUNCTION_ATTRIBUTES
5268
-void simde_mm_stream_si128(simde__m128i *mem_addr, simde__m128i a)
5269
-{
5270
-   simde_assert_aligned(16, mem_addr);
5271
-
5272
-#if defined(SIMDE_X86_SSE2_NATIVE)
5273
-   _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i *, mem_addr), a);
5274
-#else
5275
-   simde_memcpy(mem_addr, &a, sizeof(a));
5276
-#endif
5277
-}
5278
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5279
-#define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, a)
5280
-#endif
5281
-
5282
-SIMDE_FUNCTION_ATTRIBUTES
5283
-void simde_mm_stream_si32(int32_t *mem_addr, int32_t a)
5284
-{
5285
-#if defined(SIMDE_X86_SSE2_NATIVE)
5286
-   _mm_stream_si32(mem_addr, a);
5287
-#else
5288
-   *mem_addr = a;
5289
-#endif
5290
-}
5291
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5292
-#define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a)
5293
-#endif
5294
-
5295
-SIMDE_FUNCTION_ATTRIBUTES
5296
-void simde_mm_stream_si64(int64_t *mem_addr, int64_t a)
5297
-{
5298
-   *mem_addr = a;
5299
-}
5300
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5301
-#define _mm_stream_si64(mem_addr, a)                                  \
5302
-   simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(          \
5303
-                    int64_t *, __int64 *, mem_addr), \
5304
-                a)
5305
-#endif
5306
-
5307
-SIMDE_FUNCTION_ATTRIBUTES
5308
-simde__m128i simde_mm_sub_epi8(simde__m128i a, simde__m128i b)
5309
-{
5310
-#if defined(SIMDE_X86_SSE2_NATIVE)
5311
-   return _mm_sub_epi8(a, b);
5312
-#else
5313
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5314
-                b_ = simde__m128i_to_private(b);
5315
-
5316
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5317
-   r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8);
5318
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5319
-   r_.i8 = a_.i8 - b_.i8;
5320
-#else
5321
-   SIMDE_VECTORIZE
5322
-   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
5323
-       r_.i8[i] = a_.i8[i] - b_.i8[i];
5324
-   }
5325
-#endif
5326
-
5327
-   return simde__m128i_from_private(r_);
5328
-#endif
5329
-}
5330
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5331
-#define _mm_sub_epi8(a, b) simde_mm_sub_epi8(a, b)
5332
-#endif
5333
-
5334
-SIMDE_FUNCTION_ATTRIBUTES
5335
-simde__m128i simde_mm_sub_epi16(simde__m128i a, simde__m128i b)
5336
-{
5337
-#if defined(SIMDE_X86_SSE2_NATIVE)
5338
-   return _mm_sub_epi16(a, b);
5339
-#else
5340
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5341
-                b_ = simde__m128i_to_private(b);
5342
-
5343
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5344
-   r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16);
5345
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5346
-   r_.i16 = a_.i16 - b_.i16;
5347
-#else
5348
-   SIMDE_VECTORIZE
5349
-   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
5350
-       r_.i16[i] = a_.i16[i] - b_.i16[i];
5351
-   }
5352
-#endif
5353
-
5354
-   return simde__m128i_from_private(r_);
5355
-#endif
5356
-}
5357
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5358
-#define _mm_sub_epi16(a, b) simde_mm_sub_epi16(a, b)
5359
-#endif
5360
-
5361
-SIMDE_FUNCTION_ATTRIBUTES
5362
-simde__m128i simde_mm_sub_epi32(simde__m128i a, simde__m128i b)
5363
-{
5364
-#if defined(SIMDE_X86_SSE2_NATIVE)
5365
-   return _mm_sub_epi32(a, b);
5366
-#else
5367
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5368
-                b_ = simde__m128i_to_private(b);
5369
-
5370
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5371
-   r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32);
5372
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5373
-   r_.i32 = a_.i32 - b_.i32;
5374
-#else
5375
-   SIMDE_VECTORIZE
5376
-   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
5377
-       r_.i32[i] = a_.i32[i] - b_.i32[i];
5378
-   }
5379
-#endif
5380
-
5381
-   return simde__m128i_from_private(r_);
5382
-#endif
5383
-}
5384
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5385
-#define _mm_sub_epi32(a, b) simde_mm_sub_epi32(a, b)
5386
-#endif
5387
-
5388
-SIMDE_FUNCTION_ATTRIBUTES
5389
-simde__m128i simde_mm_sub_epi64(simde__m128i a, simde__m128i b)
5390
-{
5391
-#if defined(SIMDE_X86_SSE2_NATIVE)
5392
-   return _mm_sub_epi64(a, b);
5393
-#else
5394
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5395
-                b_ = simde__m128i_to_private(b);
5396
-
5397
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5398
-   r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64);
5399
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5400
-   r_.i64 = a_.i64 - b_.i64;
5401
-#else
5402
-   SIMDE_VECTORIZE
5403
-   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
5404
-       r_.i64[i] = a_.i64[i] - b_.i64[i];
5405
-   }
5406
-#endif
5407
-
5408
-   return simde__m128i_from_private(r_);
5409
-#endif
5410
-}
5411
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5412
-#define _mm_sub_epi64(a, b) simde_mm_sub_epi64(a, b)
5413
-#endif
5414
-
5415
-SIMDE_FUNCTION_ATTRIBUTES
5416
-simde__m128i simde_x_mm_sub_epu32(simde__m128i a, simde__m128i b)
5417
-{
5418
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5419
-                b_ = simde__m128i_to_private(b);
5420
-
5421
-#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5422
-   r_.u32 = a_.u32 - b_.u32;
5423
-#else
5424
-   SIMDE_VECTORIZE
5425
-   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
5426
-       r_.u32[i] = a_.u32[i] - b_.u32[i];
5427
-   }
5428
-#endif
5429
-
5430
-   return simde__m128i_from_private(r_);
5431
-}
5432
-
5433
-SIMDE_FUNCTION_ATTRIBUTES
5434
-simde__m128d simde_mm_sub_pd(simde__m128d a, simde__m128d b)
5435
-{
5436
-#if defined(SIMDE_X86_SSE2_NATIVE)
5437
-   return _mm_sub_pd(a, b);
5438
-#else
5439
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
5440
-                b_ = simde__m128d_to_private(b);
5441
-
5442
-#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5443
-   r_.f64 = a_.f64 - b_.f64;
5444
-#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5445
-   r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128);
5446
-#else
5447
-   SIMDE_VECTORIZE
5448
-   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
5449
-       r_.f64[i] = a_.f64[i] - b_.f64[i];
5450
-   }
5451
-#endif
5452
-
5453
-   return simde__m128d_from_private(r_);
5454
-#endif
5455
-}
5456
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5457
-#define _mm_sub_pd(a, b) simde_mm_sub_pd(a, b)
5458
-#endif
5459
-
5460
-SIMDE_FUNCTION_ATTRIBUTES
5461
-simde__m128d simde_mm_sub_sd(simde__m128d a, simde__m128d b)
5462
-{
5463
-#if defined(SIMDE_X86_SSE2_NATIVE)
5464
-   return _mm_sub_sd(a, b);
5465
-#elif defined(SIMDE_ASSUME_VECTORIZATION)
5466
-   return simde_mm_move_sd(a, simde_mm_sub_pd(a, b));
5467
-#else
5468
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
5469
-                b_ = simde__m128d_to_private(b);
5470
-
5471
-   r_.f64[0] = a_.f64[0] - b_.f64[0];
5472
-   r_.f64[1] = a_.f64[1];
5473
-
5474
-   return simde__m128d_from_private(r_);
5475
-#endif
5476
-}
5477
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5478
-#define _mm_sub_sd(a, b) simde_mm_sub_sd(a, b)
5479
-#endif
5480
-
5481
-SIMDE_FUNCTION_ATTRIBUTES
5482
-simde__m64 simde_mm_sub_si64(simde__m64 a, simde__m64 b)
5483
-{
5484
-#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
5485
-   return _mm_sub_si64(a, b);
5486
-#else
5487
-   simde__m64_private r_, a_ = simde__m64_to_private(a),
5488
-                  b_ = simde__m64_to_private(b);
5489
-
5490
-#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
5491
-   r_.i64 = a_.i64 - b_.i64;
5492
-#else
5493
-   r_.i64[0] = a_.i64[0] - b_.i64[0];
5494
-#endif
5495
-
5496
-   return simde__m64_from_private(r_);
5497
-#endif
5498
-}
5499
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5500
-#define _mm_sub_si64(a, b) simde_mm_sub_si64(a, b)
5501
-#endif
5502
-
5503
-SIMDE_FUNCTION_ATTRIBUTES
5504
-simde__m128i simde_mm_subs_epi8(simde__m128i a, simde__m128i b)
5505
-{
5506
-#if defined(SIMDE_X86_SSE2_NATIVE)
5507
-   return _mm_subs_epi8(a, b);
5508
-#else
5509
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5510
-                b_ = simde__m128i_to_private(b);
5511
-
5512
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5513
-   r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8);
5514
-#else
5515
-   SIMDE_VECTORIZE
5516
-   for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i8[0])); i++) {
5517
-       if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
5518
-           r_.i8[i] = INT8_MIN;
5519
-       } else if ((b_.i8[i]) < 0 &&
5520
-              (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
5521
-           r_.i8[i] = INT8_MAX;
5522
-       } else {
5523
-           r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
5524
-       }
5525
-   }
5526
-#endif
5527
-
5528
-   return simde__m128i_from_private(r_);
5529
-#endif
5530
-}
5531
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5532
-#define _mm_subs_epi8(a, b) simde_mm_subs_epi8(a, b)
5533
-#endif
5534
-
5535
-SIMDE_FUNCTION_ATTRIBUTES
5536
-simde__m128i simde_mm_subs_epi16(simde__m128i a, simde__m128i b)
5537
-{
5538
-#if defined(SIMDE_X86_SSE2_NATIVE)
5539
-   return _mm_subs_epi16(a, b);
5540
-#else
5541
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5542
-                b_ = simde__m128i_to_private(b);
5543
-
5544
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5545
-   r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16);
5546
-#else
5547
-   SIMDE_VECTORIZE
5548
-   for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i16[0])); i++) {
5549
-       if (((b_.i16[i]) > 0 &&
5550
-            (a_.i16[i]) < INT16_MIN + (b_.i16[i]))) {
5551
-           r_.i16[i] = INT16_MIN;
5552
-       } else if ((b_.i16[i]) < 0 &&
5553
-              (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
5554
-           r_.i16[i] = INT16_MAX;
5555
-       } else {
5556
-           r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
5557
-       }
5558
-   }
5559
-#endif
5560
-
5561
-   return simde__m128i_from_private(r_);
5562
-#endif
5563
-}
5564
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5565
-#define _mm_subs_epi16(a, b) simde_mm_subs_epi16(a, b)
5566
-#endif
5567
-
5568
-SIMDE_FUNCTION_ATTRIBUTES
5569
-simde__m128i simde_mm_subs_epu8(simde__m128i a, simde__m128i b)
5570
-{
5571
-#if defined(SIMDE_X86_SSE2_NATIVE)
5572
-   return _mm_subs_epu8(a, b);
5573
-#else
5574
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5575
-                b_ = simde__m128i_to_private(b);
5576
-
5577
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5578
-   r_.neon_u8 = vqsubq_u8(a_.neon_u8, b_.neon_u8);
5579
-#else
5580
-   SIMDE_VECTORIZE
5581
-   for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i8[0])); i++) {
5582
-       const int32_t x = a_.u8[i] - b_.u8[i];
5583
-       if (x < 0) {
5584
-           r_.u8[i] = 0;
5585
-       } else if (x > UINT8_MAX) {
5586
-           r_.u8[i] = UINT8_MAX;
5587
-       } else {
5588
-           r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
5589
-       }
5590
-   }
5591
-#endif
5592
-
5593
-   return simde__m128i_from_private(r_);
5594
-#endif
5595
-}
5596
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5597
-#define _mm_subs_epu8(a, b) simde_mm_subs_epu8(a, b)
5598
-#endif
5599
-
5600
-SIMDE_FUNCTION_ATTRIBUTES
5601
-simde__m128i simde_mm_subs_epu16(simde__m128i a, simde__m128i b)
5602
-{
5603
-#if defined(SIMDE_X86_SSE2_NATIVE)
5604
-   return _mm_subs_epu16(a, b);
5605
-#else
5606
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5607
-                b_ = simde__m128i_to_private(b);
5608
-
5609
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5610
-   r_.neon_u16 = vqsubq_u16(a_.neon_u16, b_.neon_u16);
5611
-#else
5612
-   SIMDE_VECTORIZE
5613
-   for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i16[0])); i++) {
5614
-       const int32_t x = a_.u16[i] - b_.u16[i];
5615
-       if (x < 0) {
5616
-           r_.u16[i] = 0;
5617
-       } else if (x > UINT16_MAX) {
5618
-           r_.u16[i] = UINT16_MAX;
5619
-       } else {
5620
-           r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
5621
-       }
5622
-   }
5623
-#endif
5624
-
5625
-   return simde__m128i_from_private(r_);
5626
-#endif
5627
-}
5628
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5629
-#define _mm_subs_epu16(a, b) simde_mm_subs_epu16(a, b)
5630
-#endif
5631
-
5632
-SIMDE_FUNCTION_ATTRIBUTES
5633
-int simde_mm_ucomieq_sd(simde__m128d a, simde__m128d b)
5634
-{
5635
-#if defined(SIMDE_X86_SSE2_NATIVE)
5636
-   return _mm_ucomieq_sd(a, b);
5637
-#else
5638
-   simde__m128d_private a_ = simde__m128d_to_private(a),
5639
-                b_ = simde__m128d_to_private(b);
5640
-   int r;
5641
-
5642
-#if defined(SIMDE_HAVE_FENV_H)
5643
-   fenv_t envp;
5644
-   int x = feholdexcept(&envp);
5645
-   r = a_.f64[0] == b_.f64[0];
5646
-   if (HEDLEY_LIKELY(x == 0))
5647
-       fesetenv(&envp);
5648
-#else
5649
-   r = a_.f64[0] == b_.f64[0];
5650
-#endif
5651
-
5652
-   return r;
5653
-#endif
5654
-}
5655
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5656
-#define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(a, b)
5657
-#endif
5658
-
5659
-SIMDE_FUNCTION_ATTRIBUTES
5660
-int simde_mm_ucomige_sd(simde__m128d a, simde__m128d b)
5661
-{
5662
-#if defined(SIMDE_X86_SSE2_NATIVE)
5663
-   return _mm_ucomige_sd(a, b);
5664
-#else
5665
-   simde__m128d_private a_ = simde__m128d_to_private(a),
5666
-                b_ = simde__m128d_to_private(b);
5667
-   int r;
5668
-
5669
-#if defined(SIMDE_HAVE_FENV_H)
5670
-   fenv_t envp;
5671
-   int x = feholdexcept(&envp);
5672
-   r = a_.f64[0] >= b_.f64[0];
5673
-   if (HEDLEY_LIKELY(x == 0))
5674
-       fesetenv(&envp);
5675
-#else
5676
-   r = a_.f64[0] >= b_.f64[0];
5677
-#endif
5678
-
5679
-   return r;
5680
-#endif
5681
-}
5682
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5683
-#define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(a, b)
5684
-#endif
5685
-
5686
-SIMDE_FUNCTION_ATTRIBUTES
5687
-int simde_mm_ucomigt_sd(simde__m128d a, simde__m128d b)
5688
-{
5689
-#if defined(SIMDE_X86_SSE2_NATIVE)
5690
-   return _mm_ucomigt_sd(a, b);
5691
-#else
5692
-   simde__m128d_private a_ = simde__m128d_to_private(a),
5693
-                b_ = simde__m128d_to_private(b);
5694
-   int r;
5695
-
5696
-#if defined(SIMDE_HAVE_FENV_H)
5697
-   fenv_t envp;
5698
-   int x = feholdexcept(&envp);
5699
-   r = a_.f64[0] > b_.f64[0];
5700
-   if (HEDLEY_LIKELY(x == 0))
5701
-       fesetenv(&envp);
5702
-#else
5703
-   r = a_.f64[0] > b_.f64[0];
5704
-#endif
5705
-
5706
-   return r;
5707
-#endif
5708
-}
5709
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5710
-#define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(a, b)
5711
-#endif
5712
-
5713
-SIMDE_FUNCTION_ATTRIBUTES
5714
-int simde_mm_ucomile_sd(simde__m128d a, simde__m128d b)
5715
-{
5716
-#if defined(SIMDE_X86_SSE2_NATIVE)
5717
-   return _mm_ucomile_sd(a, b);
5718
-#else
5719
-   simde__m128d_private a_ = simde__m128d_to_private(a),
5720
-                b_ = simde__m128d_to_private(b);
5721
-   int r;
5722
-
5723
-#if defined(SIMDE_HAVE_FENV_H)
5724
-   fenv_t envp;
5725
-   int x = feholdexcept(&envp);
5726
-   r = a_.f64[0] <= b_.f64[0];
5727
-   if (HEDLEY_LIKELY(x == 0))
5728
-       fesetenv(&envp);
5729
-#else
5730
-   r = a_.f64[0] <= b_.f64[0];
5731
-#endif
5732
-
5733
-   return r;
5734
-#endif
5735
-}
5736
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5737
-#define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(a, b)
5738
-#endif
5739
-
5740
-SIMDE_FUNCTION_ATTRIBUTES
5741
-int simde_mm_ucomilt_sd(simde__m128d a, simde__m128d b)
5742
-{
5743
-#if defined(SIMDE_X86_SSE2_NATIVE)
5744
-   return _mm_ucomilt_sd(a, b);
5745
-#else
5746
-   simde__m128d_private a_ = simde__m128d_to_private(a),
5747
-                b_ = simde__m128d_to_private(b);
5748
-   int r;
5749
-
5750
-#if defined(SIMDE_HAVE_FENV_H)
5751
-   fenv_t envp;
5752
-   int x = feholdexcept(&envp);
5753
-   r = a_.f64[0] < b_.f64[0];
5754
-   if (HEDLEY_LIKELY(x == 0))
5755
-       fesetenv(&envp);
5756
-#else
5757
-   r = a_.f64[0] < b_.f64[0];
5758
-#endif
5759
-
5760
-   return r;
5761
-#endif
5762
-}
5763
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5764
-#define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(a, b)
5765
-#endif
5766
-
5767
-SIMDE_FUNCTION_ATTRIBUTES
5768
-int simde_mm_ucomineq_sd(simde__m128d a, simde__m128d b)
5769
-{
5770
-#if defined(SIMDE_X86_SSE2_NATIVE)
5771
-   return _mm_ucomineq_sd(a, b);
5772
-#else
5773
-   simde__m128d_private a_ = simde__m128d_to_private(a),
5774
-                b_ = simde__m128d_to_private(b);
5775
-   int r;
5776
-
5777
-#if defined(SIMDE_HAVE_FENV_H)
5778
-   fenv_t envp;
5779
-   int x = feholdexcept(&envp);
5780
-   r = a_.f64[0] != b_.f64[0];
5781
-   if (HEDLEY_LIKELY(x == 0))
5782
-       fesetenv(&envp);
5783
-#else
5784
-   r = a_.f64[0] != b_.f64[0];
5785
-#endif
5786
-
5787
-   return r;
5788
-#endif
5789
-}
5790
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5791
-#define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b)
5792
-#endif
5793
-
5794
-#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5795
-HEDLEY_DIAGNOSTIC_PUSH
5796
-SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
5797
-#endif
5798
-
5799
-#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5800
-HEDLEY_DIAGNOSTIC_POP
5801
-#endif
5802
-
5803
-SIMDE_FUNCTION_ATTRIBUTES
5804
-void simde_mm_lfence(void)
5805
-{
5806
-#if defined(SIMDE_X86_SSE2_NATIVE)
5807
-   _mm_lfence();
5808
-#else
5809
-   simde_mm_sfence();
5810
-#endif
5811
-}
5812
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5813
-#define _mm_lfence() simde_mm_lfence()
5814
-#endif
5815
-
5816
-SIMDE_FUNCTION_ATTRIBUTES
5817
-void simde_mm_mfence(void)
5818
-{
5819
-#if defined(SIMDE_X86_SSE2_NATIVE)
5820
-   _mm_mfence();
5821
-#else
5822
-   simde_mm_sfence();
5823
-#endif
5824
-}
5825
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5826
-#define _mm_mfence() simde_mm_mfence()
5827
-#endif
5828
-
5829
-SIMDE_FUNCTION_ATTRIBUTES
5830
-simde__m128i simde_mm_unpackhi_epi8(simde__m128i a, simde__m128i b)
5831
-{
5832
-#if defined(SIMDE_X86_SSE2_NATIVE)
5833
-   return _mm_unpackhi_epi8(a, b);
5834
-#else
5835
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5836
-                b_ = simde__m128i_to_private(b);
5837
-
5838
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5839
-   int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a_.neon_i16));
5840
-   int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16));
5841
-   int8x8x2_t result = vzip_s8(a1, b1);
5842
-   r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
5843
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
5844
-   r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26,
5845
-                     11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
5846
-#else
5847
-   SIMDE_VECTORIZE
5848
-   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2); i++) {
5849
-       r_.i8[(i * 2)] =
5850
-           a_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
5851
-       r_.i8[(i * 2) + 1] =
5852
-           b_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
5853
-   }
5854
-#endif
5855
-
5856
-   return simde__m128i_from_private(r_);
5857
-#endif
5858
-}
5859
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5860
-#define _mm_unpackhi_epi8(a, b) simde_mm_unpackhi_epi8(a, b)
5861
-#endif
5862
-
5863
-SIMDE_FUNCTION_ATTRIBUTES
5864
-simde__m128i simde_mm_unpackhi_epi16(simde__m128i a, simde__m128i b)
5865
-{
5866
-#if defined(SIMDE_X86_SSE2_NATIVE)
5867
-   return _mm_unpackhi_epi16(a, b);
5868
-#else
5869
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5870
-                b_ = simde__m128i_to_private(b);
5871
-
5872
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5873
-   int16x4_t a1 = vget_high_s16(a_.neon_i16);
5874
-   int16x4_t b1 = vget_high_s16(b_.neon_i16);
5875
-   int16x4x2_t result = vzip_s16(a1, b1);
5876
-   r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
5877
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
5878
-   r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6,
5879
-                      14, 7, 15);
5880
-#else
5881
-   SIMDE_VECTORIZE
5882
-   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2); i++) {
5883
-       r_.i16[(i * 2)] =
5884
-           a_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
5885
-       r_.i16[(i * 2) + 1] =
5886
-           b_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
5887
-   }
5888
-#endif
5889
-
5890
-   return simde__m128i_from_private(r_);
5891
-#endif
5892
-}
5893
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5894
-#define _mm_unpackhi_epi16(a, b) simde_mm_unpackhi_epi16(a, b)
5895
-#endif
5896
-
5897
-SIMDE_FUNCTION_ATTRIBUTES
5898
-simde__m128i simde_mm_unpackhi_epi32(simde__m128i a, simde__m128i b)
5899
-{
5900
-#if defined(SIMDE_X86_SSE2_NATIVE)
5901
-   return _mm_unpackhi_epi32(a, b);
5902
-#else
5903
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5904
-                b_ = simde__m128i_to_private(b);
5905
-
5906
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5907
-   int32x2_t a1 = vget_high_s32(a_.neon_i32);
5908
-   int32x2_t b1 = vget_high_s32(b_.neon_i32);
5909
-   int32x2x2_t result = vzip_s32(a1, b1);
5910
-   r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
5911
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
5912
-   r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7);
5913
-#else
5914
-   SIMDE_VECTORIZE
5915
-   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2); i++) {
5916
-       r_.i32[(i * 2)] =
5917
-           a_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
5918
-       r_.i32[(i * 2) + 1] =
5919
-           b_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
5920
-   }
5921
-#endif
5922
-
5923
-   return simde__m128i_from_private(r_);
5924
-#endif
5925
-}
5926
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5927
-#define _mm_unpackhi_epi32(a, b) simde_mm_unpackhi_epi32(a, b)
5928
-#endif
5929
-
5930
-SIMDE_FUNCTION_ATTRIBUTES
5931
-simde__m128i simde_mm_unpackhi_epi64(simde__m128i a, simde__m128i b)
5932
-{
5933
-#if defined(SIMDE_X86_SSE2_NATIVE)
5934
-   return _mm_unpackhi_epi64(a, b);
5935
-#else
5936
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5937
-                b_ = simde__m128i_to_private(b);
5938
-
5939
-#if defined(SIMDE_SHUFFLE_VECTOR_)
5940
-   r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3);
5941
-#else
5942
-   SIMDE_VECTORIZE
5943
-   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2); i++) {
5944
-       r_.i64[(i * 2)] =
5945
-           a_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
5946
-       r_.i64[(i * 2) + 1] =
5947
-           b_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
5948
-   }
5949
-#endif
5950
-
5951
-   return simde__m128i_from_private(r_);
5952
-#endif
5953
-}
5954
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5955
-#define _mm_unpackhi_epi64(a, b) simde_mm_unpackhi_epi64(a, b)
5956
-#endif
5957
-
5958
-SIMDE_FUNCTION_ATTRIBUTES
5959
-simde__m128d simde_mm_unpackhi_pd(simde__m128d a, simde__m128d b)
5960
-{
5961
-#if defined(SIMDE_X86_SSE2_NATIVE)
5962
-   return _mm_unpackhi_pd(a, b);
5963
-#else
5964
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
5965
-                b_ = simde__m128d_to_private(b);
5966
-
5967
-#if defined(SIMDE_SHUFFLE_VECTOR_)
5968
-   r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3);
5969
-#else
5970
-   SIMDE_VECTORIZE
5971
-   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2); i++) {
5972
-       r_.f64[(i * 2)] =
5973
-           a_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
5974
-       r_.f64[(i * 2) + 1] =
5975
-           b_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
5976
-   }
5977
-#endif
5978
-
5979
-   return simde__m128d_from_private(r_);
5980
-#endif
5981
-}
5982
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5983
-#define _mm_unpackhi_pd(a, b) simde_mm_unpackhi_pd(a, b)
5984
-#endif
5985
-
5986
-SIMDE_FUNCTION_ATTRIBUTES
5987
-simde__m128i simde_mm_unpacklo_epi8(simde__m128i a, simde__m128i b)
5988
-{
5989
-#if defined(SIMDE_X86_SSE2_NATIVE)
5990
-   return _mm_unpacklo_epi8(a, b);
5991
-#else
5992
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5993
-                b_ = simde__m128i_to_private(b);
5994
-
5995
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5996
-   int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a_.neon_i16));
5997
-   int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16));
5998
-   int8x8x2_t result = vzip_s8(a1, b1);
5999
-   r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
6000
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
6001
-   r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18,
6002
-                     3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
6003
-#else
6004
-   SIMDE_VECTORIZE
6005
-   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2); i++) {
6006
-       r_.i8[(i * 2)] = a_.i8[i];
6007
-       r_.i8[(i * 2) + 1] = b_.i8[i];
6008
-   }
6009
-#endif
6010
-
6011
-   return simde__m128i_from_private(r_);
6012
-#endif
6013
-}
6014
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6015
-#define _mm_unpacklo_epi8(a, b) simde_mm_unpacklo_epi8(a, b)
6016
-#endif
6017
-
6018
-SIMDE_FUNCTION_ATTRIBUTES
6019
-simde__m128i simde_mm_unpacklo_epi16(simde__m128i a, simde__m128i b)
6020
-{
6021
-#if defined(SIMDE_X86_SSE2_NATIVE)
6022
-   return _mm_unpacklo_epi16(a, b);
6023
-#else
6024
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
6025
-                b_ = simde__m128i_to_private(b);
6026
-
6027
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6028
-   int16x4_t a1 = vget_low_s16(a_.neon_i16);
6029
-   int16x4_t b1 = vget_low_s16(b_.neon_i16);
6030
-   int16x4x2_t result = vzip_s16(a1, b1);
6031
-   r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
6032
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
6033
-   r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2,
6034
-                      10, 3, 11);
6035
-#else
6036
-   SIMDE_VECTORIZE
6037
-   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2); i++) {
6038
-       r_.i16[(i * 2)] = a_.i16[i];
6039
-       r_.i16[(i * 2) + 1] = b_.i16[i];
6040
-   }
6041
-#endif
6042
-
6043
-   return simde__m128i_from_private(r_);
6044
-#endif
6045
-}
6046
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6047
-#define _mm_unpacklo_epi16(a, b) simde_mm_unpacklo_epi16(a, b)
6048
-#endif
6049
-
6050
-SIMDE_FUNCTION_ATTRIBUTES
6051
-simde__m128i simde_mm_unpacklo_epi32(simde__m128i a, simde__m128i b)
6052
-{
6053
-#if defined(SIMDE_X86_SSE2_NATIVE)
6054
-   return _mm_unpacklo_epi32(a, b);
6055
-#else
6056
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
6057
-                b_ = simde__m128i_to_private(b);
6058
-
6059
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6060
-   int32x2_t a1 = vget_low_s32(a_.neon_i32);
6061
-   int32x2_t b1 = vget_low_s32(b_.neon_i32);
6062
-   int32x2x2_t result = vzip_s32(a1, b1);
6063
-   r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
6064
-#elif defined(SIMDE_SHUFFLE_VECTOR_)
6065
-   r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5);
6066
-#else
6067
-   SIMDE_VECTORIZE
6068
-   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2); i++) {
6069
-       r_.i32[(i * 2)] = a_.i32[i];
6070
-       r_.i32[(i * 2) + 1] = b_.i32[i];
6071
-   }
6072
-#endif
6073
-
6074
-   return simde__m128i_from_private(r_);
6075
-#endif
6076
-}
6077
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6078
-#define _mm_unpacklo_epi32(a, b) simde_mm_unpacklo_epi32(a, b)
6079
-#endif
6080
-
6081
-SIMDE_FUNCTION_ATTRIBUTES
6082
-simde__m128i simde_mm_unpacklo_epi64(simde__m128i a, simde__m128i b)
6083
-{
6084
-#if defined(SIMDE_X86_SSE2_NATIVE)
6085
-   return _mm_unpacklo_epi64(a, b);
6086
-#else
6087
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
6088
-                b_ = simde__m128i_to_private(b);
6089
-
6090
-#if defined(SIMDE_SHUFFLE_VECTOR_)
6091
-   r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2);
6092
-#else
6093
-   SIMDE_VECTORIZE
6094
-   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2); i++) {
6095
-       r_.i64[(i * 2)] = a_.i64[i];
6096
-       r_.i64[(i * 2) + 1] = b_.i64[i];
6097
-   }
6098
-#endif
6099
-
6100
-   return simde__m128i_from_private(r_);
6101
-#endif
6102
-}
6103
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6104
-#define _mm_unpacklo_epi64(a, b) simde_mm_unpacklo_epi64(a, b)
6105
-#endif
6106
-
6107
-SIMDE_FUNCTION_ATTRIBUTES
6108
-simde__m128d simde_mm_unpacklo_pd(simde__m128d a, simde__m128d b)
6109
-{
6110
-#if defined(SIMDE_X86_SSE2_NATIVE)
6111
-   return _mm_unpacklo_pd(a, b);
6112
-#else
6113
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
6114
-                b_ = simde__m128d_to_private(b);
6115
-
6116
-#if defined(SIMDE_SHUFFLE_VECTOR_)
6117
-   r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2);
6118
-#else
6119
-   SIMDE_VECTORIZE
6120
-   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2); i++) {
6121
-       r_.f64[(i * 2)] = a_.f64[i];
6122
-       r_.f64[(i * 2) + 1] = b_.f64[i];
6123
-   }
6124
-#endif
6125
-
6126
-   return simde__m128d_from_private(r_);
6127
-#endif
6128
-}
6129
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6130
-#define _mm_unpacklo_pd(a, b) simde_mm_unpacklo_pd(a, b)
6131
-#endif
6132
-
6133
-SIMDE_FUNCTION_ATTRIBUTES
6134
-simde__m128d simde_mm_xor_pd(simde__m128d a, simde__m128d b)
6135
-{
6136
-#if defined(SIMDE_X86_SSE2_NATIVE)
6137
-   return _mm_xor_pd(a, b);
6138
-#else
6139
-   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
6140
-                b_ = simde__m128d_to_private(b);
6141
-
6142
-#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6143
-   r_.i32f = a_.i32f ^ b_.i32f;
6144
-#else
6145
-   SIMDE_VECTORIZE
6146
-   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
6147
-       r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
6148
-   }
6149
-#endif
6150
-
6151
-   return simde__m128d_from_private(r_);
6152
-#endif
6153
-}
6154
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6155
-#define _mm_xor_pd(a, b) simde_mm_xor_pd(a, b)
6156
-#endif
6157
-
6158
-SIMDE_FUNCTION_ATTRIBUTES
6159
-simde__m128i simde_mm_xor_si128(simde__m128i a, simde__m128i b)
6160
-{
6161
-#if defined(SIMDE_X86_SSE2_NATIVE)
6162
-   return _mm_xor_si128(a, b);
6163
-#else
6164
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
6165
-                b_ = simde__m128i_to_private(b);
6166
-
6167
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6168
-   r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
6169
-#elif defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
6170
-   r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
6171
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6172
-   r_.i32f = a_.i32f ^ b_.i32f;
6173
-#else
6174
-   SIMDE_VECTORIZE
6175
-   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
6176
-       r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
6177
-   }
6178
-#endif
6179
-
6180
-   return simde__m128i_from_private(r_);
6181
-#endif
6182
-}
6183
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6184
-#define _mm_xor_si128(a, b) simde_mm_xor_si128(a, b)
6185
-#endif
6186
-
6187
-SIMDE_FUNCTION_ATTRIBUTES
6188
-simde__m128i simde_x_mm_not_si128(simde__m128i a)
6189
-{
6190
-   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
6191
-
6192
-#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6193
-   r_.neon_i32 = vmvnq_s32(a_.neon_i32);
6194
-#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6195
-   r_.i32f = ~(a_.i32f);
6196
-#else
6197
-   SIMDE_VECTORIZE
6198
-   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
6199
-       r_.i32f[i] = ~(a_.i32f[i]);
6200
-   }
6201
-#endif
6202
-
6203
-   return simde__m128i_from_private(r_);
6204
-}
6205
-
6206
-#define SIMDE_MM_SHUFFLE2(x, y) (((x) << 1) | (y))
6207
-#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6208
-#define _MM_SHUFFLE2(x, y) SIMDE_MM_SHUFFLE2(x, y)
6209
-#endif
6210
-
6211
-SIMDE_END_DECLS_
6212
-
6213
-HEDLEY_DIAGNOSTIC_POP
6214
-
6215
-#endif /* !defined(SIMDE_X86_SSE2_H) */
6216
obs-studio-26.1.0.tar.xz/libobs/util/sse2neon.h Deleted
4209
 
1
@@ -1,4207 +0,0 @@
2
-#ifndef SSE2NEON_H
3
-#define SSE2NEON_H
4
-
5
-// This header file provides a simple API translation layer
6
-// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
7
-//
8
-// This header file does not yet translate all of the SSE intrinsics.
9
-//
10
-// Contributors to this work are:
11
-//   John W. Ratcliff <jratcliffscarab@gmail.com>
12
-//   Brandon Rowlett <browlett@nvidia.com>
13
-//   Ken Fast <kfast@gdeb.com>
14
-//   Eric van Beurden <evanbeurden@nvidia.com>
15
-//   Alexander Potylitsin <apotylitsin@nvidia.com>
16
-//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
17
-//   Jim Huang <jserv@biilabs.io>
18
-//   Mark Cheng <marktwtn@biilabs.io>
19
-//   Malcolm James MacLeod <malcolm@gulden.com>
20
-//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
21
-//   Sebastian Pop <spop@amazon.com>
22
-//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
23
-//   Danila Kutenin <danilak@google.com>
24
-
25
-/*
26
- * sse2neon is freely redistributable under the MIT License.
27
- *
28
- * Permission is hereby granted, free of charge, to any person obtaining a copy
29
- * of this software and associated documentation files (the "Software"), to deal
30
- * in the Software without restriction, including without limitation the rights
31
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
32
- * copies of the Software, and to permit persons to whom the Software is
33
- * furnished to do so, subject to the following conditions:
34
- *
35
- * The above copyright notice and this permission notice shall be included in
36
- * all copies or substantial portions of the Software.
37
- *
38
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
39
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
40
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
43
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
44
- * SOFTWARE.
45
- */
46
-
47
-#if defined(__GNUC__) || defined(__clang__)
48
-#pragma push_macro("FORCE_INLINE")
49
-#pragma push_macro("ALIGN_STRUCT")
50
-#define FORCE_INLINE static inline __attribute__((always_inline))
51
-#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
52
-#else
53
-#error "Macro name collisions may happen with unsupported compiler."
54
-#ifdef FORCE_INLINE
55
-#undef FORCE_INLINE
56
-#endif
57
-#define FORCE_INLINE static inline
58
-#ifndef ALIGN_STRUCT
59
-#define ALIGN_STRUCT(x) __declspec(align(x))
60
-#endif
61
-#endif
62
-
63
-#include <stdint.h>
64
-#include <stdlib.h>
65
-
66
-#include <arm_neon.h>
67
-
68
-/* "__has_builtin" can be used to query support for built-in functions
69
- * provided by gcc/clang and other compilers that support it.
70
- */
71
-#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
72
-/* Compatibility with gcc <= 9 */
73
-#if __GNUC__ <= 9
74
-#define __has_builtin(x) HAS##x
75
-#define HAS__builtin_popcount 1
76
-#define HAS__builtin_popcountll 1
77
-#else
78
-#define __has_builtin(x) 0
79
-#endif
80
-#endif
81
-
82
-/**
83
- * MACRO for shuffle parameter for _mm_shuffle_ps().
84
- * Argument fp3 is a digit[0123] that represents the fp from argument "b"
85
- * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
86
- * for fp2 in result. fp1 is a digit[0123] that represents the fp from
87
- * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
88
- * fp0 is the same for fp0 of result.
89
- */
90
-#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
91
-   (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
92
-
93
-/* indicate immediate constant argument in a given range */
94
-#define __constrange(a, b) const
95
-
96
-/* A few intrinsics accept traditional data types like ints or floats, but
97
- * most operate on data types that are specific to SSE.
98
- * If a vector type ends in d, it contains doubles, and if it does not have
99
- * a suffix, it contains floats. An integer vector type can contain any type
100
- * of integer, from chars to shorts to unsigned long longs.
101
- */
102
-typedef float32x2_t __m64;
103
-typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
104
-// On ARM 32-bit architecture, the float64x2_t is not supported.
105
-// The data type __m128d should be represented in a different way for related
106
-// intrinsic conversion.
107
-#if defined(__aarch64__)
108
-typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
109
-#else
110
-typedef float32x4_t __m128d;
111
-#endif
112
-typedef int64x1_t __m64i;
113
-typedef int64x2_t __m128i; /* 128-bit vector containing integers */
114
-
115
-/* type-safe casting between types */
116
-
117
-#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
118
-#define vreinterpretq_m128_f32(x) (x)
119
-#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
120
-
121
-#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
122
-#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
123
-#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
124
-#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
125
-
126
-#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
127
-#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
128
-#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
129
-#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
130
-
131
-#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
132
-#define vreinterpretq_f32_m128(x) (x)
133
-#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
134
-
135
-#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
136
-#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
137
-#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
138
-#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
139
-
140
-#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
141
-#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
142
-#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
143
-#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
144
-
145
-#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
146
-#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
147
-#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
148
-#define vreinterpretq_m128i_s64(x) (x)
149
-
150
-#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
151
-#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
152
-#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
153
-#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
154
-
155
-#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
156
-#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
157
-#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
158
-#define vreinterpretq_s64_m128i(x) (x)
159
-
160
-#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
161
-#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
162
-#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
163
-#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
164
-
165
-#define vreinterpret_m64i_s8(x) vreinterpret_s64_s8(x)
166
-#define vreinterpret_m64i_s16(x) vreinterpret_s64_s16(x)
167
-#define vreinterpret_m64i_s32(x) vreinterpret_s64_s32(x)
168
-#define vreinterpret_m64i_s64(x) (x)
169
-
170
-#define vreinterpret_m64i_u8(x) vreinterpret_s64_u8(x)
171
-#define vreinterpret_m64i_u16(x) vreinterpret_s64_u16(x)
172
-#define vreinterpret_m64i_u32(x) vreinterpret_s64_u32(x)
173
-#define vreinterpret_m64i_u64(x) vreinterpret_s64_u64(x)
174
-
175
-#define vreinterpret_u8_m64i(x) vreinterpret_u8_s64(x)
176
-#define vreinterpret_u16_m64i(x) vreinterpret_u16_s64(x)
177
-#define vreinterpret_u32_m64i(x) vreinterpret_u32_s64(x)
178
-#define vreinterpret_u64_m64i(x) vreinterpret_u64_s64(x)
179
-
180
-#define vreinterpret_s8_m64i(x) vreinterpret_s8_s64(x)
181
-#define vreinterpret_s16_m64i(x) vreinterpret_s16_s64(x)
182
-#define vreinterpret_s32_m64i(x) vreinterpret_s32_s64(x)
183
-#define vreinterpret_s64_m64i(x) (x)
184
-
185
-// A struct is defined in this header file called 'SIMDVec' which can be used
186
-// by applications which attempt to access the contents of an _m128 struct
187
-// directly.  It is important to note that accessing the __m128 struct directly
188
-// is bad coding practice by Microsoft: @see:
189
-// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
190
-//
191
-// However, some legacy source code may try to access the contents of an __m128
192
-// struct directly so the developer can use the SIMDVec as an alias for it.  Any
193
-// casting must be done manually by the developer, as you cannot cast or
194
-// otherwise alias the base NEON data type for intrinsic operations.
195
-//
196
-// union intended to allow direct access to an __m128 variable using the names
197
-// that the MSVC compiler provides.  This union should really only be used when
198
-// trying to access the members of the vector as integer values.  GCC/clang
199
-// allow native access to the float members through a simple array access
200
-// operator (in C since 4.6, in C++ since 4.8).
201
-//
202
-// Ideally direct accesses to SIMD vectors should not be used since it can cause
203
-// a performance hit.  If it really is needed however, the original __m128
204
-// variable can be aliased with a pointer to this union and used to access
205
-// individual components.  The use of this union should be hidden behind a macro
206
-// that is used throughout the codebase to access the members instead of always
207
-// declaring this type of variable.
208
-typedef union ALIGN_STRUCT(16) SIMDVec {
209
-   float m128_f32[4];    // as floats - DON'T USE. Added for convenience.
210
-   int8_t m128_i8[16];   // as signed 8-bit integers.
211
-   int16_t m128_i16[8];  // as signed 16-bit integers.
212
-   int32_t m128_i32[4];  // as signed 32-bit integers.
213
-   int64_t m128_i64[2];  // as signed 64-bit integers.
214
-   uint8_t m128_u8[16];  // as unsigned 8-bit integers.
215
-   uint16_t m128_u16[8]; // as unsigned 16-bit integers.
216
-   uint32_t m128_u32[4]; // as unsigned 32-bit integers.
217
-   uint64_t m128_u64[2]; // as unsigned 64-bit integers.
218
-} SIMDVec;
219
-
220
-// casting using SIMDVec
221
-#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *)&x)->m128_u64[n])
222
-#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *)&x)->m128_u32[n])
223
-
224
-/* Backwards compatibility for compilers with lack of specific type support */
225
-
226
-// Older gcc does not define vld1q_u8_x4 type
227
-#if defined(__GNUC__) && !defined(__clang__)
228
-#if __GNUC__ <= 9
229
-FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
230
-{
231
-   uint8x16x4_t ret;
232
-   ret.val[0] = vld1q_u8(p + 0);
233
-   ret.val[1] = vld1q_u8(p + 16);
234
-   ret.val[2] = vld1q_u8(p + 32);
235
-   ret.val[3] = vld1q_u8(p + 48);
236
-   return ret;
237
-}
238
-#endif
239
-#endif
240
-
241
-/* Function Naming Conventions
242
- * The naming convention of SSE intrinsics is straightforward. A generic SSE
243
- * intrinsic function is given as follows:
244
- *   _mm_<name>_<data_type>
245
- *
246
- * The parts of this format are given as follows:
247
- * 1. <name> describes the operation performed by the intrinsic
248
- * 2. <data_type> identifies the data type of the function's primary arguments
249
- *
250
- * This last part, <data_type>, is a little complicated. It identifies the
251
- * content of the input values, and can be set to any of the following values:
252
- * + ps - vectors contain floats (ps stands for packed single-precision)
253
- * + pd - vectors cantain doubles (pd stands for packed double-precision)
254
- * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
255
- *                            signed integers
256
- * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
257
- *                            unsigned integers
258
- * + si128 - unspecified 128-bit vector or 256-bit vector
259
- * + m128/m128i/m128d - identifies input vector types when they are different
260
- *                      than the type of the returned vector
261
- *
262
- * For example, _mm_setzero_ps. The _mm implies that the function returns
263
- * a 128-bit vector. The _ps at the end implies that the argument vectors
264
- * contain floats.
265
- *
266
- * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
267
- *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
268
- *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
269
- *   // Set packed 8-bit integers
270
- *   // 128 bits, 16 chars, per 8 bits
271
- *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
272
- *                                  4, 5, 12, 13, 6, 7, 14, 15);
273
- *   // Shuffle packed 8-bit integers
274
- *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
275
- *
276
- * Data (Number, Binary, Byte Index):
277
-    +------+------+-------------+------+------+-------------+
278
-    |      1      |      2      |      3      |      4      | Number
279
-    +------+------+------+------+------+------+------+------+
280
-    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
281
-    +------+------+------+------+------+------+------+------+
282
-    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
283
-    +------+------+------+------+------+------+------+------+
284
-
285
-    +------+------+------+------+------+------+------+------+
286
-    |      5      |      6      |      7      |      8      | Number
287
-    +------+------+------+------+------+------+------+------+
288
-    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
289
-    +------+------+------+------+------+------+------+------+
290
-    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
291
-    +------+------+------+------+------+------+------+------+
292
- * Index (Byte Index):
293
-    +------+------+------+------+------+------+------+------+
294
-    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
295
-    +------+------+------+------+------+------+------+------+
296
-
297
-    +------+------+------+------+------+------+------+------+
298
-    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
299
-    +------+------+------+------+------+------+------+------+
300
- * Result:
301
-    +------+------+------+------+------+------+------+------+
302
-    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
303
-    +------+------+------+------+------+------+------+------+
304
-    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
305
-    +------+------+------+------+------+------+------+------+
306
-    |     256     |      2      |      5      |      6      | Number
307
-    +------+------+------+------+------+------+------+------+
308
-
309
-    +------+------+------+------+------+------+------+------+
310
-    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
311
-    +------+------+------+------+------+------+------+------+
312
-    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
313
-    +------+------+------+------+------+------+------+------+
314
-    |      3      |      7      |      4      |      8      | Number
315
-    +------+------+------+------+------+------+-------------+
316
- */
317
-
318
-/* Set/get methods */
319
-
320
-/* Constants for use with _mm_prefetch.  */
321
-enum _mm_hint {
322
-   _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
323
-   _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
324
-   _MM_HINT_T1 = 2,   /* load data to L2 cache only */
325
-   _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
326
-   _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
327
-   _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
328
-   _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
329
-   _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
330
-};
331
-
332
-// Loads one cache line of data from address p to a location closer to the
333
-// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
334
-FORCE_INLINE void _mm_prefetch(const void *p, int i)
335
-{
336
-   (void)i;
337
-   __builtin_prefetch(p);
338
-}
339
-
340
-// extracts the lower order floating point value from the parameter :
341
-// https://msdn.microsoft.com/en-us/library/bb514059%28v=vs.120%29.aspx?f=255&MSPPError=-2147217396
342
-FORCE_INLINE float _mm_cvtss_f32(__m128 a)
343
-{
344
-   return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
345
-}
346
-
347
-// Sets the 128-bit value to zero
348
-// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
349
-FORCE_INLINE __m128i _mm_setzero_si128(void)
350
-{
351
-   return vreinterpretq_m128i_s32(vdupq_n_s32(0));
352
-}
353
-
354
-// Clears the four single-precision, floating-point values.
355
-// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
356
-FORCE_INLINE __m128 _mm_setzero_ps(void)
357
-{
358
-   return vreinterpretq_m128_f32(vdupq_n_f32(0));
359
-}
360
-
361
-// Sets the four single-precision, floating-point values to w.
362
-//
363
-//   r0 := r1 := r2 := r3 := w
364
-//
365
-// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
366
-FORCE_INLINE __m128 _mm_set1_ps(float _w)
367
-{
368
-   return vreinterpretq_m128_f32(vdupq_n_f32(_w));
369
-}
370
-
371
-// Sets the four single-precision, floating-point values to w.
372
-// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
373
-FORCE_INLINE __m128 _mm_set_ps1(float _w)
374
-{
375
-   return vreinterpretq_m128_f32(vdupq_n_f32(_w));
376
-}
377
-
378
-// Sets the four single-precision, floating-point values to the four inputs.
379
-// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
380
-FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
381
-{
382
-   float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
383
-   return vreinterpretq_m128_f32(vld1q_f32(data));
384
-}
385
-
386
-// Copy single-precision (32-bit) floating-point element a to the lower element
387
-// of dst, and zero the upper 3 elements.
388
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss&expand=4901,4895,4901
389
-FORCE_INLINE __m128 _mm_set_ss(float a)
390
-{
391
-   float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
392
-   return vreinterpretq_m128_f32(vld1q_f32(data));
393
-}
394
-
395
-// Sets the four single-precision, floating-point values to the four inputs in
396
-// reverse order.
397
-// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
398
-FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
399
-{
400
-   float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
401
-   return vreinterpretq_m128_f32(vld1q_f32(data));
402
-}
403
-
404
-// Sets the 8 signed 16-bit integer values in reverse order.
405
-//
406
-// Return Value
407
-//   r0 := w0
408
-//   r1 := w1
409
-//   ...
410
-//   r7 := w7
411
-FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3,
412
-                   short w4, short w5, short w6, short w7)
413
-{
414
-   int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
415
-   return vreinterpretq_m128i_s16(vld1q_s16((int16_t *)data));
416
-}
417
-
418
-// Sets the 4 signed 32-bit integer values in reverse order
419
-// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
420
-FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
421
-{
422
-   int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
423
-   return vreinterpretq_m128i_s32(vld1q_s32(data));
424
-}
425
-
426
-// Sets the 16 signed 8-bit integer values to b.
427
-//
428
-//   r0 := b
429
-//   r1 := b
430
-//   ...
431
-//   r15 := b
432
-//
433
-// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
434
-FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
435
-{
436
-   return vreinterpretq_m128i_s8(vdupq_n_s8(w));
437
-}
438
-
439
-// Sets the 8 signed 16-bit integer values to w.
440
-//
441
-//   r0 := w
442
-//   r1 := w
443
-//   ...
444
-//   r7 := w
445
-//
446
-// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
447
-FORCE_INLINE __m128i _mm_set1_epi16(short w)
448
-{
449
-   return vreinterpretq_m128i_s16(vdupq_n_s16(w));
450
-}
451
-
452
-// Sets the 16 signed 8-bit integer values.
453
-// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
454
-FORCE_INLINE __m128i
455
-_mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12,
456
-        signed char b11, signed char b10, signed char b9, signed char b8,
457
-        signed char b7, signed char b6, signed char b5, signed char b4,
458
-        signed char b3, signed char b2, signed char b1, signed char b0)
459
-{
460
-   int8_t ALIGN_STRUCT(16)
461
-       data[16] = {(int8_t)b0,  (int8_t)b1,  (int8_t)b2,  (int8_t)b3,
462
-               (int8_t)b4,  (int8_t)b5,  (int8_t)b6,  (int8_t)b7,
463
-               (int8_t)b8,  (int8_t)b9,  (int8_t)b10, (int8_t)b11,
464
-               (int8_t)b12, (int8_t)b13, (int8_t)b14, (int8_t)b15};
465
-   return (__m128i)vld1q_s8(data);
466
-}
467
-
468
-// Sets the 8 signed 16-bit integer values.
469
-// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
470
-FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4,
471
-                  short i3, short i2, short i1, short i0)
472
-{
473
-   int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
474
-   return vreinterpretq_m128i_s16(vld1q_s16(data));
475
-}
476
-
477
-// Sets the 16 signed 8-bit integer values in reverse order.
478
-// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
479
-FORCE_INLINE __m128i _mm_setr_epi8(
480
-   signed char b0, signed char b1, signed char b2, signed char b3,
481
-   signed char b4, signed char b5, signed char b6, signed char b7,
482
-   signed char b8, signed char b9, signed char b10, signed char b11,
483
-   signed char b12, signed char b13, signed char b14, signed char b15)
484
-{
485
-   int8_t ALIGN_STRUCT(16)
486
-       data[16] = {(int8_t)b0,  (int8_t)b1,  (int8_t)b2,  (int8_t)b3,
487
-               (int8_t)b4,  (int8_t)b5,  (int8_t)b6,  (int8_t)b7,
488
-               (int8_t)b8,  (int8_t)b9,  (int8_t)b10, (int8_t)b11,
489
-               (int8_t)b12, (int8_t)b13, (int8_t)b14, (int8_t)b15};
490
-   return (__m128i)vld1q_s8(data);
491
-}
492
-
493
-// Sets the 4 signed 32-bit integer values to i.
494
-//
495
-//   r0 := i
496
-//   r1 := i
497
-//   r2 := i
498
-//   r3 := I
499
-//
500
-// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
501
-FORCE_INLINE __m128i _mm_set1_epi32(int _i)
502
-{
503
-   return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
504
-}
505
-
506
-// Sets the 2 signed 64-bit integer values to i.
507
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
508
-FORCE_INLINE __m128i _mm_set1_epi64(int64_t _i)
509
-{
510
-   return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
511
-}
512
-
513
-// Sets the 2 signed 64-bit integer values to i.
514
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x&expand=4961
515
-FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
516
-{
517
-   return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
518
-}
519
-
520
-// Sets the 4 signed 32-bit integer values.
521
-// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
522
-FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
523
-{
524
-   int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
525
-   return vreinterpretq_m128i_s32(vld1q_s32(data));
526
-}
527
-
528
-// Returns the __m128i structure with its two 64-bit integer values
529
-// initialized to the values of the two 64-bit integers passed in.
530
-// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
531
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
532
-{
533
-   int64_t ALIGN_STRUCT(16) data[2] = {i2, i1};
534
-   return vreinterpretq_m128i_s64(vld1q_s64(data));
535
-}
536
-
537
-// Stores four single-precision, floating-point values.
538
-// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
539
-FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
540
-{
541
-   vst1q_f32(p, vreinterpretq_f32_m128(a));
542
-}
543
-
544
-// Stores four single-precision, floating-point values.
545
-// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
546
-FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
547
-{
548
-   vst1q_f32(p, vreinterpretq_f32_m128(a));
549
-}
550
-
551
-// Stores four 32-bit integer values as (as a __m128i value) at the address p.
552
-// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
553
-FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
554
-{
555
-   vst1q_s32((int32_t *)p, vreinterpretq_s32_m128i(a));
556
-}
557
-
558
-// Stores four 32-bit integer values as (as a __m128i value) at the address p.
559
-// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
560
-FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
561
-{
562
-   vst1q_s32((int32_t *)p, vreinterpretq_s32_m128i(a));
563
-}
564
-
565
-// Stores the lower single - precision, floating - point value.
566
-// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
567
-FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
568
-{
569
-   vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
570
-}
571
-
572
-// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
573
-// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
574
-FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
575
-{
576
-   uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
577
-   uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
578
-   *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
579
-}
580
-
581
-// Stores the lower two single-precision floating point values of a to the
582
-// address p.
583
-//
584
-//   *p0 := a0
585
-//   *p1 := a1
586
-//
587
-// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
588
-FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
589
-{
590
-   *p = vget_low_f32(a);
591
-}
592
-
593
-// Stores the upper two single-precision, floating-point values of a to the
594
-// address p.
595
-//
596
-//   *p0 := a2
597
-//   *p1 := a3
598
-//
599
-// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
600
-FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
601
-{
602
-   *p = vget_high_f32(a);
603
-}
604
-
605
-// Loads a single single-precision, floating-point value, copying it into all
606
-// four words
607
-// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
608
-FORCE_INLINE __m128 _mm_load1_ps(const float *p)
609
-{
610
-   return vreinterpretq_m128_f32(vld1q_dup_f32(p));
611
-}
612
-#define _mm_load_ps1 _mm_load1_ps
613
-
614
-// Sets the lower two single-precision, floating-point values with 64
615
-// bits of data loaded from the address p; the upper two values are passed
616
-// through from a.
617
-//
618
-// Return Value
619
-//   r0 := *p0
620
-//   r1 := *p1
621
-//   r2 := a2
622
-//   r3 := a3
623
-//
624
-// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
625
-FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
626
-{
627
-   return vreinterpretq_m128_f32(
628
-       vcombine_f32(vld1_f32((const float32_t *)p), vget_high_f32(a)));
629
-}
630
-
631
-// Sets the upper two single-precision, floating-point values with 64
632
-// bits of data loaded from the address p; the lower two values are passed
633
-// through from a.
634
-//
635
-//   r0 := a0
636
-//   r1 := a1
637
-//   r2 := *p0
638
-//   r3 := *p1
639
-//
640
-// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
641
-FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
642
-{
643
-   return vreinterpretq_m128_f32(
644
-       vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *)p)));
645
-}
646
-
647
-// Loads four single-precision, floating-point values.
648
-// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
649
-FORCE_INLINE __m128 _mm_load_ps(const float *p)
650
-{
651
-   return vreinterpretq_m128_f32(vld1q_f32(p));
652
-}
653
-
654
-// Loads four single-precision, floating-point values.
655
-// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
656
-FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
657
-{
658
-   // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
659
-   // equivalent for neon
660
-   return vreinterpretq_m128_f32(vld1q_f32(p));
661
-}
662
-
663
-// Loads a double-precision, floating-point value.
664
-// The upper double-precision, floating-point is set to zero. The address p does
665
-// not need to be 16-byte aligned.
666
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/574w9fdd(v%3dvs.100)
667
-FORCE_INLINE __m128d _mm_load_sd(const double *p)
668
-{
669
-#if defined(__aarch64__)
670
-   return vsetq_lane_f64(*p, vdupq_n_f64(0), 0);
671
-#else
672
-   const float *fp = (const float *)p;
673
-   float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
674
-   return vld1q_f32(data);
675
-#endif
676
-}
677
-
678
-// Loads an single - precision, floating - point value into the low word and
679
-// clears the upper three words.
680
-// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
681
-FORCE_INLINE __m128 _mm_load_ss(const float *p)
682
-{
683
-   return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
684
-}
685
-
686
-FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
687
-{
688
-   /* Load the lower 64 bits of the value pointed to by p into the
689
-     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
690
-     */
691
-   return vreinterpretq_m128i_s32(
692
-       vcombine_s32(vld1_s32((int32_t const *)p), vcreate_s32(0)));
693
-}
694
-
695
-/* Logic/Binary operations */
696
-
697
-// Compares for inequality.
698
-// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
699
-FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
700
-{
701
-   return vreinterpretq_m128_u32(vmvnq_u32(vceqq_f32(
702
-       vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
703
-}
704
-
705
-// Computes the bitwise AND-NOT of the four single-precision, floating-point
706
-// values of a and b.
707
-//
708
-//   r0 := ~a0 & b0
709
-//   r1 := ~a1 & b1
710
-//   r2 := ~a2 & b2
711
-//   r3 := ~a3 & b3
712
-//
713
-// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
714
-FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
715
-{
716
-   return vreinterpretq_m128_s32(
717
-       vbicq_s32(vreinterpretq_s32_m128(b),
718
-             vreinterpretq_s32_m128(a))); // *NOTE* argument swap
719
-}
720
-
721
-// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
722
-// 128-bit value in a.
723
-//
724
-//   r := (~a) & b
725
-//
726
-// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
727
-FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
728
-{
729
-   return vreinterpretq_m128i_s32(
730
-       vbicq_s32(vreinterpretq_s32_m128i(b),
731
-             vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
732
-}
733
-
734
-// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
735
-// b.
736
-//
737
-//   r := a & b
738
-//
739
-// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
740
-FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
741
-{
742
-   return vreinterpretq_m128i_s32(vandq_s32(vreinterpretq_s32_m128i(a),
743
-                        vreinterpretq_s32_m128i(b)));
744
-}
745
-
746
-// Computes the bitwise AND of the four single-precision, floating-point values
747
-// of a and b.
748
-//
749
-//   r0 := a0 & b0
750
-//   r1 := a1 & b1
751
-//   r2 := a2 & b2
752
-//   r3 := a3 & b3
753
-//
754
-// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
755
-FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
756
-{
757
-   return vreinterpretq_m128_s32(vandq_s32(vreinterpretq_s32_m128(a),
758
-                       vreinterpretq_s32_m128(b)));
759
-}
760
-
761
-// Computes the bitwise OR of the four single-precision, floating-point values
762
-// of a and b.
763
-// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
764
-FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
765
-{
766
-   return vreinterpretq_m128_s32(vorrq_s32(vreinterpretq_s32_m128(a),
767
-                       vreinterpretq_s32_m128(b)));
768
-}
769
-
770
-// Computes bitwise EXOR (exclusive-or) of the four single-precision,
771
-// floating-point values of a and b.
772
-// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
773
-FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
774
-{
775
-   return vreinterpretq_m128_s32(veorq_s32(vreinterpretq_s32_m128(a),
776
-                       vreinterpretq_s32_m128(b)));
777
-}
778
-
779
-// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
780
-//
781
-//   r := a | b
782
-//
783
-// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
784
-FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
785
-{
786
-   return vreinterpretq_m128i_s32(vorrq_s32(vreinterpretq_s32_m128i(a),
787
-                        vreinterpretq_s32_m128i(b)));
788
-}
789
-
790
-// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
791
-// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
792
-FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
793
-{
794
-   return vreinterpretq_m128i_s32(veorq_s32(vreinterpretq_s32_m128i(a),
795
-                        vreinterpretq_s32_m128i(b)));
796
-}
797
-
798
-// Moves the upper two values of B into the lower two values of A.
799
-//
800
-//   r3 := a3
801
-//   r2 := a2
802
-//   r1 := b3
803
-//   r0 := b2
804
-FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
805
-{
806
-   float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
807
-   float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
808
-   return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
809
-}
810
-
811
-// Moves the lower two values of B into the upper two values of A.
812
-//
813
-//   r3 := b1
814
-//   r2 := b0
815
-//   r1 := a1
816
-//   r0 := a0
817
-FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
818
-{
819
-   float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
820
-   float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
821
-   return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
822
-}
823
-
824
-FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
825
-{
826
-   return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
827
-}
828
-
829
-FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
830
-{
831
-   return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
832
-}
833
-
834
-FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
835
-{
836
-   return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
837
-}
838
-
839
-// Takes the upper 64 bits of a and places it in the low end of the result
840
-// Takes the lower 64 bits of b and places it into the high end of the result.
841
-FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
842
-{
843
-   float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
844
-   float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
845
-   return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
846
-}
847
-
848
-// takes the lower two 32-bit values from a and swaps them and places in high
849
-// end of result takes the higher two 32 bit values from b and swaps them and
850
-// places in low end of result.
851
-FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
852
-{
853
-   float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
854
-   float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
855
-   return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
856
-}
857
-
858
-FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
859
-{
860
-   float32x2_t a21 = vget_high_f32(vextq_f32(
861
-       vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
862
-   float32x2_t b03 = vget_low_f32(vextq_f32(vreinterpretq_f32_m128(b),
863
-                        vreinterpretq_f32_m128(b), 3));
864
-   return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
865
-}
866
-
867
-FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
868
-{
869
-   float32x2_t a03 = vget_low_f32(vextq_f32(vreinterpretq_f32_m128(a),
870
-                        vreinterpretq_f32_m128(a), 3));
871
-   float32x2_t b21 = vget_high_f32(vextq_f32(
872
-       vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
873
-   return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
874
-}
875
-
876
-FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
877
-{
878
-   float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
879
-   float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
880
-   return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
881
-}
882
-
883
-FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
884
-{
885
-   float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
886
-   float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
887
-   return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
888
-}
889
-
890
-FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
891
-{
892
-   float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
893
-   float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
894
-   return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
895
-}
896
-
897
-// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
898
-// high
899
-FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
900
-{
901
-   float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
902
-   float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
903
-   return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
904
-}
905
-
906
-FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
907
-{
908
-   float32x2_t a11 =
909
-       vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
910
-   float32x2_t b00 =
911
-       vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
912
-   return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
913
-}
914
-
915
-FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
916
-{
917
-   float32x2_t a22 =
918
-       vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
919
-   float32x2_t b00 =
920
-       vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
921
-   return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
922
-}
923
-
924
-FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
925
-{
926
-   float32x2_t a00 =
927
-       vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
928
-   float32x2_t b22 =
929
-       vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
930
-   return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
931
-}
932
-
933
-FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
934
-{
935
-   float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
936
-   float32x2_t a22 =
937
-       vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
938
-   float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
939
-   float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
940
-   return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
941
-}
942
-
943
-FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
944
-{
945
-   float32x2_t a33 =
946
-       vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
947
-   float32x2_t b11 =
948
-       vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
949
-   return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
950
-}
951
-
952
-FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
953
-{
954
-   float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
955
-   float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
956
-   float32x2_t b00 =
957
-       vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
958
-   float32x2_t b20 = vset_lane_f32(b2, b00, 1);
959
-   return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
960
-}
961
-
962
-FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
963
-{
964
-   float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
965
-   float32_t b2 = vgetq_lane_f32(b, 2);
966
-   float32x2_t b00 =
967
-       vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
968
-   float32x2_t b20 = vset_lane_f32(b2, b00, 1);
969
-   return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
970
-}
971
-
972
-FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
973
-{
974
-   float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
975
-   float32_t b2 = vgetq_lane_f32(b, 2);
976
-   float32x2_t b00 =
977
-       vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
978
-   float32x2_t b20 = vset_lane_f32(b2, b00, 1);
979
-   return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
980
-}
981
-
982
-// NEON does not support a general purpose permute intrinsic
983
-// Selects four specific single-precision, floating-point values from a and b,
984
-// based on the mask i.
985
-// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
986
-#if 0 /* C version */
987
-FORCE_INLINE __m128 _mm_shuffle_ps_default(__m128 a,
988
-                                           __m128 b,
989
-                                           __constrange(0, 255) int imm)
990
-{
991
-    __m128 ret;
992
-    ret[0] = a[imm & 0x3];
993
-    ret[1] = a[(imm >> 2) & 0x3];
994
-    ret[2] = b[(imm >> 4) & 0x03];
995
-    ret[3] = b[(imm >> 6) & 0x03];
996
-    return ret;
997
-}
998
-#endif
999
-#define _mm_shuffle_ps_default(a, b, imm)                                      \
1000
-   __extension__({                                                        \
1001
-       float32x4_t ret;                                               \
1002
-       ret = vmovq_n_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a),    \
1003
-                        (imm) & (0x3)));              \
1004
-       ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(a), \
1005
-                           ((imm) >> 2) & 0x3),       \
1006
-                    ret, 1);                                  \
1007
-       ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), \
1008
-                           ((imm) >> 4) & 0x3),       \
1009
-                    ret, 2);                                  \
1010
-       ret = vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), \
1011
-                           ((imm) >> 6) & 0x3),       \
1012
-                    ret, 3);                                  \
1013
-       vreinterpretq_m128_f32(ret);                                   \
1014
-   })
1015
-
1016
-// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
1017
-// int imm)
1018
-#if __has_builtin(__builtin_shufflevector)
1019
-#define _mm_shuffle_ps(a, b, imm)                                            \
1020
-   __extension__({                                                      \
1021
-       float32x4_t _input1 = vreinterpretq_f32_m128(a);             \
1022
-       float32x4_t _input2 = vreinterpretq_f32_m128(b);             \
1023
-       float32x4_t _shuf = __builtin_shufflevector(                 \
1024
-           _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1025
-           (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
1026
-       vreinterpretq_m128_f32(_shuf);                               \
1027
-   })
1028
-#else // generic
1029
-#define _mm_shuffle_ps(a, b, imm)                                      \
1030
-   __extension__({                                                \
1031
-       __m128 ret;                                            \
1032
-       switch (imm) {                                         \
1033
-       case _MM_SHUFFLE(1, 0, 3, 2):                          \
1034
-           ret = _mm_shuffle_ps_1032((a), (b));           \
1035
-           break;                                         \
1036
-       case _MM_SHUFFLE(2, 3, 0, 1):                          \
1037
-           ret = _mm_shuffle_ps_2301((a), (b));           \
1038
-           break;                                         \
1039
-       case _MM_SHUFFLE(0, 3, 2, 1):                          \
1040
-           ret = _mm_shuffle_ps_0321((a), (b));           \
1041
-           break;                                         \
1042
-       case _MM_SHUFFLE(2, 1, 0, 3):                          \
1043
-           ret = _mm_shuffle_ps_2103((a), (b));           \
1044
-           break;                                         \
1045
-       case _MM_SHUFFLE(1, 0, 1, 0):                          \
1046
-           ret = _mm_movelh_ps((a), (b));                 \
1047
-           break;                                         \
1048
-       case _MM_SHUFFLE(1, 0, 0, 1):                          \
1049
-           ret = _mm_shuffle_ps_1001((a), (b));           \
1050
-           break;                                         \
1051
-       case _MM_SHUFFLE(0, 1, 0, 1):                          \
1052
-           ret = _mm_shuffle_ps_0101((a), (b));           \
1053
-           break;                                         \
1054
-       case _MM_SHUFFLE(3, 2, 1, 0):                          \
1055
-           ret = _mm_shuffle_ps_3210((a), (b));           \
1056
-           break;                                         \
1057
-       case _MM_SHUFFLE(0, 0, 1, 1):                          \
1058
-           ret = _mm_shuffle_ps_0011((a), (b));           \
1059
-           break;                                         \
1060
-       case _MM_SHUFFLE(0, 0, 2, 2):                          \
1061
-           ret = _mm_shuffle_ps_0022((a), (b));           \
1062
-           break;                                         \
1063
-       case _MM_SHUFFLE(2, 2, 0, 0):                          \
1064
-           ret = _mm_shuffle_ps_2200((a), (b));           \
1065
-           break;                                         \
1066
-       case _MM_SHUFFLE(3, 2, 0, 2):                          \
1067
-           ret = _mm_shuffle_ps_3202((a), (b));           \
1068
-           break;                                         \
1069
-       case _MM_SHUFFLE(3, 2, 3, 2):                          \
1070
-           ret = _mm_movehl_ps((b), (a));                 \
1071
-           break;                                         \
1072
-       case _MM_SHUFFLE(1, 1, 3, 3):                          \
1073
-           ret = _mm_shuffle_ps_1133((a), (b));           \
1074
-           break;                                         \
1075
-       case _MM_SHUFFLE(2, 0, 1, 0):                          \
1076
-           ret = _mm_shuffle_ps_2010((a), (b));           \
1077
-           break;                                         \
1078
-       case _MM_SHUFFLE(2, 0, 0, 1):                          \
1079
-           ret = _mm_shuffle_ps_2001((a), (b));           \
1080
-           break;                                         \
1081
-       case _MM_SHUFFLE(2, 0, 3, 2):                          \
1082
-           ret = _mm_shuffle_ps_2032((a), (b));           \
1083
-           break;                                         \
1084
-       default:                                               \
1085
-           ret = _mm_shuffle_ps_default((a), (b), (imm)); \
1086
-           break;                                         \
1087
-       }                                                      \
1088
-       ret;                                                   \
1089
-   })
1090
-#endif
1091
-
1092
-// Takes the upper 64 bits of a and places it in the low end of the result
1093
-// Takes the lower 64 bits of a and places it into the high end of the result.
1094
-FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
1095
-{
1096
-   int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1097
-   int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1098
-   return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
1099
-}
1100
-
1101
-// takes the lower two 32-bit values from a and swaps them and places in low end
1102
-// of result takes the higher two 32 bit values from a and swaps them and places
1103
-// in high end of result.
1104
-FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
1105
-{
1106
-   int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1107
-   int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1108
-   return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1109
-}
1110
-
1111
-// rotates the least significant 32 bits into the most signficant 32 bits, and
1112
-// shifts the rest down
1113
-FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
1114
-{
1115
-   return vreinterpretq_m128i_s32(vextq_s32(
1116
-       vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
1117
-}
1118
-
1119
-// rotates the most significant 32 bits into the least signficant 32 bits, and
1120
-// shifts the rest up
1121
-FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1122
-{
1123
-   return vreinterpretq_m128i_s32(vextq_s32(
1124
-       vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1125
-}
1126
-
1127
-// gets the lower 64 bits of a, and places it in the upper 64 bits
1128
-// gets the lower 64 bits of a and places it in the lower 64 bits
1129
-FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
1130
-{
1131
-   int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1132
-   return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1133
-}
1134
-
1135
-// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1136
-// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
1137
-FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
1138
-{
1139
-   int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1140
-   int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1141
-   return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1142
-}
1143
-
1144
-// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1145
-// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1146
-// places it in the lower 64 bits
1147
-FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
1148
-{
1149
-   int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1150
-   return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1151
-}
1152
-
1153
-FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
1154
-{
1155
-   int32x2_t a11 =
1156
-       vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1157
-   int32x2_t a22 =
1158
-       vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1159
-   return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1160
-}
1161
-
1162
-FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1163
-{
1164
-   int32x2_t a22 =
1165
-       vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1166
-   int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1167
-   return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1168
-}
1169
-
1170
-FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1171
-{
1172
-   int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1173
-   int32x2_t a33 =
1174
-       vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1175
-   return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1176
-}
1177
-
1178
-// Shuffle packed 8-bit integers in a according to shuffle control mask in the
1179
-// corresponding 8-bit element of b, and store the results in dst.
1180
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8&expand=5146
1181
-FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
1182
-{
1183
-   int8x16_t tbl = vreinterpretq_s8_m128i(a);  // input a
1184
-   uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
1185
-   uint8x16_t idx_masked =
1186
-       vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
1187
-#if defined(__aarch64__)
1188
-   return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
1189
-#elif defined(__GNUC__)
1190
-   int8x16_t ret;
1191
-   // %e and %f represent the even and odd D registers
1192
-   // respectively.
1193
-   __asm__ __volatile__("vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
1194
-                "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
1195
-                : [ret] "=&w"(ret)
1196
-                : [tbl] "w"(tbl), [idx] "w"(idx_masked));
1197
-   return vreinterpretq_m128i_s8(ret);
1198
-#else
1199
-   // use this line if testing on aarch64
1200
-   int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
1201
-   return vreinterpretq_m128i_s8(
1202
-       vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
1203
-               vtbl2_s8(a_split, vget_high_u8(idx_masked))));
1204
-#endif
1205
-}
1206
-
1207
-#if 0 /* C version */
1208
-FORCE_INLINE __m128i _mm_shuffle_epi32_default(__m128i a,
1209
-                                               __constrange(0, 255) int imm)
1210
-{
1211
-    __m128i ret;
1212
-    ret[0] = a[imm & 0x3];
1213
-    ret[1] = a[(imm >> 2) & 0x3];
1214
-    ret[2] = a[(imm >> 4) & 0x03];
1215
-    ret[3] = a[(imm >> 6) & 0x03];
1216
-    return ret;
1217
-}
1218
-#endif
1219
-#define _mm_shuffle_epi32_default(a, imm)                                    \
1220
-   __extension__({                                                      \
1221
-       int32x4_t ret;                                               \
1222
-       ret = vmovq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), \
1223
-                        (imm) & (0x3)));            \
1224
-       ret = vsetq_lane_s32(                                        \
1225
-           vgetq_lane_s32(vreinterpretq_s32_m128i(a),           \
1226
-                      ((imm) >> 2) & 0x3),                  \
1227
-           ret, 1);                                             \
1228
-       ret = vsetq_lane_s32(                                        \
1229
-           vgetq_lane_s32(vreinterpretq_s32_m128i(a),           \
1230
-                      ((imm) >> 4) & 0x3),                  \
1231
-           ret, 2);                                             \
1232
-       ret = vsetq_lane_s32(                                        \
1233
-           vgetq_lane_s32(vreinterpretq_s32_m128i(a),           \
1234
-                      ((imm) >> 6) & 0x3),                  \
1235
-           ret, 3);                                             \
1236
-       vreinterpretq_m128i_s32(ret);                                \
1237
-   })
1238
-
1239
-// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
1240
-// int imm)
1241
-#if defined(__aarch64__)
1242
-#define _mm_shuffle_epi32_splat(a, imm)                                      \
1243
-   __extension__({                                                      \
1244
-       vreinterpretq_m128i_s32(                                     \
1245
-           vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
1246
-   })
1247
-#else
1248
-#define _mm_shuffle_epi32_splat(a, imm)                                      \
1249
-   __extension__({                                                      \
1250
-       vreinterpretq_m128i_s32(vdupq_n_s32(                         \
1251
-           vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
1252
-   })
1253
-#endif
1254
-
1255
-// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
1256
-// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
1257
-// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
1258
-//                                        __constrange(0,255) int imm)
1259
-#if __has_builtin(__builtin_shufflevector)
1260
-#define _mm_shuffle_epi32(a, imm)                                          \
1261
-   __extension__({                                                    \
1262
-       int32x4_t _input = vreinterpretq_s32_m128i(a);             \
1263
-       int32x4_t _shuf = __builtin_shufflevector(                 \
1264
-           _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
1265
-           ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
1266
-       vreinterpretq_m128i_s32(_shuf);                            \
1267
-   })
1268
-#else // generic
1269
-#define _mm_shuffle_epi32(a, imm)                                    \
1270
-   __extension__({                                              \
1271
-       __m128i ret;                                         \
1272
-       switch (imm) {                                       \
1273
-       case _MM_SHUFFLE(1, 0, 3, 2):                        \
1274
-           ret = _mm_shuffle_epi_1032((a));             \
1275
-           break;                                       \
1276
-       case _MM_SHUFFLE(2, 3, 0, 1):                        \
1277
-           ret = _mm_shuffle_epi_2301((a));             \
1278
-           break;                                       \
1279
-       case _MM_SHUFFLE(0, 3, 2, 1):                        \
1280
-           ret = _mm_shuffle_epi_0321((a));             \
1281
-           break;                                       \
1282
-       case _MM_SHUFFLE(2, 1, 0, 3):                        \
1283
-           ret = _mm_shuffle_epi_2103((a));             \
1284
-           break;                                       \
1285
-       case _MM_SHUFFLE(1, 0, 1, 0):                        \
1286
-           ret = _mm_shuffle_epi_1010((a));             \
1287
-           break;                                       \
1288
-       case _MM_SHUFFLE(1, 0, 0, 1):                        \
1289
-           ret = _mm_shuffle_epi_1001((a));             \
1290
-           break;                                       \
1291
-       case _MM_SHUFFLE(0, 1, 0, 1):                        \
1292
-           ret = _mm_shuffle_epi_0101((a));             \
1293
-           break;                                       \
1294
-       case _MM_SHUFFLE(2, 2, 1, 1):                        \
1295
-           ret = _mm_shuffle_epi_2211((a));             \
1296
-           break;                                       \
1297
-       case _MM_SHUFFLE(0, 1, 2, 2):                        \
1298
-           ret = _mm_shuffle_epi_0122((a));             \
1299
-           break;                                       \
1300
-       case _MM_SHUFFLE(3, 3, 3, 2):                        \
1301
-           ret = _mm_shuffle_epi_3332((a));             \
1302
-           break;                                       \
1303
-       case _MM_SHUFFLE(0, 0, 0, 0):                        \
1304
-           ret = _mm_shuffle_epi32_splat((a), 0);       \
1305
-           break;                                       \
1306
-       case _MM_SHUFFLE(1, 1, 1, 1):                        \
1307
-           ret = _mm_shuffle_epi32_splat((a), 1);       \
1308
-           break;                                       \
1309
-       case _MM_SHUFFLE(2, 2, 2, 2):                        \
1310
-           ret = _mm_shuffle_epi32_splat((a), 2);       \
1311
-           break;                                       \
1312
-       case _MM_SHUFFLE(3, 3, 3, 3):                        \
1313
-           ret = _mm_shuffle_epi32_splat((a), 3);       \
1314
-           break;                                       \
1315
-       default:                                             \
1316
-           ret = _mm_shuffle_epi32_default((a), (imm)); \
1317
-           break;                                       \
1318
-       }                                                    \
1319
-       ret;                                                 \
1320
-   })
1321
-#endif
1322
-
1323
-// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
1324
-// by imm.
1325
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
1326
-// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
1327
-//                                                   __constrange(0,255) int
1328
-//                                                   imm)
1329
-#define _mm_shufflelo_epi16_function(a, imm)                                 \
1330
-   __extension__({                                                      \
1331
-       int16x8_t ret = vreinterpretq_s16_m128i(a);                  \
1332
-       int16x4_t lowBits = vget_low_s16(ret);                       \
1333
-       ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)),  \
1334
-                    ret, 0);                                \
1335
-       ret = vsetq_lane_s16(                                        \
1336
-           vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, 1); \
1337
-       ret = vsetq_lane_s16(                                        \
1338
-           vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, 2); \
1339
-       ret = vsetq_lane_s16(                                        \
1340
-           vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, 3); \
1341
-       vreinterpretq_m128i_s16(ret);                                \
1342
-   })
1343
-
1344
-// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
1345
-//                                          __constrange(0,255) int imm)
1346
-#if __has_builtin(__builtin_shufflevector)
1347
-#define _mm_shufflelo_epi16(a, imm)                                            \
1348
-   __extension__({                                                        \
1349
-       int16x8_t _input = vreinterpretq_s16_m128i(a);                 \
1350
-       int16x8_t _shuf = __builtin_shufflevector(                     \
1351
-           _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
1352
-           (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6,   \
1353
-           7);                                                    \
1354
-       vreinterpretq_m128i_s16(_shuf);                                \
1355
-   })
1356
-#else // generic
1357
-#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
1358
-#endif
1359
-
1360
-// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
1361
-// by imm.
1362
-// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
1363
-// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
1364
-//                                                   __constrange(0,255) int
1365
-//                                                   imm)
1366
-#define _mm_shufflehi_epi16_function(a, imm)                                  \
1367
-   __extension__({                                                       \
1368
-       int16x8_t ret = vreinterpretq_s16_m128i(a);                   \
1369
-       int16x4_t highBits = vget_high_s16(ret);                      \
1370
-       ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)),  \
1371
-                    ret, 4);                                 \
1372
-       ret = vsetq_lane_s16(                                         \
1373
-           vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, 5); \
1374
-       ret = vsetq_lane_s16(                                         \
1375
-           vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, 6); \
1376
-       ret = vsetq_lane_s16(                                         \
1377
-           vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, 7); \
1378
-       vreinterpretq_m128i_s16(ret);                                 \
1379
-   })
1380
-
1381
-// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
1382
-//                                          __constrange(0,255) int imm)
1383
-#if __has_builtin(__builtin_shufflevector)
1384
-#define _mm_shufflehi_epi16(a, imm)                                         \
1385
-   __extension__({                                                     \
1386
-       int16x8_t _input = vreinterpretq_s16_m128i(a);              \
1387
-       int16x8_t _shuf = __builtin_shufflevector(                  \
1388
-           _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
1389
-           (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
1390
-           (((imm) >> 6) & 0x3) + 4);                          \
1391
-       vreinterpretq_m128i_s16(_shuf);                             \
1392
-   })
1393
-#else // generic
1394
-#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
1395
-#endif
1396
-
1397
-// Blend packed 16-bit integers from a and b using control mask imm8, and store
1398
-// the results in dst.
1399
-//
1400
-//   FOR j := 0 to 7
1401
-//       i := j*16
1402
-//       IF imm8[j]
1403
-//           dst[i+15:i] := b[i+15:i]
1404
-//       ELSE
1405
-//           dst[i+15:i] := a[i+15:i]
1406
-//       FI
1407
-//   ENDFOR
1408
-// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
1409
-//                                      __constrange(0,255) int imm)
1410
-#define _mm_blend_epi16(a, b, imm)                                     \
1411
-   __extension__({                                                \
1412
-       const uint16_t _mask[8] = {                            \
1413
-           ((imm) & (1 << 0)) ? 0xFFFF : 0x0000,          \
1414
-           ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,          \
1415
-           ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,          \
1416
-           ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,          \
1417
-           ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,          \
1418
-           ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,          \
1419
-           ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,          \
1420
-           ((imm) & (1 << 7)) ? 0xFFFF : 0x0000};         \
1421
-       uint16x8_t _mask_vec = vld1q_u16(_mask);               \
1422
-       uint16x8_t _a = vreinterpretq_u16_m128i(a);            \
1423
-       uint16x8_t _b = vreinterpretq_u16_m128i(b);            \
1424
-       vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
1425
-   })
1426
-
1427
-// Blend packed 8-bit integers from a and b using mask, and store the results in
1428
-// dst.
1429
-//
1430
-//   FOR j := 0 to 15
1431
-//       i := j*8
1432
-//       IF mask[i+7]
1433
-//           dst[i+7:i] := b[i+7:i]
1434
-//       ELSE
1435
-//           dst[i+7:i] := a[i+7:i]
1436
-//       FI
1437
-//   ENDFOR
1438
-FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
1439
-{
1440
-   // Use a signed shift right to create a mask with the sign bit
1441
-   uint8x16_t mask = vreinterpretq_u8_s8(
1442
-       vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
1443
-   uint8x16_t a = vreinterpretq_u8_m128i(_a);
1444
-   uint8x16_t b = vreinterpretq_u8_m128i(_b);
1445
-   return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
1446
-}
1447
-
1448
-/* Shifts */
1449
-
1450
-// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
1451
-// in the sign bit.
1452
-//
1453
-//   r0 := a0 >> count
1454
-//   r1 := a1 >> count
1455
-//   r2 := a2 >> count
1456
-//   r3 := a3 >> count immediate
1457
-FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, int count)
1458
-{
1459
-   return (__m128i)vshlq_s32((int32x4_t)a, vdupq_n_s32(-count));
1460
-}
1461
-
1462
-// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
1463
-// in the sign bit.
1464
-//
1465
-//   r0 := a0 >> count
1466
-//   r1 := a1 >> count
1467
-//   ...
1468
-//   r7 := a7 >> count
1469
-FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int count)
1470
-{
1471
-   return (__m128i)vshlq_s16((int16x8_t)a, vdupq_n_s16(-count));
1472
-}
1473
-
1474
-// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
1475
-// shifting in zeros.
1476
-//
1477
-//   r0 := a0 << count
1478
-//   r1 := a1 << count
1479
-//   ...
1480
-//   r7 := a7 << count
1481
-//
1482
-// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
1483
-#define _mm_slli_epi16(a, imm)                                       \
1484
-   __extension__({                                              \
1485
-       __m128i ret;                                         \
1486
-       if ((imm) <= 0) {                                    \
1487
-           ret = a;                                     \
1488
-       } else if ((imm) > 31) {                             \
1489
-           ret = _mm_setzero_si128();                   \
1490
-       } else {                                             \
1491
-           ret = vreinterpretq_m128i_s16(vshlq_n_s16(   \
1492
-               vreinterpretq_s16_m128i(a), (imm))); \
1493
-       }                                                    \
1494
-       ret;                                                 \
1495
-   })
1496
-
1497
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
1498
-// shifting in zeros. :
1499
-// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
1500
-// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
1501
-#define _mm_slli_epi32(a, imm)                                       \
1502
-   __extension__({                                              \
1503
-       __m128i ret;                                         \
1504
-       if ((imm) <= 0) {                                    \
1505
-           ret = a;                                     \
1506
-       } else if ((imm) > 31) {                             \
1507
-           ret = _mm_setzero_si128();                   \
1508
-       } else {                                             \
1509
-           ret = vreinterpretq_m128i_s32(vshlq_n_s32(   \
1510
-               vreinterpretq_s32_m128i(a), (imm))); \
1511
-       }                                                    \
1512
-       ret;                                                 \
1513
-   })
1514
-
1515
-// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
1516
-// store the results in dst.
1517
-#define _mm_slli_epi64(a, imm)                                       \
1518
-   __extension__({                                              \
1519
-       __m128i ret;                                         \
1520
-       if ((imm) <= 0) {                                    \
1521
-           ret = a;                                     \
1522
-       } else if ((imm) > 63) {                             \
1523
-           ret = _mm_setzero_si128();                   \
1524
-       } else {                                             \
1525
-           ret = vreinterpretq_m128i_s64(vshlq_n_s64(   \
1526
-               vreinterpretq_s64_m128i(a), (imm))); \
1527
-       }                                                    \
1528
-       ret;                                                 \
1529
-   })
1530
-
1531
-// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
1532
-// while shifting in zeros.
1533
-//
1534
-//   r0 := srl(a0, count)
1535
-//   r1 := srl(a1, count)
1536
-//   ...
1537
-//   r7 := srl(a7, count)
1538
-//
1539
-// https://msdn.microsoft.com/en-us/library/6tcwd38t(v=vs.90).aspx
1540
-#define _mm_srli_epi16(a, imm)                                       \
1541
-   __extension__({                                              \
1542
-       __m128i ret;                                         \
1543
-       if ((imm) <= 0) {                                    \
1544
-           ret = a;                                     \
1545
-       } else if ((imm) > 31) {                             \
1546
-           ret = _mm_setzero_si128();                   \
1547
-       } else {                                             \
1548
-           ret = vreinterpretq_m128i_u16(vshrq_n_u16(   \
1549
-               vreinterpretq_u16_m128i(a), (imm))); \
1550
-       }                                                    \
1551
-       ret;                                                 \
1552
-   })
1553
-
1554
-// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
1555
-// while shifting in zeros.
1556
-// https://msdn.microsoft.com/en-us/library/w486zcfa(v=vs.100).aspx FORCE_INLINE
1557
-// __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
1558
-#define _mm_srli_epi32(a, imm)                                       \
1559
-   __extension__({                                              \
1560
-       __m128i ret;                                         \
1561
-       if ((imm) <= 0) {                                    \
1562
-           ret = a;                                     \
1563
-       } else if ((imm) > 31) {                             \
1564
-           ret = _mm_setzero_si128();                   \
1565
-       } else {                                             \
1566
-           ret = vreinterpretq_m128i_u32(vshrq_n_u32(   \
1567
-               vreinterpretq_u32_m128i(a), (imm))); \
1568
-       }                                                    \
1569
-       ret;                                                 \
1570
-   })
1571
-
1572
-// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
1573
-// store the results in dst.
1574
-#define _mm_srli_epi64(a, imm)                                       \
1575
-   __extension__({                                              \
1576
-       __m128i ret;                                         \
1577
-       if ((imm) <= 0) {                                    \
1578
-           ret = a;                                     \
1579
-       } else if ((imm) > 63) {                             \
1580
-           ret = _mm_setzero_si128();                   \
1581
-       } else {                                             \
1582
-           ret = vreinterpretq_m128i_u64(vshrq_n_u64(   \
1583
-               vreinterpretq_u64_m128i(a), (imm))); \
1584
-       }                                                    \
1585
-       ret;                                                 \
1586
-   })
1587
-
1588
-// Shifts the 4 signed 32 - bit integers in a right by count bits while shifting
1589
-// in the sign bit.
1590
-// https://msdn.microsoft.com/en-us/library/z1939387(v=vs.100).aspx
1591
-// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
1592
-#define _mm_srai_epi32(a, imm)                                                \
1593
-   __extension__({                                                       \
1594
-       __m128i ret;                                                  \
1595
-       if ((imm) <= 0) {                                             \
1596
-           ret = a;                                              \
1597
-       } else if ((imm) > 31) {                                      \
1598
-           ret = vreinterpretq_m128i_s32(                        \
1599
-               vshrq_n_s32(vreinterpretq_s32_m128i(a), 16)); \
1600
-           ret = vreinterpretq_m128i_s32(vshrq_n_s32(            \
1601
-               vreinterpretq_s32_m128i(ret), 16));           \
1602
-       } else {                                                      \
1603
-           ret = vreinterpretq_m128i_s32(vshrq_n_s32(            \
1604
-               vreinterpretq_s32_m128i(a), (imm)));          \
1605
-       }                                                             \
1606
-       ret;                                                          \
1607
-   })
1608
-
1609
-// Shifts the 128 - bit value in a right by imm bytes while shifting in
1610
-// zeros.imm must be an immediate.
1611
-//
1612
-//   r := srl(a, imm*8)
1613
-//
1614
-// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
1615
-// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
1616
-#define _mm_srli_si128(a, imm)                                      \
1617
-   __extension__({                                             \
1618
-       __m128i ret;                                        \
1619
-       if ((imm) <= 0) {                                   \
1620
-           ret = a;                                    \
1621
-       } else if ((imm) > 15) {                            \
1622
-           ret = _mm_setzero_si128();                  \
1623
-       } else {                                            \
1624
-           ret = vreinterpretq_m128i_s8(               \
1625
-               vextq_s8(vreinterpretq_s8_m128i(a), \
1626
-                    vdupq_n_s8(0), (imm)));    \
1627
-       }                                                   \
1628
-       ret;                                                \
1629
-   })
1630
-
1631
-// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
1632
-// must be an immediate.
1633
-//
1634
-//   r := a << (imm * 8)
1635
-//
1636
-// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
1637
-// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
1638
-#define _mm_slli_si128(a, imm)                                            \
1639
-   __extension__({                                                   \
1640
-       __m128i ret;                                              \
1641
-       if ((imm) <= 0) {                                         \
1642
-           ret = a;                                          \
1643
-       } else if ((imm) > 15) {                                  \
1644
-           ret = _mm_setzero_si128();                        \
1645
-       } else {                                                  \
1646
-           ret = vreinterpretq_m128i_s8(vextq_s8(            \
1647
-               vdupq_n_s8(0), vreinterpretq_s8_m128i(a), \
1648
-               16 - (imm)));                             \
1649
-       }                                                         \
1650
-       ret;                                                      \
1651
-   })
1652
-
1653
-// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
1654
-// shifting in zeros.
1655
-//
1656
-//   r0 := a0 << count
1657
-//   r1 := a1 << count
1658
-//   ...
1659
-//   r7 := a7 << count
1660
-//
1661
-// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
1662
-FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
1663
-{
1664
-   uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
1665
-   if (c > 15)
1666
-       return _mm_setzero_si128();
1667
-
1668
-   int16x8_t vc = vdupq_n_s16((int16_t)c);
1669
-   return vreinterpretq_m128i_s16(
1670
-       vshlq_s16(vreinterpretq_s16_m128i(a), vc));
1671
-}
1672
-
1673
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
1674
-// shifting in zeros.
1675
-//
1676
-// r0 := a0 << count
1677
-// r1 := a1 << count
1678
-// r2 := a2 << count
1679
-// r3 := a3 << count
1680
-//
1681
-// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
1682
-FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
1683
-{
1684
-   uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
1685
-   if (c > 31)
1686
-       return _mm_setzero_si128();
1687
-
1688
-   int32x4_t vc = vdupq_n_s32((int32_t)c);
1689
-   return vreinterpretq_m128i_s32(
1690
-       vshlq_s32(vreinterpretq_s32_m128i(a), vc));
1691
-}
1692
-
1693
-// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
1694
-// shifting in zeros.
1695
-//
1696
-// r0 := a0 << count
1697
-// r1 := a1 << count
1698
-//
1699
-// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
1700
-FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
1701
-{
1702
-   uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
1703
-   if (c > 63)
1704
-       return _mm_setzero_si128();
1705
-
1706
-   int64x2_t vc = vdupq_n_s64((int64_t)c);
1707
-   return vreinterpretq_m128i_s64(
1708
-       vshlq_s64(vreinterpretq_s64_m128i(a), vc));
1709
-}
1710
-
1711
-// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
1712
-// while shifting in zeros.
1713
-//
1714
-// r0 := srl(a0, count)
1715
-// r1 := srl(a1, count)
1716
-// ...
1717
-// r7 := srl(a7, count)
1718
-//
1719
-// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
1720
-FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
1721
-{
1722
-   uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
1723
-   if (c > 15)
1724
-       return _mm_setzero_si128();
1725
-
1726
-   int16x8_t vc = vdupq_n_s16(-(int16_t)c);
1727
-   return vreinterpretq_m128i_u16(
1728
-       vshlq_u16(vreinterpretq_u16_m128i(a), vc));
1729
-}
1730
-
1731
-// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
1732
-// while shifting in zeros.
1733
-//
1734
-// r0 := srl(a0, count)
1735
-// r1 := srl(a1, count)
1736
-// r2 := srl(a2, count)
1737
-// r3 := srl(a3, count)
1738
-//
1739
-// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
1740
-FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
1741
-{
1742
-   uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
1743
-   if (c > 31)
1744
-       return _mm_setzero_si128();
1745
-
1746
-   int32x4_t vc = vdupq_n_s32(-(int32_t)c);
1747
-   return vreinterpretq_m128i_u32(
1748
-       vshlq_u32(vreinterpretq_u32_m128i(a), vc));
1749
-}
1750
-
1751
-// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
1752
-// while shifting in zeros.
1753
-//
1754
-// r0 := srl(a0, count)
1755
-// r1 := srl(a1, count)
1756
-//
1757
-// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
1758
-FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
1759
-{
1760
-   uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
1761
-   if (c > 63)
1762
-       return _mm_setzero_si128();
1763
-
1764
-   int64x2_t vc = vdupq_n_s64(-(int64_t)c);
1765
-   return vreinterpretq_m128i_u64(
1766
-       vshlq_u64(vreinterpretq_u64_m128i(a), vc));
1767
-}
1768
-
1769
-// NEON does not provide a version of this function.
1770
-// Creates a 16-bit mask from the most significant bits of the 16 signed or
1771
-// unsigned 8-bit integers in a and zero extends the upper bits.
1772
-// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
1773
-FORCE_INLINE int _mm_movemask_epi8(__m128i a)
1774
-{
1775
-#if defined(__aarch64__)
1776
-   uint8x16_t input = vreinterpretq_u8_m128i(a);
1777
-   const int8_t ALIGN_STRUCT(16) xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0,
1778
-                       -7, -6, -5, -4, -3, -2, -1, 0};
1779
-   const uint8x16_t mask_and = vdupq_n_u8(0x80);
1780
-   const int8x16_t mask_shift = vld1q_s8(xr);
1781
-   const uint8x16_t mask_result =
1782
-       vshlq_u8(vandq_u8(input, mask_and), mask_shift);
1783
-   uint8x8_t lo = vget_low_u8(mask_result);
1784
-   uint8x8_t hi = vget_high_u8(mask_result);
1785
-
1786
-   return vaddv_u8(lo) + (vaddv_u8(hi) << 8);
1787
-#else
1788
-   // Use increasingly wide shifts+adds to collect the sign bits
1789
-   // together.
1790
-   // Since the widening shifts would be rather confusing to follow in little
1791
-   // endian, everything will be illustrated in big endian order instead. This
1792
-   // has a different result - the bits would actually be reversed on a big
1793
-   // endian machine.
1794
-
1795
-   // Starting input (only half the elements are shown):
1796
-   // 89 ff 1d c0 00 10 99 33
1797
-   uint8x16_t input = vreinterpretq_u8_m128i(a);
1798
-
1799
-   // Shift out everything but the sign bits with an unsigned shift right.
1800
-   //
1801
-   // Bytes of the vector::
1802
-   // 89 ff 1d c0 00 10 99 33
1803
-   // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
1804
-   //  |  |  |  |  |  |  |  |
1805
-   // 01 01 00 01 00 00 01 00
1806
-   //
1807
-   // Bits of first important lane(s):
1808
-   // 10001001 (89)
1809
-   // \______
1810
-   //        |
1811
-   // 00000001 (01)
1812
-   uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
1813
-
1814
-   // Merge the even lanes together with a 16-bit unsigned shift right + add.
1815
-   // 'xx' represents garbage data which will be ignored in the final result.
1816
-   // In the important bytes, the add functions like a binary OR.
1817
-   //
1818
-   // 01 01 00 01 00 00 01 00
1819
-   //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
1820
-   //    \|    \|    \|    \|
1821
-   // xx 03 xx 01 xx 00 xx 02
1822
-   //
1823
-   // 00000001 00000001 (01 01)
1824
-   //        \_______ |
1825
-   //                \|
1826
-   // xxxxxxxx xxxxxx11 (xx 03)
1827
-   uint32x4_t paired16 =
1828
-       vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
1829
-
1830
-   // Repeat with a wider 32-bit shift + add.
1831
-   // xx 03 xx 01 xx 00 xx 02
1832
-   //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
1833
-   //     14))
1834
-   //          \|          \|
1835
-   // xx xx xx 0d xx xx xx 02
1836
-   //
1837
-   // 00000011 00000001 (03 01)
1838
-   //        \\_____ ||
1839
-   //         '----.\||
1840
-   // xxxxxxxx xxxx1101 (xx 0d)
1841
-   uint64x2_t paired32 =
1842
-       vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
1843
-
1844
-   // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
1845
-   // lanes. xx xx xx 0d xx xx xx 02
1846
-   //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
1847
-   //            28))
1848
-   //                      \|
1849
-   // xx xx xx xx xx xx xx d2
1850
-   //
1851
-   // 00001101 00000010 (0d 02)
1852
-   //     \   \___ |  |
1853
-   //      '---.  \|  |
1854
-   // xxxxxxxx 11010010 (xx d2)
1855
-   uint8x16_t paired64 =
1856
-       vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
1857
-
1858
-   // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
1859
-   // xx xx xx xx xx xx xx d2
1860
-   //                      ||  return paired64[0]
1861
-   //                      d2
1862
-   // Note: Little endian would return the correct value 4b (01001011) instead.
1863
-   return vgetq_lane_u8(paired64, 0) |
1864
-          ((int)vgetq_lane_u8(paired64, 8) << 8);
1865
-#endif
1866
-}
1867
-
1868
-// NEON does not provide this method
1869
-// Creates a 4-bit mask from the most significant bits of the four
1870
-// single-precision, floating-point values.
1871
-// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
1872
-FORCE_INLINE int _mm_movemask_ps(__m128 a)
1873
-{
1874
-   uint32x4_t input = vreinterpretq_u32_m128(a);
1875
-#if defined(__aarch64__)
1876
-   static const int32x4_t shift = {-31, -30, -29, -28};
1877
-   static const uint32x4_t highbit = {0x80000000, 0x80000000, 0x80000000,
1878
-                      0x80000000};
1879
-   return vaddvq_u32(vshlq_u32(vandq_u32(input, highbit), shift));
1880
-#else
1881
-   // Uses the exact same method as _mm_movemask_epi8, see that for details.
1882
-   // Shift out everything but the sign bits with a 32-bit unsigned shift
1883
-   // right.
1884
-   uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
1885
-   // Merge the two pairs together with a 64-bit unsigned shift right + add.
1886
-   uint8x16_t paired =
1887
-       vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
1888
-   // Extract the result.
1889
-   return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
1890
-#endif
1891
-}
1892
-
1893
-// Compute the bitwise AND of 128 bits (representing integer data) in a and
1894
-// mask, and return 1 if the result is zero, otherwise return 0.
1895
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros&expand=5871
1896
-FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
1897
-{
1898
-   int64x2_t a_and_mask = vandq_s64(vreinterpretq_s64_m128i(a),
1899
-                    vreinterpretq_s64_m128i(mask));
1900
-   return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1))
1901
-              ? 0
1902
-              : 1;
1903
-}
1904
-
1905
-/* Math operations */
1906
-
1907
-// Subtracts the four single-precision, floating-point values of a and b.
1908
-//
1909
-//   r0 := a0 - b0
1910
-//   r1 := a1 - b1
1911
-//   r2 := a2 - b2
1912
-//   r3 := a3 - b3
1913
-//
1914
-// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
1915
-FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
1916
-{
1917
-   return vreinterpretq_m128_f32(vsubq_f32(vreinterpretq_f32_m128(a),
1918
-                       vreinterpretq_f32_m128(b)));
1919
-}
1920
-
1921
-// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
1922
-// and store the results in dst.
1923
-//    r0 := a0 - b0
1924
-//    r1 := a1 - b1
1925
-FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
1926
-{
1927
-   return vreinterpretq_m128i_s64(vsubq_s64(vreinterpretq_s64_m128i(a),
1928
-                        vreinterpretq_s64_m128i(b)));
1929
-}
1930
-
1931
-// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
1932
-// unsigned 32-bit integers of a.
1933
-//
1934
-//   r0 := a0 - b0
1935
-//   r1 := a1 - b1
1936
-//   r2 := a2 - b2
1937
-//   r3 := a3 - b3
1938
-//
1939
-// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
1940
-FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
1941
-{
1942
-   return vreinterpretq_m128i_s32(vsubq_s32(vreinterpretq_s32_m128i(a),
1943
-                        vreinterpretq_s32_m128i(b)));
1944
-}
1945
-
1946
-FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
1947
-{
1948
-   return vreinterpretq_m128i_s16(vsubq_s16(vreinterpretq_s16_m128i(a),
1949
-                        vreinterpretq_s16_m128i(b)));
1950
-}
1951
-
1952
-FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
1953
-{
1954
-   return vreinterpretq_m128i_s8(
1955
-       vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
1956
-}
1957
-
1958
-// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
1959
-// integers of a and saturates..
1960
-// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
1961
-FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
1962
-{
1963
-   return vreinterpretq_m128i_u16(vqsubq_u16(vreinterpretq_u16_m128i(a),
1964
-                         vreinterpretq_u16_m128i(b)));
1965
-}
1966
-
1967
-// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
1968
-// integers of a and saturates.
1969
-//
1970
-//   r0 := UnsignedSaturate(a0 - b0)
1971
-//   r1 := UnsignedSaturate(a1 - b1)
1972
-//   ...
1973
-//   r15 := UnsignedSaturate(a15 - b15)
1974
-//
1975
-// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
1976
-FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
1977
-{
1978
-   return vreinterpretq_m128i_u8(vqsubq_u8(vreinterpretq_u8_m128i(a),
1979
-                       vreinterpretq_u8_m128i(b)));
1980
-}
1981
-
1982
-// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
1983
-// of a and saturates.
1984
-//
1985
-//   r0 := SignedSaturate(a0 - b0)
1986
-//   r1 := SignedSaturate(a1 - b1)
1987
-//   ...
1988
-//   r15 := SignedSaturate(a15 - b15)
1989
-//
1990
-// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
1991
-FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
1992
-{
1993
-   return vreinterpretq_m128i_s8(vqsubq_s8(vreinterpretq_s8_m128i(a),
1994
-                       vreinterpretq_s8_m128i(b)));
1995
-}
1996
-
1997
-// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
1998
-// of a and saturates.
1999
-//
2000
-//   r0 := SignedSaturate(a0 - b0)
2001
-//   r1 := SignedSaturate(a1 - b1)
2002
-//   ...
2003
-//   r7 := SignedSaturate(a7 - b7)
2004
-//
2005
-// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
2006
-FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
2007
-{
2008
-   return vreinterpretq_m128i_s16(vqsubq_s16(vreinterpretq_s16_m128i(a),
2009
-                         vreinterpretq_s16_m128i(b)));
2010
-}
2011
-
2012
-FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
2013
-{
2014
-   return vreinterpretq_m128i_u16(vqaddq_u16(vreinterpretq_u16_m128i(a),
2015
-                         vreinterpretq_u16_m128i(b)));
2016
-}
2017
-
2018
-// Negate packed 8-bit integers in a when the corresponding signed
2019
-// 8-bit integer in b is negative, and store the results in dst.
2020
-// Element in dst are zeroed out when the corresponding element
2021
-// in b is zero.
2022
-//
2023
-//   for i in 0..15
2024
-//     if b[i] < 0
2025
-//       r[i] := -a[i]
2026
-//     else if b[i] == 0
2027
-//       r[i] := 0
2028
-//     else
2029
-//       r[i] := a[i]
2030
-//     fi
2031
-//   done
2032
-FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
2033
-{
2034
-   int8x16_t a = vreinterpretq_s8_m128i(_a);
2035
-   int8x16_t b = vreinterpretq_s8_m128i(_b);
2036
-
2037
-   int8x16_t zero = vdupq_n_s8(0);
2038
-   // signed shift right: faster than vclt
2039
-   // (b < 0) ? 0xFF : 0
2040
-   uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
2041
-   // (b == 0) ? 0xFF : 0
2042
-   int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, zero));
2043
-   // -a
2044
-   int8x16_t neg = vnegq_s8(a);
2045
-   // bitwise select either a or neg based on ltMask
2046
-   int8x16_t masked = vbslq_s8(ltMask, a, neg);
2047
-   // res = masked & (~zeroMask)
2048
-   int8x16_t res = vbicq_s8(masked, zeroMask);
2049
-   return vreinterpretq_m128i_s8(res);
2050
-}
2051
-
2052
-// Negate packed 16-bit integers in a when the corresponding signed
2053
-// 16-bit integer in b is negative, and store the results in dst.
2054
-// Element in dst are zeroed out when the corresponding element
2055
-// in b is zero.
2056
-//
2057
-//   for i in 0..7
2058
-//     if b[i] < 0
2059
-//       r[i] := -a[i]
2060
-//     else if b[i] == 0
2061
-//       r[i] := 0
2062
-//     else
2063
-//       r[i] := a[i]
2064
-//     fi
2065
-//   done
2066
-FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
2067
-{
2068
-   int16x8_t a = vreinterpretq_s16_m128i(_a);
2069
-   int16x8_t b = vreinterpretq_s16_m128i(_b);
2070
-
2071
-   int16x8_t zero = vdupq_n_s16(0);
2072
-   // signed shift right: faster than vclt
2073
-   // (b < 0) ? 0xFFFF : 0
2074
-   uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
2075
-   // (b == 0) ? 0xFFFF : 0
2076
-   int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, zero));
2077
-   // -a
2078
-   int16x8_t neg = vnegq_s16(a);
2079
-   // bitwise select either a or neg based on ltMask
2080
-   int16x8_t masked = vbslq_s16(ltMask, a, neg);
2081
-   // res = masked & (~zeroMask)
2082
-   int16x8_t res = vbicq_s16(masked, zeroMask);
2083
-   return vreinterpretq_m128i_s16(res);
2084
-}
2085
-
2086
-// Negate packed 32-bit integers in a when the corresponding signed
2087
-// 32-bit integer in b is negative, and store the results in dst.
2088
-// Element in dst are zeroed out when the corresponding element
2089
-// in b is zero.
2090
-//
2091
-//   for i in 0..3
2092
-//     if b[i] < 0
2093
-//       r[i] := -a[i]
2094
-//     else if b[i] == 0
2095
-//       r[i] := 0
2096
-//     else
2097
-//       r[i] := a[i]
2098
-//     fi
2099
-//   done
2100
-FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
2101
-{
2102
-   int32x4_t a = vreinterpretq_s32_m128i(_a);
2103
-   int32x4_t b = vreinterpretq_s32_m128i(_b);
2104
-
2105
-   int32x4_t zero = vdupq_n_s32(0);
2106
-   // signed shift right: faster than vclt
2107
-   // (b < 0) ? 0xFFFFFFFF : 0
2108
-   uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
2109
-   // (b == 0) ? 0xFFFFFFFF : 0
2110
-   int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, zero));
2111
-   // neg = -a
2112
-   int32x4_t neg = vnegq_s32(a);
2113
-   // bitwise select either a or neg based on ltMask
2114
-   int32x4_t masked = vbslq_s32(ltMask, a, neg);
2115
-   // res = masked & (~zeroMask)
2116
-   int32x4_t res = vbicq_s32(masked, zeroMask);
2117
-   return vreinterpretq_m128i_s32(res);
2118
-}
2119
-
2120
-// Computes the average of the 16 unsigned 8-bit integers in a and the 16
2121
-// unsigned 8-bit integers in b and rounds.
2122
-//
2123
-//   r0 := (a0 + b0) / 2
2124
-//   r1 := (a1 + b1) / 2
2125
-//   ...
2126
-//   r15 := (a15 + b15) / 2
2127
-//
2128
-// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
2129
-FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
2130
-{
2131
-   return vreinterpretq_m128i_u8(vrhaddq_u8(vreinterpretq_u8_m128i(a),
2132
-                        vreinterpretq_u8_m128i(b)));
2133
-}
2134
-
2135
-// Computes the average of the 8 unsigned 16-bit integers in a and the 8
2136
-// unsigned 16-bit integers in b and rounds.
2137
-//
2138
-//   r0 := (a0 + b0) / 2
2139
-//   r1 := (a1 + b1) / 2
2140
-//   ...
2141
-//   r7 := (a7 + b7) / 2
2142
-//
2143
-// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
2144
-FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
2145
-{
2146
-   return (__m128i)vrhaddq_u16(vreinterpretq_u16_m128i(a),
2147
-                   vreinterpretq_u16_m128i(b));
2148
-}
2149
-
2150
-// Adds the four single-precision, floating-point values of a and b.
2151
-//
2152
-//   r0 := a0 + b0
2153
-//   r1 := a1 + b1
2154
-//   r2 := a2 + b2
2155
-//   r3 := a3 + b3
2156
-//
2157
-// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
2158
-FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
2159
-{
2160
-   return vreinterpretq_m128_f32(vaddq_f32(vreinterpretq_f32_m128(a),
2161
-                       vreinterpretq_f32_m128(b)));
2162
-}
2163
-
2164
-// adds the scalar single-precision floating point values of a and b.
2165
-// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
2166
-FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
2167
-{
2168
-   float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
2169
-   float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
2170
-   // the upper values in the result must be the remnants of <a>.
2171
-   return vreinterpretq_m128_f32(vaddq_f32(a, value));
2172
-}
2173
-
2174
-// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
2175
-// unsigned 32-bit integers in b.
2176
-// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2177
-FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2178
-{
2179
-   return vreinterpretq_m128i_s64(vaddq_s64(vreinterpretq_s64_m128i(a),
2180
-                        vreinterpretq_s64_m128i(b)));
2181
-}
2182
-
2183
-// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
2184
-// unsigned 32-bit integers in b.
2185
-//
2186
-//   r0 := a0 + b0
2187
-//   r1 := a1 + b1
2188
-//   r2 := a2 + b2
2189
-//   r3 := a3 + b3
2190
-//
2191
-// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
2192
-FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2193
-{
2194
-   return vreinterpretq_m128i_s32(vaddq_s32(vreinterpretq_s32_m128i(a),
2195
-                        vreinterpretq_s32_m128i(b)));
2196
-}
2197
-
2198
-// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
2199
-// unsigned 16-bit integers in b.
2200
-// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
2201
-FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2202
-{
2203
-   return vreinterpretq_m128i_s16(vaddq_s16(vreinterpretq_s16_m128i(a),
2204
-                        vreinterpretq_s16_m128i(b)));
2205
-}
2206
-
2207
-// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
2208
-// unsigned 8-bit integers in b.
2209
-// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
2210
-FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
2211
-{
2212
-   return vreinterpretq_m128i_s8(
2213
-       vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2214
-}
2215
-
2216
-// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
2217
-// and saturates.
2218
-//
2219
-//   r0 := SignedSaturate(a0 + b0)
2220
-//   r1 := SignedSaturate(a1 + b1)
2221
-//   ...
2222
-//   r7 := SignedSaturate(a7 + b7)
2223
-//
2224
-// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
2225
-FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
2226
-{
2227
-   return vreinterpretq_m128i_s16(vqaddq_s16(vreinterpretq_s16_m128i(a),
2228
-                         vreinterpretq_s16_m128i(b)));
2229
-}
2230
-
2231
-// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
2232
-// b and saturates..
2233
-// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
2234
-FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
2235
-{
2236
-   return vreinterpretq_m128i_u8(vqaddq_u8(vreinterpretq_u8_m128i(a),
2237
-                       vreinterpretq_u8_m128i(b)));
2238
-}
2239
-
2240
-// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
2241
-// unsigned 16-bit integers from b.
2242
-//
2243
-//   r0 := (a0 * b0)[15:0]
2244
-//   r1 := (a1 * b1)[15:0]
2245
-//   ...
2246
-//   r7 := (a7 * b7)[15:0]
2247
-//
2248
-// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
2249
-FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
2250
-{
2251
-   return vreinterpretq_m128i_s16(vmulq_s16(vreinterpretq_s16_m128i(a),
2252
-                        vreinterpretq_s16_m128i(b)));
2253
-}
2254
-
2255
-// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
2256
-// unsigned 32-bit integers from b.
2257
-// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
2258
-FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
2259
-{
2260
-   return vreinterpretq_m128i_s32(vmulq_s32(vreinterpretq_s32_m128i(a),
2261
-                        vreinterpretq_s32_m128i(b)));
2262
-}
2263
-
2264
-// Multiplies the four single-precision, floating-point values of a and b.
2265
-//
2266
-//   r0 := a0 * b0
2267
-//   r1 := a1 * b1
2268
-//   r2 := a2 * b2
2269
-//   r3 := a3 * b3
2270
-//
2271
-// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
2272
-FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2273
-{
2274
-   return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a),
2275
-                       vreinterpretq_f32_m128(b)));
2276
-}
2277
-
2278
-// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
2279
-// a and b, and store the unsigned 64-bit results in dst.
2280
-//
2281
-//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
2282
-//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
2283
-FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
2284
-{
2285
-   // vmull_u32 upcasts instead of masking, so we downcast.
2286
-   uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
2287
-   uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
2288
-   return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
2289
-}
2290
-
2291
-// Multiply the low signed 32-bit integers from each packed 64-bit element in
2292
-// a and b, and store the signed 64-bit results in dst.
2293
-//
2294
-//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
2295
-//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
2296
-FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
2297
-{
2298
-   // vmull_s32 upcasts instead of masking, so we downcast.
2299
-   int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
2300
-   int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
2301
-   return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
2302
-}
2303
-
2304
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
2305
-// integers from b.
2306
-//
2307
-//   r0 := (a0 * b0) + (a1 * b1)
2308
-//   r1 := (a2 * b2) + (a3 * b3)
2309
-//   r2 := (a4 * b4) + (a5 * b5)
2310
-//   r3 := (a6 * b6) + (a7 * b7)
2311
-// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
2312
-FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
2313
-{
2314
-   int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
2315
-                 vget_low_s16(vreinterpretq_s16_m128i(b)));
2316
-   int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
2317
-                  vget_high_s16(vreinterpretq_s16_m128i(b)));
2318
-
2319
-   int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
2320
-   int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
2321
-
2322
-   return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
2323
-}
2324
-
2325
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
2326
-// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
2327
-// the packed 16-bit integers in dst.
2328
-//
2329
-//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
2330
-//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
2331
-//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
2332
-//   ...
2333
-//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
2334
-FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
2335
-{
2336
-   // Has issues due to saturation
2337
-   // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
2338
-
2339
-   // Multiply
2340
-   int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
2341
-                    vget_low_s16(vreinterpretq_s16_m128i(b)));
2342
-   int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
2343
-                    vget_high_s16(vreinterpretq_s16_m128i(b)));
2344
-
2345
-   // Rounding narrowing shift right
2346
-   // narrow = (int16_t)((mul + 16384) >> 15);
2347
-   int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
2348
-   int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
2349
-
2350
-   // Join together
2351
-   return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
2352
-}
2353
-
2354
-// Vertically multiply each unsigned 8-bit integer from a with the corresponding
2355
-// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
2356
-// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
2357
-// and pack the saturated results in dst.
2358
-//
2359
-//   FOR j := 0 to 7
2360
-//      i := j*16
2361
-//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
2362
-//      a[i+7:i]*b[i+7:i] )
2363
-//   ENDFOR
2364
-FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
2365
-{
2366
-   // This would be much simpler if x86 would choose to zero extend OR sign
2367
-   // extend, not both. This could probably be optimized better.
2368
-   uint16x8_t a = vreinterpretq_u16_m128i(_a);
2369
-   int16x8_t b = vreinterpretq_s16_m128i(_b);
2370
-
2371
-   // Zero extend a
2372
-   int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
2373
-   int16x8_t a_even =
2374
-       vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
2375
-
2376
-   // Sign extend by shifting left then shifting right.
2377
-   int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
2378
-   int16x8_t b_odd = vshrq_n_s16(b, 8);
2379
-
2380
-   // multiply
2381
-   int16x8_t prod1 = vmulq_s16(a_even, b_even);
2382
-   int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
2383
-
2384
-   // saturated add
2385
-   return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
2386
-}
2387
-
2388
-// Computes the absolute difference of the 16 unsigned 8-bit integers from a
2389
-// and the 16 unsigned 8-bit integers from b.
2390
-//
2391
-// Return Value
2392
-// Sums the upper 8 differences and lower 8 differences and packs the
2393
-// resulting 2 unsigned 16-bit integers into the upper and lower 64-bit
2394
-// elements.
2395
-//
2396
-//   r0 := abs(a0 - b0) + abs(a1 - b1) +...+ abs(a7 - b7)
2397
-//   r1 := 0x0
2398
-//   r2 := 0x0
2399
-//   r3 := 0x0
2400
-//   r4 := abs(a8 - b8) + abs(a9 - b9) +...+ abs(a15 - b15)
2401
-//   r5 := 0x0
2402
-//   r6 := 0x0
2403
-//   r7 := 0x0
2404
-FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
2405
-{
2406
-   uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t)a, (uint8x16_t)b));
2407
-   uint16_t r0 = t[0] + t[1] + t[2] + t[3];
2408
-   uint16_t r4 = t[4] + t[5] + t[6] + t[7];
2409
-   uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
2410
-   return (__m128i)vsetq_lane_u16(r4, r, 4);
2411
-}
2412
-
2413
-// Divides the four single-precision, floating-point values of a and b.
2414
-//
2415
-//   r0 := a0 / b0
2416
-//   r1 := a1 / b1
2417
-//   r2 := a2 / b2
2418
-//   r3 := a3 / b3
2419
-//
2420
-// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
2421
-FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
2422
-{
2423
-   float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
2424
-   float32x4_t recip1 = vmulq_f32(
2425
-       recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
2426
-   return vreinterpretq_m128_f32(
2427
-       vmulq_f32(vreinterpretq_f32_m128(a), recip1));
2428
-}
2429
-
2430
-// Divides the scalar single-precision floating point value of a by b.
2431
-// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
2432
-FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
2433
-{
2434
-   float32_t value =
2435
-       vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
2436
-   return vreinterpretq_m128_f32(
2437
-       vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2438
-}
2439
-
2440
-// Computes the approximations of reciprocals of the four single-precision,
2441
-// floating-point values of a.
2442
-// https://msdn.microsoft.com/en-us/library/vstudio/796k1tty(v=vs.100).aspx
2443
-FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2444
-{
2445
-   float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2446
-   recip = vmulq_f32(recip,
2447
-             vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2448
-   return vreinterpretq_m128_f32(recip);
2449
-}
2450
-
2451
-// Computes the approximations of square roots of the four single-precision,
2452
-// floating-point values of a. First computes reciprocal square roots and then
2453
-// reciprocals of the four values.
2454
-//
2455
-//   r0 := sqrt(a0)
2456
-//   r1 := sqrt(a1)
2457
-//   r2 := sqrt(a2)
2458
-//   r3 := sqrt(a3)
2459
-//
2460
-// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
2461
-FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2462
-{
2463
-#if defined(__aarch64__)
2464
-   return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2465
-#else
2466
-   float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2467
-   float32x4_t sq = vrecpeq_f32(recipsq);
2468
-   // ??? use step versions of both sqrt and recip for better accuracy?
2469
-   return vreinterpretq_m128_f32(sq);
2470
-#endif
2471
-}
2472
-
2473
-// Computes the approximation of the square root of the scalar single-precision
2474
-// floating point value of in.
2475
-// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
2476
-FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2477
-{
2478
-   float32_t value =
2479
-       vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2480
-   return vreinterpretq_m128_f32(
2481
-       vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2482
-}
2483
-
2484
-// Computes the approximations of the reciprocal square roots of the four
2485
-// single-precision floating point values of in.
2486
-// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
2487
-FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
2488
-{
2489
-   return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
2490
-}
2491
-
2492
-// Compute the approximate reciprocal square root of the lower single-precision
2493
-// (32-bit) floating-point element in a, store the result in the lower element
2494
-// of dst, and copy the upper 3 packed elements from a to the upper elements of
2495
-// dst.
2496
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
2497
-FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
2498
-{
2499
-   return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2500
-}
2501
-
2502
-// Computes the maximums of the four single-precision, floating-point values of
2503
-// a and b.
2504
-// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
2505
-FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
2506
-{
2507
-   return vreinterpretq_m128_f32(vmaxq_f32(vreinterpretq_f32_m128(a),
2508
-                       vreinterpretq_f32_m128(b)));
2509
-}
2510
-
2511
-// Computes the minima of the four single-precision, floating-point values of a
2512
-// and b.
2513
-// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
2514
-FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2515
-{
2516
-   return vreinterpretq_m128_f32(vminq_f32(vreinterpretq_f32_m128(a),
2517
-                       vreinterpretq_f32_m128(b)));
2518
-}
2519
-
2520
-// Computes the maximum of the two lower scalar single-precision floating point
2521
-// values of a and b.
2522
-// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
2523
-FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
2524
-{
2525
-   float32_t value = vgetq_lane_f32(vmaxq_f32(vreinterpretq_f32_m128(a),
2526
-                          vreinterpretq_f32_m128(b)),
2527
-                    0);
2528
-   return vreinterpretq_m128_f32(
2529
-       vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2530
-}
2531
-
2532
-// Computes the minimum of the two lower scalar single-precision floating point
2533
-// values of a and b.
2534
-// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
2535
-FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2536
-{
2537
-   float32_t value = vgetq_lane_f32(vminq_f32(vreinterpretq_f32_m128(a),
2538
-                          vreinterpretq_f32_m128(b)),
2539
-                    0);
2540
-   return vreinterpretq_m128_f32(
2541
-       vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2542
-}
2543
-
2544
-// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
2545
-// 16 unsigned 8-bit integers from b.
2546
-// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
2547
-FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
2548
-{
2549
-   return vreinterpretq_m128i_u8(
2550
-       vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2551
-}
2552
-
2553
-// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
2554
-// 16 unsigned 8-bit integers from b.
2555
-// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
2556
-FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
2557
-{
2558
-   return vreinterpretq_m128i_u8(
2559
-       vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
2560
-}
2561
-
2562
-// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
2563
-// signed 16-bit integers from b.
2564
-// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
2565
-FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
2566
-{
2567
-   return vreinterpretq_m128i_s16(vminq_s16(vreinterpretq_s16_m128i(a),
2568
-                        vreinterpretq_s16_m128i(b)));
2569
-}
2570
-
2571
-// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
2572
-// signed 16-bit integers from b.
2573
-// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
2574
-FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
2575
-{
2576
-   return vreinterpretq_m128i_s16(vmaxq_s16(vreinterpretq_s16_m128i(a),
2577
-                        vreinterpretq_s16_m128i(b)));
2578
-}
2579
-
2580
-// epi versions of min/max
2581
-// Computes the pariwise maximums of the four signed 32-bit integer values of a
2582
-// and b.
2583
-//
2584
-// A 128-bit parameter that can be defined with the following equations:
2585
-//   r0 := (a0 > b0) ? a0 : b0
2586
-//   r1 := (a1 > b1) ? a1 : b1
2587
-//   r2 := (a2 > b2) ? a2 : b2
2588
-//   r3 := (a3 > b3) ? a3 : b3
2589
-//
2590
-// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
2591
-FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
2592
-{
2593
-   return vreinterpretq_m128i_s32(vmaxq_s32(vreinterpretq_s32_m128i(a),
2594
-                        vreinterpretq_s32_m128i(b)));
2595
-}
2596
-
2597
-// Computes the pariwise minima of the four signed 32-bit integer values of a
2598
-// and b.
2599
-//
2600
-// A 128-bit parameter that can be defined with the following equations:
2601
-//   r0 := (a0 < b0) ? a0 : b0
2602
-//   r1 := (a1 < b1) ? a1 : b1
2603
-//   r2 := (a2 < b2) ? a2 : b2
2604
-//   r3 := (a3 < b3) ? a3 : b3
2605
-//
2606
-// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
2607
-FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
2608
-{
2609
-   return vreinterpretq_m128i_s32(vminq_s32(vreinterpretq_s32_m128i(a),
2610
-                        vreinterpretq_s32_m128i(b)));
2611
-}
2612
-
2613
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
2614
-// integers from b.
2615
-//
2616
-//   r0 := (a0 * b0)[31:16]
2617
-//   r1 := (a1 * b1)[31:16]
2618
-//   ...
2619
-//   r7 := (a7 * b7)[31:16]
2620
-//
2621
-// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
2622
-FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
2623
-{
2624
-   /* FIXME: issue with large values because of result saturation */
2625
-   // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
2626
-   // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
2627
-   // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
2628
-   int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
2629
-   int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
2630
-   int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
2631
-   int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
2632
-   int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
2633
-   int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
2634
-   uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210),
2635
-                  vreinterpretq_u16_s32(ab7654));
2636
-   return vreinterpretq_m128i_u16(r.val[1]);
2637
-}
2638
-
2639
-// Computes pairwise add of each argument as single-precision, floating-point
2640
-// values a and b.
2641
-// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
2642
-FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
2643
-{
2644
-#if defined(__aarch64__)
2645
-   return vreinterpretq_m128_f32(vpaddq_f32(vreinterpretq_f32_m128(a),
2646
-                        vreinterpretq_f32_m128(b)));
2647
-#else
2648
-   float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
2649
-   float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
2650
-   float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
2651
-   float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
2652
-   return vreinterpretq_m128_f32(
2653
-       vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
2654
-#endif
2655
-}
2656
-
2657
-// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
2658
-// values a and b.
2659
-FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
2660
-{
2661
-   int16x8_t a = vreinterpretq_s16_m128i(_a);
2662
-   int16x8_t b = vreinterpretq_s16_m128i(_b);
2663
-#if defined(__aarch64__)
2664
-   return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
2665
-#else
2666
-   return vreinterpretq_m128i_s16(
2667
-       vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
2668
-                vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
2669
-#endif
2670
-}
2671
-
2672
-// Computes pairwise difference of each argument as a 16-bit signed or unsigned
2673
-// integer values a and b.
2674
-FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
2675
-{
2676
-   int32x4_t a = vreinterpretq_s32_m128i(_a);
2677
-   int32x4_t b = vreinterpretq_s32_m128i(_b);
2678
-   // Interleave using vshrn/vmovn
2679
-   // [a0|a2|a4|a6|b0|b2|b4|b6]
2680
-   // [a1|a3|a5|a7|b1|b3|b5|b7]
2681
-   int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
2682
-   int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
2683
-   // Subtract
2684
-   return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
2685
-}
2686
-
2687
-// Computes saturated pairwise sub of each argument as a 16-bit signed
2688
-// integer values a and b.
2689
-FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
2690
-{
2691
-   int32x4_t a = vreinterpretq_s32_m128i(_a);
2692
-   int32x4_t b = vreinterpretq_s32_m128i(_b);
2693
-   // Interleave using vshrn/vmovn
2694
-   // [a0|a2|a4|a6|b0|b2|b4|b6]
2695
-   // [a1|a3|a5|a7|b1|b3|b5|b7]
2696
-   int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
2697
-   int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
2698
-   // Saturated add
2699
-   return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
2700
-}
2701
-
2702
-// Computes saturated pairwise difference of each argument as a 16-bit signed
2703
-// integer values a and b.
2704
-FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
2705
-{
2706
-   int32x4_t a = vreinterpretq_s32_m128i(_a);
2707
-   int32x4_t b = vreinterpretq_s32_m128i(_b);
2708
-   // Interleave using vshrn/vmovn
2709
-   // [a0|a2|a4|a6|b0|b2|b4|b6]
2710
-   // [a1|a3|a5|a7|b1|b3|b5|b7]
2711
-   int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
2712
-   int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
2713
-   // Saturated subtract
2714
-   return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
2715
-}
2716
-
2717
-// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
2718
-// values a and b.
2719
-FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
2720
-{
2721
-   int32x4_t a = vreinterpretq_s32_m128i(_a);
2722
-   int32x4_t b = vreinterpretq_s32_m128i(_b);
2723
-   return vreinterpretq_m128i_s32(
2724
-       vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
2725
-                vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
2726
-}
2727
-
2728
-// Computes pairwise difference of each argument as a 32-bit signed or unsigned
2729
-// integer values a and b.
2730
-FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
2731
-{
2732
-   int64x2_t a = vreinterpretq_s64_m128i(_a);
2733
-   int64x2_t b = vreinterpretq_s64_m128i(_b);
2734
-   // Interleave using vshrn/vmovn
2735
-   // [a0|a2|b0|b2]
2736
-   // [a1|a2|b1|b3]
2737
-   int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
2738
-   int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
2739
-   // Subtract
2740
-   return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
2741
-}
2742
-
2743
-/* Compare operations */
2744
-
2745
-// Compares for less than
2746
-// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
2747
-FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
2748
-{
2749
-   return vreinterpretq_m128_u32(vcltq_f32(vreinterpretq_f32_m128(a),
2750
-                       vreinterpretq_f32_m128(b)));
2751
-}
2752
-
2753
-// Compares for greater than.
2754
-//
2755
-//   r0 := (a0 > b0) ? 0xffffffff : 0x0
2756
-//   r1 := (a1 > b1) ? 0xffffffff : 0x0
2757
-//   r2 := (a2 > b2) ? 0xffffffff : 0x0
2758
-//   r3 := (a3 > b3) ? 0xffffffff : 0x0
2759
-//
2760
-// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
2761
-FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
2762
-{
2763
-   return vreinterpretq_m128_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
2764
-                       vreinterpretq_f32_m128(b)));
2765
-}
2766
-
2767
-// Compares for greater than or equal.
2768
-// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
2769
-FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
2770
-{
2771
-   return vreinterpretq_m128_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
2772
-                       vreinterpretq_f32_m128(b)));
2773
-}
2774
-
2775
-// Compares for less than or equal.
2776
-//
2777
-//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
2778
-//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
2779
-//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
2780
-//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
2781
-//
2782
-// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
2783
-FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
2784
-{
2785
-   return vreinterpretq_m128_u32(vcleq_f32(vreinterpretq_f32_m128(a),
2786
-                       vreinterpretq_f32_m128(b)));
2787
-}
2788
-
2789
-// Compares for equality.
2790
-// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
2791
-FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
2792
-{
2793
-   return vreinterpretq_m128_u32(vceqq_f32(vreinterpretq_f32_m128(a),
2794
-                       vreinterpretq_f32_m128(b)));
2795
-}
2796
-
2797
-// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
2798
-// unsigned 8-bit integers in b for equality.
2799
-// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
2800
-FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
2801
-{
2802
-   return vreinterpretq_m128i_u8(
2803
-       vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2804
-}
2805
-
2806
-// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
2807
-// unsigned 16-bit integers in b for equality.
2808
-// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
2809
-FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
2810
-{
2811
-   return vreinterpretq_m128i_u16(vceqq_s16(vreinterpretq_s16_m128i(a),
2812
-                        vreinterpretq_s16_m128i(b)));
2813
-}
2814
-
2815
-// Compare packed 32-bit integers in a and b for equality, and store the results
2816
-// in dst
2817
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
2818
-{
2819
-   return vreinterpretq_m128i_u32(vceqq_s32(vreinterpretq_s32_m128i(a),
2820
-                        vreinterpretq_s32_m128i(b)));
2821
-}
2822
-
2823
-// Compare packed 64-bit integers in a and b for equality, and store the results
2824
-// in dst
2825
-FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
2826
-{
2827
-#if defined(__aarch64__)
2828
-   return vreinterpretq_m128i_u64(vceqq_u64(vreinterpretq_u64_m128i(a),
2829
-                        vreinterpretq_u64_m128i(b)));
2830
-#else
2831
-   // ARMv7 lacks vceqq_u64
2832
-   // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
2833
-   uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128i(a),
2834
-                  vreinterpretq_u32_m128i(b));
2835
-   uint32x4_t swapped = vrev64q_u32(cmp);
2836
-   return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
2837
-#endif
2838
-}
2839
-
2840
-// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
2841
-// in b for lesser than.
2842
-// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
2843
-FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
2844
-{
2845
-   return vreinterpretq_m128i_u8(
2846
-       vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2847
-}
2848
-
2849
-// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
2850
-// in b for greater than.
2851
-//
2852
-//   r0 := (a0 > b0) ? 0xff : 0x0
2853
-//   r1 := (a1 > b1) ? 0xff : 0x0
2854
-//   ...
2855
-//   r15 := (a15 > b15) ? 0xff : 0x0
2856
-//
2857
-// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
2858
-FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
2859
-{
2860
-   return vreinterpretq_m128i_u8(
2861
-       vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2862
-}
2863
-
2864
-// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
2865
-// in b for less than.
2866
-//
2867
-//   r0 := (a0 < b0) ? 0xffff : 0x0
2868
-//   r1 := (a1 < b1) ? 0xffff : 0x0
2869
-//   ...
2870
-//   r7 := (a7 < b7) ? 0xffff : 0x0
2871
-//
2872
-// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
2873
-FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
2874
-{
2875
-   return vreinterpretq_m128i_u16(vcltq_s16(vreinterpretq_s16_m128i(a),
2876
-                        vreinterpretq_s16_m128i(b)));
2877
-}
2878
-
2879
-// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
2880
-// in b for greater than.
2881
-//
2882
-//   r0 := (a0 > b0) ? 0xffff : 0x0
2883
-//   r1 := (a1 > b1) ? 0xffff : 0x0
2884
-//   ...
2885
-//   r7 := (a7 > b7) ? 0xffff : 0x0
2886
-//
2887
-// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
2888
-FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
2889
-{
2890
-   return vreinterpretq_m128i_u16(vcgtq_s16(vreinterpretq_s16_m128i(a),
2891
-                        vreinterpretq_s16_m128i(b)));
2892
-}
2893
-
2894
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
2895
-// in b for less than.
2896
-// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
2897
-FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
2898
-{
2899
-   return vreinterpretq_m128i_u32(vcltq_s32(vreinterpretq_s32_m128i(a),
2900
-                        vreinterpretq_s32_m128i(b)));
2901
-}
2902
-
2903
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
2904
-// in b for greater than.
2905
-// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
2906
-FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
2907
-{
2908
-   return vreinterpretq_m128i_u32(vcgtq_s32(vreinterpretq_s32_m128i(a),
2909
-                        vreinterpretq_s32_m128i(b)));
2910
-}
2911
-
2912
-// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
2913
-// in b for greater than.
2914
-FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
2915
-{
2916
-#if defined(__aarch64__)
2917
-   return vreinterpretq_m128i_u64(vcgtq_s64(vreinterpretq_s64_m128i(a),
2918
-                        vreinterpretq_s64_m128i(b)));
2919
-#else
2920
-   // ARMv7 lacks vcgtq_s64.
2921
-   // This is based off of Clang's SSE2 polyfill:
2922
-   // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
2923
-
2924
-   // Mask the sign bit out since we need a signed AND an unsigned comparison
2925
-   // and it is ugly to try and split them.
2926
-   int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
2927
-   int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
2928
-   int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
2929
-   // Check if a > b
2930
-   int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
2931
-   // Copy upper mask to lower mask
2932
-   // a_hi > b_hi
2933
-   int64x2_t gt_hi = vshrq_n_s64(greater, 63);
2934
-   // Copy lower mask to upper mask
2935
-   // a_lo > b_lo
2936
-   int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
2937
-   // Compare for equality
2938
-   int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
2939
-   // Copy upper mask to lower mask
2940
-   // a_hi == b_hi
2941
-   int64x2_t eq_hi = vshrq_n_s64(equal, 63);
2942
-   // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
2943
-   int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
2944
-   return vreinterpretq_m128i_s64(ret);
2945
-#endif
2946
-}
2947
-
2948
-// Compares the four 32-bit floats in a and b to check if any values are NaN.
2949
-// Ordered compare between each value returns true for "orderable" and false for
2950
-// "not orderable" (NaN).
2951
-// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
2952
-// also:
2953
-// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
2954
-// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
2955
-FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
2956
-{
2957
-   // Note: NEON does not have ordered compare builtin
2958
-   // Need to compare a eq a and b eq b to check for NaN
2959
-   // Do AND of results to get final
2960
-   uint32x4_t ceqaa =
2961
-       vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2962
-   uint32x4_t ceqbb =
2963
-       vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2964
-   return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
2965
-}
2966
-
2967
-// Compares the lower single-precision floating point scalar values of a and b
2968
-// using a less than operation. :
2969
-// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
2970
-// note!! The documentation on MSDN is incorrect!  If either of the values is a
2971
-// NAN the docs say you will get a one, but in fact, it will return a zero!!
2972
-FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
2973
-{
2974
-   uint32x4_t a_not_nan =
2975
-       vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2976
-   uint32x4_t b_not_nan =
2977
-       vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2978
-   uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
2979
-   uint32x4_t a_lt_b =
2980
-       vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
2981
-   return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1
2982
-                                       : 0;
2983
-}
2984
-
2985
-// Compares the lower single-precision floating point scalar values of a and b
2986
-// using a greater than operation. :
2987
-// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
2988
-FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
2989
-{
2990
-   // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
2991
-   // vreinterpretq_f32_m128(b)), 0);
2992
-   uint32x4_t a_not_nan =
2993
-       vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
2994
-   uint32x4_t b_not_nan =
2995
-       vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
2996
-   uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
2997
-   uint32x4_t a_gt_b =
2998
-       vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
2999
-   return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1
3000
-                                       : 0;
3001
-}
3002
-
3003
-// Compares the lower single-precision floating point scalar values of a and b
3004
-// using a less than or equal operation. :
3005
-// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
3006
-FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
3007
-{
3008
-   // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
3009
-   // vreinterpretq_f32_m128(b)), 0);
3010
-   uint32x4_t a_not_nan =
3011
-       vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
3012
-   uint32x4_t b_not_nan =
3013
-       vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
3014
-   uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3015
-   uint32x4_t a_le_b =
3016
-       vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
3017
-   return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1
3018
-                                       : 0;
3019
-}
3020
-
3021
-// Compares the lower single-precision floating point scalar values of a and b
3022
-// using a greater than or equal operation. :
3023
-// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
3024
-FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
3025
-{
3026
-   // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
3027
-   // vreinterpretq_f32_m128(b)), 0);
3028
-   uint32x4_t a_not_nan =
3029
-       vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
3030
-   uint32x4_t b_not_nan =
3031
-       vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
3032
-   uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3033
-   uint32x4_t a_ge_b =
3034
-       vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
3035
-   return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1
3036
-                                       : 0;
3037
-}
3038
-
3039
-// Compares the lower single-precision floating point scalar values of a and b
3040
-// using an equality operation. :
3041
-// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
3042
-FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
3043
-{
3044
-   // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
3045
-   // vreinterpretq_f32_m128(b)), 0);
3046
-   uint32x4_t a_not_nan =
3047
-       vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
3048
-   uint32x4_t b_not_nan =
3049
-       vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
3050
-   uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3051
-   uint32x4_t a_eq_b =
3052
-       vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
3053
-   return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1
3054
-                                       : 0;
3055
-}
3056
-
3057
-// Compares the lower single-precision floating point scalar values of a and b
3058
-// using an inequality operation. :
3059
-// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
3060
-FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
3061
-{
3062
-   // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
3063
-   // vreinterpretq_f32_m128(b)), 0);
3064
-   uint32x4_t a_not_nan =
3065
-       vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
3066
-   uint32x4_t b_not_nan =
3067
-       vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
3068
-   uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
3069
-   uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(vreinterpretq_f32_m128(a),
3070
-                        vreinterpretq_f32_m128(b)));
3071
-   return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
3072
-}
3073
-
3074
-// according to the documentation, these intrinsics behave the same as the
3075
-// non-'u' versions.  We'll just alias them here.
3076
-#define _mm_ucomilt_ss _mm_comilt_ss
3077
-#define _mm_ucomile_ss _mm_comile_ss
3078
-#define _mm_ucomigt_ss _mm_comigt_ss
3079
-#define _mm_ucomige_ss _mm_comige_ss
3080
-#define _mm_ucomieq_ss _mm_comieq_ss
3081
-#define _mm_ucomineq_ss _mm_comineq_ss
3082
-
3083
-/* Conversions */
3084
-
3085
-// Converts the four single-precision, floating-point values of a to signed
3086
-// 32-bit integer values using truncate.
3087
-// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
3088
-FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
3089
-{
3090
-   return vreinterpretq_m128i_s32(
3091
-       vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
3092
-}
3093
-
3094
-// Converts the four signed 32-bit integer values of a to single-precision,
3095
-// floating-point values
3096
-// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
3097
-FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
3098
-{
3099
-   return vreinterpretq_m128_f32(
3100
-       vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3101
-}
3102
-
3103
-// Converts the four unsigned 8-bit integers in the lower 16 bits to four
3104
-// unsigned 32-bit integers.
3105
-FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
3106
-{
3107
-   uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
3108
-   uint16x8_t u16x8 =
3109
-       vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
3110
-   return vreinterpretq_m128i_u16(u16x8);
3111
-}
3112
-
3113
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four
3114
-// unsigned 32-bit integers.
3115
-// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
3116
-FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
3117
-{
3118
-   uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
3119
-   uint16x8_t u16x8 =
3120
-       vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
3121
-   uint32x4_t u32x4 =
3122
-       vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
3123
-   return vreinterpretq_m128i_u32(u32x4);
3124
-}
3125
-
3126
-// Converts the two unsigned 8-bit integers in the lower 16 bits to two
3127
-// unsigned 64-bit integers.
3128
-FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
3129
-{
3130
-   uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
3131
-   uint16x8_t u16x8 =
3132
-       vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
3133
-   uint32x4_t u32x4 =
3134
-       vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
3135
-   uint64x2_t u64x2 =
3136
-       vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
3137
-   return vreinterpretq_m128i_u64(u64x2);
3138
-}
3139
-
3140
-// Converts the four unsigned 8-bit integers in the lower 16 bits to four
3141
-// unsigned 32-bit integers.
3142
-FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
3143
-{
3144
-   int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
3145
-   int16x8_t s16x8 =
3146
-       vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
3147
-   return vreinterpretq_m128i_s16(s16x8);
3148
-}
3149
-
3150
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four
3151
-// unsigned 32-bit integers.
3152
-FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
3153
-{
3154
-   int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
3155
-   int16x8_t s16x8 =
3156
-       vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
3157
-   int32x4_t s32x4 =
3158
-       vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
3159
-   return vreinterpretq_m128i_s32(s32x4);
3160
-}
3161
-
3162
-// Converts the two signed 8-bit integers in the lower 32 bits to four
3163
-// signed 64-bit integers.
3164
-FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
3165
-{
3166
-   int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
3167
-   int16x8_t s16x8 =
3168
-       vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
3169
-   int32x4_t s32x4 =
3170
-       vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
3171
-   int64x2_t s64x2 =
3172
-       vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
3173
-   return vreinterpretq_m128i_s64(s64x2);
3174
-}
3175
-
3176
-// Converts the four signed 16-bit integers in the lower 64 bits to four signed
3177
-// 32-bit integers.
3178
-FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
3179
-{
3180
-   return vreinterpretq_m128i_s32(
3181
-       vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
3182
-}
3183
-
3184
-// Converts the two signed 16-bit integers in the lower 32 bits two signed
3185
-// 32-bit integers.
3186
-FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
3187
-{
3188
-   int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
3189
-   int32x4_t s32x4 =
3190
-       vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
3191
-   int64x2_t s64x2 =
3192
-       vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
3193
-   return vreinterpretq_m128i_s64(s64x2);
3194
-}
3195
-
3196
-// Converts the four unsigned 16-bit integers in the lower 64 bits to four
3197
-// unsigned 32-bit integers.
3198
-FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
3199
-{
3200
-   return vreinterpretq_m128i_u32(
3201
-       vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
3202
-}
3203
-
3204
-// Converts the two unsigned 16-bit integers in the lower 32 bits to two
3205
-// unsigned 64-bit integers.
3206
-FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
3207
-{
3208
-   uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
3209
-   uint32x4_t u32x4 =
3210
-       vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
3211
-   uint64x2_t u64x2 =
3212
-       vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
3213
-   return vreinterpretq_m128i_u64(u64x2);
3214
-}
3215
-
3216
-// Converts the two unsigned 32-bit integers in the lower 64 bits to two
3217
-// unsigned 64-bit integers.
3218
-FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
3219
-{
3220
-   return vreinterpretq_m128i_u64(
3221
-       vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
3222
-}
3223
-
3224
-// Converts the two signed 32-bit integers in the lower 64 bits to two signed
3225
-// 64-bit integers.
3226
-FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
3227
-{
3228
-   return vreinterpretq_m128i_s64(
3229
-       vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
3230
-}
3231
-
3232
-// Converts the four single-precision, floating-point values of a to signed
3233
-// 32-bit integer values.
3234
-//
3235
-//   r0 := (int) a0
3236
-//   r1 := (int) a1
3237
-//   r2 := (int) a2
3238
-//   r3 := (int) a3
3239
-//
3240
-// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
3241
-// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
3242
-// does not support! It is supported on ARMv8-A however.
3243
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
3244
-{
3245
-#if defined(__aarch64__)
3246
-   return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
3247
-#else
3248
-   uint32x4_t signmask = vdupq_n_u32(0x80000000);
3249
-   float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
3250
-                    vdupq_n_f32(0.5f)); /* +/- 0.5 */
3251
-   int32x4_t r_normal =
3252
-       vcvtq_s32_f32(vaddq_f32(vreinterpretq_f32_m128(a),
3253
-                   half)); /* round to integer: [a + 0.5]*/
3254
-   int32x4_t r_trunc = vcvtq_s32_f32(
3255
-       vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
3256
-   int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
3257
-       vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
3258
-   int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
3259
-                    vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
3260
-   float32x4_t delta = vsubq_f32(
3261
-       vreinterpretq_f32_m128(a),
3262
-       vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
3263
-   uint32x4_t is_delta_half =
3264
-       vceqq_f32(delta, half); /* delta == +/- 0.5 */
3265
-   return vreinterpretq_m128i_s32(
3266
-       vbslq_s32(is_delta_half, r_even, r_normal));
3267
-#endif
3268
-}
3269
-
3270
-// Moves the least significant 32 bits of a to a 32-bit integer.
3271
-// https://msdn.microsoft.com/en-us/library/5z7a9642%28v=vs.90%29.aspx
3272
-FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
3273
-{
3274
-   return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3275
-}
3276
-
3277
-// Extracts the low order 64-bit integer from the parameter.
3278
-// https://msdn.microsoft.com/en-us/library/bb531384(v=vs.120).aspx
3279
-FORCE_INLINE uint64_t _mm_cvtsi128_si64(__m128i a)
3280
-{
3281
-   return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
3282
-}
3283
-
3284
-// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
3285
-// zero extending the upper bits.
3286
-//
3287
-//   r0 := a
3288
-//   r1 := 0x0
3289
-//   r2 := 0x0
3290
-//   r3 := 0x0
3291
-//
3292
-// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
3293
-FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
3294
-{
3295
-   return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
3296
-}
3297
-
3298
-// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
3299
-// zero extending the upper bits.
3300
-//
3301
-//   r0 := a
3302
-//   r1 := 0x0
3303
-FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
3304
-{
3305
-   return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
3306
-}
3307
-
3308
-// Applies a type cast to reinterpret four 32-bit floating point values passed
3309
-// in as a 128-bit parameter as packed 32-bit integers.
3310
-// https://msdn.microsoft.com/en-us/library/bb514099.aspx
3311
-FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
3312
-{
3313
-   return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
3314
-}
3315
-
3316
-// Applies a type cast to reinterpret four 32-bit integers passed in as a
3317
-// 128-bit parameter as packed 32-bit floating point values.
3318
-// https://msdn.microsoft.com/en-us/library/bb514029.aspx
3319
-FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
3320
-{
3321
-   return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
3322
-}
3323
-
3324
-// Loads 128-bit value. :
3325
-// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
3326
-FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
3327
-{
3328
-   return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p));
3329
-}
3330
-
3331
-// Loads 128-bit value. :
3332
-// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
3333
-FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
3334
-{
3335
-   return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *)p));
3336
-}
3337
-
3338
-// _mm_lddqu_si128 functions the same as _mm_loadu_si128.
3339
-#define _mm_lddqu_si128 _mm_loadu_si128
3340
-
3341
-/* Miscellaneous Operations */
3342
-
3343
-// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
3344
-// in the sign bit.
3345
-//
3346
-//   r0 := a0 >> count
3347
-//   r1 := a1 >> count
3348
-//   ...
3349
-//   r7 := a7 >> count
3350
-//
3351
-// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
3352
-FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
3353
-{
3354
-   int64_t c = (int64_t)vget_low_s64((int64x2_t)count);
3355
-   if (c > 15)
3356
-       return _mm_cmplt_epi16(a, _mm_setzero_si128());
3357
-   return vreinterpretq_m128i_s16(
3358
-       vshlq_s16((int16x8_t)a, vdupq_n_s16(-c)));
3359
-}
3360
-
3361
-// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
3362
-// in the sign bit.
3363
-//
3364
-//   r0 := a0 >> count
3365
-//   r1 := a1 >> count
3366
-//   r2 := a2 >> count
3367
-//   r3 := a3 >> count
3368
-//
3369
-// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
3370
-FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
3371
-{
3372
-   int64_t c = (int64_t)vget_low_s64((int64x2_t)count);
3373
-   if (c > 31)
3374
-       return _mm_cmplt_epi32(a, _mm_setzero_si128());
3375
-   return vreinterpretq_m128i_s32(
3376
-       vshlq_s32((int32x4_t)a, vdupq_n_s32(-c)));
3377
-}
3378
-
3379
-// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
3380
-// saturates.
3381
-// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
3382
-FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
3383
-{
3384
-   return vreinterpretq_m128i_s8(
3385
-       vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
3386
-               vqmovn_s16(vreinterpretq_s16_m128i(b))));
3387
-}
3388
-
3389
-// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
3390
-// integers and saturates.
3391
-//
3392
-//   r0 := UnsignedSaturate(a0)
3393
-//   r1 := UnsignedSaturate(a1)
3394
-//   ...
3395
-//   r7 := UnsignedSaturate(a7)
3396
-//   r8 := UnsignedSaturate(b0)
3397
-//   r9 := UnsignedSaturate(b1)
3398
-//   ...
3399
-//   r15 := UnsignedSaturate(b7)
3400
-//
3401
-// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
3402
-FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
3403
-{
3404
-   return vreinterpretq_m128i_u8(
3405
-       vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
3406
-               vqmovun_s16(vreinterpretq_s16_m128i(b))));
3407
-}
3408
-
3409
-// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
3410
-// and saturates.
3411
-//
3412
-//   r0 := SignedSaturate(a0)
3413
-//   r1 := SignedSaturate(a1)
3414
-//   r2 := SignedSaturate(a2)
3415
-//   r3 := SignedSaturate(a3)
3416
-//   r4 := SignedSaturate(b0)
3417
-//   r5 := SignedSaturate(b1)
3418
-//   r6 := SignedSaturate(b2)
3419
-//   r7 := SignedSaturate(b3)
3420
-//
3421
-// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
3422
-FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
3423
-{
3424
-   return vreinterpretq_m128i_s16(
3425
-       vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
3426
-                vqmovn_s32(vreinterpretq_s32_m128i(b))));
3427
-}
3428
-
3429
-// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
3430
-// integers and saturates.
3431
-//
3432
-//   r0 := UnsignedSaturate(a0)
3433
-//   r1 := UnsignedSaturate(a1)
3434
-//   r2 := UnsignedSaturate(a2)
3435
-//   r3 := UnsignedSaturate(a3)
3436
-//   r4 := UnsignedSaturate(b0)
3437
-//   r5 := UnsignedSaturate(b1)
3438
-//   r6 := UnsignedSaturate(b2)
3439
-//   r7 := UnsignedSaturate(b3)
3440
-FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
3441
-{
3442
-   return vreinterpretq_m128i_u16(
3443
-       vcombine_u16(vqmovn_u32(vreinterpretq_u32_m128i(a)),
3444
-                vqmovn_u32(vreinterpretq_u32_m128i(b))));
3445
-}
3446
-
3447
-// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
3448
-// 8 signed or unsigned 8-bit integers in b.
3449
-//
3450
-//   r0 := a0
3451
-//   r1 := b0
3452
-//   r2 := a1
3453
-//   r3 := b1
3454
-//   ...
3455
-//   r14 := a7
3456
-//   r15 := b7
3457
-//
3458
-// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
3459
-FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
3460
-{
3461
-#if defined(__aarch64__)
3462
-   return vreinterpretq_m128i_s8(vzip1q_s8(vreinterpretq_s8_m128i(a),
3463
-                       vreinterpretq_s8_m128i(b)));
3464
-#else
3465
-   int8x8_t a1 =
3466
-       vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
3467
-   int8x8_t b1 =
3468
-       vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
3469
-   int8x8x2_t result = vzip_s8(a1, b1);
3470
-   return vreinterpretq_m128i_s8(
3471
-       vcombine_s8(result.val[0], result.val[1]));
3472
-#endif
3473
-}
3474
-
3475
-// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
3476
-// lower 4 signed or unsigned 16-bit integers in b.
3477
-//
3478
-//   r0 := a0
3479
-//   r1 := b0
3480
-//   r2 := a1
3481
-//   r3 := b1
3482
-//   r4 := a2
3483
-//   r5 := b2
3484
-//   r6 := a3
3485
-//   r7 := b3
3486
-//
3487
-// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
3488
-FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
3489
-{
3490
-#if defined(__aarch64__)
3491
-   return vreinterpretq_m128i_s16(vzip1q_s16(vreinterpretq_s16_m128i(a),
3492
-                         vreinterpretq_s16_m128i(b)));
3493
-#else
3494
-   int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
3495
-   int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
3496
-   int16x4x2_t result = vzip_s16(a1, b1);
3497
-   return vreinterpretq_m128i_s16(
3498
-       vcombine_s16(result.val[0], result.val[1]));
3499
-#endif
3500
-}
3501
-
3502
-// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
3503
-// lower 2 signed or unsigned 32 - bit integers in b.
3504
-//
3505
-//   r0 := a0
3506
-//   r1 := b0
3507
-//   r2 := a1
3508
-//   r3 := b1
3509
-//
3510
-// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
3511
-FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
3512
-{
3513
-#if defined(__aarch64__)
3514
-   return vreinterpretq_m128i_s32(vzip1q_s32(vreinterpretq_s32_m128i(a),
3515
-                         vreinterpretq_s32_m128i(b)));
3516
-#else
3517
-   int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
3518
-   int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
3519
-   int32x2x2_t result = vzip_s32(a1, b1);
3520
-   return vreinterpretq_m128i_s32(
3521
-       vcombine_s32(result.val[0], result.val[1]));
3522
-#endif
3523
-}
3524
-
3525
-FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
3526
-{
3527
-   int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
3528
-   int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
3529
-   return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
3530
-}
3531
-
3532
-// Selects and interleaves the lower two single-precision, floating-point values
3533
-// from a and b.
3534
-//
3535
-//   r0 := a0
3536
-//   r1 := b0
3537
-//   r2 := a1
3538
-//   r3 := b1
3539
-//
3540
-// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
3541
-FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
3542
-{
3543
-#if defined(__aarch64__)
3544
-   return vreinterpretq_m128_f32(vzip1q_f32(vreinterpretq_f32_m128(a),
3545
-                        vreinterpretq_f32_m128(b)));
3546
-#else
3547
-   float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
3548
-   float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
3549
-   float32x2x2_t result = vzip_f32(a1, b1);
3550
-   return vreinterpretq_m128_f32(
3551
-       vcombine_f32(result.val[0], result.val[1]));
3552
-#endif
3553
-}
3554
-
3555
-// Selects and interleaves the upper two single-precision, floating-point values
3556
-// from a and b.
3557
-//
3558
-//   r0 := a2
3559
-//   r1 := b2
3560
-//   r2 := a3
3561
-//   r3 := b3
3562
-//
3563
-// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
3564
-FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
3565
-{
3566
-#if defined(__aarch64__)
3567
-   return vreinterpretq_m128_f32(vzip2q_f32(vreinterpretq_f32_m128(a),
3568
-                        vreinterpretq_f32_m128(b)));
3569
-#else
3570
-   float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
3571
-   float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
3572
-   float32x2x2_t result = vzip_f32(a1, b1);
3573
-   return vreinterpretq_m128_f32(
3574
-       vcombine_f32(result.val[0], result.val[1]));
3575
-#endif
3576
-}
3577
-
3578
-// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
3579
-// 8 signed or unsigned 8-bit integers in b.
3580
-//
3581
-//   r0 := a8
3582
-//   r1 := b8
3583
-//   r2 := a9
3584
-//   r3 := b9
3585
-//   ...
3586
-//   r14 := a15
3587
-//   r15 := b15
3588
-//
3589
-// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
3590
-FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
3591
-{
3592
-#if defined(__aarch64__)
3593
-   return vreinterpretq_m128i_s8(vzip2q_s8(vreinterpretq_s8_m128i(a),
3594
-                       vreinterpretq_s8_m128i(b)));
3595
-#else
3596
-   int8x8_t a1 =
3597
-       vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
3598
-   int8x8_t b1 =
3599
-       vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
3600
-   int8x8x2_t result = vzip_s8(a1, b1);
3601
-   return vreinterpretq_m128i_s8(
3602
-       vcombine_s8(result.val[0], result.val[1]));
3603
-#endif
3604
-}
3605
-
3606
-// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
3607
-// upper 4 signed or unsigned 16-bit integers in b.
3608
-//
3609
-//   r0 := a4
3610
-//   r1 := b4
3611
-//   r2 := a5
3612
-//   r3 := b5
3613
-//   r4 := a6
3614
-//   r5 := b6
3615
-//   r6 := a7
3616
-//   r7 := b7
3617
-//
3618
-// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
3619
-FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
3620
-{
3621
-#if defined(__aarch64__)
3622
-   return vreinterpretq_m128i_s16(vzip2q_s16(vreinterpretq_s16_m128i(a),
3623
-                         vreinterpretq_s16_m128i(b)));
3624
-#else
3625
-   int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
3626
-   int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
3627
-   int16x4x2_t result = vzip_s16(a1, b1);
3628
-   return vreinterpretq_m128i_s16(
3629
-       vcombine_s16(result.val[0], result.val[1]));
3630
-#endif
3631
-}
3632
-
3633
-// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
3634
-// upper 2 signed or unsigned 32-bit integers in b.
3635
-// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
3636
-FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
3637
-{
3638
-#if defined(__aarch64__)
3639
-   return vreinterpretq_m128i_s32(vzip2q_s32(vreinterpretq_s32_m128i(a),
3640
-                         vreinterpretq_s32_m128i(b)));
3641
-#else
3642
-   int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
3643
-   int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
3644
-   int32x2x2_t result = vzip_s32(a1, b1);
3645
-   return vreinterpretq_m128i_s32(
3646
-       vcombine_s32(result.val[0], result.val[1]));
3647
-#endif
3648
-}
3649
-
3650
-// Interleaves the upper signed or unsigned 64-bit integer in a with the
3651
-// upper signed or unsigned 64-bit integer in b.
3652
-//
3653
-//   r0 := a1
3654
-//   r1 := b1
3655
-FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
3656
-{
3657
-   int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
3658
-   int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
3659
-   return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
3660
-}
3661
-
3662
-// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
3663
-// in a, store the minimum and index in dst, and zero the remaining bits in dst.
3664
-//
3665
-//   index[2:0] := 0
3666
-//   min[15:0] := a[15:0]
3667
-//   FOR j := 0 to 7
3668
-//       i := j*16
3669
-//       IF a[i+15:i] < min[15:0]
3670
-//           index[2:0] := j
3671
-//           min[15:0] := a[i+15:i]
3672
-//       FI
3673
-//   ENDFOR
3674
-//   dst[15:0] := min[15:0]
3675
-//   dst[18:16] := index[2:0]
3676
-//   dst[127:19] := 0
3677
-//
3678
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16&expand=3789
3679
-FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
3680
-{
3681
-   __m128i dst;
3682
-   uint16_t min, idx = 0;
3683
-   // Find the minimum value
3684
-#if defined(__aarch64__)
3685
-   min = vminvq_u16(vreinterpretq_u16_m128i(a));
3686
-#else
3687
-   __m64i tmp;
3688
-   tmp = vreinterpret_m64i_u16(
3689
-       vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
3690
-            vget_high_u16(vreinterpretq_u16_m128i(a))));
3691
-   tmp = vreinterpret_m64i_u16(vpmin_u16(vreinterpret_u16_m64i(tmp),
3692
-                         vreinterpret_u16_m64i(tmp)));
3693
-   tmp = vreinterpret_m64i_u16(vpmin_u16(vreinterpret_u16_m64i(tmp),
3694
-                         vreinterpret_u16_m64i(tmp)));
3695
-   min = vget_lane_u16(vreinterpret_u16_m64i(tmp), 0);
3696
-#endif
3697
-   // Get the index of the minimum value
3698
-   int i;
3699
-   for (i = 0; i < 8; i++) {
3700
-       if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
3701
-           idx = (uint16_t)i;
3702
-           break;
3703
-       }
3704
-       a = _mm_srli_si128(a, 2);
3705
-   }
3706
-   // Generate result
3707
-   dst = _mm_setzero_si128();
3708
-   dst = vreinterpretq_m128i_u16(
3709
-       vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
3710
-   dst = vreinterpretq_m128i_u16(
3711
-       vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
3712
-   return dst;
3713
-}
3714
-
3715
-// shift to right
3716
-// https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
3717
-// http://blog.csdn.net/hemmingway/article/details/44828303
3718
-// Clang requires a macro here, as it is extremely picky about c being a
3719
-// literal.
3720
-#define _mm_alignr_epi8(a, b, c) \
3721
-   ((__m128i)vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c)))
3722
-
3723
-// Extracts the selected signed or unsigned 8-bit integer from a and zero
3724
-// extends.
3725
-// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
3726
-#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
3727
-
3728
-// Inserts the least significant 8 bits of b into the selected 8-bit integer
3729
-// of a.
3730
-// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
3731
-//                                      __constrange(0,16) int imm)
3732
-#define _mm_insert_epi8(a, b, imm)                                             \
3733
-   __extension__({                                                        \
3734
-       vreinterpretq_m128i_s8(                                        \
3735
-           vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
3736
-   })
3737
-
3738
-// Extracts the selected signed or unsigned 16-bit integer from a and zero
3739
-// extends.
3740
-// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
3741
-// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
3742
-#define _mm_extract_epi16(a, imm) \
3743
-   vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
3744
-
3745
-// Inserts the least significant 16 bits of b into the selected 16-bit integer
3746
-// of a.
3747
-// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
3748
-// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
3749
-//                                       __constrange(0,8) int imm)
3750
-#define _mm_insert_epi16(a, b, imm)                               \
3751
-   __extension__({                                           \
3752
-       vreinterpretq_m128i_s16(vsetq_lane_s16(           \
3753
-           (b), vreinterpretq_s16_m128i(a), (imm))); \
3754
-   })
3755
-
3756
-// Extracts the selected signed or unsigned 32-bit integer from a and zero
3757
-// extends.
3758
-// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
3759
-#define _mm_extract_epi32(a, imm) \
3760
-   vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
3761
-
3762
-// Extracts the selected single-precision (32-bit) floating-point from a.
3763
-// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
3764
-#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
3765
-
3766
-// Inserts the least significant 32 bits of b into the selected 32-bit integer
3767
-// of a.
3768
-// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
3769
-//                                       __constrange(0,4) int imm)
3770
-#define _mm_insert_epi32(a, b, imm)                               \
3771
-   __extension__({                                           \
3772
-       vreinterpretq_m128i_s32(vsetq_lane_s32(           \
3773
-           (b), vreinterpretq_s32_m128i(a), (imm))); \
3774
-   })
3775
-
3776
-// Extracts the selected signed or unsigned 64-bit integer from a and zero
3777
-// extends.
3778
-// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
3779
-#define _mm_extract_epi64(a, imm) \
3780
-   vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
3781
-
3782
-// Inserts the least significant 64 bits of b into the selected 64-bit integer
3783
-// of a.
3784
-// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
3785
-//                                       __constrange(0,2) int imm)
3786
-#define _mm_insert_epi64(a, b, imm)                               \
3787
-   __extension__({                                           \
3788
-       vreinterpretq_m128i_s64(vsetq_lane_s64(           \
3789
-           (b), vreinterpretq_s64_m128i(a), (imm))); \
3790
-   })
3791
-
3792
-// Count the number of bits set to 1 in unsigned 32-bit integer a, and
3793
-// return that count in dst.
3794
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
3795
-FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
3796
-{
3797
-#if defined(__aarch64__)
3798
-#if __has_builtin(__builtin_popcount)
3799
-   return __builtin_popcount(a);
3800
-#else
3801
-   return (int)vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t)a)));
3802
-#endif
3803
-#else
3804
-   uint32_t count = 0;
3805
-   uint8x8_t input_val, count8x8_val;
3806
-   uint16x4_t count16x4_val;
3807
-   uint32x2_t count32x2_val;
3808
-
3809
-   input_val = vld1_u8((uint8_t *)&a);
3810
-   count8x8_val = vcnt_u8(input_val);
3811
-   count16x4_val = vpaddl_u8(count8x8_val);
3812
-   count32x2_val = vpaddl_u16(count16x4_val);
3813
-
3814
-   vst1_u32(&count, count32x2_val);
3815
-   return count;
3816
-#endif
3817
-}
3818
-
3819
-// Count the number of bits set to 1 in unsigned 64-bit integer a, and
3820
-// return that count in dst.
3821
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
3822
-FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
3823
-{
3824
-#if defined(__aarch64__)
3825
-#if __has_builtin(__builtin_popcountll)
3826
-   return __builtin_popcountll(a);
3827
-#else
3828
-   return (int64_t)vaddlv_u8(vcnt_u8(vcreate_u8(a)));
3829
-#endif
3830
-#else
3831
-   uint64_t count = 0;
3832
-   uint8x8_t input_val, count8x8_val;
3833
-   uint16x4_t count16x4_val;
3834
-   uint32x2_t count32x2_val;
3835
-   uint64x1_t count64x1_val;
3836
-
3837
-   input_val = vld1_u8((uint8_t *)&a);
3838
-   count8x8_val = vcnt_u8(input_val);
3839
-   count16x4_val = vpaddl_u8(count8x8_val);
3840
-   count32x2_val = vpaddl_u16(count16x4_val);
3841
-   count64x1_val = vpaddl_u32(count32x2_val);
3842
-   vst1_u64(&count, count64x1_val);
3843
-   return count;
3844
-#endif
3845
-}
3846
-
3847
-// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
3848
-// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
3849
-// transposed matrix in these vectors (row0 now contains column 0, etc.).
3850
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS&expand=5949
3851
-#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)   \
3852
-   do {                                        \
3853
-       __m128 tmp0, tmp1, tmp2, tmp3;      \
3854
-       tmp0 = _mm_unpacklo_ps(row0, row1); \
3855
-       tmp2 = _mm_unpacklo_ps(row2, row3); \
3856
-       tmp1 = _mm_unpackhi_ps(row0, row1); \
3857
-       tmp3 = _mm_unpackhi_ps(row2, row3); \
3858
-       row0 = _mm_movelh_ps(tmp0, tmp2);   \
3859
-       row1 = _mm_movehl_ps(tmp2, tmp0);   \
3860
-       row2 = _mm_movelh_ps(tmp1, tmp3);   \
3861
-       row3 = _mm_movehl_ps(tmp3, tmp1);   \
3862
-   } while (0)
3863
-
3864
-/* Crypto Extensions */
3865
-
3866
-#if defined(__ARM_FEATURE_CRYPTO)
3867
-// Wraps vmull_p64
3868
-FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
3869
-{
3870
-   poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
3871
-   poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
3872
-   return vreinterpretq_u64_p128(vmull_p64(a, b));
3873
-}
3874
-#else // ARMv7 polyfill
3875
-// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
3876
-//
3877
-// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
3878
-// 64-bit->128-bit polynomial multiply.
3879
-//
3880
-// It needs some work and is somewhat slow, but it is still faster than all
3881
-// known scalar methods.
3882
-//
3883
-// Algorithm adapted to C from
3884
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
3885
-// from "Fast Software Polynomial Multiplication on ARM Processors Using the
3886
-// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
3887
-// (https://hal.inria.fr/hal-01506572)
3888
-static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
3889
-{
3890
-   poly8x8_t a = vreinterpret_p8_u64(_a);
3891
-   poly8x8_t b = vreinterpret_p8_u64(_b);
3892
-
3893
-   // Masks
3894
-   uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
3895
-                   vcreate_u8(0x00000000ffffffff));
3896
-   uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
3897
-                   vcreate_u8(0x0000000000000000));
3898
-
3899
-   // Do the multiplies, rotating with vext to get all combinations
3900
-   uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
3901
-   uint8x16_t e = vreinterpretq_u8_p16(
3902
-       vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
3903
-   uint8x16_t f = vreinterpretq_u8_p16(
3904
-       vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
3905
-   uint8x16_t g = vreinterpretq_u8_p16(
3906
-       vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
3907
-   uint8x16_t h = vreinterpretq_u8_p16(
3908
-       vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
3909
-   uint8x16_t i = vreinterpretq_u8_p16(
3910
-       vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
3911
-   uint8x16_t j = vreinterpretq_u8_p16(
3912
-       vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
3913
-   uint8x16_t k = vreinterpretq_u8_p16(
3914
-       vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
3915
-
3916
-   // Add cross products
3917
-   uint8x16_t l = veorq_u8(e, f); // L = E + F
3918
-   uint8x16_t m = veorq_u8(g, h); // M = G + H
3919
-   uint8x16_t n = veorq_u8(i, j); // N = I + J
3920
-
3921
-   // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
3922
-   // instructions.
3923
-#if defined(__aarch64__)
3924
-   uint8x16_t lm_p0 = vreinterpretq_u8_u64(
3925
-       vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
3926
-   uint8x16_t lm_p1 = vreinterpretq_u8_u64(
3927
-       vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
3928
-   uint8x16_t nk_p0 = vreinterpretq_u8_u64(
3929
-       vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
3930
-   uint8x16_t nk_p1 = vreinterpretq_u8_u64(
3931
-       vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
3932
-#else
3933
-   uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
3934
-   uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
3935
-   uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
3936
-   uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
3937
-#endif
3938
-   // t0 = (L) (P0 + P1) << 8
3939
-   // t1 = (M) (P2 + P3) << 16
3940
-   uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
3941
-   uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
3942
-   uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
3943
-
3944
-   // t2 = (N) (P4 + P5) << 24
3945
-   // t3 = (K) (P6 + P7) << 32
3946
-   uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
3947
-   uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
3948
-   uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
3949
-
3950
-   // De-interleave
3951
-#if defined(__aarch64__)
3952
-   uint8x16_t t0 = vreinterpretq_u8_u64(vuzp1q_u64(
3953
-       vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
3954
-   uint8x16_t t1 = vreinterpretq_u8_u64(vuzp2q_u64(
3955
-       vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
3956
-   uint8x16_t t2 = vreinterpretq_u8_u64(vuzp1q_u64(
3957
-       vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
3958
-   uint8x16_t t3 = vreinterpretq_u8_u64(vuzp2q_u64(
3959
-       vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
3960
-#else
3961
-   uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
3962
-   uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
3963
-   uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
3964
-   uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
3965
-#endif
3966
-   // Shift the cross products
3967
-   uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
3968
-   uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
3969
-   uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
3970
-   uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
3971
-
3972
-   // Accumulate the products
3973
-   uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
3974
-   uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
3975
-   uint8x16_t mix = veorq_u8(d, cross1);
3976
-   uint8x16_t r = veorq_u8(mix, cross2);
3977
-   return vreinterpretq_u64_u8(r);
3978
-}
3979
-#endif // ARMv7 polyfill
3980
-
3981
-FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
3982
-{
3983
-   uint64x2_t a = vreinterpretq_u64_m128i(_a);
3984
-   uint64x2_t b = vreinterpretq_u64_m128i(_b);
3985
-   switch (imm & 0x11) {
3986
-   case 0x00:
3987
-       return vreinterpretq_m128i_u64(
3988
-           _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
3989
-   case 0x01:
3990
-       return vreinterpretq_m128i_u64(
3991
-           _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
3992
-   case 0x10:
3993
-       return vreinterpretq_m128i_u64(
3994
-           _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
3995
-   case 0x11:
3996
-       return vreinterpretq_m128i_u64(_sse2neon_vmull_p64(
3997
-           vget_high_u64(a), vget_high_u64(b)));
3998
-   default:
3999
-       abort();
4000
-   }
4001
-}
4002
-
4003
-#if !defined(__ARM_FEATURE_CRYPTO) && defined(__aarch64__)
4004
-// In the absence of crypto extensions, implement aesenc using regular neon
4005
-// intrinsics instead. See:
4006
-// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
4007
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
4008
-// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
4009
-// for more information Reproduced with permission of the author.
4010
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
4011
-{
4012
-   static const uint8_t crypto_aes_sbox[256] = {
4013
-       0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01,
4014
-       0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d,
4015
-       0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4,
4016
-       0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
4017
-       0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, 0x04, 0xc7,
4018
-       0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
4019
-       0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e,
4020
-       0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
4021
-       0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb,
4022
-       0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb,
4023
-       0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c,
4024
-       0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
4025
-       0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c,
4026
-       0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d,
4027
-       0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a,
4028
-       0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
4029
-       0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3,
4030
-       0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
4031
-       0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a,
4032
-       0xae, 0x08, 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
4033
-       0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e,
4034
-       0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9,
4035
-       0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9,
4036
-       0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
4037
-       0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99,
4038
-       0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
4039
-   static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
4040
-                        0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
4041
-                        0xc, 0x1, 0x6, 0xb};
4042
-   static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6,
4043
-                      0x7, 0x4, 0x9, 0xa, 0xb, 0x8,
4044
-                      0xd, 0xe, 0xf, 0xc};
4045
-
4046
-   uint8x16_t v;
4047
-   uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
4048
-
4049
-   // shift rows
4050
-   w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
4051
-
4052
-   // sub bytes
4053
-   v = vqtbl4q_u8(vld1q_u8_x4(crypto_aes_sbox), w);
4054
-   v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x40), w - 0x40);
4055
-   v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0x80), w - 0x80);
4056
-   v = vqtbx4q_u8(v, vld1q_u8_x4(crypto_aes_sbox + 0xc0), w - 0xc0);
4057
-
4058
-   // mix columns
4059
-   w = (v << 1) ^ (uint8x16_t)(((int8x16_t)v >> 7) & 0x1b);
4060
-   w ^= (uint8x16_t)vrev32q_u16((uint16x8_t)v);
4061
-   w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
4062
-
4063
-   //  add round key
4064
-   return vreinterpretq_m128i_u8(w) ^ RoundKey;
4065
-}
4066
-#elif defined(__ARM_FEATURE_CRYPTO)
4067
-// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
4068
-// AESMC and then manually applying the real key as an xor operation This
4069
-// unfortunately means an additional xor op; the compiler should be able to
4070
-// optimise this away for repeated calls however See
4071
-// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
4072
-// for more details.
4073
-inline __m128i _mm_aesenc_si128(__m128i a, __m128i b)
4074
-{
4075
-   return vreinterpretq_m128i_u8(
4076
-       vaesmcq_u8(
4077
-           vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
4078
-       vreinterpretq_u8_m128i(b));
4079
-}
4080
-#endif
4081
-
4082
-/* Streaming Extensions */
4083
-
4084
-// Guarantees that every preceding store is globally visible before any
4085
-// subsequent store.
4086
-// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
4087
-FORCE_INLINE void _mm_sfence(void)
4088
-{
4089
-   __sync_synchronize();
4090
-}
4091
-
4092
-// Stores the data in a to the address p without polluting the caches.  If the
4093
-// cache line containing address p is already in the cache, the cache will be
4094
-// updated.Address p must be 16 - byte aligned.
4095
-// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
4096
-FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
4097
-{
4098
-#if __has_builtin(__builtin_nontemporal_store)
4099
-   __builtin_nontemporal_store(a, p);
4100
-#else
4101
-   vst1q_s64((int64_t *)p, vreinterpretq_s64_m128i(a));
4102
-#endif
4103
-}
4104
-
4105
-// Cache line containing p is flushed and invalidated from all caches in the
4106
-// coherency domain. :
4107
-// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
4108
-FORCE_INLINE void _mm_clflush(void const *p)
4109
-{
4110
-   (void)p;
4111
-   // no corollary for Neon?
4112
-}
4113
-
4114
-// Allocate aligned blocks of memory.
4115
-// https://software.intel.com/en-us/
4116
-//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
4117
-FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
4118
-{
4119
-   void *ptr;
4120
-   if (align == 1)
4121
-       return malloc(size);
4122
-   if (align == 2 || (sizeof(void *) == 8 && align == 4))
4123
-       align = sizeof(void *);
4124
-   if (!posix_memalign(&ptr, align, size))
4125
-       return ptr;
4126
-   return NULL;
4127
-}
4128
-
4129
-FORCE_INLINE void _mm_free(void *addr)
4130
-{
4131
-   free(addr);
4132
-}
4133
-
4134
-// Starting with the initial value in crc, accumulates a CRC32 value for
4135
-// unsigned 8-bit integer v.
4136
-// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
4137
-FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
4138
-{
4139
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
4140
-   __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
4141
-                : [c] "+r"(crc)
4142
-                : [v] "r"(v));
4143
-#else
4144
-   crc ^= v;
4145
-   for (int bit = 0; bit < 8; bit++) {
4146
-       if (crc & 1)
4147
-           crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
4148
-       else
4149
-           crc = (crc >> 1);
4150
-   }
4151
-#endif
4152
-   return crc;
4153
-}
4154
-
4155
-// Starting with the initial value in crc, accumulates a CRC32 value for
4156
-// unsigned 16-bit integer v.
4157
-// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
4158
-FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
4159
-{
4160
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
4161
-   __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
4162
-                : [c] "+r"(crc)
4163
-                : [v] "r"(v));
4164
-#else
4165
-   crc = _mm_crc32_u8(crc, v & 0xff);
4166
-   crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
4167
-#endif
4168
-   return crc;
4169
-}
4170
-
4171
-// Starting with the initial value in crc, accumulates a CRC32 value for
4172
-// unsigned 32-bit integer v.
4173
-// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
4174
-FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
4175
-{
4176
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
4177
-   __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
4178
-                : [c] "+r"(crc)
4179
-                : [v] "r"(v));
4180
-#else
4181
-   crc = _mm_crc32_u16(crc, v & 0xffff);
4182
-   crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
4183
-#endif
4184
-   return crc;
4185
-}
4186
-
4187
-// Starting with the initial value in crc, accumulates a CRC32 value for
4188
-// unsigned 64-bit integer v.
4189
-// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
4190
-FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
4191
-{
4192
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
4193
-   __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
4194
-                : [c] "+r"(crc)
4195
-                : [v] "r"(v));
4196
-#else
4197
-   crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
4198
-   crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
4199
-#endif
4200
-   return crc;
4201
-}
4202
-
4203
-#if defined(__GNUC__) || defined(__clang__)
4204
-#pragma pop_macro("ALIGN_STRUCT")
4205
-#pragma pop_macro("FORCE_INLINE")
4206
-#endif
4207
-
4208
-#endif
4209
obs-studio-26.1.0.tar.xz/.github/workflows/main.yml -> obs-studio-26.1.1.tar.xz/.github/workflows/main.yml Changed
153
 
1
@@ -24,7 +24,7 @@
2
     runs-on: [macos-latest]
3
     env:
4
       MIN_MACOS_VERSION: '10.13'
5
-      MACOS_DEPS_VERSION: '2020-12-11'
6
+      MACOS_DEPS_VERSION: '2020-12-22'
7
       VLC_VERSION: '3.0.8'
8
       SPARKLE_VERSION: '1.23.0'
9
       QT_VERSION: '5.15.2'
10
@@ -54,13 +54,17 @@
11
         shell: bash
12
         run: |
13
           if [ -d /usr/local/opt/openssl@1.0.2t ]; then
14
-              brew uninstall openssl@1.0.2t
15
-              brew untap local/openssl
16
+            brew uninstall openssl@1.0.2t
17
+            brew untap local/openssl
18
           fi
19
 
20
           if [ -d /usr/local/opt/python@2.7.17 ]; then
21
-              brew uninstall python@2.7.17
22
-              brew untap local/python2
23
+            brew uninstall python@2.7.17
24
+            brew untap local/python2
25
+          fi
26
+
27
+          if [ -d /usr/local/opt/speexdsp ]; then
28
+            brew unlink speexdsp
29
           fi
30
           brew bundle --file ./CI/scripts/macos/Brewfile
31
       - name: 'Restore Chromium Embedded Framework from cache'
32
@@ -136,7 +140,8 @@
33
         run: |
34
           mkdir ./build
35
           cd ./build
36
-          cmake -DENABLE_UNIT_TESTS=YES -DENABLE_SPARKLE_UPDATER=ON -DDISABLE_PYTHON=ON -DCMAKE_OSX_DEPLOYMENT_TARGET=${{ env.MIN_MACOS_VERSION }} -DQTDIR="/tmp/obsdeps" -DSWIGDIR="/tmp/obsdeps" -DDepsPath="/tmp/obsdeps" -DVLCPath="${{ github.workspace }}/cmbuild/vlc-${{ env.VLC_VERSION }}" -DENABLE_VLC=ON -DBUILD_BROWSER=ON -DBROWSER_DEPLOY=ON -DWITH_RTMPS=ON -DCEF_ROOT_DIR="${{ github.workspace }}/cmbuild/cef_binary_${{ env.CEF_BUILD_VERSION }}_macosx64" ..
37
+          LEGACY_BROWSER="$(test "${{ env.CEF_BUILD_VERSION }}" -le 3770 && echo "ON" || echo "OFF")"
38
+          cmake -DENABLE_UNIT_TESTS=YES -DENABLE_SPARKLE_UPDATER=ON -DDISABLE_PYTHON=ON -DCMAKE_OSX_DEPLOYMENT_TARGET=${{ env.MIN_MACOS_VERSION }} -DQTDIR="/tmp/obsdeps" -DSWIGDIR="/tmp/obsdeps" -DDepsPath="/tmp/obsdeps" -DVLCPath="${{ github.workspace }}/cmbuild/vlc-${{ env.VLC_VERSION }}" -DENABLE_VLC=ON -DBUILD_BROWSER=ON -DBROWSER_LEGACY=$LEGACY_BROWSER -DWITH_RTMPS=ON -DCEF_ROOT_DIR="${{ github.workspace }}/cmbuild/cef_binary_${{ env.CEF_BUILD_VERSION }}_macosx64" ..
39
       - name: 'Build'
40
         shell: bash
41
         working-directory: ${{ github.workspace }}/build
42
@@ -169,9 +174,16 @@
43
           mkdir -p OBS.app/Contents/MacOS
44
           mkdir OBS.app/Contents/PlugIns
45
           mkdir OBS.app/Contents/Resources
46
+          mkdir OBS.app/Contents/Frameworks
47
 
48
           cp rundir/RelWithDebInfo/bin/obs ./OBS.app/Contents/MacOS
49
           cp rundir/RelWithDebInfo/bin/obs-ffmpeg-mux ./OBS.app/Contents/MacOS
50
+          if ! [ "${{ env.CEF_BUILD_VERSION }}" -le 3770 ]; then
51
+            cp -R "rundir/RelWithDebInfo/bin/OBS Helper.app" "./OBS.app/Contents/Frameworks/OBS Helper.app"
52
+            cp -R "rundir/RelWithDebInfo/bin/OBS Helper (GPU).app" "./OBS.app/Contents/Frameworks/OBS Helper (GPU).app"
53
+            cp -R "rundir/RelWithDebInfo/bin/OBS Helper (Plugin).app" "./OBS.app/Contents/Frameworks/OBS Helper (Plugin).app"
54
+            cp -R "rundir/RelWithDebInfo/bin/OBS Helper (Renderer).app" "./OBS.app/Contents/Frameworks/OBS Helper (Renderer).app"
55
+          fi
56
           cp rundir/RelWithDebInfo/bin/libobsglad.0.dylib ./OBS.app/Contents/MacOS
57
           cp -R rundir/RelWithDebInfo/data ./OBS.app/Contents/Resources
58
           cp ../CI/scripts/macos/app/AppIcon.icns ./OBS.app/Contents/Resources
59
@@ -185,35 +197,45 @@
60
             rm -rf ./OBS.app/Contents/Resources/data/obs-scripting/
61
           fi
62
 
63
+          BUNDLE_PLUGINS=(
64
+            ./OBS.app/Contents/PlugIns/coreaudio-encoder.so
65
+            ./OBS.app/Contents/PlugIns/decklink-ouput-ui.so
66
+            ./OBS.app/Contents/PlugIns/decklink-captions.so
67
+            ./OBS.app/Contents/PlugIns/frontend-tools.so
68
+            ./OBS.app/Contents/PlugIns/image-source.so
69
+            ./OBS.app/Contents/PlugIns/mac-avcapture.so
70
+            ./OBS.app/Contents/PlugIns/mac-capture.so
71
+            ./OBS.app/Contents/PlugIns/mac-decklink.so
72
+            ./OBS.app/Contents/PlugIns/mac-syphon.so
73
+            ./OBS.app/Contents/PlugIns/mac-vth264.so
74
+            ./OBS.app/Contents/PlugIns/mac-virtualcam.so
75
+            ./OBS.app/Contents/PlugIns/obs-browser.so
76
+            ./OBS.app/Contents/PlugIns/obs-ffmpeg.so
77
+            ./OBS.app/Contents/PlugIns/obs-filters.so
78
+            ./OBS.app/Contents/PlugIns/obs-transitions.so
79
+            ./OBS.app/Contents/PlugIns/obs-vst.so
80
+            ./OBS.app/Contents/PlugIns/rtmp-services.so
81
+            ./OBS.app/Contents/MacOS/obs-ffmpeg-mux
82
+            ./OBS.app/Contents/MacOS/obslua.so
83
+            ./OBS.app/Contents/PlugIns/obs-x264.so
84
+            ./OBS.app/Contents/PlugIns/text-freetype2.so
85
+            ./OBS.app/Contents/PlugIns/obs-outputs.so
86
+          )
87
+
88
+          if ! [ "${{ env.CEF_BUILD_VERSION }}" -le 3770 ]; then
89
           ../CI/scripts/macos/app/dylibBundler -cd -of -a ./OBS.app -q -f \
90
             -s ./OBS.app/Contents/MacOS \
91
             -s "${{ github.workspace }}/cmbuild/sparkle/Sparkle.framework" \
92
             -s ./rundir/RelWithDebInfo/bin \
93
-            -x ./OBS.app/Contents/PlugIns/coreaudio-encoder.so \
94
-            -x ./OBS.app/Contents/PlugIns/decklink-ouput-ui.so \
95
-            -x ./OBS.app/Contents/PlugIns/decklink-captions.so \
96
-            -x ./OBS.app/Contents/PlugIns/frontend-tools.so \
97
-            -x ./OBS.app/Contents/PlugIns/image-source.so \
98
-            -x ./OBS.app/Contents/PlugIns/linux-jack.so \
99
-            -x ./OBS.app/Contents/PlugIns/mac-avcapture.so \
100
-            -x ./OBS.app/Contents/PlugIns/mac-capture.so \
101
-            -x ./OBS.app/Contents/PlugIns/mac-decklink.so \
102
-            -x ./OBS.app/Contents/PlugIns/mac-syphon.so \
103
-            -x ./OBS.app/Contents/PlugIns/mac-vth264.so \
104
-            -x ./OBS.app/Contents/PlugIns/mac-virtualcam.so \
105
-            -x ./OBS.app/Contents/PlugIns/obs-browser.so \
106
-            -x ./OBS.app/Contents/PlugIns/obs-browser-page \
107
-            -x ./OBS.app/Contents/PlugIns/obs-ffmpeg.so \
108
-            -x ./OBS.app/Contents/PlugIns/obs-filters.so \
109
-            -x ./OBS.app/Contents/PlugIns/obs-transitions.so \
110
-            -x ./OBS.app/Contents/PlugIns/obs-vst.so \
111
-            -x ./OBS.app/Contents/PlugIns/rtmp-services.so \
112
-            -x ./OBS.app/Contents/MacOS/obs-ffmpeg-mux \
113
-            -x ./OBS.app/Contents/MacOS/obslua.so \
114
-            -x ./OBS.app/Contents/PlugIns/obs-x264.so \
115
-            -x ./OBS.app/Contents/PlugIns/text-freetype2.so \
116
-            -x ./OBS.app/Contents/PlugIns/obs-libfdk.so \
117
-            -x ./OBS.app/Contents/PlugIns/obs-outputs.so
118
+            $(echo "${BUNDLE_PLUGINS[@]/#/-x }")
119
+          else
120
+          ../CI/scripts/macos/app/dylibBundler -cd -of -a ./OBS.app -q -f \
121
+            -s ./OBS.app/Contents/MacOS \
122
+            -s "${{ github.workspace }}/cmbuild/sparkle/Sparkle.framework" \
123
+            -s ./rundir/RelWithDebInfo/bin \
124
+            $(echo "${BUNDLE_PLUGINS[@]/#/-x }") \
125
+            -x ./OBS.app/Contents/PlugIns/obs-browser-page
126
+          fi
127
 
128
           mv ./libobs-opengl/libobs-opengl.so ./OBS.app/Contents/Frameworks
129
 
130
@@ -250,12 +272,22 @@
131
           codesign --force --options runtime --sign "${SIGN_IDENTITY:--}" "./OBS.app/Contents/Frameworks/Chromium Embedded Framework.framework/Libraries/libswiftshader_libEGL.dylib"
132
           codesign --force --options runtime --sign "${SIGN_IDENTITY:--}" "./OBS.app/Contents/Frameworks/Chromium Embedded Framework.framework/Libraries/libGLESv2.dylib"
133
           codesign --force --options runtime --sign "${SIGN_IDENTITY:--}" "./OBS.app/Contents/Frameworks/Chromium Embedded Framework.framework/Libraries/libswiftshader_libGLESv2.dylib"
134
+          if ! [ "${{ env.CEF_BUILD_VERSION }}" -le 3770 ]; then
135
+            codesign --force --options runtime --sign "${SIGN_IDENTITY:--}" "./OBS.app/Contents/Frameworks/Chromium Embedded Framework.framework/Libraries/libvk_swiftshader.dylib"
136
+          fi
137
           codesign --force --options runtime --sign "${SIGN_IDENTITY:--}" --deep "./OBS.app/Contents/Frameworks/Chromium Embedded Framework.framework"
138
 
139
           codesign --force --options runtime --deep --sign "${SIGN_IDENTITY:--}" "./OBS.app/Contents/Resources/data/obs-mac-virtualcam.plugin"
140
 
141
           codesign --force --options runtime --entitlements "../CI/scripts/macos/app/entitlements.plist" --sign "${SIGN_IDENTITY:--}" --deep ./OBS.app
142
 
143
+          if ! [ "${{ env.CEF_BUILD_VERSION }}" -le 3770 ]; then
144
+            codesign --force --options runtime --sign "${SIGN_IDENTITY:--}" --deep "./OBS.app/Contents/Frameworks/OBS Helper.app"
145
+            codesign --force --options runtime --entitlements "../CI/scripts/macos/helpers/helper-gpu-entitlements.plist" --sign "${SIGN_IDENTITY:--}" --deep "./OBS.app/Contents/Frameworks/OBS Helper (GPU).app"
146
+            codesign --force --options runtime --entitlements "../CI/scripts/macos/helpers/helper-plugin-entitlements.plist" --sign "${SIGN_IDENTITY:--}" --deep "./OBS.app/Contents/Frameworks/OBS Helper (Plugin).app"
147
+            codesign --force --options runtime --entitlements "../CI/scripts/macos/helpers/helper-renderer-entitlements.plist" --sign "${SIGN_IDENTITY:--}" --deep "./OBS.app/Contents/Frameworks/OBS Helper (Renderer).app"
148
+          fi
149
+
150
           codesign -dvv ./OBS.app
151
       - name: 'Package'
152
         if: success() && (github.event_name != 'pull_request' || env.SEEKING_TESTERS == '1')
153
obs-studio-26.1.0.tar.xz/CI/full-build-macos.sh -> obs-studio-26.1.1.tar.xz/CI/full-build-macos.sh Changed
162
 
1
@@ -47,6 +47,7 @@
2
 CI_SPARKLE_VERSION=$(cat ${CI_WORKFLOW} | sed -En "s/[ ]+SPARKLE_VERSION: '([0-9\.]+)'/\1/p")
3
 CI_QT_VERSION=$(cat ${CI_WORKFLOW} | sed -En "s/[ ]+QT_VERSION: '([0-9\.]+)'/\1/p" | head -1)
4
 CI_MIN_MACOS_VERSION=$(cat ${CI_WORKFLOW} | sed -En "s/[ ]+MIN_MACOS_VERSION: '([0-9\.]+)'/\1/p")
5
+NPROC="${NPROC:-$(sysctl -n hw.ncpu)}"
6
 
7
 BUILD_DEPS=(
8
     "obs-deps ${MACOS_DEPS_VERSION:-${CI_DEPS_VERSION}}"
9
@@ -226,7 +227,7 @@
10
         -DCMAKE_OSX_DEPLOYMENT_TARGET=${MIN_MACOS_VERSION:-${CI_MIN_MACOS_VERSION}} \
11
         ..
12
     step "Build..."
13
-    make -j4
14
+    make -j${NPROC}
15
     if [ ! -d libcef_dll ]; then mkdir libcef_dll; fi
16
 }
17
 
18
@@ -277,7 +278,7 @@
19
         -DDepsPath="/tmp/obsdeps" \
20
         -DVLCPath="${DEPS_BUILD_DIR}/vlc-${VLC_VERSION:-${CI_VLC_VERSION}}" \
21
         -DBUILD_BROWSER=ON \
22
-        -DBROWSER_DEPLOY=ON \
23
+        -DBROWSER_LEGACY="$(test "${CEF_BUILD_VERSION:-${CI_CEF_VERSION}}" -le 3770 && echo "ON" || echo "OFF")" \
24
         -DWITH_RTMPS=ON \
25
         -DCEF_ROOT_DIR="${DEPS_BUILD_DIR}/cef_binary_${CEF_BUILD_VERSION:-${CI_CEF_VERSION}}_macosx64" \
26
         -DCMAKE_BUILD_TYPE="${BUILD_CONFIG}" \
27
@@ -288,7 +289,7 @@
28
 run_obs_build() {
29
     ensure_dir "${CHECKOUT_DIR}/${BUILD_DIR}"
30
     hr "Build OBS..."
31
-    make -j4
32
+    make -j${NPROC}
33
 }
34
 
35
 ## OBS BUNDLE AS MACOS APPLICATION ##
36
@@ -303,37 +304,47 @@
37
     hr "Bundle dylibs for macOS application"
38
 
39
     step "Run dylibBundler.."
40
-    ${CI_SCRIPTS}/app/dylibbundler -cd -of -a ./OBS.app -q -f \
41
-        -s ./OBS.app/Contents/MacOS \
42
-        -s "${DEPS_BUILD_DIR}/sparkle/Sparkle.framework" \
43
-        -s ./rundir/${BUILD_CONFIG}/bin/ \
44
-        -x ./OBS.app/Contents/PlugIns/coreaudio-encoder.so \
45
-        -x ./OBS.app/Contents/PlugIns/decklink-ouput-ui.so \
46
-        -x ./OBS.app/Contents/PlugIns/decklink-captions.so \
47
-        -x ./OBS.app/Contents/PlugIns/frontend-tools.so \
48
-        -x ./OBS.app/Contents/PlugIns/image-source.so \
49
-        -x ./OBS.app/Contents/PlugIns/linux-jack.so \
50
-        -x ./OBS.app/Contents/PlugIns/mac-avcapture.so \
51
-        -x ./OBS.app/Contents/PlugIns/mac-capture.so \
52
-        -x ./OBS.app/Contents/PlugIns/mac-decklink.so \
53
-        -x ./OBS.app/Contents/PlugIns/mac-syphon.so \
54
-        -x ./OBS.app/Contents/PlugIns/mac-vth264.so \
55
-        -x ./OBS.app/Contents/PlugIns/mac-virtualcam.so \
56
-        -x ./OBS.app/Contents/PlugIns/obs-browser.so \
57
-        -x ./OBS.app/Contents/PlugIns/obs-browser-page \
58
-        -x ./OBS.app/Contents/PlugIns/obs-ffmpeg.so \
59
-        -x ./OBS.app/Contents/PlugIns/obs-filters.so \
60
-        -x ./OBS.app/Contents/PlugIns/obs-transitions.so \
61
-        -x ./OBS.app/Contents/PlugIns/obs-vst.so \
62
-        -x ./OBS.app/Contents/PlugIns/rtmp-services.so \
63
-        -x ./OBS.app/Contents/MacOS/obs-ffmpeg-mux \
64
-        -x ./OBS.app/Contents/MacOS/obslua.so \
65
-        -x ./OBS.app/Contents/PlugIns/obs-x264.so \
66
-        -x ./OBS.app/Contents/PlugIns/text-freetype2.so \
67
-        -x ./OBS.app/Contents/PlugIns/obs-libfdk.so \
68
-        -x ./OBS.app/Contents/PlugIns/obs-outputs.so
69
-    step "Move libobs-opengl to final destination"
70
 
71
+    BUNDLE_PLUGINS=(
72
+        ./OBS.app/Contents/PlugIns/coreaudio-encoder.so
73
+        ./OBS.app/Contents/PlugIns/decklink-ouput-ui.so
74
+        ./OBS.app/Contents/PlugIns/decklink-captions.so
75
+        ./OBS.app/Contents/PlugIns/frontend-tools.so
76
+        ./OBS.app/Contents/PlugIns/image-source.so
77
+        ./OBS.app/Contents/PlugIns/mac-avcapture.so
78
+        ./OBS.app/Contents/PlugIns/mac-capture.so
79
+        ./OBS.app/Contents/PlugIns/mac-decklink.so
80
+        ./OBS.app/Contents/PlugIns/mac-syphon.so
81
+        ./OBS.app/Contents/PlugIns/mac-vth264.so
82
+        ./OBS.app/Contents/PlugIns/mac-virtualcam.so
83
+        ./OBS.app/Contents/PlugIns/obs-browser.so
84
+        ./OBS.app/Contents/PlugIns/obs-ffmpeg.so
85
+        ./OBS.app/Contents/PlugIns/obs-filters.so
86
+        ./OBS.app/Contents/PlugIns/obs-transitions.so
87
+        ./OBS.app/Contents/PlugIns/obs-vst.so
88
+        ./OBS.app/Contents/PlugIns/rtmp-services.so
89
+        ./OBS.app/Contents/MacOS/obs-ffmpeg-mux
90
+        ./OBS.app/Contents/MacOS/obslua.so
91
+        ./OBS.app/Contents/PlugIns/obs-x264.so
92
+        ./OBS.app/Contents/PlugIns/text-freetype2.so
93
+        ./OBS.app/Contents/PlugIns/obs-outputs.so
94
+        )
95
+    if ! [ "${CEF_BUILD_VERSION:-${CI_CEF_VERSION}}" -le 3770 ]; then
96
+        ${CI_SCRIPTS}/app/dylibbundler -cd -of -a ./OBS.app -q -f \
97
+            -s ./OBS.app/Contents/MacOS \
98
+            -s "${DEPS_BUILD_DIR}/sparkle/Sparkle.framework" \
99
+            -s ./rundir/${BUILD_CONFIG}/bin/ \
100
+            $(echo "${BUNDLE_PLUGINS[@]/#/-x }")
101
+    else
102
+        ${CI_SCRIPTS}/app/dylibbundler -cd -of -a ./OBS.app -q -f \
103
+            -s ./OBS.app/Contents/MacOS \
104
+            -s "${DEPS_BUILD_DIR}/sparkle/Sparkle.framework" \
105
+            -s ./rundir/${BUILD_CONFIG}/bin/ \
106
+            $(echo "${BUNDLE_PLUGINS[@]/#/-x }") \
107
+            -x ./OBS.app/Contents/PlugIns/obs-browser-page
108
+    fi
109
+
110
+    step "Move libobs-opengl to final destination"
111
     if [ -f "./libobs-opengl/libobs-opengl.so" ]; then
112
         cp ./libobs-opengl/libobs-opengl.so ./OBS.app/Contents/Frameworks
113
     else
114
@@ -379,10 +390,17 @@
115
     mkdir -p OBS.app/Contents/MacOS
116
     mkdir OBS.app/Contents/PlugIns
117
     mkdir OBS.app/Contents/Resources
118
+    mkdir OBS.app/Contents/Frameworks
119
 
120
     cp rundir/${BUILD_CONFIG}/bin/obs ./OBS.app/Contents/MacOS
121
     cp rundir/${BUILD_CONFIG}/bin/obs-ffmpeg-mux ./OBS.app/Contents/MacOS
122
     cp rundir/${BUILD_CONFIG}/bin/libobsglad.0.dylib ./OBS.app/Contents/MacOS
123
+    if ! [ "${CEF_BUILD_VERSION:-${CI_CEF_VERSION}}" -le 3770 ]; then
124
+        cp -R "rundir/${BUILD_CONFIG}/bin/OBS Helper.app" "./OBS.app/Contents/Frameworks/OBS Helper.app"
125
+        cp -R "rundir/${BUILD_CONFIG}/bin/OBS Helper (GPU).app" "./OBS.app/Contents/Frameworks/OBS Helper (GPU).app"
126
+        cp -R "rundir/${BUILD_CONFIG}/bin/OBS Helper (Plugin).app" "./OBS.app/Contents/Frameworks/OBS Helper (Plugin).app"
127
+        cp -R "rundir/${BUILD_CONFIG}/bin/OBS Helper (Renderer).app" "./OBS.app/Contents/Frameworks/OBS Helper (Renderer).app"
128
+    fi
129
     cp -R rundir/${BUILD_CONFIG}/data ./OBS.app/Contents/Resources
130
     cp ${CI_SCRIPTS}/app/AppIcon.icns ./OBS.app/Contents/Resources
131
     cp -R rundir/${BUILD_CONFIG}/obs-plugins/ ./OBS.app/Contents/PlugIns
132
@@ -506,7 +524,10 @@
133
     codesign --force --options runtime --sign "${CODESIGN_IDENT}" "./OBS.app/Contents/Frameworks/Chromium Embedded Framework.framework/Libraries/libswiftshader_libEGL.dylib"
134
     codesign --force --options runtime --sign "${CODESIGN_IDENT}" "./OBS.app/Contents/Frameworks/Chromium Embedded Framework.framework/Libraries/libGLESv2.dylib"
135
     codesign --force --options runtime --sign "${CODESIGN_IDENT}" "./OBS.app/Contents/Frameworks/Chromium Embedded Framework.framework/Libraries/libswiftshader_libGLESv2.dylib"
136
-    codesign --force --options runtime --sign "${CODESIGN_IDENT}" --deep "./OBS.app/Contents/Frameworks/Chromium Embedded Framework.framework"
137
+    if ! [ "${CEF_BUILD_VERSION:-${CI_CEF_VERSION}}" -le 3770 ]; then
138
+        codesign --force --options runtime --sign "${CODESIGN_IDENT}" "./OBS.app/Contents/Frameworks/Chromium Embedded Framework.framework/Libraries/libvk_swiftshader.dylib"
139
+    fi
140
+
141
     echo -n "${COLOR_RESET}"
142
 
143
     step "Code-sign DAL Plugin..."
144
@@ -518,6 +539,17 @@
145
     echo -n "${COLOR_ORANGE}"
146
     codesign --force --options runtime --entitlements "${CI_SCRIPTS}/app/entitlements.plist" --sign "${CODESIGN_IDENT}" --deep ./OBS.app
147
     echo -n "${COLOR_RESET}"
148
+
149
+    if ! [ "${CEF_BUILD_VERSION:-${CI_CEF_VERSION}}" -le 3770 ]; then
150
+        step "Code-sign CEF helper apps..."
151
+        echo -n "${COLOR_ORANGE}"
152
+        codesign --force --options runtime --sign "${CODESIGN_IDENT}" --deep "./OBS.app/Contents/Frameworks/OBS Helper.app"
153
+        codesign --force --options runtime --entitlements "${CI_SCRIPTS}/helpers/helper-gpu-entitlements.plist" --sign "${CODESIGN_IDENT}" --deep "./OBS.app/Contents/Frameworks/OBS Helper (GPU).app"
154
+        codesign --force --options runtime --entitlements "${CI_SCRIPTS}/helpers/helper-plugin-entitlements.plist" --sign "${CODESIGN_IDENT}" --deep "./OBS.app/Contents/Frameworks/OBS Helper (Plugin).app"
155
+        codesign --force --options runtime --entitlements "${CI_SCRIPTS}/helpers/helper-renderer-entitlements.plist" --sign "${CODESIGN_IDENT}" --deep "./OBS.app/Contents/Frameworks/OBS Helper (Renderer).app"
156
+        echo -n "${COLOR_RESET}"
157
+    fi
158
+
159
     step "Check code-sign result..."
160
     codesign -dvv ./OBS.app
161
 }
162
obs-studio-26.1.0.tar.xz/CI/scripts/macos/Brewfile -> obs-studio-26.1.1.tar.xz/CI/scripts/macos/Brewfile Changed
11
 
1
@@ -1,8 +1,5 @@
2
 tap "akeru-inc/tap"
3
-brew "jack"
4
-brew "speexdsp"
5
 brew "cmake"
6
 brew "freetype"
7
-brew "fdk-aac"
8
 brew "cmocka"
9
 brew "akeru-inc/tap/xcnotary"
10
\ No newline at end of file
11
obs-studio-26.1.1.tar.xz/CI/scripts/macos/helpers Added
2
 
1
+(directory)
2
obs-studio-26.1.1.tar.xz/CI/scripts/macos/helpers/helper-gpu-entitlements.plist Added
11
 
1
@@ -0,0 +1,8 @@
2
+<?xml version="1.0" encoding="UTF-8"?>
3
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4
+<plist version="1.0">
5
+<dict>
6
+    <key>com.apple.security.cs.allow-jit</key>
7
+    <true/>
8
+</dict>
9
+</plist>
10
\ No newline at end of file
11
obs-studio-26.1.1.tar.xz/CI/scripts/macos/helpers/helper-plugin-entitlements.plist Added
13
 
1
@@ -0,0 +1,10 @@
2
+<?xml version="1.0" encoding="UTF-8"?>
3
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4
+<plist version="1.0">
5
+<dict>
6
+    <key>com.apple.security.cs.allow-unsigned-executable-memory</key>
7
+    <true/>
8
+    <key>com.apple.security.cs.disable-library-validation</key>
9
+    <true/>
10
+</dict>
11
+</plist>
12
\ No newline at end of file
13
obs-studio-26.1.1.tar.xz/CI/scripts/macos/helpers/helper-renderer-entitlements.plist Added
11
 
1
@@ -0,0 +1,8 @@
2
+<?xml version="1.0" encoding="UTF-8"?>
3
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4
+<plist version="1.0">
5
+<dict>
6
+    <key>com.apple.security.cs.allow-jit</key>
7
+    <true/>
8
+</dict>
9
+</plist>
10
\ No newline at end of file
11
obs-studio-26.1.0.tar.xz/CMakeLists.txt -> obs-studio-26.1.1.tar.xz/CMakeLists.txt Changed
20
 
1
@@ -123,17 +123,14 @@
2
 endif ()
3
 
4
 if(LOWERCASE_CMAKE_SYSTEM_PROCESSOR MATCHES "(i[3-6]86|x86|x64|x86_64|amd64|e2k)")
5
-   set(NEEDS_SIMDE "0")
6
    if(NOT MSVC)
7
        set(ARCH_SIMD_FLAGS "-mmmx" "-msse" "-msse2")
8
    endif()
9
 elseif(LOWERCASE_CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64(le)?")
10
-   set(NEEDS_SIMDE "0")
11
+   set(ARCH_SIMD_DEFINES "-DNO_WARN_X86_INTRINSICS")
12
    set(ARCH_SIMD_FLAGS "-mvsx")
13
    add_compile_definitions(NO_WARN_X86_INTRINSICS)
14
 else()
15
-   set(NEEDS_SIMDE "1")
16
-   add_definitions(-DNEEDS_SIMDE=1)
17
    if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)
18
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSIMDE_ENABLE_OPENMP")
19
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DSIMDE_ENABLE_OPENMP")
20
obs-studio-26.1.0.tar.xz/UI/data/locale.ini -> obs-studio-26.1.1.tar.xz/UI/data/locale.ini Changed
7
 
1
@@ -179,4 +179,4 @@
2
 Name=Esperanto
3
 
4
 [kab-KAB]
5
-Name=Taglizit
6
+Name=Taqbaylit
7
obs-studio-26.1.0.tar.xz/UI/data/themes/Acri.qss -> obs-studio-26.1.1.tar.xz/UI/data/themes/Acri.qss Changed
12
 
1
@@ -317,8 +317,8 @@
2
     max-height: 40px;
3
 }
4
 
5
-#contextContainer QPushButton[themeID2=contextBarButton] {
6
-    padding: 0px;
7
+#contextContainer QPushButton {
8
+    padding: 0px 12px;
9
 }
10
 
11
 QPushButton#sourcePropertiesButton {
12
obs-studio-26.1.0.tar.xz/UI/installer/mp-installer.nsi -> obs-studio-26.1.1.tar.xz/UI/installer/mp-installer.nsi Changed
9
 
1
@@ -124,6 +124,7 @@
2
    ClearErrors
3
    GetDLLVersion "vcruntime140.DLL" $R0 $R1
4
    GetDLLVersion "msvcp140.DLL" $R0 $R1
5
+   GetDLLVersion "msvcp140_1.DLL" $R0 $R1
6
    IfErrors vs2019Missing_32 vs2019OK_32
7
    vs2019Missing_32:
8
        MessageBox MB_YESNO|MB_ICONEXCLAMATION "Your system is missing runtime components that ${APPNAME} requires. Would you like to download them?" IDYES vs2019true_32 IDNO vs2019false_32
9
obs-studio-26.1.0.tar.xz/UI/win-update/updater/updater.cpp -> obs-studio-26.1.1.tar.xz/UI/win-update/updater/updater.cpp Changed
119
 
1
@@ -298,6 +298,8 @@
2
            } else {
3
                DeleteFile(outputPath.c_str());
4
            }
5
+           if (state == STATE_INSTALL_FAILED)
6
+               DeleteFile(tempPath.c_str());
7
        } else if (state == STATE_DOWNLOADED) {
8
            DeleteFile(tempPath.c_str());
9
        }
10
@@ -337,7 +339,10 @@
11
 
12
 bool DownloadWorkerThread()
13
 {
14
-   const DWORD tlsProtocols = WINHTTP_FLAG_SECURE_PROTOCOL_TLS1_2;
15
+   const DWORD tlsProtocols = WINHTTP_FLAG_SECURE_PROTOCOL_TLS1_2 |
16
+                  WINHTTP_FLAG_SECURE_PROTOCOL_TLS1_3;
17
+
18
+   const DWORD enableHTTP2Flag = WINHTTP_PROTOCOL_FLAG_HTTP2;
19
 
20
    HttpHandle hSession = WinHttpOpen(L"OBS Studio Updater/2.1",
21
                      WINHTTP_ACCESS_TYPE_DEFAULT_PROXY,
22
@@ -352,6 +357,9 @@
23
    WinHttpSetOption(hSession, WINHTTP_OPTION_SECURE_PROTOCOLS,
24
             (LPVOID)&tlsProtocols, sizeof(tlsProtocols));
25
 
26
+   WinHttpSetOption(hSession, WINHTTP_OPTION_ENABLE_HTTP_PROTOCOL,
27
+            (LPVOID)&enableHTTP2Flag, sizeof(enableHTTP2Flag));
28
+
29
    HttpHandle hConnect = WinHttpConnect(hSession,
30
                         L"cdn-fastly.obsproject.com",
31
                         INTERNET_DEFAULT_HTTPS_PORT, 0);
32
@@ -784,6 +792,41 @@
33
    }
34
 }
35
 
36
+static bool MoveInUseFileAway(update_t &file)
37
+{
38
+   _TCHAR deleteMeName[MAX_PATH];
39
+   _TCHAR randomStr[MAX_PATH];
40
+
41
+   BYTE junk[40];
42
+   BYTE hash[BLAKE2_HASH_LENGTH];
43
+
44
+   CryptGenRandom(hProvider, sizeof(junk), junk);
45
+   blake2b(hash, sizeof(hash), junk, sizeof(junk), NULL, 0);
46
+   HashToString(hash, randomStr);
47
+   randomStr[8] = 0;
48
+
49
+   StringCbCopy(deleteMeName, sizeof(deleteMeName),
50
+            file.outputPath.c_str());
51
+
52
+   StringCbCat(deleteMeName, sizeof(deleteMeName), L".");
53
+   StringCbCat(deleteMeName, sizeof(deleteMeName), randomStr);
54
+   StringCbCat(deleteMeName, sizeof(deleteMeName), L".deleteme");
55
+
56
+   if (MoveFile(file.outputPath.c_str(), deleteMeName)) {
57
+
58
+       if (MyCopyFile(deleteMeName, file.outputPath.c_str())) {
59
+           MoveFileEx(deleteMeName, NULL,
60
+                  MOVEFILE_DELAY_UNTIL_REBOOT);
61
+
62
+           return true;
63
+       } else {
64
+           MoveFile(deleteMeName, file.outputPath.c_str());
65
+       }
66
+   }
67
+
68
+   return false;
69
+}
70
+
71
 static bool UpdateFile(update_t &file)
72
 {
73
    wchar_t oldFileRenamedPath[MAX_PATH];
74
@@ -836,6 +879,9 @@
75
 
76
        int error_code;
77
        bool installed_ok;
78
+       bool already_tried_to_move = false;
79
+
80
+   retryAfterMovingFile:
81
 
82
        if (file.patchable) {
83
            error_code = ApplyPatch(file.tempPath.c_str(),
84
@@ -875,15 +921,23 @@
85
            int is_sharing_violation =
86
                (error_code == ERROR_SHARING_VIOLATION);
87
 
88
-           if (is_sharing_violation)
89
+           if (is_sharing_violation) {
90
+               if (!already_tried_to_move) {
91
+                   already_tried_to_move = true;
92
+
93
+                   if (MoveInUseFileAway(file))
94
+                       goto retryAfterMovingFile;
95
+               }
96
+
97
                Status(L"Update failed: %s is still in use.  "
98
                       L"Close all "
99
                       L"programs and try again.",
100
                       curFileName);
101
-           else
102
+           } else {
103
                Status(L"Update failed: Couldn't update %s "
104
                       L"(error %d)",
105
                       curFileName, GetLastError());
106
+           }
107
 
108
            file.state = STATE_INSTALL_FAILED;
109
            return false;
110
@@ -1390,7 +1444,7 @@
111
    /* ------------------------------------- *
112
     * Download Updates                      */
113
 
114
-   if (!RunDownloadWorkers(2))
115
+   if (!RunDownloadWorkers(4))
116
        return false;
117
 
118
    if ((size_t)completedUpdates != updates.size()) {
119
obs-studio-26.1.0.tar.xz/UI/window-basic-auto-config.cpp -> obs-studio-26.1.1.tar.xz/UI/window-basic-auto-config.cpp Changed
41
 
1
@@ -373,8 +373,6 @@
2
    if (!wiz->customServer) {
3
        if (wiz->serviceName == "Twitch")
4
            wiz->service = AutoConfig::Service::Twitch;
5
-       else if (wiz->serviceName == "Smashcast")
6
-           wiz->service = AutoConfig::Service::Smashcast;
7
        else
8
            wiz->service = AutoConfig::Service::Other;
9
    } else {
10
@@ -504,7 +502,7 @@
11
        return;
12
 
13
    std::string service = QT_TO_UTF8(ui->service->currentText());
14
-   bool regionBased = service == "Twitch" || service == "Smashcast";
15
+   bool regionBased = service == "Twitch";
16
    bool testBandwidth = ui->doBandwidthTest->isChecked();
17
    bool custom = IsCustomService();
18
 
19
@@ -928,21 +926,6 @@
20
        } else if (regionOther) {
21
            return true;
22
        }
23
-   } else if (service == Service::Smashcast) {
24
-       if (strcmp(server, "Default") == 0) {
25
-           return true;
26
-       } else if (astrcmp_n(server, "US-West:", 8) == 0 ||
27
-              astrcmp_n(server, "US-East:", 8) == 0) {
28
-           return regionUS;
29
-       } else if (astrcmp_n(server, "EU-", 3) == 0) {
30
-           return regionEU;
31
-       } else if (astrcmp_n(server, "South Korea:", 12) == 0 ||
32
-              astrcmp_n(server, "Asia:", 5) == 0 ||
33
-              astrcmp_n(server, "China:", 6) == 0) {
34
-           return regionAsia;
35
-       } else if (regionOther) {
36
-           return true;
37
-       }
38
    } else {
39
        return true;
40
    }
41
obs-studio-26.1.0.tar.xz/UI/window-basic-auto-config.hpp -> obs-studio-26.1.1.tar.xz/UI/window-basic-auto-config.hpp Changed
9
 
1
@@ -38,7 +38,6 @@
2
 
3
    enum class Service {
4
        Twitch,
5
-       Smashcast,
6
        Other,
7
    };
8
 
9
obs-studio-26.1.0.tar.xz/UI/window-basic-main.cpp -> obs-studio-26.1.1.tar.xz/UI/window-basic-main.cpp Changed
16
 
1
@@ -1218,6 +1218,14 @@
2
    uint32_t cx = primaryScreen->size().width();
3
    uint32_t cy = primaryScreen->size().height();
4
 
5
+#ifdef SUPPORTS_FRACTIONAL_SCALING
6
+   cx *= devicePixelRatioF();
7
+   cy *= devicePixelRatioF();
8
+#elif
9
+   cx *= devicePixelRatio();
10
+   cy *= devicePixelRatio();
11
+#endif
12
+
13
    bool oldResolutionDefaults = config_get_bool(
14
        App()->GlobalConfig(), "General", "Pre19Defaults");
15
 
16
obs-studio-26.1.0.tar.xz/cmake/Modules/FindLibcurl.cmake -> obs-studio-26.1.1.tar.xz/cmake/Modules/FindLibcurl.cmake Changed
70
 
1
@@ -36,29 +36,45 @@
2
    PATH_SUFFIXES
3
        include)
4
 
5
-find_library(CURL_LIB
6
-   NAMES ${_CURL_LIBRARIES} curl libcurl
7
-   HINTS
8
-       ENV curlPath${_lib_suffix}
9
-       ENV curlPath
10
-       ENV DepsPath${_lib_suffix}
11
-       ENV DepsPath
12
-       ${curlPath${_lib_suffix}}
13
-       ${curlPath}
14
-       ${DepsPath${_lib_suffix}}
15
-       ${DepsPath}
16
-       ${_CURL_LIBRARY_DIRS}
17
-   PATHS
18
-       /usr/lib /usr/local/lib /opt/local/lib /sw/lib
19
-   PATH_SUFFIXES
20
-       lib${_lib_suffix} lib
21
-       libs${_lib_suffix} libs
22
-       bin${_lib_suffix} bin
23
-       ../lib${_lib_suffix} ../lib
24
-       ../libs${_lib_suffix} ../libs
25
-       ../bin${_lib_suffix} ../bin
26
-       "build/Win${_lib_suffix}/VC12/DLL Release - DLL Windows SSPI"
27
-       "../build/Win${_lib_suffix}/VC12/DLL Release - DLL Windows SSPI")
28
+if(APPLE)
29
+   find_library(CURL_LIB
30
+       NAMES ${_CURL_LIBRARIES} curl libcurl
31
+       HINTS
32
+           ENV curlPath${_lib_suffix}
33
+           ENV curlPath
34
+           ENV DepsPath${_lib_suffix}
35
+           ENV DepsPath
36
+           ${curlPath${_lib_suffix}}
37
+           ${curlPath}
38
+           ${DepsPath${_lib_suffix}}
39
+           ${DepsPath}
40
+           ${_CURL_LIBRARY_DIRS}
41
+       )
42
+else()
43
+   find_library(CURL_LIB
44
+       NAMES ${_CURL_LIBRARIES} curl libcurl
45
+       HINTS
46
+           ENV curlPath${_lib_suffix}
47
+           ENV curlPath
48
+           ENV DepsPath${_lib_suffix}
49
+           ENV DepsPath
50
+           ${curlPath${_lib_suffix}}
51
+           ${curlPath}
52
+           ${DepsPath${_lib_suffix}}
53
+           ${DepsPath}
54
+           ${_CURL_LIBRARY_DIRS}
55
+       PATHS
56
+           /usr/lib /usr/local/lib /opt/local/lib /sw/lib
57
+       PATH_SUFFIXES
58
+           lib${_lib_suffix} lib
59
+           libs${_lib_suffix} libs
60
+           bin${_lib_suffix} bin
61
+           ../lib${_lib_suffix} ../lib
62
+           ../libs${_lib_suffix} ../libs
63
+           ../bin${_lib_suffix} ../bin
64
+           "build/Win${_lib_suffix}/VC12/DLL Release - DLL Windows SSPI"
65
+           "../build/Win${_lib_suffix}/VC12/DLL Release - DLL Windows SSPI")
66
+endif()
67
 
68
 include(FindPackageHandleStandardArgs)
69
 find_package_handle_standard_args(Libcurl DEFAULT_MSG CURL_LIB CURL_INCLUDE_DIR)
70
obs-studio-26.1.0.tar.xz/docs/sphinx/reference-frontend-api.rst -> obs-studio-26.1.1.tar.xz/docs/sphinx/reference-frontend-api.rst Changed
17
 
1
@@ -454,6 +454,15 @@
2
 
3
 ---------------------------------------
4
 
5
+.. function:: void obs_frontend_open_projector(const char *type, int monitor, const char *geometry, const char *name)
6
+
7
+   :param type:     "Preview", "Source", "Scene", "StudioProgram", or "Multiview" (case insensitive).
8
+   :param monitor:  Monitor to open the projector on. If -1, opens a window.
9
+   :param geometry: If *monitor* is -1, size and position of the projector window. Encoded in Base64 using Qt's geometry encoding.
10
+   :param name:     If *type* is "Source" or "Scene", name of the source or scene to be displayed.
11
+
12
+---------------------------------------
13
+
14
 .. function:: void obs_frontend_save(void)
15
 
16
    Saves the current scene collection.
17
obs-studio-26.1.0.tar.xz/libobs/CMakeLists.txt -> obs-studio-26.1.1.tar.xz/libobs/CMakeLists.txt Changed
64
 
1
@@ -188,20 +188,8 @@
2
        util/pipe-posix.c
3
        util/platform-nix.c)
4
 
5
-   if(NEEDS_SIMDE)
6
-       set(libobs_PLATFORM_HEADERS
7
-           util/simde/check.h
8
-           util/simde/hedley.h
9
-           util/simde/mmx.h
10
-           util/simde/simde-arch.h
11
-           util/simde/simde-common.h
12
-           util/simde/sse.h
13
-           util/simde/sse2.h
14
-           util/threading-posix.h)
15
-   else()
16
-       set(libobs_PLATFORM_HEADERS
17
-           util/threading-posix.h)
18
-   endif()
19
+   set(libobs_PLATFORM_HEADERS
20
+       util/threading-posix.h)
21
 
22
    if(HAVE_PULSEAUDIO)
23
        set(libobs_audio_monitoring_HEADERS
24
@@ -369,7 +357,6 @@
25
 set(libobs_util_HEADERS
26
    util/curl/curl-helper.h
27
    util/sse-intrin.h
28
-   util/sse2neon.h
29
    util/array-serializer.h
30
    util/file-serializer.h
31
    util/utf8.h
32
@@ -419,6 +406,20 @@
33
    obs-video-gpu-encode.c
34
    obs-video.c)
35
 set(libobs_libobs_HEADERS
36
+   util/simde/check.h
37
+   util/simde/debug-trap.h
38
+   util/simde/hedley.h
39
+   util/simde/simde-align.h
40
+   util/simde/simde-arch.h
41
+   util/simde/simde-common.h
42
+   util/simde/simde-constify.h
43
+   util/simde/simde-detect-clang.h
44
+   util/simde/simde-diagnostic.h
45
+   util/simde/simde-features.h
46
+   util/simde/simde-math.h
47
+   util/simde/x86/mmx.h
48
+   util/simde/x86/sse2.h
49
+   util/simde/x86/sse.h
50
    ${libobs_PLATFORM_HEADERS}
51
    obs-audio-controls.h
52
    obs-defs.h
53
@@ -499,6 +500,10 @@
54
    PUBLIC
55
        HAVE_OBSCONFIG_H)
56
 
57
+target_compile_definitions(libobs
58
+   PUBLIC
59
+   ${ARCH_SIMD_DEFINES})
60
+
61
 target_compile_options(libobs
62
    PUBLIC
63
    ${ARCH_SIMD_FLAGS})
64
obs-studio-26.1.0.tar.xz/libobs/media-io/media-remux.c -> obs-studio-26.1.1.tar.xz/libobs/media-io/media-remux.c Changed
10
 
1
@@ -227,7 +227,7 @@
2
 
3
            /* Treat "Invalid data found when processing input" and
4
             * "Invalid argument" as non-fatal */
5
-           if (ret == AVERROR_INVALIDDATA || ret == EINVAL)
6
+           if (ret == AVERROR_INVALIDDATA || ret == -EINVAL)
7
                continue;
8
 
9
            break;
10
obs-studio-26.1.0.tar.xz/libobs/obs-config.h -> obs-studio-26.1.1.tar.xz/libobs/obs-config.h Changed
10
 
1
@@ -41,7 +41,7 @@
2
  *
3
  * Reset to zero each major or minor version
4
  */
5
-#define LIBOBS_API_PATCH_VER 0
6
+#define LIBOBS_API_PATCH_VER 1
7
 
8
 #define MAKE_SEMANTIC_VERSION(major, minor, patch) \
9
    ((major << 24) | (minor << 16) | patch)
10
obs-studio-26.1.0.tar.xz/libobs/obs-scene.c -> obs-studio-26.1.1.tar.xz/libobs/obs-scene.c Changed
65
 
1
@@ -951,19 +951,12 @@
2
 }
3
 
4
 static void apply_scene_item_audio_actions(struct obs_scene_item *item,
5
-                      float **p_buf, uint64_t ts,
6
+                      float *buf, uint64_t ts,
7
                       size_t sample_rate)
8
 {
9
    bool cur_visible = item->visible;
10
    uint64_t frame_num = 0;
11
    size_t deref_count = 0;
12
-   float *buf = NULL;
13
-
14
-   if (p_buf) {
15
-       if (!*p_buf)
16
-           *p_buf = malloc(AUDIO_OUTPUT_FRAMES * sizeof(float));
17
-       buf = *p_buf;
18
-   }
19
 
20
    pthread_mutex_lock(&item->actions_mutex);
21
 
22
@@ -1010,7 +1003,7 @@
23
    }
24
 }
25
 
26
-static bool apply_scene_item_volume(struct obs_scene_item *item, float **buf,
27
+static bool apply_scene_item_volume(struct obs_scene_item *item, float *buf,
28
                    uint64_t ts, size_t sample_rate)
29
 {
30
    bool actions_pending;
31
@@ -1074,7 +1067,7 @@
32
                   size_t sample_rate)
33
 {
34
    uint64_t timestamp = 0;
35
-   float *buf = NULL;
36
+   float buf[AUDIO_OUTPUT_FRAMES];
37
    struct obs_source_audio_mix child_audio;
38
    struct obs_scene *scene = data;
39
    struct obs_scene_item *item;
40
@@ -1113,7 +1106,7 @@
41
        size_t pos, count;
42
        bool apply_buf;
43
 
44
-       apply_buf = apply_scene_item_volume(item, &buf, timestamp,
45
+       apply_buf = apply_scene_item_volume(item, buf, timestamp,
46
                            sample_rate);
47
 
48
        if (obs_source_audio_pending(item->source)) {
49
@@ -1159,7 +1152,6 @@
50
    *ts_out = timestamp;
51
    audio_unlock(scene);
52
 
53
-   free(buf);
54
    return true;
55
 }
56
 
57
@@ -1300,6 +1292,7 @@
58
    }
59
 
60
    obs_sceneitem_set_crop(dst, &src->crop);
61
+   obs_sceneitem_set_locked(dst, src->locked);
62
 
63
    if (defer_texture_update) {
64
        os_atomic_set_bool(&dst->update_transform, true);
65
obs-studio-26.1.0.tar.xz/libobs/obs-source.c -> obs-studio-26.1.1.tar.xz/libobs/obs-source.c Changed
19
 
1
@@ -4532,7 +4532,7 @@
2
 static void apply_audio_actions(obs_source_t *source, size_t channels,
3
                size_t sample_rate)
4
 {
5
-   float *vol_data = malloc(sizeof(float) * AUDIO_OUTPUT_FRAMES);
6
+   float vol_data[AUDIO_OUTPUT_FRAMES];
7
    float cur_vol = get_source_volume(source, source->audio_ts);
8
    size_t frame_num = 0;
9
 
10
@@ -4573,8 +4573,6 @@
11
        if ((source->audio_mixers & (1 << mix)) != 0)
12
            multiply_vol_data(source, mix, channels, vol_data);
13
    }
14
-
15
-   free(vol_data);
16
 }
17
 
18
 static void apply_audio_volume(obs_source_t *source, uint32_t mixers,
19
obs-studio-26.1.0.tar.xz/libobs/obsconfig.h.in -> obs-studio-26.1.1.tar.xz/libobs/obsconfig.h.in Changed
9
 
1
@@ -18,7 +18,6 @@
2
 #define HAVE_DBUS @HAVE_DBUS@
3
 #define HAVE_PULSEAUDIO @HAVE_PULSEAUDIO@
4
 #define USE_XINPUT @USE_XINPUT@
5
-#define NEEDS_SIMDE @NEEDS_SIMDE@
6
 #define LIBOBS_IMAGEMAGICK_DIR_STYLE_6L 6
7
 #define LIBOBS_IMAGEMAGICK_DIR_STYLE_7GE 7
8
 #define LIBOBS_IMAGEMAGICK_DIR_STYLE @LIBOBS_IMAGEMAGICK_DIR_STYLE@
9
obs-studio-26.1.0.tar.xz/libobs/util/simde/README.libobs -> obs-studio-26.1.1.tar.xz/libobs/util/simde/README.libobs Changed
10
 
1
@@ -1,5 +1,5 @@
2
-This is a slightly modified version of https://github.com/nemequ/simde/commit/cafec4b952fa5a31a51a10326f97c2e7c9067771
3
-sse{,2}.h and mmx.h was moved down from the original "x86" subdirectory,
4
-subsequently the '#include "../simde-common.h"' line in mmx.h was changed to '#include "simde-common.h"'
5
+This is a slightly modified version of the simde directory in
6
+https://github.com/simd-everywhere/simde/commit/c3d7abfaba6729a8b11d09a314b34a4db628911d
7
+Unused files have removed.
8
 
9
 Then the code was reformatted using the "formatcode.sh" script in the root of this repository.
10
obs-studio-26.1.0.tar.xz/libobs/util/simde/check.h -> obs-studio-26.1.1.tar.xz/libobs/util/simde/check.h Changed
9
 
1
@@ -18,6 +18,7 @@
2
 #endif
3
 
4
 #include "hedley.h"
5
+#include "simde-diagnostic.h"
6
 #include <stdint.h>
7
 
8
 #if !defined(_WIN32)
9
obs-studio-26.1.0.tar.xz/libobs/util/simde/hedley.h -> obs-studio-26.1.1.tar.xz/libobs/util/simde/hedley.h Changed
564
 
1
@@ -10,11 +10,11 @@
2
  * SPDX-License-Identifier: CC0-1.0
3
  */
4
 
5
-#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 12)
6
+#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 14)
7
 #if defined(HEDLEY_VERSION)
8
 #undef HEDLEY_VERSION
9
 #endif
10
-#define HEDLEY_VERSION 12
11
+#define HEDLEY_VERSION 14
12
 
13
 #if defined(HEDLEY_STRINGIFY_EX)
14
 #undef HEDLEY_STRINGIFY_EX
15
@@ -36,6 +36,16 @@
16
 #endif
17
 #define HEDLEY_CONCAT(a, b) HEDLEY_CONCAT_EX(a, b)
18
 
19
+#if defined(HEDLEY_CONCAT3_EX)
20
+#undef HEDLEY_CONCAT3_EX
21
+#endif
22
+#define HEDLEY_CONCAT3_EX(a, b, c) a##b##c
23
+
24
+#if defined(HEDLEY_CONCAT3)
25
+#undef HEDLEY_CONCAT3
26
+#endif
27
+#define HEDLEY_CONCAT3(a, b, c) HEDLEY_CONCAT3_EX(a, b, c)
28
+
29
 #if defined(HEDLEY_VERSION_ENCODE)
30
 #undef HEDLEY_VERSION_ENCODE
31
 #endif
32
@@ -80,17 +90,17 @@
33
 #if defined(HEDLEY_MSVC_VERSION)
34
 #undef HEDLEY_MSVC_VERSION
35
 #endif
36
-#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000)
37
+#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL)
38
 #define HEDLEY_MSVC_VERSION                                        \
39
    HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000,            \
40
                  (_MSC_FULL_VER % 10000000) / 100000, \
41
                  (_MSC_FULL_VER % 100000) / 100)
42
-#elif defined(_MSC_FULL_VER)
43
+#elif defined(_MSC_FULL_VER) && !defined(__ICL)
44
 #define HEDLEY_MSVC_VERSION                                      \
45
    HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000,           \
46
                  (_MSC_FULL_VER % 1000000) / 10000, \
47
                  (_MSC_FULL_VER % 10000) / 10)
48
-#elif defined(_MSC_VER)
49
+#elif defined(_MSC_VER) && !defined(__ICL)
50
 #define HEDLEY_MSVC_VERSION \
51
    HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0)
52
 #endif
53
@@ -98,7 +108,7 @@
54
 #if defined(HEDLEY_MSVC_VERSION_CHECK)
55
 #undef HEDLEY_MSVC_VERSION_CHECK
56
 #endif
57
-#if !defined(_MSC_VER)
58
+#if !defined(HEDLEY_MSVC_VERSION)
59
 #define HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) (0)
60
 #elif defined(_MSC_VER) && (_MSC_VER >= 1400)
61
 #define HEDLEY_MSVC_VERSION_CHECK(major, minor, patch) \
62
@@ -114,11 +124,12 @@
63
 #if defined(HEDLEY_INTEL_VERSION)
64
 #undef HEDLEY_INTEL_VERSION
65
 #endif
66
-#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE)
67
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && \
68
+   !defined(__ICL)
69
 #define HEDLEY_INTEL_VERSION                                                  \
70
    HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, \
71
                  __INTEL_COMPILER_UPDATE)
72
-#elif defined(__INTEL_COMPILER)
73
+#elif defined(__INTEL_COMPILER) && !defined(__ICL)
74
 #define HEDLEY_INTEL_VERSION \
75
    HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0)
76
 #endif
77
@@ -133,6 +144,25 @@
78
 #define HEDLEY_INTEL_VERSION_CHECK(major, minor, patch) (0)
79
 #endif
80
 
81
+#if defined(HEDLEY_INTEL_CL_VERSION)
82
+#undef HEDLEY_INTEL_CL_VERSION
83
+#endif
84
+#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && \
85
+   defined(__ICL)
86
+#define HEDLEY_INTEL_CL_VERSION \
87
+   HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0)
88
+#endif
89
+
90
+#if defined(HEDLEY_INTEL_CL_VERSION_CHECK)
91
+#undef HEDLEY_INTEL_CL_VERSION_CHECK
92
+#endif
93
+#if defined(HEDLEY_INTEL_CL_VERSION)
94
+#define HEDLEY_INTEL_CL_VERSION_CHECK(major, minor, patch) \
95
+   (HEDLEY_INTEL_CL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch))
96
+#else
97
+#define HEDLEY_INTEL_CL_VERSION_CHECK(major, minor, patch) (0)
98
+#endif
99
+
100
 #if defined(HEDLEY_PGI_VERSION)
101
 #undef HEDLEY_PGI_VERSION
102
 #endif
103
@@ -788,6 +818,68 @@
104
    HEDLEY_GCC_VERSION_CHECK(major, minor, patch)
105
 #endif
106
 
107
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
108
+   defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(3, 0, 0) ||  \
109
+   HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                     \
110
+   HEDLEY_IAR_VERSION_CHECK(8, 0, 0) ||                        \
111
+   HEDLEY_PGI_VERSION_CHECK(18, 4, 0) ||                       \
112
+   HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                        \
113
+   HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                       \
114
+   HEDLEY_TI_ARMCL_VERSION_CHECK(4, 7, 0) ||                   \
115
+   HEDLEY_TI_CL430_VERSION_CHECK(2, 0, 1) ||                   \
116
+   HEDLEY_TI_CL2000_VERSION_CHECK(6, 1, 0) ||                  \
117
+   HEDLEY_TI_CL6X_VERSION_CHECK(7, 0, 0) ||                    \
118
+   HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||                    \
119
+   HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) ||                   \
120
+   HEDLEY_CRAY_VERSION_CHECK(5, 0, 0) ||                       \
121
+   HEDLEY_TINYC_VERSION_CHECK(0, 9, 17) ||                     \
122
+   HEDLEY_SUNPRO_VERSION_CHECK(8, 0, 0) ||                     \
123
+   (HEDLEY_IBM_VERSION_CHECK(10, 1, 0) && defined(__C99_PRAGMA_OPERATOR))
124
+#define HEDLEY_PRAGMA(value) _Pragma(#value)
125
+#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
126
+#define HEDLEY_PRAGMA(value) __pragma(value)
127
+#else
128
+#define HEDLEY_PRAGMA(value)
129
+#endif
130
+
131
+#if defined(HEDLEY_DIAGNOSTIC_PUSH)
132
+#undef HEDLEY_DIAGNOSTIC_PUSH
133
+#endif
134
+#if defined(HEDLEY_DIAGNOSTIC_POP)
135
+#undef HEDLEY_DIAGNOSTIC_POP
136
+#endif
137
+#if defined(__clang__)
138
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
139
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
140
+#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
141
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
142
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
143
+#elif HEDLEY_GCC_VERSION_CHECK(4, 6, 0)
144
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
145
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
146
+#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0) || \
147
+   HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
148
+#define HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
149
+#define HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
150
+#elif HEDLEY_ARM_VERSION_CHECK(5, 6, 0)
151
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
152
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
153
+#elif HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||       \
154
+   HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \
155
+   HEDLEY_TI_CL430_VERSION_CHECK(4, 4, 0) || \
156
+   HEDLEY_TI_CL6X_VERSION_CHECK(8, 1, 0) ||  \
157
+   HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||  \
158
+   HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
159
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
160
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
161
+#elif HEDLEY_PELLES_VERSION_CHECK(2, 90, 0)
162
+#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
163
+#define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
164
+#else
165
+#define HEDLEY_DIAGNOSTIC_PUSH
166
+#define HEDLEY_DIAGNOSTIC_POP
167
+#endif
168
+
169
 /* HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for
170
    HEDLEY INTERNAL USE ONLY.  API subject to change without notice. */
171
 #if defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_)
172
@@ -796,11 +888,20 @@
173
 #if defined(__cplusplus)
174
 #if HEDLEY_HAS_WARNING("-Wc++98-compat")
175
 #if HEDLEY_HAS_WARNING("-Wc++17-extensions")
176
+#if HEDLEY_HAS_WARNING("-Wc++1z-extensions")
177
+#define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr)                  \
178
+   HEDLEY_DIAGNOSTIC_PUSH                                             \
179
+   _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") _Pragma(    \
180
+       "clang diagnostic ignored \"-Wc++17-extensions\"")         \
181
+       _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \
182
+           xpr HEDLEY_DIAGNOSTIC_POP
183
+#else
184
 #define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr)                  \
185
    HEDLEY_DIAGNOSTIC_PUSH                                             \
186
    _Pragma("clang diagnostic ignored \"-Wc++98-compat\"")             \
187
        _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \
188
            xpr HEDLEY_DIAGNOSTIC_POP
189
+#endif
190
 #else
191
 #define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr)      \
192
    HEDLEY_DIAGNOSTIC_PUSH                                 \
193
@@ -861,74 +962,14 @@
194
 #elif HEDLEY_IAR_VERSION_CHECK(8, 3, 0)
195
 #define HEDLEY_CPP_CAST(T, expr) \
196
    HEDLEY_DIAGNOSTIC_PUSH   \
197
-   _Pragma("diag_suppress=Pe137") HEDLEY_DIAGNOSTIC_POP #else
198
+   _Pragma("diag_suppress=Pe137") HEDLEY_DIAGNOSTIC_POP
199
+#else
200
 #define HEDLEY_CPP_CAST(T, expr) ((T)(expr))
201
 #endif
202
 #else
203
 #define HEDLEY_CPP_CAST(T, expr) (expr)
204
 #endif
205
 
206
-#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \
207
-   defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(3, 0, 0) ||  \
208
-   HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                     \
209
-   HEDLEY_IAR_VERSION_CHECK(8, 0, 0) ||                        \
210
-   HEDLEY_PGI_VERSION_CHECK(18, 4, 0) ||                       \
211
-   HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                        \
212
-   HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                       \
213
-   HEDLEY_TI_ARMCL_VERSION_CHECK(4, 7, 0) ||                   \
214
-   HEDLEY_TI_CL430_VERSION_CHECK(2, 0, 1) ||                   \
215
-   HEDLEY_TI_CL2000_VERSION_CHECK(6, 1, 0) ||                  \
216
-   HEDLEY_TI_CL6X_VERSION_CHECK(7, 0, 0) ||                    \
217
-   HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||                    \
218
-   HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0) ||                   \
219
-   HEDLEY_CRAY_VERSION_CHECK(5, 0, 0) ||                       \
220
-   HEDLEY_TINYC_VERSION_CHECK(0, 9, 17) ||                     \
221
-   HEDLEY_SUNPRO_VERSION_CHECK(8, 0, 0) ||                     \
222
-   (HEDLEY_IBM_VERSION_CHECK(10, 1, 0) && defined(__C99_PRAGMA_OPERATOR))
223
-#define HEDLEY_PRAGMA(value) _Pragma(#value)
224
-#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
225
-#define HEDLEY_PRAGMA(value) __pragma(value)
226
-#else
227
-#define HEDLEY_PRAGMA(value)
228
-#endif
229
-
230
-#if defined(HEDLEY_DIAGNOSTIC_PUSH)
231
-#undef HEDLEY_DIAGNOSTIC_PUSH
232
-#endif
233
-#if defined(HEDLEY_DIAGNOSTIC_POP)
234
-#undef HEDLEY_DIAGNOSTIC_POP
235
-#endif
236
-#if defined(__clang__)
237
-#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
238
-#define HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
239
-#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
240
-#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
241
-#define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
242
-#elif HEDLEY_GCC_VERSION_CHECK(4, 6, 0)
243
-#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push")
244
-#define HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop")
245
-#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
246
-#define HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push))
247
-#define HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop))
248
-#elif HEDLEY_ARM_VERSION_CHECK(5, 6, 0)
249
-#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("push")
250
-#define HEDLEY_DIAGNOSTIC_POP _Pragma("pop")
251
-#elif HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||       \
252
-   HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) || \
253
-   HEDLEY_TI_CL430_VERSION_CHECK(4, 4, 0) || \
254
-   HEDLEY_TI_CL6X_VERSION_CHECK(8, 1, 0) ||  \
255
-   HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||  \
256
-   HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
257
-#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push")
258
-#define HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop")
259
-#elif HEDLEY_PELLES_VERSION_CHECK(2, 90, 0)
260
-#define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)")
261
-#define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)")
262
-#else
263
-#define HEDLEY_DIAGNOSTIC_PUSH
264
-#define HEDLEY_DIAGNOSTIC_POP
265
-#endif
266
-
267
 #if defined(HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED)
268
 #undef HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED
269
 #endif
270
@@ -938,6 +979,12 @@
271
 #elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
272
 #define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \
273
    _Pragma("warning(disable:1478 1786)")
274
+#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
275
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \
276
+   __pragma(warning(disable : 1478 1786))
277
+#elif HEDLEY_PGI_VERSION_CHECK(20, 7, 0)
278
+#define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED \
279
+   _Pragma("diag_suppress 1215,1216,1444,1445")
280
 #elif HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
281
 #define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444")
282
 #elif HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
283
@@ -985,6 +1032,9 @@
284
 #elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
285
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
286
    _Pragma("warning(disable:161)")
287
+#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
288
+#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \
289
+   __pragma(warning(disable : 161))
290
 #elif HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
291
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675")
292
 #elif HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
293
@@ -1018,9 +1068,15 @@
294
 #elif HEDLEY_INTEL_VERSION_CHECK(17, 0, 0)
295
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
296
    _Pragma("warning(disable:1292)")
297
+#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
298
+#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
299
+   __pragma(warning(disable : 1292))
300
 #elif HEDLEY_MSVC_VERSION_CHECK(19, 0, 0)
301
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
302
    __pragma(warning(disable : 5030))
303
+#elif HEDLEY_PGI_VERSION_CHECK(20, 7, 0)
304
+#define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
305
+   _Pragma("diag_suppress 1097,1098")
306
 #elif HEDLEY_PGI_VERSION_CHECK(17, 10, 0)
307
 #define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES \
308
    _Pragma("diag_suppress 1097")
309
@@ -1061,13 +1117,11 @@
310
 #if defined(HEDLEY_DEPRECATED_FOR)
311
 #undef HEDLEY_DEPRECATED_FOR
312
 #endif
313
-#if defined(__cplusplus) && (__cplusplus >= 201402L)
314
-#define HEDLEY_DEPRECATED(since)                      \
315
-   HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \
316
-       [[deprecated("Since " #since)]])
317
-#define HEDLEY_DEPRECATED_FOR(since, replacement)     \
318
-   HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \
319
-       [[deprecated("Since " #since "; use " #replacement)]])
320
+#if HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) || \
321
+   HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
322
+#define HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " #since))
323
+#define HEDLEY_DEPRECATED_FOR(since, replacement) \
324
+   __declspec(deprecated("Since " #since "; use " #replacement))
325
 #elif HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) || \
326
    HEDLEY_GCC_VERSION_CHECK(4, 5, 0) ||                     \
327
    HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                  \
328
@@ -1083,6 +1137,13 @@
329
    __attribute__((__deprecated__("Since " #since)))
330
 #define HEDLEY_DEPRECATED_FOR(since, replacement) \
331
    __attribute__((__deprecated__("Since " #since "; use " #replacement)))
332
+#elif defined(__cplusplus) && (__cplusplus >= 201402L)
333
+#define HEDLEY_DEPRECATED(since)                      \
334
+   HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \
335
+       [[deprecated("Since " #since)]])
336
+#define HEDLEY_DEPRECATED_FOR(since, replacement)     \
337
+   HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \
338
+       [[deprecated("Since " #since "; use " #replacement)]])
339
 #elif HEDLEY_HAS_ATTRIBUTE(deprecated) || HEDLEY_GCC_VERSION_CHECK(3, 1, 0) || \
340
    HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                                   \
341
    HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                                  \
342
@@ -1103,12 +1164,9 @@
343
 #define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__))
344
 #define HEDLEY_DEPRECATED_FOR(since, replacement) \
345
    __attribute__((__deprecated__))
346
-#elif HEDLEY_MSVC_VERSION_CHECK(14, 0, 0)
347
-#define HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " #since))
348
-#define HEDLEY_DEPRECATED_FOR(since, replacement) \
349
-   __declspec(deprecated("Since " #since "; use " #replacement))
350
-#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || \
351
-   HEDLEY_PELLES_VERSION_CHECK(6, 50, 0)
352
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) ||    \
353
+   HEDLEY_PELLES_VERSION_CHECK(6, 50, 0) || \
354
+   HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
355
 #define HEDLEY_DEPRECATED(since) __declspec(deprecated)
356
 #define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated)
357
 #elif HEDLEY_IAR_VERSION_CHECK(8, 0, 0)
358
@@ -1136,17 +1194,7 @@
359
 #if defined(HEDLEY_WARN_UNUSED_RESULT_MSG)
360
 #undef HEDLEY_WARN_UNUSED_RESULT_MSG
361
 #endif
362
-#if (HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L)
363
-#define HEDLEY_WARN_UNUSED_RESULT \
364
-   HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
365
-#define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) \
366
-   HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]])
367
-#elif HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard)
368
-#define HEDLEY_WARN_UNUSED_RESULT \
369
-   HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
370
-#define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) \
371
-   HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
372
-#elif HEDLEY_HAS_ATTRIBUTE(warn_unused_result) ||                          \
373
+#if HEDLEY_HAS_ATTRIBUTE(warn_unused_result) ||                            \
374
    HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||                               \
375
    HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
376
    HEDLEY_TI_VERSION_CHECK(15, 12, 0) ||                              \
377
@@ -1169,6 +1217,16 @@
378
 #define HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
379
 #define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) \
380
    __attribute__((__warn_unused_result__))
381
+#elif (HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L)
382
+#define HEDLEY_WARN_UNUSED_RESULT \
383
+   HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
384
+#define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) \
385
+   HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]])
386
+#elif HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard)
387
+#define HEDLEY_WARN_UNUSED_RESULT \
388
+   HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
389
+#define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) \
390
+   HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]])
391
 #elif defined(_Check_return_) /* SAL */
392
 #define HEDLEY_WARN_UNUSED_RESULT _Check_return_
393
 #define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_
394
@@ -1222,7 +1280,8 @@
395
 #define HEDLEY_NO_RETURN __attribute__((__noreturn__))
396
 #elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0)
397
 #define HEDLEY_NO_RETURN _Pragma("does_not_return")
398
-#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0)
399
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || \
400
+   HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
401
 #define HEDLEY_NO_RETURN __declspec(noreturn)
402
 #elif HEDLEY_TI_CL6X_VERSION_CHECK(6, 0, 0) && defined(__cplusplus)
403
 #define HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;")
404
@@ -1252,7 +1311,9 @@
405
 #if defined(HEDLEY_ASSUME)
406
 #undef HEDLEY_ASSUME
407
 #endif
408
-#if HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
409
+#if HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) ||     \
410
+   HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \
411
+   HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
412
 #define HEDLEY_ASSUME(expr) __assume(expr)
413
 #elif HEDLEY_HAS_BUILTIN(__builtin_assume)
414
 #define HEDLEY_ASSUME(expr) __builtin_assume(expr)
415
@@ -1389,7 +1450,8 @@
416
 #if HEDLEY_HAS_BUILTIN(__builtin_unpredictable)
417
 #define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr))
418
 #endif
419
-#if HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) || \
420
+#if (HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && \
421
+     !defined(HEDLEY_PGI_VERSION)) ||                         \
422
    HEDLEY_GCC_VERSION_CHECK(9, 0, 0)
423
 #define HEDLEY_PREDICT(expr, value, probability) \
424
    __builtin_expect_with_probability((expr), (value), (probability))
425
@@ -1399,7 +1461,8 @@
426
    __builtin_expect_with_probability(!!(expr), 0, (probability))
427
 #define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1)
428
 #define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
429
-#elif HEDLEY_HAS_BUILTIN(__builtin_expect) ||                              \
430
+#elif (HEDLEY_HAS_BUILTIN(__builtin_expect) &&                             \
431
+       !defined(HEDLEY_INTEL_CL_VERSION)) ||                               \
432
    HEDLEY_GCC_VERSION_CHECK(3, 0, 0) ||                               \
433
    HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
434
    (HEDLEY_SUNPRO_VERSION_CHECK(5, 15, 0) && defined(__cplusplus)) || \
435
@@ -1476,7 +1539,8 @@
436
 #define HEDLEY_MALLOC __attribute__((__malloc__))
437
 #elif HEDLEY_SUNPRO_VERSION_CHECK(5, 10, 0)
438
 #define HEDLEY_MALLOC _Pragma("returns_new_memory")
439
-#elif HEDLEY_MSVC_VERSION_CHECK(14, 0, 0)
440
+#elif HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) || \
441
+   HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
442
 #define HEDLEY_MALLOC __declspec(restrict)
443
 #else
444
 #define HEDLEY_MALLOC
445
@@ -1557,6 +1621,7 @@
446
 #elif HEDLEY_GCC_VERSION_CHECK(3, 1, 0) ||                                 \
447
    HEDLEY_MSVC_VERSION_CHECK(14, 0, 0) ||                             \
448
    HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
449
+   HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0) ||                       \
450
    HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                               \
451
    HEDLEY_IBM_VERSION_CHECK(10, 1, 0) ||                              \
452
    HEDLEY_PGI_VERSION_CHECK(17, 10, 0) ||                             \
453
@@ -1581,13 +1646,14 @@
454
 #define HEDLEY_INLINE inline
455
 #elif defined(HEDLEY_GCC_VERSION) || HEDLEY_ARM_VERSION_CHECK(6, 2, 0)
456
 #define HEDLEY_INLINE __inline__
457
-#elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0) ||       \
458
-   HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||       \
459
-   HEDLEY_TI_ARMCL_VERSION_CHECK(5, 1, 0) ||  \
460
-   HEDLEY_TI_CL430_VERSION_CHECK(3, 1, 0) ||  \
461
-   HEDLEY_TI_CL2000_VERSION_CHECK(6, 2, 0) || \
462
-   HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) ||   \
463
-   HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||   \
464
+#elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0) ||         \
465
+   HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0) || \
466
+   HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||         \
467
+   HEDLEY_TI_ARMCL_VERSION_CHECK(5, 1, 0) ||    \
468
+   HEDLEY_TI_CL430_VERSION_CHECK(3, 1, 0) ||    \
469
+   HEDLEY_TI_CL2000_VERSION_CHECK(6, 2, 0) ||   \
470
+   HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) ||     \
471
+   HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||     \
472
    HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
473
 #define HEDLEY_INLINE __inline
474
 #else
475
@@ -1619,7 +1685,8 @@
476
    HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||    \
477
    HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
478
 #define HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) HEDLEY_INLINE
479
-#elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0)
480
+#elif HEDLEY_MSVC_VERSION_CHECK(12, 0, 0) || \
481
+   HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
482
 #define HEDLEY_ALWAYS_INLINE __forceinline
483
 #elif defined(__cplusplus) && (HEDLEY_TI_ARMCL_VERSION_CHECK(5, 2, 0) ||  \
484
                   HEDLEY_TI_CL430_VERSION_CHECK(4, 3, 0) ||  \
485
@@ -1658,7 +1725,8 @@
486
    HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||                           \
487
    HEDLEY_TI_CLPRU_VERSION_CHECK(2, 1, 0)
488
 #define HEDLEY_NEVER_INLINE __attribute__((__noinline__))
489
-#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0)
490
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 10, 0) || \
491
+   HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
492
 #define HEDLEY_NEVER_INLINE __declspec(noinline)
493
 #elif HEDLEY_PGI_VERSION_CHECK(10, 2, 0)
494
 #define HEDLEY_NEVER_INLINE _Pragma("noinline")
495
@@ -1711,7 +1779,9 @@
496
 #if HEDLEY_HAS_ATTRIBUTE(nothrow) || HEDLEY_GCC_VERSION_CHECK(3, 3, 0) || \
497
    HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
498
 #define HEDLEY_NO_THROW __attribute__((__nothrow__))
499
-#elif HEDLEY_MSVC_VERSION_CHECK(13, 1, 0) || HEDLEY_ARM_VERSION_CHECK(4, 1, 0)
500
+#elif HEDLEY_MSVC_VERSION_CHECK(13, 1, 0) ||         \
501
+   HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0) || \
502
+   HEDLEY_ARM_VERSION_CHECK(4, 1, 0)
503
 #define HEDLEY_NO_THROW __declspec(nothrow)
504
 #else
505
 #define HEDLEY_NO_THROW
506
@@ -1720,8 +1790,7 @@
507
 #if defined(HEDLEY_FALL_THROUGH)
508
 #undef HEDLEY_FALL_THROUGH
509
 #endif
510
-#if HEDLEY_GNUC_HAS_ATTRIBUTE(fallthrough, 7, 0, 0) && \
511
-   !defined(HEDLEY_PGI_VERSION)
512
+#if HEDLEY_HAS_ATTRIBUTE(fallthrough) || HEDLEY_GCC_VERSION_CHECK(7, 0, 0)
513
 #define HEDLEY_FALL_THROUGH __attribute__((__fallthrough__))
514
 #elif HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang, fallthrough)
515
 #define HEDLEY_FALL_THROUGH \
516
@@ -1866,12 +1935,14 @@
517
 #endif
518
 #if !defined(__cplusplus) &&                                             \
519
    ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
520
-    HEDLEY_HAS_FEATURE(c_static_assert) ||                          \
521
+    (HEDLEY_HAS_FEATURE(c_static_assert) &&                         \
522
+     !defined(HEDLEY_INTEL_CL_VERSION)) ||                          \
523
     HEDLEY_GCC_VERSION_CHECK(6, 0, 0) ||                            \
524
     HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || defined(_Static_assert))
525
 #define HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
526
 #elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
527
-   HEDLEY_MSVC_VERSION_CHECK(16, 0, 0)
528
+   HEDLEY_MSVC_VERSION_CHECK(16, 0, 0) ||              \
529
+   HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
530
 #define HEDLEY_STATIC_ASSERT(expr, message)           \
531
    HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \
532
        static_assert(expr, message))
533
@@ -1930,7 +2001,8 @@
534
    HEDLEY_PGI_VERSION_CHECK(18, 4, 0) || \
535
    HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
536
 #define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(GCC warning msg)
537
-#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0)
538
+#elif HEDLEY_MSVC_VERSION_CHECK(15, 0, 0) || \
539
+   HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
540
 #define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(message(msg))
541
 #else
542
 #define HEDLEY_WARNING(msg) HEDLEY_MESSAGE(msg)
543
@@ -1970,6 +2042,8 @@
544
 #endif
545
 #if HEDLEY_HAS_ATTRIBUTE(flag_enum)
546
 #define HEDLEY_FLAGS __attribute__((__flag_enum__))
547
+#else
548
+#define HEDLEY_FLAGS
549
 #endif
550
 
551
 #if defined(HEDLEY_FLAGS_CAST)
552
@@ -1989,8 +2063,9 @@
553
 #if defined(HEDLEY_EMPTY_BASES)
554
 #undef HEDLEY_EMPTY_BASES
555
 #endif
556
-#if HEDLEY_MSVC_VERSION_CHECK(19, 0, 23918) && \
557
-   !HEDLEY_MSVC_VERSION_CHECK(20, 0, 0)
558
+#if (HEDLEY_MSVC_VERSION_CHECK(19, 0, 23918) && \
559
+     !HEDLEY_MSVC_VERSION_CHECK(20, 0, 0)) ||   \
560
+   HEDLEY_INTEL_CL_VERSION_CHECK(2021, 1, 0)
561
 #define HEDLEY_EMPTY_BASES __declspec(empty_bases)
562
 #else
563
 #define HEDLEY_EMPTY_BASES
564
obs-studio-26.1.1.tar.xz/libobs/util/simde/simde-align.h Added
483
 
1
@@ -0,0 +1,481 @@
2
+/* Alignment
3
+ * Created by Evan Nemerson <evan@nemerson.com>
4
+ *
5
+ *   To the extent possible under law, the authors have waived all
6
+ *   copyright and related or neighboring rights to this code.  For
7
+ *   details, see the Creative Commons Zero 1.0 Universal license at
8
+ *   <https://creativecommons.org/publicdomain/zero/1.0/>
9
+ *
10
+ * SPDX-License-Identifier: CC0-1.0
11
+ *
12
+ **********************************************************************
13
+ *
14
+ * This is portability layer which should help iron out some
15
+ * differences across various compilers, as well as various verisons of
16
+ * C and C++.
17
+ *
18
+ * It was originally developed for SIMD Everywhere
19
+ * (<https://github.com/simd-everywhere/simde>), but since its only
20
+ * dependency is Hedley (<https://nemequ.github.io/hedley>, also CC0)
21
+ * it can easily be used in other projects, so please feel free to do
22
+ * so.
23
+ *
24
+ * If you do use this in your project, please keep a link to SIMDe in
25
+ * your code to remind you where to report any bugs and/or check for
26
+ * updated versions.
27
+ *
28
+ * # API Overview
29
+ *
30
+ * The API has several parts, and most macros have a few variations.
31
+ * There are APIs for declaring aligned fields/variables, optimization
32
+ * hints, and run-time alignment checks.
33
+ *
34
+ * Briefly, macros ending with "_TO" take numeric values and are great
35
+ * when you know the value you would like to use.  Macros ending with
36
+ * "_LIKE", on the other hand, accept a type and are used when you want
37
+ * to use the alignment of a type instead of hardcoding a value.
38
+ *
39
+ * Documentation for each section of the API is inline.
40
+ *
41
+ * True to form, MSVC is the main problem and imposes several
42
+ * limitations on the effectiveness of the APIs.  Detailed descriptions
43
+ * of the limitations of each macro are inline, but in general:
44
+ *
45
+ *  * On C11+ or C++11+ code written using this API will work.  The
46
+ *    ASSUME macros may or may not generate a hint to the compiler, but
47
+ *    that is only an optimization issue and will not actually cause
48
+ *    failures.
49
+ *  * If you're using pretty much any compiler other than MSVC,
50
+ *    everything should basically work as well as in C11/C++11.
51
+ */
52
+
53
+#if !defined(SIMDE_ALIGN_H)
54
+#define SIMDE_ALIGN_H
55
+
56
+#include "hedley.h"
57
+
58
+/* I know this seems a little silly, but some non-hosted compilers
59
+ * don't have stddef.h, so we try to accomodate them. */
60
+#if !defined(SIMDE_ALIGN_SIZE_T_)
61
+#if defined(__SIZE_TYPE__)
62
+#define SIMDE_ALIGN_SIZE_T_ __SIZE_TYPE__
63
+#elif defined(__SIZE_T_TYPE__)
64
+#define SIMDE_ALIGN_SIZE_T_ __SIZE_TYPE__
65
+#elif defined(__cplusplus)
66
+#include <cstddef>
67
+#define SIMDE_ALIGN_SIZE_T_ size_t
68
+#else
69
+#include <stddef.h>
70
+#define SIMDE_ALIGN_SIZE_T_ size_t
71
+#endif
72
+#endif
73
+
74
+#if !defined(SIMDE_ALIGN_INTPTR_T_)
75
+#if defined(__INTPTR_TYPE__)
76
+#define SIMDE_ALIGN_INTPTR_T_ __INTPTR_TYPE__
77
+#elif defined(__PTRDIFF_TYPE__)
78
+#define SIMDE_ALIGN_INTPTR_T_ __PTRDIFF_TYPE__
79
+#elif defined(__PTRDIFF_T_TYPE__)
80
+#define SIMDE_ALIGN_INTPTR_T_ __PTRDIFF_T_TYPE__
81
+#elif defined(__cplusplus)
82
+#include <cstddef>
83
+#define SIMDE_ALIGN_INTPTR_T_ ptrdiff_t
84
+#else
85
+#include <stddef.h>
86
+#define SIMDE_ALIGN_INTPTR_T_ ptrdiff_t
87
+#endif
88
+#endif
89
+
90
+#if defined(SIMDE_ALIGN_DEBUG)
91
+#if defined(__cplusplus)
92
+#include <cstdio>
93
+#else
94
+#include <stdio.h>
95
+#endif
96
+#endif
97
+
98
+/* SIMDE_ALIGN_OF(Type)
99
+ *
100
+ * The SIMDE_ALIGN_OF macro works like alignof, or _Alignof, or
101
+ * __alignof, or __alignof__, or __ALIGNOF__, depending on the compiler.
102
+ * It isn't defined everywhere (only when the compiler has some alignof-
103
+ * like feature we can use to implement it), but it should work in most
104
+ * modern compilers, as well as C11 and C++11.
105
+ *
106
+ * If we can't find an implementation for SIMDE_ALIGN_OF then the macro
107
+ * will not be defined, so if you can handle that situation sensibly
108
+ * you may need to sprinkle some ifdefs into your code.
109
+ */
110
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
111
+   (0 && HEDLEY_HAS_FEATURE(c_alignof))
112
+#define SIMDE_ALIGN_OF(Type) _Alignof(Type)
113
+#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
114
+   (0 && HEDLEY_HAS_FEATURE(cxx_alignof))
115
+#define SIMDE_ALIGN_OF(Type) alignof(Type)
116
+#elif HEDLEY_GCC_VERSION_CHECK(2, 95, 0) ||                                    \
117
+   HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                                   \
118
+   HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                                \
119
+   HEDLEY_SUNPRO_VERSION_CHECK(5, 13, 0) ||                               \
120
+   HEDLEY_TINYC_VERSION_CHECK(0, 9, 24) ||                                \
121
+   HEDLEY_PGI_VERSION_CHECK(19, 10, 0) ||                                 \
122
+   HEDLEY_CRAY_VERSION_CHECK(10, 0, 0) ||                                 \
123
+   HEDLEY_TI_ARMCL_VERSION_CHECK(16, 9, 0) ||                             \
124
+   HEDLEY_TI_CL2000_VERSION_CHECK(16, 9, 0) ||                            \
125
+   HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) ||                               \
126
+   HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||                               \
127
+   HEDLEY_TI_CL430_VERSION_CHECK(16, 9, 0) ||                             \
128
+   HEDLEY_TI_CLPRU_VERSION_CHECK(2, 3, 2) || defined(__IBM__ALIGNOF__) || \
129
+   defined(__clang__)
130
+#define SIMDE_ALIGN_OF(Type) __alignof__(Type)
131
+#elif HEDLEY_IAR_VERSION_CHECK(8, 40, 0)
132
+#define SIMDE_ALIGN_OF(Type) __ALIGNOF__(Type)
133
+#elif HEDLEY_MSVC_VERSION_CHECK(19, 0, 0)
134
+/* Probably goes back much further, but MS takes down their old docs.
135
+   * If you can verify that this works in earlier versions please let
136
+   * me know! */
137
+#define SIMDE_ALIGN_OF(Type) __alignof(Type)
138
+#endif
139
+
140
+/* SIMDE_ALIGN_MAXIMUM:
141
+ *
142
+ * This is the maximum alignment that the compiler supports.  You can
143
+ * define the value prior to including SIMDe if necessary, but in that
144
+ * case *please* submit an issue so we can add the platform to the
145
+ * detection code.
146
+ *
147
+ * Most compilers are okay with types which are aligned beyond what
148
+ * they think is the maximum, as long as the alignment is a power
149
+ * of two.  MSVC is the exception (of course), so we need to cap the
150
+ * alignment requests at values that the implementation supports.
151
+ *
152
+ * XL C/C++ will accept values larger than 16 (which is the alignment
153
+ * of an AltiVec vector), but will not reliably align to the larger
154
+ * value, so so we cap the value at 16 there.
155
+ *
156
+ * If the compiler accepts any power-of-two value within reason then
157
+ * this macro should be left undefined, and the SIMDE_ALIGN_CAP
158
+ * macro will just return the value passed to it. */
159
+#if !defined(SIMDE_ALIGN_MAXIMUM)
160
+#if defined(HEDLEY_MSVC_VERSION)
161
+#if defined(_M_IX86) || defined(_M_AMD64)
162
+#if HEDLEY_MSVC_VERSION_CHECK(19, 14, 0)
163
+#define SIMDE_ALIGN_PLATFORM_MAXIMUM 64
164
+#elif HEDLEY_MSVC_VERSION_CHECK(16, 0, 0)
165
+/* VS 2010 is really a guess based on Wikipedia; if anyone can
166
+         * test with old VS versions I'd really appreciate it. */
167
+#define SIMDE_ALIGN_PLATFORM_MAXIMUM 32
168
+#else
169
+#define SIMDE_ALIGN_PLATFORM_MAXIMUM 16
170
+#endif
171
+#elif defined(_M_ARM) || defined(_M_ARM64)
172
+#define SIMDE_ALIGN_PLATFORM_MAXIMUM 8
173
+#endif
174
+#elif defined(HEDLEY_IBM_VERSION)
175
+#define SIMDE_ALIGN_PLATFORM_MAXIMUM 16
176
+#endif
177
+#endif
178
+
179
+/* You can mostly ignore these; they're intended for internal use.
180
+ * If you do need to use them please let me know; if they fulfill
181
+ * a common use case I'll probably drop the trailing underscore
182
+ * and make them part of the public API. */
183
+#if defined(SIMDE_ALIGN_PLATFORM_MAXIMUM)
184
+#if SIMDE_ALIGN_PLATFORM_MAXIMUM >= 64
185
+#define SIMDE_ALIGN_64_ 64
186
+#define SIMDE_ALIGN_32_ 32
187
+#define SIMDE_ALIGN_16_ 16
188
+#define SIMDE_ALIGN_8_ 8
189
+#elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 32
190
+#define SIMDE_ALIGN_64_ 32
191
+#define SIMDE_ALIGN_32_ 32
192
+#define SIMDE_ALIGN_16_ 16
193
+#define SIMDE_ALIGN_8_ 8
194
+#elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 16
195
+#define SIMDE_ALIGN_64_ 16
196
+#define SIMDE_ALIGN_32_ 16
197
+#define SIMDE_ALIGN_16_ 16
198
+#define SIMDE_ALIGN_8_ 8
199
+#elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 8
200
+#define SIMDE_ALIGN_64_ 8
201
+#define SIMDE_ALIGN_32_ 8
202
+#define SIMDE_ALIGN_16_ 8
203
+#define SIMDE_ALIGN_8_ 8
204
+#else
205
+#error Max alignment expected to be >= 8
206
+#endif
207
+#else
208
+#define SIMDE_ALIGN_64_ 64
209
+#define SIMDE_ALIGN_32_ 32
210
+#define SIMDE_ALIGN_16_ 16
211
+#define SIMDE_ALIGN_8_ 8
212
+#endif
213
+
214
+/**
215
+ * SIMDE_ALIGN_CAP(Alignment)
216
+ *
217
+ * Returns the minimum of Alignment or SIMDE_ALIGN_MAXIMUM.
218
+ */
219
+#if defined(SIMDE_ALIGN_MAXIMUM)
220
+#define SIMDE_ALIGN_CAP(Alignment)                      \
221
+   (((Alignment) < (SIMDE_ALIGN_PLATFORM_MAXIMUM)) \
222
+        ? (Alignment)                          \
223
+        : (SIMDE_ALIGN_PLATFORM_MAXIMUM))
224
+#else
225
+#define SIMDE_ALIGN_CAP(Alignment) (Alignment)
226
+#endif
227
+
228
+/* SIMDE_ALIGN_TO(Alignment)
229
+ *
230
+ * SIMDE_ALIGN_TO is used to declare types or variables.  It basically
231
+ * maps to the align attribute in most compilers, the align declspec
232
+ * in MSVC, or _Alignas/alignas in C11/C++11.
233
+ *
234
+ * Example:
235
+ *
236
+ *   struct i32x4 {
237
+ *     SIMDE_ALIGN_TO(16) int32_t values[4];
238
+ *   }
239
+ *
240
+ * Limitations:
241
+ *
242
+ * MSVC requires that the Alignment parameter be numeric; you can't do
243
+ * something like `SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(int))`.  This is
244
+ * unfortunate because that's really how the LIKE macros are
245
+ * implemented, and I am not aware of a way to get anything like this
246
+ * to work without using the C11/C++11 keywords.
247
+ *
248
+ * It also means that we can't use SIMDE_ALIGN_CAP to limit the
249
+ * alignment to the value specified, which MSVC also requires, so on
250
+ * MSVC you should use the `SIMDE_ALIGN_TO_8/16/32/64` macros instead.
251
+ * They work like `SIMDE_ALIGN_TO(SIMDE_ALIGN_CAP(Alignment))` would,
252
+ * but should be safe to use on MSVC.
253
+ *
254
+ * All this is to say that, if you want your code to work on MSVC, you
255
+ * should use the SIMDE_ALIGN_TO_8/16/32/64 macros below instead of
256
+ * SIMDE_ALIGN_TO(8/16/32/64).
257
+ */
258
+#if HEDLEY_HAS_ATTRIBUTE(aligned) || HEDLEY_GCC_VERSION_CHECK(2, 95, 0) || \
259
+   HEDLEY_CRAY_VERSION_CHECK(8, 4, 0) ||                              \
260
+   HEDLEY_IBM_VERSION_CHECK(11, 1, 0) ||                              \
261
+   HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
262
+   HEDLEY_PGI_VERSION_CHECK(19, 4, 0) ||                              \
263
+   HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                               \
264
+   HEDLEY_TINYC_VERSION_CHECK(0, 9, 24) ||                            \
265
+   HEDLEY_TI_ARMCL_VERSION_CHECK(16, 9, 0) ||                         \
266
+   HEDLEY_TI_CL2000_VERSION_CHECK(16, 9, 0) ||                        \
267
+   HEDLEY_TI_CL6X_VERSION_CHECK(8, 0, 0) ||                           \
268
+   HEDLEY_TI_CL7X_VERSION_CHECK(1, 2, 0) ||                           \
269
+   HEDLEY_TI_CL430_VERSION_CHECK(16, 9, 0) ||                         \
270
+   HEDLEY_TI_CLPRU_VERSION_CHECK(2, 3, 2)
271
+#define SIMDE_ALIGN_TO(Alignment) \
272
+   __attribute__((__aligned__(SIMDE_ALIGN_CAP(Alignment))))
273
+#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))
274
+#define SIMDE_ALIGN_TO(Alignment) _Alignas(SIMDE_ALIGN_CAP(Alignment))
275
+#elif (defined(__cplusplus) && (__cplusplus >= 201103L))
276
+#define SIMDE_ALIGN_TO(Alignment) alignas(SIMDE_ALIGN_CAP(Alignment))
277
+#elif defined(HEDLEY_MSVC_VERSION)
278
+#define SIMDE_ALIGN_TO(Alignment) __declspec(align(Alignment))
279
+/* Unfortunately MSVC can't handle __declspec(align(__alignof(Type)));
280
+   * the alignment passed to the declspec has to be an integer. */
281
+#define SIMDE_ALIGN_OF_UNUSABLE_FOR_LIKE
282
+#endif
283
+#define SIMDE_ALIGN_TO_64 SIMDE_ALIGN_TO(SIMDE_ALIGN_64_)
284
+#define SIMDE_ALIGN_TO_32 SIMDE_ALIGN_TO(SIMDE_ALIGN_32_)
285
+#define SIMDE_ALIGN_TO_16 SIMDE_ALIGN_TO(SIMDE_ALIGN_16_)
286
+#define SIMDE_ALIGN_TO_8 SIMDE_ALIGN_TO(SIMDE_ALIGN_8_)
287
+
288
+/* SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment)
289
+ *
290
+ * SIMDE_ALIGN_ASSUME_TO is semantically similar to C++20's
291
+ * std::assume_aligned, or __builtin_assume_aligned.  It tells the
292
+ * compiler to assume that the provided pointer is aligned to an
293
+ * `Alignment`-byte boundary.
294
+ *
295
+ * If you define SIMDE_ALIGN_DEBUG prior to including this header then
296
+ * SIMDE_ALIGN_ASSUME_TO will turn into a runtime check.   We don't
297
+ * integrate with NDEBUG in this header, but it may be a good idea to
298
+ * put something like this in your code:
299
+ *
300
+ *   #if !defined(NDEBUG)
301
+ *     #define SIMDE_ALIGN_DEBUG
302
+ *   #endif
303
+ *   #include <.../simde-align.h>
304
+ */
305
+#if HEDLEY_HAS_BUILTIN(__builtin_assume_aligned) || \
306
+   HEDLEY_GCC_VERSION_CHECK(4, 7, 0)
307
+#define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment)                   \
308
+   HEDLEY_REINTERPRET_CAST(                                              \
309
+       __typeof__(Pointer),                                          \
310
+       __builtin_assume_aligned(                                     \
311
+           HEDLEY_CONST_CAST(                                    \
312
+               void *, HEDLEY_REINTERPRET_CAST(const void *, \
313
+                               Pointer)),    \
314
+           Alignment))
315
+#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
316
+#define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment)           \
317
+   (__extension__({                                              \
318
+       __typeof__(v) simde_assume_aligned_t_ = (Pointer);    \
319
+       __assume_aligned(simde_assume_aligned_t_, Alignment); \
320
+       simde_assume_aligned_t_;                              \
321
+   }))
322
+#elif defined(__cplusplus) && (__cplusplus > 201703L)
323
+#include <memory>
324
+#define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) \
325
+   std::assume_aligned<Alignment>(Pointer)
326
+#else
327
+#if defined(__cplusplus)
328
+template<typename T>
329
+HEDLEY_ALWAYS_INLINE static T *
330
+simde_align_assume_to_unchecked(T *ptr, const size_t alignment)
331
+#else
332
+HEDLEY_ALWAYS_INLINE static void *
333
+simde_align_assume_to_unchecked(void *ptr, const size_t alignment)
334
+#endif
335
+{
336
+   HEDLEY_ASSUME((HEDLEY_REINTERPRET_CAST(size_t, (ptr)) %
337
+              SIMDE_ALIGN_CAP(alignment)) == 0);
338
+   return ptr;
339
+}
340
+#if defined(__cplusplus)
341
+#define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) \
342
+   simde_align_assume_to_unchecked((Pointer), (Alignment))
343
+#else
344
+#define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment)                \
345
+   simde_align_assume_to_unchecked(                                   \
346
+       HEDLEY_CONST_CAST(void *, HEDLEY_REINTERPRET_CAST(         \
347
+                         const void *, Pointer)), \
348
+       (Alignment))
349
+#endif
350
+#endif
351
+
352
+#if !defined(SIMDE_ALIGN_DEBUG)
353
+#define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment) \
354
+   SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment)
355
+#else
356
+#include <stdio.h>
357
+#if defined(__cplusplus)
358
+template<typename T>
359
+static HEDLEY_ALWAYS_INLINE T *
360
+simde_align_assume_to_checked_uncapped(T *ptr, const size_t alignment,
361
+                      const char *file, int line,
362
+                      const char *ptrname)
363
+#else
364
+static HEDLEY_ALWAYS_INLINE void *
365
+simde_align_assume_to_checked_uncapped(void *ptr, const size_t alignment,
366
+                      const char *file, int line,
367
+                      const char *ptrname)
368
+#endif
369
+{
370
+   if (HEDLEY_UNLIKELY(
371
+           (HEDLEY_REINTERPRET_CAST(SIMDE_ALIGN_INTPTR_T_, (ptr)) %
372
+            HEDLEY_STATIC_CAST(SIMDE_ALIGN_INTPTR_T_,
373
+                   SIMDE_ALIGN_CAP(alignment))) != 0)) {
374
+       fprintf(stderr,
375
+           "%s:%d: alignment check failed for `%s' (%p %% %u == %u)\n",
376
+           file, line, ptrname,
377
+           HEDLEY_REINTERPRET_CAST(const void *, ptr),
378
+           HEDLEY_STATIC_CAST(unsigned int,
379
+                      SIMDE_ALIGN_CAP(alignment)),
380
+           HEDLEY_STATIC_CAST(
381
+               unsigned int,
382
+               HEDLEY_REINTERPRET_CAST(SIMDE_ALIGN_INTPTR_T_,
383
+                           (ptr)) %
384
+                   HEDLEY_STATIC_CAST(
385
+                       SIMDE_ALIGN_INTPTR_T_,
386
+                       SIMDE_ALIGN_CAP(alignment))));
387
+   }
388
+
389
+   return ptr;
390
+}
391
+
392
+#if defined(__cplusplus)
393
+#define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment)                      \
394
+   simde_align_assume_to_checked_uncapped((Pointer), (Alignment), \
395
+                          __FILE__, __LINE__, #Pointer)
396
+#else
397
+#define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment)                          \
398
+   simde_align_assume_to_checked_uncapped(                            \
399
+       HEDLEY_CONST_CAST(void *, HEDLEY_REINTERPRET_CAST(         \
400
+                         const void *, Pointer)), \
401
+       (Alignment), __FILE__, __LINE__, #Pointer)
402
+#endif
403
+#endif
404
+
405
+/* SIMDE_ALIGN_LIKE(Type)
406
+ * SIMDE_ALIGN_LIKE_#(Type)
407
+ *
408
+ * The SIMDE_ALIGN_LIKE macros are similar to the SIMDE_ALIGN_TO macros
409
+ * except instead of an integer they take a type; basically, it's just
410
+ * a more convenient way to do something like:
411
+ *
412
+ *   SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(Type))
413
+ *
414
+ * The versions with a numeric suffix will fall back on using a numeric
415
+ * value in the event we can't use SIMDE_ALIGN_OF(Type).  This is
416
+ * mainly for MSVC, where __declspec(align()) can't handle anything
417
+ * other than hard-coded numeric values.
418
+ */
419
+#if defined(SIMDE_ALIGN_OF) && defined(SIMDE_ALIGN_TO) && \
420
+   !defined(SIMDE_ALIGN_OF_UNUSABLE_FOR_LIKE)
421
+#define SIMDE_ALIGN_LIKE(Type) SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(Type))
422
+#define SIMDE_ALIGN_LIKE_64(Type) SIMDE_ALIGN_LIKE(Type)
423
+#define SIMDE_ALIGN_LIKE_32(Type) SIMDE_ALIGN_LIKE(Type)
424
+#define SIMDE_ALIGN_LIKE_16(Type) SIMDE_ALIGN_LIKE(Type)
425
+#define SIMDE_ALIGN_LIKE_8(Type) SIMDE_ALIGN_LIKE(Type)
426
+#else
427
+#define SIMDE_ALIGN_LIKE_64(Type) SIMDE_ALIGN_TO_64
428
+#define SIMDE_ALIGN_LIKE_32(Type) SIMDE_ALIGN_TO_32
429
+#define SIMDE_ALIGN_LIKE_16(Type) SIMDE_ALIGN_TO_16
430
+#define SIMDE_ALIGN_LIKE_8(Type) SIMDE_ALIGN_TO_8
431
+#endif
432
+
433
+/* SIMDE_ALIGN_ASSUME_LIKE(Pointer, Type)
434
+ *
435
+ * Tihs is similar to SIMDE_ALIGN_ASSUME_TO, except that it takes a
436
+ * type instead of a numeric value. */
437
+#if defined(SIMDE_ALIGN_OF) && defined(SIMDE_ALIGN_ASSUME_TO)
438
+#define SIMDE_ALIGN_ASSUME_LIKE(Pointer, Type) \
439
+   SIMDE_ALIGN_ASSUME_TO(Pointer, SIMDE_ALIGN_OF(Type))
440
+#endif
441
+
442
+/* SIMDE_ALIGN_CAST(Type, Pointer)
443
+ *
444
+ * SIMDE_ALIGN_CAST is like C++'s reinterpret_cast, but it will try
445
+ * to silence warnings that some compilers may produce if you try
446
+ * to assign to a type with increased alignment requirements.
447
+ *
448
+ * Note that it does *not* actually attempt to tell the compiler that
449
+ * the pointer is aligned like the destination should be; that's the
450
+ * job of the next macro.  This macro is necessary for stupid APIs
451
+ * like _mm_loadu_si128 where the input is a __m128i* but the function
452
+ * is specifically for data which isn't necessarily aligned to
453
+ * _Alignof(__m128i).
454
+ */
455
+#if HEDLEY_HAS_WARNING("-Wcast-align") || defined(__clang__) || \
456
+   HEDLEY_GCC_VERSION_CHECK(3, 4, 0)
457
+#define SIMDE_ALIGN_CAST(Type, Pointer)                                 \
458
+   (__extension__({                                                \
459
+       HEDLEY_DIAGNOSTIC_PUSH                                  \
460
+       _Pragma("GCC diagnostic ignored \"-Wcast-align\"")      \
461
+           Type simde_r_ =                                 \
462
+               HEDLEY_REINTERPRET_CAST(Type, Pointer); \
463
+       HEDLEY_DIAGNOSTIC_POP                                   \
464
+       simde_r_;                                               \
465
+   }))
466
+#else
467
+#define SIMDE_ALIGN_CAST(Type, Pointer) HEDLEY_REINTERPRET_CAST(Type, Pointer)
468
+#endif
469
+
470
+/* SIMDE_ALIGN_ASSUME_CAST(Type, Pointer)
471
+ *
472
+ * This is sort of like a combination of a reinterpret_cast and a
473
+ * SIMDE_ALIGN_ASSUME_LIKE.  It uses SIMDE_ALIGN_ASSUME_LIKE to tell
474
+ * the compiler that the pointer is aligned like the specified type
475
+ * and casts the pointer to the specified type while suppressing any
476
+ * warnings from the compiler about casting to a type with greater
477
+ * alignment requirements.
478
+ */
479
+#define SIMDE_ALIGN_ASSUME_CAST(Type, Pointer) \
480
+   SIMDE_ALIGN_ASSUME_LIKE(SIMDE_ALIGN_CAST(Type, Pointer), Type)
481
+
482
+#endif /* !defined(SIMDE_ALIGN_H) */
483
obs-studio-26.1.0.tar.xz/libobs/util/simde/simde-arch.h -> obs-studio-26.1.1.tar.xz/libobs/util/simde/simde-arch.h Changed
74
 
1
@@ -27,14 +27,14 @@
2
  * an undefined macro being used (e.g., GCC with -Wundef).
3
  *
4
  * This was originally created for SIMDe
5
- * <https://github.com/nemequ/simde> (hence the prefix), but this
6
+ * <https://github.com/simd-everywhere/simde> (hence the prefix), but this
7
  * header has no dependencies and may be used anywhere.  It is
8
  * originally based on information from
9
  * <https://sourceforge.net/p/predef/wiki/Architectures/>, though it
10
  * has been enhanced with additional information.
11
  *
12
  * If you improve this file, or find a bug, please file the issue at
13
- * <https://github.com/nemequ/simde/issues>.  If you copy this into
14
+ * <https://github.com/simd-everywhere/simde/issues>.  If you copy this into
15
  * your project, even if you change the prefix, please keep the links
16
  * to SIMDe intact so others know where to report issues, submit
17
  * enhancements, and find the latest version. */
18
@@ -70,7 +70,7 @@
19
 /* AMD64 / x86_64
20
    <https://en.wikipedia.org/wiki/X86-64> */
21
 #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || \
22
-   defined(__x86_64) || defined(_M_X66) || defined(_M_AMD64)
23
+   defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
24
 #define SIMDE_ARCH_AMD64 1000
25
 #endif
26
 
27
@@ -125,6 +125,9 @@
28
 #define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_ARM
29
 #endif
30
 #endif
31
+#if defined(__ARM_FEATURE_SVE)
32
+#define SIMDE_ARCH_ARM_SVE
33
+#endif
34
 
35
 /* Blackfin
36
    <https://en.wikipedia.org/wiki/Blackfin> */
37
@@ -276,6 +279,12 @@
38
 #define SIMDE_ARCH_X86_AVX 1
39
 #endif
40
 #endif
41
+#if defined(__AVX512VP2INTERSECT__)
42
+#define SIMDE_ARCH_X86_AVX512VP2INTERSECT 1
43
+#endif
44
+#if defined(__AVX512VBMI__)
45
+#define SIMDE_ARCH_X86_AVX512VBMI 1
46
+#endif
47
 #if defined(__AVX512BW__)
48
 #define SIMDE_ARCH_X86_AVX512BW 1
49
 #endif
50
@@ -294,6 +303,12 @@
51
 #if defined(__GFNI__)
52
 #define SIMDE_ARCH_X86_GFNI 1
53
 #endif
54
+#if defined(__PCLMUL__)
55
+#define SIMDE_ARCH_X86_PCLMUL 1
56
+#endif
57
+#if defined(__VPCLMULQDQ__)
58
+#define SIMDE_ARCH_X86_VPCLMULQDQ 1
59
+#endif
60
 #endif
61
 
62
 /* Itanium
63
@@ -363,6 +378,10 @@
64
 #define SIMDE_ARCH_MIPS_CHECK(version) (0)
65
 #endif
66
 
67
+#if defined(__mips_loongson_mmi)
68
+#define SIMDE_ARCH_MIPS_LOONGSON_MMI 1
69
+#endif
70
+
71
 /* Matsushita MN10300
72
    <https://en.wikipedia.org/wiki/MN103> */
73
 #if defined(__MN10300__) || defined(__mn10300__)
74
obs-studio-26.1.0.tar.xz/libobs/util/simde/simde-common.h -> obs-studio-26.1.1.tar.xz/libobs/util/simde/simde-common.h Changed
674
 
1
@@ -30,63 +30,104 @@
2
 #include "hedley.h"
3
 
4
 #define SIMDE_VERSION_MAJOR 0
5
-#define SIMDE_VERSION_MINOR 5
6
-#define SIMDE_VERSION_MICRO 0
7
+#define SIMDE_VERSION_MINOR 7
8
+#define SIMDE_VERSION_MICRO 1
9
 #define SIMDE_VERSION                                                   \
10
    HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, \
11
                  SIMDE_VERSION_MICRO)
12
 
13
-#include "simde-arch.h"
14
-#include "simde-features.h"
15
-#include "simde-diagnostic.h"
16
-
17
 #include <stddef.h>
18
 #include <stdint.h>
19
 
20
-#if HEDLEY_HAS_ATTRIBUTE(aligned) || HEDLEY_GCC_VERSION_CHECK(2, 95, 0) || \
21
-   HEDLEY_CRAY_VERSION_CHECK(8, 4, 0) ||                              \
22
-   HEDLEY_IBM_VERSION_CHECK(11, 1, 0) ||                              \
23
-   HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                            \
24
-   HEDLEY_PGI_VERSION_CHECK(19, 4, 0) ||                              \
25
-   HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||                               \
26
-   HEDLEY_TINYC_VERSION_CHECK(0, 9, 24) ||                            \
27
-   HEDLEY_TI_VERSION_CHECK(8, 1, 0)
28
-#define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment)))
29
-#elif defined(_MSC_VER) && !(defined(_M_ARM) && !defined(_M_ARM64))
30
-#define SIMDE_ALIGN(alignment) __declspec(align(alignment))
31
-#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
32
-#define SIMDE_ALIGN(alignment) _Alignas(alignment)
33
-#elif defined(__cplusplus) && (__cplusplus >= 201103L)
34
-#define SIMDE_ALIGN(alignment) alignas(alignment)
35
-#else
36
-#define SIMDE_ALIGN(alignment)
37
-#endif
38
-
39
-#if HEDLEY_GNUC_VERSION_CHECK(2, 95, 0) ||   \
40
-   HEDLEY_ARM_VERSION_CHECK(4, 1, 0) || \
41
-   HEDLEY_IBM_VERSION_CHECK(11, 1, 0)
42
-#define SIMDE_ALIGN_OF(T) (__alignof__(T))
43
-#elif (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
44
-   HEDLEY_HAS_FEATURE(c11_alignof)
45
-#define SIMDE_ALIGN_OF(T) (_Alignof(T))
46
-#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
47
-   HEDLEY_HAS_FEATURE(cxx_alignof)
48
-#define SIMDE_ALIGN_OF(T) (alignof(T))
49
-#endif
50
-
51
-#if defined(SIMDE_ALIGN_OF)
52
-#define SIMDE_ALIGN_AS(N, T) SIMDE_ALIGN(SIMDE_ALIGN_OF(T))
53
-#else
54
-#define SIMDE_ALIGN_AS(N, T) SIMDE_ALIGN(N)
55
+#include "simde-detect-clang.h"
56
+#include "simde-arch.h"
57
+#include "simde-features.h"
58
+#include "simde-diagnostic.h"
59
+#include "simde-math.h"
60
+#include "simde-constify.h"
61
+#include "simde-align.h"
62
+
63
+/* In some situations, SIMDe has to make large performance sacrifices
64
+ * for small increases in how faithfully it reproduces an API, but
65
+ * only a relatively small number of users will actually need the API
66
+ * to be completely accurate.  The SIMDE_FAST_* options can be used to
67
+ * disable these trade-offs.
68
+ *
69
+ * They can be enabled by passing -DSIMDE_FAST_MATH to the compiler, or
70
+ * the individual defines (e.g., -DSIMDE_FAST_NANS) if you only want to
71
+ * enable some optimizations.  Using -ffast-math and/or
72
+ * -ffinite-math-only will also enable the relevant options.  If you
73
+ * don't want that you can pass -DSIMDE_NO_FAST_* to disable them. */
74
+
75
+/* Most programs avoid NaNs by never passing values which can result in
76
+ * a NaN; for example, if you only pass non-negative values to the sqrt
77
+ * functions, it won't generate a NaN.  On some platforms, similar
78
+ * functions handle NaNs differently; for example, the _mm_min_ps SSE
79
+ * function will return 0.0 if you pass it (0.0, NaN), but the NEON
80
+ * vminq_f32 function will return NaN.  Making them behave like one
81
+ * another is expensive; it requires generating a mask of all lanes
82
+ * with NaNs, then performing the operation (e.g., vminq_f32), then
83
+ * blending together the result with another vector using the mask.
84
+ *
85
+ * If you don't want SIMDe to worry about the differences between how
86
+ * NaNs are handled on the two platforms, define this (or pass
87
+ * -ffinite-math-only) */
88
+#if !defined(SIMDE_FAST_MATH) && !defined(SIMDE_NO_FAST_MATH) && \
89
+   defined(__FAST_MATH__)
90
+#define SIMDE_FAST_MATH
91
+#endif
92
+
93
+#if !defined(SIMDE_FAST_NANS) && !defined(SIMDE_NO_FAST_NANS)
94
+#if defined(SIMDE_FAST_MATH)
95
+#define SIMDE_FAST_NANS
96
+#elif defined(__FINITE_MATH_ONLY__)
97
+#if __FINITE_MATH_ONLY__
98
+#define SIMDE_FAST_NANS
99
+#endif
100
+#endif
101
+#endif
102
+
103
+/* Many functions are defined as using the current rounding mode
104
+ * (i.e., the SIMD version of fegetround()) when converting to
105
+ * an integer.  For example, _mm_cvtpd_epi32.  Unfortunately,
106
+ * on some platforms (such as ARMv8+ where round-to-nearest is
107
+ * always used, regardless of the FPSCR register) this means we
108
+ * have to first query the current rounding mode, then choose
109
+ * the proper function (rounnd
110
+ , ceil, floor, etc.) */
111
+#if !defined(SIMDE_FAST_ROUND_MODE) && !defined(SIMDE_NO_FAST_ROUND_MODE) && \
112
+   defined(SIMDE_FAST_MATH)
113
+#define SIMDE_FAST_ROUND_MODE
114
+#endif
115
+
116
+/* This controls how ties are rounded.  For example, does 10.5 round to
117
+ * 10 or 11?  IEEE 754 specifies round-towards-even, but ARMv7 (for
118
+ * example) doesn't support it and it must be emulated (which is rather
119
+ * slow).  If you're okay with just using the default for whatever arch
120
+ * you're on, you should definitely define this.
121
+ *
122
+ * Note that we don't use this macro to avoid correct implementations
123
+ * in functions which are explicitly about rounding (such as vrnd* on
124
+ * NEON, _mm_round_* on x86, etc.); it is only used for code where
125
+ * rounding is a component in another function, and even then it isn't
126
+ * usually a problem since such functions will use the current rounding
127
+ * mode. */
128
+#if !defined(SIMDE_FAST_ROUND_TIES) && !defined(SIMDE_NO_FAST_ROUND_TIES) && \
129
+   defined(SIMDE_FAST_MATH)
130
+#define SIMDE_FAST_ROUND_TIES
131
+#endif
132
+
133
+/* For functions which convert from one type to another (mostly from
134
+ * floating point to integer types), sometimes we need to do a range
135
+ * check and potentially return a different result if the value
136
+ * falls outside that range.  Skipping this check can provide a
137
+ * performance boost, at the expense of faithfulness to the API we're
138
+ * emulating. */
139
+#if !defined(SIMDE_FAST_CONVERSION_RANGE) && \
140
+   !defined(SIMDE_NO_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_MATH)
141
+#define SIMDE_FAST_CONVERSION_RANGE
142
 #endif
143
 
144
-#define simde_assert_aligned(alignment, val)                                \
145
-   simde_assert_int(HEDLEY_REINTERPRET_CAST(                           \
146
-                uintptr_t, HEDLEY_REINTERPRET_CAST(        \
147
-                           const void *, (val))) % \
148
-                (alignment),                               \
149
-            ==, 0)
150
-
151
 #if HEDLEY_HAS_BUILTIN(__builtin_constant_p) ||                             \
152
    HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||                                \
153
    HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) ||                             \
154
@@ -102,15 +143,21 @@
155
 #define SIMDE_CHECK_CONSTANT_(expr) (std::is_constant_evaluated())
156
 #endif
157
 
158
-/* diagnose_if + __builtin_constant_p was broken until clang 9,
159
- * which is when __FILE_NAME__ was added. */
160
-#if defined(SIMDE_CHECK_CONSTANT_) && defined(__FILE_NAME__)
161
+#if !defined(SIMDE_NO_CHECK_IMMEDIATE_CONSTANT)
162
+#if defined(SIMDE_CHECK_CONSTANT_) &&                \
163
+   SIMDE_DETECT_CLANG_VERSION_CHECK(9, 0, 0) && \
164
+   (!defined(__apple_build_version__) ||        \
165
+    ((__apple_build_version__ < 11000000) ||    \
166
+     (__apple_build_version__ >= 12000000)))
167
 #define SIMDE_REQUIRE_CONSTANT(arg)                    \
168
    HEDLEY_REQUIRE_MSG(SIMDE_CHECK_CONSTANT_(arg), \
169
               "`" #arg "' must be constant")
170
 #else
171
 #define SIMDE_REQUIRE_CONSTANT(arg)
172
 #endif
173
+#else
174
+#define SIMDE_REQUIRE_CONSTANT(arg)
175
+#endif
176
 
177
 #define SIMDE_REQUIRE_RANGE(arg, min, max)                         \
178
    HEDLEY_REQUIRE_MSG((((arg) >= (min)) && ((arg) <= (max))), \
179
@@ -120,39 +167,20 @@
180
    SIMDE_REQUIRE_CONSTANT(arg)                 \
181
    SIMDE_REQUIRE_RANGE(arg, min, max)
182
 
183
-/* SIMDE_ASSUME_ALIGNED allows you to (try to) tell the compiler
184
- * that a pointer is aligned to an `alignment`-byte boundary. */
185
-#if HEDLEY_HAS_BUILTIN(__builtin_assume_aligned) || \
186
-   HEDLEY_GCC_VERSION_CHECK(4, 7, 0)
187
-#define SIMDE_ASSUME_ALIGNED(alignment, v)     \
188
-   HEDLEY_REINTERPRET_CAST(__typeof__(v), \
189
-               __builtin_assume_aligned(v, alignment))
190
-#elif defined(__cplusplus) && (__cplusplus > 201703L)
191
-#define SIMDE_ASSUME_ALIGNED(alignment, v) std::assume_aligned<alignment>(v)
192
-#elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
193
-#define SIMDE_ASSUME_ALIGNED(alignment, v)                            \
194
-   (__extension__({                                              \
195
-       __typeof__(v) simde_assume_aligned_t_ = (v);          \
196
-       __assume_aligned(simde_assume_aligned_t_, alignment); \
197
-       simde_assume_aligned_t_;                              \
198
-   }))
199
-#else
200
-#define SIMDE_ASSUME_ALIGNED(alignment, v) (v)
201
-#endif
202
-
203
-/* SIMDE_ALIGN_CAST allows you to convert to a type with greater
204
- * aligment requirements without triggering a warning. */
205
-#if HEDLEY_HAS_WARNING("-Wcast-align")
206
-#define SIMDE_ALIGN_CAST(T, v)                                       \
207
-   (__extension__({                                             \
208
-       HEDLEY_DIAGNOSTIC_PUSH                               \
209
-       _Pragma("clang diagnostic ignored \"-Wcast-align\"") \
210
-           T simde_r_ = HEDLEY_REINTERPRET_CAST(T, v);  \
211
-       HEDLEY_DIAGNOSTIC_POP                                \
212
-       simde_r_;                                            \
213
-   }))
214
-#else
215
-#define SIMDE_ALIGN_CAST(T, v) HEDLEY_REINTERPRET_CAST(T, v)
216
+/* A copy of HEDLEY_STATIC_ASSERT, except we don't define an empty
217
+ * fallback if we can't find an implementation; instead we have to
218
+ * check if SIMDE_STATIC_ASSERT is defined before using it. */
219
+#if !defined(__cplusplus) &&                                             \
220
+   ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \
221
+    HEDLEY_HAS_FEATURE(c_static_assert) ||                          \
222
+    HEDLEY_GCC_VERSION_CHECK(6, 0, 0) ||                            \
223
+    HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || defined(_Static_assert))
224
+#define SIMDE_STATIC_ASSERT(expr, message) _Static_assert(expr, message)
225
+#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) || \
226
+   HEDLEY_MSVC_VERSION_CHECK(16, 0, 0)
227
+#define SIMDE_STATIC_ASSERT(expr, message)            \
228
+   HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_( \
229
+       static_assert(expr, message))
230
 #endif
231
 
232
 #if (HEDLEY_HAS_ATTRIBUTE(may_alias) && !defined(HEDLEY_SUNPRO_VERSION)) || \
233
@@ -170,6 +198,7 @@
234
 
235
     * SIMDE_VECTOR - Declaring a vector.
236
     * SIMDE_VECTOR_OPS - basic operations (binary and unary).
237
+    * SIMDE_VECTOR_NEGATE - negating a vector
238
     * SIMDE_VECTOR_SCALAR - For binary operators, the second argument
239
         can be a scalar, in which case the result is as if that scalar
240
         had been broadcast to all lanes of a vector.
241
@@ -182,11 +211,13 @@
242
 #if HEDLEY_GCC_VERSION_CHECK(4, 8, 0)
243
 #define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
244
 #define SIMDE_VECTOR_OPS
245
+#define SIMDE_VECTOR_NEGATE
246
 #define SIMDE_VECTOR_SCALAR
247
 #define SIMDE_VECTOR_SUBSCRIPT
248
 #elif HEDLEY_INTEL_VERSION_CHECK(16, 0, 0)
249
 #define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
250
 #define SIMDE_VECTOR_OPS
251
+#define SIMDE_VECTOR_NEGATE
252
 /* ICC only supports SIMDE_VECTOR_SCALAR for constants */
253
 #define SIMDE_VECTOR_SUBSCRIPT
254
 #elif HEDLEY_GCC_VERSION_CHECK(4, 1, 0) || HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
255
@@ -197,8 +228,9 @@
256
 #elif HEDLEY_HAS_ATTRIBUTE(vector_size)
257
 #define SIMDE_VECTOR(size) __attribute__((__vector_size__(size)))
258
 #define SIMDE_VECTOR_OPS
259
+#define SIMDE_VECTOR_NEGATE
260
 #define SIMDE_VECTOR_SUBSCRIPT
261
-#if HEDLEY_HAS_ATTRIBUTE(diagnose_if) /* clang 4.0 */
262
+#if SIMDE_DETECT_CLANG_VERSION_CHECK(5, 0, 0)
263
 #define SIMDE_VECTOR_SCALAR
264
 #endif
265
 #endif
266
@@ -281,27 +313,34 @@
267
 #endif
268
 
269
 #if defined(SIMDE_ENABLE_OPENMP)
270
-#define SIMDE_VECTORIZE _Pragma("omp simd")
271
+#define SIMDE_VECTORIZE HEDLEY_PRAGMA(omp simd)
272
 #define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l))
273
+#if defined(__clang__)
274
+#define SIMDE_VECTORIZE_REDUCTION(r)                              \
275
+   HEDLEY_DIAGNOSTIC_PUSH                                    \
276
+   _Pragma("clang diagnostic ignored \"-Wsign-conversion\"") \
277
+       HEDLEY_PRAGMA(omp simd reduction(r)) HEDLEY_DIAGNOSTIC_POP
278
+#else
279
 #define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r))
280
+#endif
281
 #define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a))
282
 #elif defined(SIMDE_ENABLE_CILKPLUS)
283
-#define SIMDE_VECTORIZE _Pragma("simd")
284
+#define SIMDE_VECTORIZE HEDLEY_PRAGMA(simd)
285
 #define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l))
286
 #define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r))
287
 #define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a))
288
 #elif defined(__clang__) && !defined(HEDLEY_IBM_VERSION)
289
-#define SIMDE_VECTORIZE _Pragma("clang loop vectorize(enable)")
290
+#define SIMDE_VECTORIZE HEDLEY_PRAGMA(clang loop vectorize(enable))
291
 #define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l))
292
 #define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
293
 #define SIMDE_VECTORIZE_ALIGNED(a)
294
 #elif HEDLEY_GCC_VERSION_CHECK(4, 9, 0)
295
-#define SIMDE_VECTORIZE _Pragma("GCC ivdep")
296
+#define SIMDE_VECTORIZE HEDLEY_PRAGMA(GCC ivdep)
297
 #define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE
298
 #define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
299
 #define SIMDE_VECTORIZE_ALIGNED(a)
300
 #elif HEDLEY_CRAY_VERSION_CHECK(5, 0, 0)
301
-#define SIMDE_VECTORIZE _Pragma("_CRI ivdep")
302
+#define SIMDE_VECTORIZE HEDLEY_PRAGMA(_CRI ivdep)
303
 #define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE
304
 #define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE
305
 #define SIMDE_VECTORIZE_ALIGNED(a)
306
@@ -350,20 +389,10 @@
307
    HEDLEY_DIAGNOSTIC_POP
308
 #endif
309
 
310
-#if HEDLEY_HAS_WARNING("-Wpedantic")
311
-#define SIMDE_DIAGNOSTIC_DISABLE_INT128 \
312
-   _Pragma("clang diagnostic ignored \"-Wpedantic\"")
313
-#elif defined(HEDLEY_GCC_VERSION)
314
-#define SIMDE_DIAGNOSTIC_DISABLE_INT128 \
315
-   _Pragma("GCC diagnostic ignored \"-Wpedantic\"")
316
-#else
317
-#define SIMDE_DIAGNOSTIC_DISABLE_INT128
318
-#endif
319
-
320
 #if defined(__SIZEOF_INT128__)
321
 #define SIMDE_HAVE_INT128_
322
 HEDLEY_DIAGNOSTIC_PUSH
323
-SIMDE_DIAGNOSTIC_DISABLE_INT128
324
+SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_
325
 typedef __int128 simde_int128;
326
 typedef unsigned __int128 simde_uint128;
327
 HEDLEY_DIAGNOSTIC_POP
328
@@ -488,39 +517,6 @@
329
 #endif
330
 typedef SIMDE_FLOAT64_TYPE simde_float64;
331
 
332
-/* Whether to assume that the compiler can auto-vectorize reasonably
333
-   well.  This will cause SIMDe to attempt to compose vector
334
-   operations using more simple vector operations instead of minimize
335
-   serial work.
336
-
337
-   As an example, consider the _mm_add_ss(a, b) function from SSE,
338
-   which returns { a0 + b0, a1, a2, a3 }.  This pattern is repeated
339
-   for other operations (sub, mul, etc.).
340
-
341
-   The naïve implementation would result in loading a0 and b0, adding
342
-   them into a temporary variable, then splicing that value into a new
343
-   vector with the remaining elements from a.
344
-
345
-   On platforms which support vectorization, it's generally faster to
346
-   simply perform the operation on the entire vector to avoid having
347
-   to move data between SIMD registers and non-SIMD registers.
348
-   Basically, instead of the temporary variable being (a0 + b0) it
349
-   would be a vector of (a + b), which is then combined with a to form
350
-   the result.
351
-
352
-   By default, SIMDe will prefer the pure-vector versions if we detect
353
-   a vector ISA extension, but this can be overridden by defining
354
-   SIMDE_NO_ASSUME_VECTORIZATION.  You can also define
355
-   SIMDE_ASSUME_VECTORIZATION if you want to force SIMDe to use the
356
-   vectorized version. */
357
-#if !defined(SIMDE_NO_ASSUME_VECTORIZATION) && \
358
-   !defined(SIMDE_ASSUME_VECTORIZATION)
359
-#if defined(__SSE__) || defined(__ARM_NEON) || defined(__mips_msa) || \
360
-   defined(__ALTIVEC__) || defined(__wasm_simd128__)
361
-#define SIMDE_ASSUME_VECTORIZATION
362
-#endif
363
-#endif
364
-
365
 #if HEDLEY_HAS_WARNING("-Wbad-function-cast")
366
 #define SIMDE_CONVERT_FTOI(T, v)                                    \
367
    HEDLEY_DIAGNOSTIC_PUSH                                      \
368
@@ -530,11 +526,18 @@
369
 #define SIMDE_CONVERT_FTOI(T, v) ((T)(v))
370
 #endif
371
 
372
+/* TODO: detect compilers which support this outside of C11 mode */
373
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
374
 #define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) \
375
-   (_Generic((value), to : (value), from : ((to)(value))))
376
+   _Generic((value), to                            \
377
+        : (value), default                     \
378
+        : (_Generic((value), from              \
379
+                : ((to)(value)))))
380
 #define SIMDE_CHECKED_STATIC_CAST(to, from, value) \
381
-   (_Generic((value), to : (value), from : ((to)(value))))
382
+   _Generic((value), to                       \
383
+        : (value), default                \
384
+        : (_Generic((value), from         \
385
+                : ((to)(value)))))
386
 #else
387
 #define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) \
388
    HEDLEY_REINTERPRET_CAST(to, value)
389
@@ -564,7 +567,7 @@
390
 #if defined(__STDC_HOSTED__)
391
 #define SIMDE_STDC_HOSTED __STDC_HOSTED__
392
 #else
393
-#if defined(HEDLEY_PGI_VERSION_CHECK) || defined(HEDLEY_MSVC_VERSION_CHECK)
394
+#if defined(HEDLEY_PGI_VERSION) || defined(HEDLEY_MSVC_VERSION)
395
 #define SIMDE_STDC_HOSTED 1
396
 #else
397
 #define SIMDE_STDC_HOSTED 0
398
@@ -572,23 +575,34 @@
399
 #endif
400
 
401
 /* Try to deal with environments without a standard library. */
402
-#if !defined(simde_memcpy) || !defined(simde_memset)
403
-#if !defined(SIMDE_NO_STRING_H) && defined(__has_include)
404
-#if __has_include(<string.h>)
405
-#include <string.h>
406
 #if !defined(simde_memcpy)
407
-#define simde_memcpy(dest, src, n) memcpy(dest, src, n)
408
+#if HEDLEY_HAS_BUILTIN(__builtin_memcpy)
409
+#define simde_memcpy(dest, src, n) __builtin_memcpy(dest, src, n)
410
+#endif
411
 #endif
412
 #if !defined(simde_memset)
413
-#define simde_memset(s, c, n) memset(s, c, n)
414
+#if HEDLEY_HAS_BUILTIN(__builtin_memset)
415
+#define simde_memset(s, c, n) __builtin_memset(s, c, n)
416
 #endif
417
-#else
418
+#endif
419
+#if !defined(simde_memcmp)
420
+#if HEDLEY_HAS_BUILTIN(__builtin_memcmp)
421
+#define simde_memcmp(s1, s2, n) __builtin_memcmp(s1, s2, n)
422
+#endif
423
+#endif
424
+
425
+#if !defined(simde_memcpy) || !defined(simde_memset) || !defined(simde_memcmp)
426
+#if !defined(SIMDE_NO_STRING_H)
427
+#if defined(__has_include)
428
+#if !__has_include(<string.h>)
429
 #define SIMDE_NO_STRING_H
430
 #endif
431
+#elif (SIMDE_STDC_HOSTED == 0)
432
+#define SIMDE_NO_STRING_H
433
 #endif
434
 #endif
435
-#if !defined(simde_memcpy) || !defined(simde_memset)
436
-#if !defined(SIMDE_NO_STRING_H) && (SIMDE_STDC_HOSTED == 1)
437
+
438
+#if !defined(SIMDE_NO_STRING_H)
439
 #include <string.h>
440
 #if !defined(simde_memcpy)
441
 #define simde_memcpy(dest, src, n) memcpy(dest, src, n)
442
@@ -596,14 +610,8 @@
443
 #if !defined(simde_memset)
444
 #define simde_memset(s, c, n) memset(s, c, n)
445
 #endif
446
-#elif (HEDLEY_HAS_BUILTIN(__builtin_memcpy) &&  \
447
-       HEDLEY_HAS_BUILTIN(__builtin_memset)) || \
448
-   HEDLEY_GCC_VERSION_CHECK(4, 2, 0)
449
-#if !defined(simde_memcpy)
450
-#define simde_memcpy(dest, src, n) __builtin_memcpy(dest, src, n)
451
-#endif
452
-#if !defined(simde_memset)
453
-#define simde_memset(s, c, n) __builtin_memset(s, c, n)
454
+#if !defined(simde_memcmp)
455
+#define simde_memcmp(s1, s2, n) memcmp(s1, s2, n)
456
 #endif
457
 #else
458
 /* These are meant to be portable, not fast.  If you're hitting them you
459
@@ -637,10 +645,24 @@
460
 }
461
 #define simde_memset(s, c, n) simde_memset_(s, c, n)
462
 #endif
463
-#endif /* !defined(SIMDE_NO_STRING_H) && (SIMDE_STDC_HOSTED == 1) */
464
-#endif /* !defined(simde_memcpy) || !defined(simde_memset) */
465
 
466
-#include "simde-math.h"
467
+#if !defined(simde_memcmp)
468
+SIMDE_FUCTION_ATTRIBUTES
469
+int simde_memcmp_(const void *s1, const void *s2, size_t n)
470
+{
471
+   unsigned char *s1_ = HEDLEY_STATIC_CAST(unsigned char *, s1);
472
+   unsigned char *s2_ = HEDLEY_STATIC_CAST(unsigned char *, s2);
473
+   for (size_t i = 0; i < len; i++) {
474
+       if (s1_[i] != s2_[i]) {
475
+           return (int)(s1_[i] - s2_[i]);
476
+       }
477
+   }
478
+   return 0;
479
+}
480
+#define simde_memcmp(s1, s2, n) simde_memcmp_(s1, s2, n)
481
+#endif
482
+#endif
483
+#endif
484
 
485
 #if defined(FE_ALL_EXCEPT)
486
 #define SIMDE_HAVE_FENV_H
487
@@ -682,6 +704,105 @@
488
 
489
 #include "check.h"
490
 
491
+/* GCC/clang have a bunch of functionality in builtins which we would
492
+ * like to access, but the suffixes indicate whether the operate on
493
+ * int, long, or long long, not fixed width types (e.g., int32_t).
494
+ * we use these macros to attempt to map from fixed-width to the
495
+ * names GCC uses.  Note that you should still cast the input(s) and
496
+ * return values (to/from SIMDE_BUILTIN_TYPE_*_) since often even if
497
+ * types are the same size they may not be compatible according to the
498
+ * compiler.  For example, on x86 long and long lonsg are generally
499
+ * both 64 bits, but platforms vary on whether an int64_t is mapped
500
+ * to a long or long long. */
501
+
502
+#include <limits.h>
503
+
504
+HEDLEY_DIAGNOSTIC_PUSH
505
+SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_
506
+
507
+#if (INT8_MAX == INT_MAX) && (INT8_MIN == INT_MIN)
508
+#define SIMDE_BUILTIN_SUFFIX_8_
509
+#define SIMDE_BUILTIN_TYPE_8_ int
510
+#elif (INT8_MAX == LONG_MAX) && (INT8_MIN == LONG_MIN)
511
+#define SIMDE_BUILTIN_SUFFIX_8_ l
512
+#define SIMDE_BUILTIN_TYPE_8_ long
513
+#elif (INT8_MAX == LLONG_MAX) && (INT8_MIN == LLONG_MIN)
514
+#define SIMDE_BUILTIN_SUFFIX_8_ ll
515
+#define SIMDE_BUILTIN_TYPE_8_ long long
516
+#endif
517
+
518
+#if (INT16_MAX == INT_MAX) && (INT16_MIN == INT_MIN)
519
+#define SIMDE_BUILTIN_SUFFIX_16_
520
+#define SIMDE_BUILTIN_TYPE_16_ int
521
+#elif (INT16_MAX == LONG_MAX) && (INT16_MIN == LONG_MIN)
522
+#define SIMDE_BUILTIN_SUFFIX_16_ l
523
+#define SIMDE_BUILTIN_TYPE_16_ long
524
+#elif (INT16_MAX == LLONG_MAX) && (INT16_MIN == LLONG_MIN)
525
+#define SIMDE_BUILTIN_SUFFIX_16_ ll
526
+#define SIMDE_BUILTIN_TYPE_16_ long long
527
+#endif
528
+
529
+#if (INT32_MAX == INT_MAX) && (INT32_MIN == INT_MIN)
530
+#define SIMDE_BUILTIN_SUFFIX_32_
531
+#define SIMDE_BUILTIN_TYPE_32_ int
532
+#elif (INT32_MAX == LONG_MAX) && (INT32_MIN == LONG_MIN)
533
+#define SIMDE_BUILTIN_SUFFIX_32_ l
534
+#define SIMDE_BUILTIN_TYPE_32_ long
535
+#elif (INT32_MAX == LLONG_MAX) && (INT32_MIN == LLONG_MIN)
536
+#define SIMDE_BUILTIN_SUFFIX_32_ ll
537
+#define SIMDE_BUILTIN_TYPE_32_ long long
538
+#endif
539
+
540
+#if (INT64_MAX == INT_MAX) && (INT64_MIN == INT_MIN)
541
+#define SIMDE_BUILTIN_SUFFIX_64_
542
+#define SIMDE_BUILTIN_TYPE_64_ int
543
+#elif (INT64_MAX == LONG_MAX) && (INT64_MIN == LONG_MIN)
544
+#define SIMDE_BUILTIN_SUFFIX_64_ l
545
+#define SIMDE_BUILTIN_TYPE_64_ long
546
+#elif (INT64_MAX == LLONG_MAX) && (INT64_MIN == LLONG_MIN)
547
+#define SIMDE_BUILTIN_SUFFIX_64_ ll
548
+#define SIMDE_BUILTIN_TYPE_64_ long long
549
+#endif
550
+
551
+#if defined(SIMDE_BUILTIN_SUFFIX_8_)
552
+#define SIMDE_BUILTIN_8_(name) \
553
+   HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_8_)
554
+#define SIMDE_BUILTIN_HAS_8_(name) \
555
+   HEDLEY_HAS_BUILTIN(        \
556
+       HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_8_))
557
+#else
558
+#define SIMDE_BUILTIN_HAS_8_(name) 0
559
+#endif
560
+#if defined(SIMDE_BUILTIN_SUFFIX_16_)
561
+#define SIMDE_BUILTIN_16_(name) \
562
+   HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_16_)
563
+#define SIMDE_BUILTIN_HAS_16_(name) \
564
+   HEDLEY_HAS_BUILTIN(         \
565
+       HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_16_))
566
+#else
567
+#define SIMDE_BUILTIN_HAS_16_(name) 0
568
+#endif
569
+#if defined(SIMDE_BUILTIN_SUFFIX_32_)
570
+#define SIMDE_BUILTIN_32_(name) \
571
+   HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_32_)
572
+#define SIMDE_BUILTIN_HAS_32_(name) \
573
+   HEDLEY_HAS_BUILTIN(         \
574
+       HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_32_))
575
+#else
576
+#define SIMDE_BUILTIN_HAS_32_(name) 0
577
+#endif
578
+#if defined(SIMDE_BUILTIN_SUFFIX_64_)
579
+#define SIMDE_BUILTIN_64_(name) \
580
+   HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_64_)
581
+#define SIMDE_BUILTIN_HAS_64_(name) \
582
+   HEDLEY_HAS_BUILTIN(         \
583
+       HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_64_))
584
+#else
585
+#define SIMDE_BUILTIN_HAS_64_(name) 0
586
+#endif
587
+
588
+HEDLEY_DIAGNOSTIC_POP
589
+
590
 /* Sometimes we run into problems with specific versions of compilers
591
    which make the native versions unusable for us.  Often this is due
592
    to missing functions, sometimes buggy implementations, etc.  These
593
@@ -712,29 +833,75 @@
594
 #if defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)
595
 #define SIMDE_BUG_GCC_94482
596
 #endif
597
+#if (defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) || \
598
+   defined(SIMDE_ARCH_SYSTEMZ)
599
+#define SIMDE_BUG_GCC_53784
600
+#endif
601
+#if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)
602
+#if HEDLEY_GCC_VERSION_CHECK(4, 3, 0) /* -Wsign-conversion */
603
+#define SIMDE_BUG_GCC_95144
604
+#endif
605
+#endif
606
 #if !HEDLEY_GCC_VERSION_CHECK(9, 4, 0) && defined(SIMDE_ARCH_AARCH64)
607
 #define SIMDE_BUG_GCC_94488
608
 #endif
609
-#if defined(SIMDE_ARCH_POWER)
610
+#if defined(SIMDE_ARCH_ARM)
611
+#define SIMDE_BUG_GCC_95399
612
+#define SIMDE_BUG_GCC_95471
613
+#elif defined(SIMDE_ARCH_POWER)
614
 #define SIMDE_BUG_GCC_95227
615
+#define SIMDE_BUG_GCC_95782
616
+#elif defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)
617
+#if !HEDLEY_GCC_VERSION_CHECK(10, 2, 0) && !defined(__OPTIMIZE__)
618
+#define SIMDE_BUG_GCC_96174
619
+#endif
620
 #endif
621
 #define SIMDE_BUG_GCC_95399
622
 #elif defined(__clang__)
623
 #if defined(SIMDE_ARCH_AARCH64)
624
 #define SIMDE_BUG_CLANG_45541
625
+#define SIMDE_BUG_CLANG_46844
626
+#define SIMDE_BUG_CLANG_48257
627
+#if SIMDE_DETECT_CLANG_VERSION_CHECK(10, 0, 0) && \
628
+   SIMDE_DETECT_CLANG_VERSION_NOT(11, 0, 0)
629
+#define SIMDE_BUG_CLANG_BAD_VI64_OPS
630
+#endif
631
+#endif
632
+#if defined(SIMDE_ARCH_POWER)
633
+#define SIMDE_BUG_CLANG_46770
634
+#endif
635
+#if defined(_ARCH_PWR9) && !SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0) && \
636
+   !defined(__OPTIMIZE__)
637
+#define SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT
638
+#endif
639
+#if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)
640
+#if HEDLEY_HAS_WARNING("-Wsign-conversion") && \
641
+   SIMDE_DETECT_CLANG_VERSION_NOT(11, 0, 0)
642
+#define SIMDE_BUG_CLANG_45931
643
+#endif
644
+#if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
645
+   SIMDE_DETECT_CLANG_VERSION_NOT(11, 0, 0)
646
+#define SIMDE_BUG_CLANG_44589
647
+#endif
648
 #endif
649
+#define SIMDE_BUG_CLANG_45959
650
+#elif defined(HEDLEY_MSVC_VERSION)
651
+#if defined(SIMDE_ARCH_X86)
652
+#define SIMDE_BUG_MSVC_ROUND_EXTRACT
653
 #endif
654
-#if defined(HEDLEY_EMSCRIPTEN_VERSION)
655
-#define SIMDE_BUG_EMSCRIPTEN_MISSING_IMPL /* Placeholder for (as yet) unfiled issues. */
656
-#define SIMDE_BUG_EMSCRIPTEN_5242
657
+#elif defined(HEDLEY_INTEL_VERSION)
658
+#define SIMDE_BUG_INTEL_857088
659
 #endif
660
 #endif
661
 
662
 /* GCC and Clang both have the same issue:
663
  * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95144
664
  * https://bugs.llvm.org/show_bug.cgi?id=45931
665
+ * This is just an easy way to work around it.
666
  */
667
-#if HEDLEY_HAS_WARNING("-Wsign-conversion") || HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
668
+#if (HEDLEY_HAS_WARNING("-Wsign-conversion") &&   \
669
+     SIMDE_DETECT_CLANG_VERSION_NOT(11, 0, 0)) || \
670
+   HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
671
 #define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr)                                      \
672
    (__extension__({                                                            \
673
        HEDLEY_DIAGNOSTIC_PUSH                                              \
674
obs-studio-26.1.1.tar.xz/libobs/util/simde/simde-constify.h Added
927
 
1
@@ -0,0 +1,925 @@
2
+/* SPDX-License-Identifier: MIT
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person
5
+ * obtaining a copy of this software and associated documentation
6
+ * files (the "Software"), to deal in the Software without
7
+ * restriction, including without limitation the rights to use, copy,
8
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
9
+ * of the Software, and to permit persons to whom the Software is
10
+ * furnished to do so, subject to the following conditions:
11
+ *
12
+ * The above copyright notice and this permission notice shall be
13
+ * included in all copies or substantial portions of the Software.
14
+ *
15
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ * SOFTWARE.
23
+ *
24
+ * Copyright:
25
+ *   2020      Evan Nemerson <evan@nemerson.com>
26
+ */
27
+
28
+/* Constify macros.  For internal use only.
29
+ *
30
+ * These are used to make it possible to call a function which takes
31
+ * an Integer Constant Expression (ICE) using a compile time constant.
32
+ * Technically it would also be possible to use a value not trivially
33
+ * known by the compiler, but there would be a siginficant performance
34
+ * hit (a switch switch is used).
35
+ *
36
+ * The basic idea is pretty simple; we just emit a do while loop which
37
+ * contains a switch with a case for every possible value of the
38
+ * constant.
39
+ *
40
+ * As long as the value you pass to the function in constant, pretty
41
+ * much any copmiler shouldn't have a problem generating exactly the
42
+ * same code as if you had used an ICE.
43
+ *
44
+ * This is intended to be used in the SIMDe implementations of
45
+ * functions the compilers require to be an ICE, but the other benefit
46
+ * is that if we also disable the warnings from
47
+ * SIMDE_REQUIRE_CONSTANT_RANGE we can actually just allow the tests
48
+ * to use non-ICE parameters
49
+ */
50
+
51
+#if !defined(SIMDE_CONSTIFY_H)
52
+#define SIMDE_CONSTIFY_H
53
+
54
+#include "simde-diagnostic.h"
55
+
56
+HEDLEY_DIAGNOSTIC_PUSH
57
+SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_
58
+SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_
59
+
60
+#define SIMDE_CONSTIFY_2_(func_name, result, default_case, imm, ...) \
61
+   do {                                                         \
62
+       switch (imm) {                                       \
63
+       case 0:                                              \
64
+           result = func_name(__VA_ARGS__, 0);          \
65
+           break;                                       \
66
+       case 1:                                              \
67
+           result = func_name(__VA_ARGS__, 1);          \
68
+           break;                                       \
69
+       default:                                             \
70
+           result = default_case;                       \
71
+           break;                                       \
72
+       }                                                    \
73
+   } while (0)
74
+
75
+#define SIMDE_CONSTIFY_4_(func_name, result, default_case, imm, ...) \
76
+   do {                                                         \
77
+       switch (imm) {                                       \
78
+       case 0:                                              \
79
+           result = func_name(__VA_ARGS__, 0);          \
80
+           break;                                       \
81
+       case 1:                                              \
82
+           result = func_name(__VA_ARGS__, 1);          \
83
+           break;                                       \
84
+       case 2:                                              \
85
+           result = func_name(__VA_ARGS__, 2);          \
86
+           break;                                       \
87
+       case 3:                                              \
88
+           result = func_name(__VA_ARGS__, 3);          \
89
+           break;                                       \
90
+       default:                                             \
91
+           result = default_case;                       \
92
+           break;                                       \
93
+       }                                                    \
94
+   } while (0)
95
+
96
+#define SIMDE_CONSTIFY_8_(func_name, result, default_case, imm, ...) \
97
+   do {                                                         \
98
+       switch (imm) {                                       \
99
+       case 0:                                              \
100
+           result = func_name(__VA_ARGS__, 0);          \
101
+           break;                                       \
102
+       case 1:                                              \
103
+           result = func_name(__VA_ARGS__, 1);          \
104
+           break;                                       \
105
+       case 2:                                              \
106
+           result = func_name(__VA_ARGS__, 2);          \
107
+           break;                                       \
108
+       case 3:                                              \
109
+           result = func_name(__VA_ARGS__, 3);          \
110
+           break;                                       \
111
+       case 4:                                              \
112
+           result = func_name(__VA_ARGS__, 4);          \
113
+           break;                                       \
114
+       case 5:                                              \
115
+           result = func_name(__VA_ARGS__, 5);          \
116
+           break;                                       \
117
+       case 6:                                              \
118
+           result = func_name(__VA_ARGS__, 6);          \
119
+           break;                                       \
120
+       case 7:                                              \
121
+           result = func_name(__VA_ARGS__, 7);          \
122
+           break;                                       \
123
+       default:                                             \
124
+           result = default_case;                       \
125
+           break;                                       \
126
+       }                                                    \
127
+   } while (0)
128
+
129
+#define SIMDE_CONSTIFY_16_(func_name, result, default_case, imm, ...) \
130
+   do {                                                          \
131
+       switch (imm) {                                        \
132
+       case 0:                                               \
133
+           result = func_name(__VA_ARGS__, 0);           \
134
+           break;                                        \
135
+       case 1:                                               \
136
+           result = func_name(__VA_ARGS__, 1);           \
137
+           break;                                        \
138
+       case 2:                                               \
139
+           result = func_name(__VA_ARGS__, 2);           \
140
+           break;                                        \
141
+       case 3:                                               \
142
+           result = func_name(__VA_ARGS__, 3);           \
143
+           break;                                        \
144
+       case 4:                                               \
145
+           result = func_name(__VA_ARGS__, 4);           \
146
+           break;                                        \
147
+       case 5:                                               \
148
+           result = func_name(__VA_ARGS__, 5);           \
149
+           break;                                        \
150
+       case 6:                                               \
151
+           result = func_name(__VA_ARGS__, 6);           \
152
+           break;                                        \
153
+       case 7:                                               \
154
+           result = func_name(__VA_ARGS__, 7);           \
155
+           break;                                        \
156
+       case 8:                                               \
157
+           result = func_name(__VA_ARGS__, 8);           \
158
+           break;                                        \
159
+       case 9:                                               \
160
+           result = func_name(__VA_ARGS__, 9);           \
161
+           break;                                        \
162
+       case 10:                                              \
163
+           result = func_name(__VA_ARGS__, 10);          \
164
+           break;                                        \
165
+       case 11:                                              \
166
+           result = func_name(__VA_ARGS__, 11);          \
167
+           break;                                        \
168
+       case 12:                                              \
169
+           result = func_name(__VA_ARGS__, 12);          \
170
+           break;                                        \
171
+       case 13:                                              \
172
+           result = func_name(__VA_ARGS__, 13);          \
173
+           break;                                        \
174
+       case 14:                                              \
175
+           result = func_name(__VA_ARGS__, 14);          \
176
+           break;                                        \
177
+       case 15:                                              \
178
+           result = func_name(__VA_ARGS__, 15);          \
179
+           break;                                        \
180
+       default:                                              \
181
+           result = default_case;                        \
182
+           break;                                        \
183
+       }                                                     \
184
+   } while (0)
185
+
186
+#define SIMDE_CONSTIFY_32_(func_name, result, default_case, imm, ...) \
187
+   do {                                                          \
188
+       switch (imm) {                                        \
189
+       case 0:                                               \
190
+           result = func_name(__VA_ARGS__, 0);           \
191
+           break;                                        \
192
+       case 1:                                               \
193
+           result = func_name(__VA_ARGS__, 1);           \
194
+           break;                                        \
195
+       case 2:                                               \
196
+           result = func_name(__VA_ARGS__, 2);           \
197
+           break;                                        \
198
+       case 3:                                               \
199
+           result = func_name(__VA_ARGS__, 3);           \
200
+           break;                                        \
201
+       case 4:                                               \
202
+           result = func_name(__VA_ARGS__, 4);           \
203
+           break;                                        \
204
+       case 5:                                               \
205
+           result = func_name(__VA_ARGS__, 5);           \
206
+           break;                                        \
207
+       case 6:                                               \
208
+           result = func_name(__VA_ARGS__, 6);           \
209
+           break;                                        \
210
+       case 7:                                               \
211
+           result = func_name(__VA_ARGS__, 7);           \
212
+           break;                                        \
213
+       case 8:                                               \
214
+           result = func_name(__VA_ARGS__, 8);           \
215
+           break;                                        \
216
+       case 9:                                               \
217
+           result = func_name(__VA_ARGS__, 9);           \
218
+           break;                                        \
219
+       case 10:                                              \
220
+           result = func_name(__VA_ARGS__, 10);          \
221
+           break;                                        \
222
+       case 11:                                              \
223
+           result = func_name(__VA_ARGS__, 11);          \
224
+           break;                                        \
225
+       case 12:                                              \
226
+           result = func_name(__VA_ARGS__, 12);          \
227
+           break;                                        \
228
+       case 13:                                              \
229
+           result = func_name(__VA_ARGS__, 13);          \
230
+           break;                                        \
231
+       case 14:                                              \
232
+           result = func_name(__VA_ARGS__, 14);          \
233
+           break;                                        \
234
+       case 15:                                              \
235
+           result = func_name(__VA_ARGS__, 15);          \
236
+           break;                                        \
237
+       case 16:                                              \
238
+           result = func_name(__VA_ARGS__, 16);          \
239
+           break;                                        \
240
+       case 17:                                              \
241
+           result = func_name(__VA_ARGS__, 17);          \
242
+           break;                                        \
243
+       case 18:                                              \
244
+           result = func_name(__VA_ARGS__, 18);          \
245
+           break;                                        \
246
+       case 19:                                              \
247
+           result = func_name(__VA_ARGS__, 19);          \
248
+           break;                                        \
249
+       case 20:                                              \
250
+           result = func_name(__VA_ARGS__, 20);          \
251
+           break;                                        \
252
+       case 21:                                              \
253
+           result = func_name(__VA_ARGS__, 21);          \
254
+           break;                                        \
255
+       case 22:                                              \
256
+           result = func_name(__VA_ARGS__, 22);          \
257
+           break;                                        \
258
+       case 23:                                              \
259
+           result = func_name(__VA_ARGS__, 23);          \
260
+           break;                                        \
261
+       case 24:                                              \
262
+           result = func_name(__VA_ARGS__, 24);          \
263
+           break;                                        \
264
+       case 25:                                              \
265
+           result = func_name(__VA_ARGS__, 25);          \
266
+           break;                                        \
267
+       case 26:                                              \
268
+           result = func_name(__VA_ARGS__, 26);          \
269
+           break;                                        \
270
+       case 27:                                              \
271
+           result = func_name(__VA_ARGS__, 27);          \
272
+           break;                                        \
273
+       case 28:                                              \
274
+           result = func_name(__VA_ARGS__, 28);          \
275
+           break;                                        \
276
+       case 29:                                              \
277
+           result = func_name(__VA_ARGS__, 29);          \
278
+           break;                                        \
279
+       case 30:                                              \
280
+           result = func_name(__VA_ARGS__, 30);          \
281
+           break;                                        \
282
+       case 31:                                              \
283
+           result = func_name(__VA_ARGS__, 31);          \
284
+           break;                                        \
285
+       default:                                              \
286
+           result = default_case;                        \
287
+           break;                                        \
288
+       }                                                     \
289
+   } while (0)
290
+
291
+#define SIMDE_CONSTIFY_64_(func_name, result, default_case, imm, ...) \
292
+   do {                                                          \
293
+       switch (imm) {                                        \
294
+       case 0:                                               \
295
+           result = func_name(__VA_ARGS__, 0);           \
296
+           break;                                        \
297
+       case 1:                                               \
298
+           result = func_name(__VA_ARGS__, 1);           \
299
+           break;                                        \
300
+       case 2:                                               \
301
+           result = func_name(__VA_ARGS__, 2);           \
302
+           break;                                        \
303
+       case 3:                                               \
304
+           result = func_name(__VA_ARGS__, 3);           \
305
+           break;                                        \
306
+       case 4:                                               \
307
+           result = func_name(__VA_ARGS__, 4);           \
308
+           break;                                        \
309
+       case 5:                                               \
310
+           result = func_name(__VA_ARGS__, 5);           \
311
+           break;                                        \
312
+       case 6:                                               \
313
+           result = func_name(__VA_ARGS__, 6);           \
314
+           break;                                        \
315
+       case 7:                                               \
316
+           result = func_name(__VA_ARGS__, 7);           \
317
+           break;                                        \
318
+       case 8:                                               \
319
+           result = func_name(__VA_ARGS__, 8);           \
320
+           break;                                        \
321
+       case 9:                                               \
322
+           result = func_name(__VA_ARGS__, 9);           \
323
+           break;                                        \
324
+       case 10:                                              \
325
+           result = func_name(__VA_ARGS__, 10);          \
326
+           break;                                        \
327
+       case 11:                                              \
328
+           result = func_name(__VA_ARGS__, 11);          \
329
+           break;                                        \
330
+       case 12:                                              \
331
+           result = func_name(__VA_ARGS__, 12);          \
332
+           break;                                        \
333
+       case 13:                                              \
334
+           result = func_name(__VA_ARGS__, 13);          \
335
+           break;                                        \
336
+       case 14:                                              \
337
+           result = func_name(__VA_ARGS__, 14);          \
338
+           break;                                        \
339
+       case 15:                                              \
340
+           result = func_name(__VA_ARGS__, 15);          \
341
+           break;                                        \
342
+       case 16:                                              \
343
+           result = func_name(__VA_ARGS__, 16);          \
344
+           break;                                        \
345
+       case 17:                                              \
346
+           result = func_name(__VA_ARGS__, 17);          \
347
+           break;                                        \
348
+       case 18:                                              \
349
+           result = func_name(__VA_ARGS__, 18);          \
350
+           break;                                        \
351
+       case 19:                                              \
352
+           result = func_name(__VA_ARGS__, 19);          \
353
+           break;                                        \
354
+       case 20:                                              \
355
+           result = func_name(__VA_ARGS__, 20);          \
356
+           break;                                        \
357
+       case 21:                                              \
358
+           result = func_name(__VA_ARGS__, 21);          \
359
+           break;                                        \
360
+       case 22:                                              \
361
+           result = func_name(__VA_ARGS__, 22);          \
362
+           break;                                        \
363
+       case 23:                                              \
364
+           result = func_name(__VA_ARGS__, 23);          \
365
+           break;                                        \
366
+       case 24:                                              \
367
+           result = func_name(__VA_ARGS__, 24);          \
368
+           break;                                        \
369
+       case 25:                                              \
370
+           result = func_name(__VA_ARGS__, 25);          \
371
+           break;                                        \
372
+       case 26:                                              \
373
+           result = func_name(__VA_ARGS__, 26);          \
374
+           break;                                        \
375
+       case 27:                                              \
376
+           result = func_name(__VA_ARGS__, 27);          \
377
+           break;                                        \
378
+       case 28:                                              \
379
+           result = func_name(__VA_ARGS__, 28);          \
380
+           break;                                        \
381
+       case 29:                                              \
382
+           result = func_name(__VA_ARGS__, 29);          \
383
+           break;                                        \
384
+       case 30:                                              \
385
+           result = func_name(__VA_ARGS__, 30);          \
386
+           break;                                        \
387
+       case 31:                                              \
388
+           result = func_name(__VA_ARGS__, 31);          \
389
+           break;                                        \
390
+       case 32:                                              \
391
+           result = func_name(__VA_ARGS__, 32);          \
392
+           break;                                        \
393
+       case 33:                                              \
394
+           result = func_name(__VA_ARGS__, 33);          \
395
+           break;                                        \
396
+       case 34:                                              \
397
+           result = func_name(__VA_ARGS__, 34);          \
398
+           break;                                        \
399
+       case 35:                                              \
400
+           result = func_name(__VA_ARGS__, 35);          \
401
+           break;                                        \
402
+       case 36:                                              \
403
+           result = func_name(__VA_ARGS__, 36);          \
404
+           break;                                        \
405
+       case 37:                                              \
406
+           result = func_name(__VA_ARGS__, 37);          \
407
+           break;                                        \
408
+       case 38:                                              \
409
+           result = func_name(__VA_ARGS__, 38);          \
410
+           break;                                        \
411
+       case 39:                                              \
412
+           result = func_name(__VA_ARGS__, 39);          \
413
+           break;                                        \
414
+       case 40:                                              \
415
+           result = func_name(__VA_ARGS__, 40);          \
416
+           break;                                        \
417
+       case 41:                                              \
418
+           result = func_name(__VA_ARGS__, 41);          \
419
+           break;                                        \
420
+       case 42:                                              \
421
+           result = func_name(__VA_ARGS__, 42);          \
422
+           break;                                        \
423
+       case 43:                                              \
424
+           result = func_name(__VA_ARGS__, 43);          \
425
+           break;                                        \
426
+       case 44:                                              \
427
+           result = func_name(__VA_ARGS__, 44);          \
428
+           break;                                        \
429
+       case 45:                                              \
430
+           result = func_name(__VA_ARGS__, 45);          \
431
+           break;                                        \
432
+       case 46:                                              \
433
+           result = func_name(__VA_ARGS__, 46);          \
434
+           break;                                        \
435
+       case 47:                                              \
436
+           result = func_name(__VA_ARGS__, 47);          \
437
+           break;                                        \
438
+       case 48:                                              \
439
+           result = func_name(__VA_ARGS__, 48);          \
440
+           break;                                        \
441
+       case 49:                                              \
442
+           result = func_name(__VA_ARGS__, 49);          \
443
+           break;                                        \
444
+       case 50:                                              \
445
+           result = func_name(__VA_ARGS__, 50);          \
446
+           break;                                        \
447
+       case 51:                                              \
448
+           result = func_name(__VA_ARGS__, 51);          \
449
+           break;                                        \
450
+       case 52:                                              \
451
+           result = func_name(__VA_ARGS__, 52);          \
452
+           break;                                        \
453
+       case 53:                                              \
454
+           result = func_name(__VA_ARGS__, 53);          \
455
+           break;                                        \
456
+       case 54:                                              \
457
+           result = func_name(__VA_ARGS__, 54);          \
458
+           break;                                        \
459
+       case 55:                                              \
460
+           result = func_name(__VA_ARGS__, 55);          \
461
+           break;                                        \
462
+       case 56:                                              \
463
+           result = func_name(__VA_ARGS__, 56);          \
464
+           break;                                        \
465
+       case 57:                                              \
466
+           result = func_name(__VA_ARGS__, 57);          \
467
+           break;                                        \
468
+       case 58:                                              \
469
+           result = func_name(__VA_ARGS__, 58);          \
470
+           break;                                        \
471
+       case 59:                                              \
472
+           result = func_name(__VA_ARGS__, 59);          \
473
+           break;                                        \
474
+       case 60:                                              \
475
+           result = func_name(__VA_ARGS__, 60);          \
476
+           break;                                        \
477
+       case 61:                                              \
478
+           result = func_name(__VA_ARGS__, 61);          \
479
+           break;                                        \
480
+       case 62:                                              \
481
+           result = func_name(__VA_ARGS__, 62);          \
482
+           break;                                        \
483
+       case 63:                                              \
484
+           result = func_name(__VA_ARGS__, 63);          \
485
+           break;                                        \
486
+       default:                                              \
487
+           result = default_case;                        \
488
+           break;                                        \
489
+       }                                                     \
490
+   } while (0)
491
+
492
+#define SIMDE_CONSTIFY_2_NO_RESULT_(func_name, default_case, imm, ...) \
493
+   do {                                                           \
494
+       switch (imm) {                                         \
495
+       case 0:                                                \
496
+           func_name(__VA_ARGS__, 0);                     \
497
+           break;                                         \
498
+       case 1:                                                \
499
+           func_name(__VA_ARGS__, 1);                     \
500
+           break;                                         \
501
+       default:                                               \
502
+           default_case;                                  \
503
+           break;                                         \
504
+       }                                                      \
505
+   } while (0)
506
+
507
+#define SIMDE_CONSTIFY_4_NO_RESULT_(func_name, default_case, imm, ...) \
508
+   do {                                                           \
509
+       switch (imm) {                                         \
510
+       case 0:                                                \
511
+           func_name(__VA_ARGS__, 0);                     \
512
+           break;                                         \
513
+       case 1:                                                \
514
+           func_name(__VA_ARGS__, 1);                     \
515
+           break;                                         \
516
+       case 2:                                                \
517
+           func_name(__VA_ARGS__, 2);                     \
518
+           break;                                         \
519
+       case 3:                                                \
520
+           func_name(__VA_ARGS__, 3);                     \
521
+           break;                                         \
522
+       default:                                               \
523
+           default_case;                                  \
524
+           break;                                         \
525
+       }                                                      \
526
+   } while (0)
527
+
528
+#define SIMDE_CONSTIFY_8_NO_RESULT_(func_name, default_case, imm, ...) \
529
+   do {                                                           \
530
+       switch (imm) {                                         \
531
+       case 0:                                                \
532
+           func_name(__VA_ARGS__, 0);                     \
533
+           break;                                         \
534
+       case 1:                                                \
535
+           func_name(__VA_ARGS__, 1);                     \
536
+           break;                                         \
537
+       case 2:                                                \
538
+           func_name(__VA_ARGS__, 2);                     \
539
+           break;                                         \
540
+       case 3:                                                \
541
+           func_name(__VA_ARGS__, 3);                     \
542
+           break;                                         \
543
+       case 4:                                                \
544
+           func_name(__VA_ARGS__, 4);                     \
545
+           break;                                         \
546
+       case 5:                                                \
547
+           func_name(__VA_ARGS__, 5);                     \
548
+           break;                                         \
549
+       case 6:                                                \
550
+           func_name(__VA_ARGS__, 6);                     \
551
+           break;                                         \
552
+       case 7:                                                \
553
+           func_name(__VA_ARGS__, 7);                     \
554
+           break;                                         \
555
+       default:                                               \
556
+           default_case;                                  \
557
+           break;                                         \
558
+       }                                                      \
559
+   } while (0)
560
+
561
+#define SIMDE_CONSTIFY_16_NO_RESULT_(func_name, default_case, imm, ...) \
562
+   do {                                                            \
563
+       switch (imm) {                                          \
564
+       case 0:                                                 \
565
+           func_name(__VA_ARGS__, 0);                      \
566
+           break;                                          \
567
+       case 1:                                                 \
568
+           func_name(__VA_ARGS__, 1);                      \
569
+           break;                                          \
570
+       case 2:                                                 \
571
+           func_name(__VA_ARGS__, 2);                      \
572
+           break;                                          \
573
+       case 3:                                                 \
574
+           func_name(__VA_ARGS__, 3);                      \
575
+           break;                                          \
576
+       case 4:                                                 \
577
+           func_name(__VA_ARGS__, 4);                      \
578
+           break;                                          \
579
+       case 5:                                                 \
580
+           func_name(__VA_ARGS__, 5);                      \
581
+           break;                                          \
582
+       case 6:                                                 \
583
+           func_name(__VA_ARGS__, 6);                      \
584
+           break;                                          \
585
+       case 7:                                                 \
586
+           func_name(__VA_ARGS__, 7);                      \
587
+           break;                                          \
588
+       case 8:                                                 \
589
+           func_name(__VA_ARGS__, 8);                      \
590
+           break;                                          \
591
+       case 9:                                                 \
592
+           func_name(__VA_ARGS__, 9);                      \
593
+           break;                                          \
594
+       case 10:                                                \
595
+           func_name(__VA_ARGS__, 10);                     \
596
+           break;                                          \
597
+       case 11:                                                \
598
+           func_name(__VA_ARGS__, 11);                     \
599
+           break;                                          \
600
+       case 12:                                                \
601
+           func_name(__VA_ARGS__, 12);                     \
602
+           break;                                          \
603
+       case 13:                                                \
604
+           func_name(__VA_ARGS__, 13);                     \
605
+           break;                                          \
606
+       case 14:                                                \
607
+           func_name(__VA_ARGS__, 14);                     \
608
+           break;                                          \
609
+       case 15:                                                \
610
+           func_name(__VA_ARGS__, 15);                     \
611
+           break;                                          \
612
+       default:                                                \
613
+           default_case;                                   \
614
+           break;                                          \
615
+       }                                                       \
616
+   } while (0)
617
+
618
+#define SIMDE_CONSTIFY_32_NO_RESULT_(func_name, default_case, imm, ...) \
619
+   do {                                                            \
620
+       switch (imm) {                                          \
621
+       case 0:                                                 \
622
+           func_name(__VA_ARGS__, 0);                      \
623
+           break;                                          \
624
+       case 1:                                                 \
625
+           func_name(__VA_ARGS__, 1);                      \
626
+           break;                                          \
627
+       case 2:                                                 \
628
+           func_name(__VA_ARGS__, 2);                      \
629
+           break;                                          \
630
+       case 3:                                                 \
631
+           func_name(__VA_ARGS__, 3);                      \
632
+           break;                                          \
633
+       case 4:                                                 \
634
+           func_name(__VA_ARGS__, 4);                      \
635
+           break;                                          \
636
+       case 5:                                                 \
637
+           func_name(__VA_ARGS__, 5);                      \
638
+           break;                                          \
639
+       case 6:                                                 \
640
+           func_name(__VA_ARGS__, 6);                      \
641
+           break;                                          \
642
+       case 7:                                                 \
643
+           func_name(__VA_ARGS__, 7);                      \
644
+           break;                                          \
645
+       case 8:                                                 \
646
+           func_name(__VA_ARGS__, 8);                      \
647
+           break;                                          \
648
+       case 9:                                                 \
649
+           func_name(__VA_ARGS__, 9);                      \
650
+           break;                                          \
651
+       case 10:                                                \
652
+           func_name(__VA_ARGS__, 10);                     \
653
+           break;                                          \
654
+       case 11:                                                \
655
+           func_name(__VA_ARGS__, 11);                     \
656
+           break;                                          \
657
+       case 12:                                                \
658
+           func_name(__VA_ARGS__, 12);                     \
659
+           break;                                          \
660
+       case 13:                                                \
661
+           func_name(__VA_ARGS__, 13);                     \
662
+           break;                                          \
663
+       case 14:                                                \
664
+           func_name(__VA_ARGS__, 14);                     \
665
+           break;                                          \
666
+       case 15:                                                \
667
+           func_name(__VA_ARGS__, 15);                     \
668
+           break;                                          \
669
+       case 16:                                                \
670
+           func_name(__VA_ARGS__, 16);                     \
671
+           break;                                          \
672
+       case 17:                                                \
673
+           func_name(__VA_ARGS__, 17);                     \
674
+           break;                                          \
675
+       case 18:                                                \
676
+           func_name(__VA_ARGS__, 18);                     \
677
+           break;                                          \
678
+       case 19:                                                \
679
+           func_name(__VA_ARGS__, 19);                     \
680
+           break;                                          \
681
+       case 20:                                                \
682
+           func_name(__VA_ARGS__, 20);                     \
683
+           break;                                          \
684
+       case 21:                                                \
685
+           func_name(__VA_ARGS__, 21);                     \
686
+           break;                                          \
687
+       case 22:                                                \
688
+           func_name(__VA_ARGS__, 22);                     \
689
+           break;                                          \
690
+       case 23:                                                \
691
+           func_name(__VA_ARGS__, 23);                     \
692
+           break;                                          \
693
+       case 24:                                                \
694
+           func_name(__VA_ARGS__, 24);                     \
695
+           break;                                          \
696
+       case 25:                                                \
697
+           func_name(__VA_ARGS__, 25);                     \
698
+           break;                                          \
699
+       case 26:                                                \
700
+           func_name(__VA_ARGS__, 26);                     \
701
+           break;                                          \
702
+       case 27:                                                \
703
+           func_name(__VA_ARGS__, 27);                     \
704
+           break;                                          \
705
+       case 28:                                                \
706
+           func_name(__VA_ARGS__, 28);                     \
707
+           break;                                          \
708
+       case 29:                                                \
709
+           func_name(__VA_ARGS__, 29);                     \
710
+           break;                                          \
711
+       case 30:                                                \
712
+           func_name(__VA_ARGS__, 30);                     \
713
+           break;                                          \
714
+       case 31:                                                \
715
+           func_name(__VA_ARGS__, 31);                     \
716
+           break;                                          \
717
+       default:                                                \
718
+           default_case;                                   \
719
+           break;                                          \
720
+       }                                                       \
721
+   } while (0)
722
+
723
+#define SIMDE_CONSTIFY_64_NO_RESULT_(func_name, default_case, imm, ...) \
724
+   do {                                                            \
725
+       switch (imm) {                                          \
726
+       case 0:                                                 \
727
+           func_name(__VA_ARGS__, 0);                      \
728
+           break;                                          \
729
+       case 1:                                                 \
730
+           func_name(__VA_ARGS__, 1);                      \
731
+           break;                                          \
732
+       case 2:                                                 \
733
+           func_name(__VA_ARGS__, 2);                      \
734
+           break;                                          \
735
+       case 3:                                                 \
736
+           func_name(__VA_ARGS__, 3);                      \
737
+           break;                                          \
738
+       case 4:                                                 \
739
+           func_name(__VA_ARGS__, 4);                      \
740
+           break;                                          \
741
+       case 5:                                                 \
742
+           func_name(__VA_ARGS__, 5);                      \
743
+           break;                                          \
744
+       case 6:                                                 \
745
+           func_name(__VA_ARGS__, 6);                      \
746
+           break;                                          \
747
+       case 7:                                                 \
748
+           func_name(__VA_ARGS__, 7);                      \
749
+           break;                                          \
750
+       case 8:                                                 \
751
+           func_name(__VA_ARGS__, 8);                      \
752
+           break;                                          \
753
+       case 9:                                                 \
754
+           func_name(__VA_ARGS__, 9);                      \
755
+           break;                                          \
756
+       case 10:                                                \
757
+           func_name(__VA_ARGS__, 10);                     \
758
+           break;                                          \
759
+       case 11:                                                \
760
+           func_name(__VA_ARGS__, 11);                     \
761
+           break;                                          \
762
+       case 12:                                                \
763
+           func_name(__VA_ARGS__, 12);                     \
764
+           break;                                          \
765
+       case 13:                                                \
766
+           func_name(__VA_ARGS__, 13);                     \
767
+           break;                                          \
768
+       case 14:                                                \
769
+           func_name(__VA_ARGS__, 14);                     \
770
+           break;                                          \
771
+       case 15:                                                \
772
+           func_name(__VA_ARGS__, 15);                     \
773
+           break;                                          \
774
+       case 16:                                                \
775
+           func_name(__VA_ARGS__, 16);                     \
776
+           break;                                          \
777
+       case 17:                                                \
778
+           func_name(__VA_ARGS__, 17);                     \
779
+           break;                                          \
780
+       case 18:                                                \
781
+           func_name(__VA_ARGS__, 18);                     \
782
+           break;                                          \
783
+       case 19:                                                \
784
+           func_name(__VA_ARGS__, 19);                     \
785
+           break;                                          \
786
+       case 20:                                                \
787
+           func_name(__VA_ARGS__, 20);                     \
788
+           break;                                          \
789
+       case 21:                                                \
790
+           func_name(__VA_ARGS__, 21);                     \
791
+           break;                                          \
792
+       case 22:                                                \
793
+           func_name(__VA_ARGS__, 22);                     \
794
+           break;                                          \
795
+       case 23:                                                \
796
+           func_name(__VA_ARGS__, 23);                     \
797
+           break;                                          \
798
+       case 24:                                                \
799
+           func_name(__VA_ARGS__, 24);                     \
800
+           break;                                          \
801
+       case 25:                                                \
802
+           func_name(__VA_ARGS__, 25);                     \
803
+           break;                                          \
804
+       case 26:                                                \
805
+           func_name(__VA_ARGS__, 26);                     \
806
+           break;                                          \
807
+       case 27:                                                \
808
+           func_name(__VA_ARGS__, 27);                     \
809
+           break;                                          \
810
+       case 28:                                                \
811
+           func_name(__VA_ARGS__, 28);                     \
812
+           break;                                          \
813
+       case 29:                                                \
814
+           func_name(__VA_ARGS__, 29);                     \
815
+           break;                                          \
816
+       case 30:                                                \
817
+           func_name(__VA_ARGS__, 30);                     \
818
+           break;                                          \
819
+       case 31:                                                \
820
+           func_name(__VA_ARGS__, 31);                     \
821
+           break;                                          \
822
+       case 32:                                                \
823
+           func_name(__VA_ARGS__, 32);                     \
824
+           break;                                          \
825
+       case 33:                                                \
826
+           func_name(__VA_ARGS__, 33);                     \
827
+           break;                                          \
828
+       case 34:                                                \
829
+           func_name(__VA_ARGS__, 34);                     \
830
+           break;                                          \
831
+       case 35:                                                \
832
+           func_name(__VA_ARGS__, 35);                     \
833
+           break;                                          \
834
+       case 36:                                                \
835
+           func_name(__VA_ARGS__, 36);                     \
836
+           break;                                          \
837
+       case 37:                                                \
838
+           func_name(__VA_ARGS__, 37);                     \
839
+           break;                                          \
840
+       case 38:                                                \
841
+           func_name(__VA_ARGS__, 38);                     \
842
+           break;                                          \
843
+       case 39:                                                \
844
+           func_name(__VA_ARGS__, 39);                     \
845
+           break;                                          \
846
+       case 40:                                                \
847
+           func_name(__VA_ARGS__, 40);                     \
848
+           break;                                          \
849
+       case 41:                                                \
850
+           func_name(__VA_ARGS__, 41);                     \
851
+           break;                                          \
852
+       case 42:                                                \
853
+           func_name(__VA_ARGS__, 42);                     \
854
+           break;                                          \
855
+       case 43:                                                \
856
+           func_name(__VA_ARGS__, 43);                     \
857
+           break;                                          \
858
+       case 44:                                                \
859
+           func_name(__VA_ARGS__, 44);                     \
860
+           break;                                          \
861
+       case 45:                                                \
862
+           func_name(__VA_ARGS__, 45);                     \
863
+           break;                                          \
864
+       case 46:                                                \
865
+           func_name(__VA_ARGS__, 46);                     \
866
+           break;                                          \
867
+       case 47:                                                \
868
+           func_name(__VA_ARGS__, 47);                     \
869
+           break;                                          \
870
+       case 48:                                                \
871
+           func_name(__VA_ARGS__, 48);                     \
872
+           break;                                          \
873
+       case 49:                                                \
874
+           func_name(__VA_ARGS__, 49);                     \
875
+           break;                                          \
876
+       case 50:                                                \
877
+           func_name(__VA_ARGS__, 50);                     \
878
+           break;                                          \
879
+       case 51:                                                \
880
+           func_name(__VA_ARGS__, 51);                     \
881
+           break;                                          \
882
+       case 52:                                                \
883
+           func_name(__VA_ARGS__, 52);                     \
884
+           break;                                          \
885
+       case 53:                                                \
886
+           func_name(__VA_ARGS__, 53);                     \
887
+           break;                                          \
888
+       case 54:                                                \
889
+           func_name(__VA_ARGS__, 54);                     \
890
+           break;                                          \
891
+       case 55:                                                \
892
+           func_name(__VA_ARGS__, 55);                     \
893
+           break;                                          \
894
+       case 56:                                                \
895
+           func_name(__VA_ARGS__, 56);                     \
896
+           break;                                          \
897
+       case 57:                                                \
898
+           func_name(__VA_ARGS__, 57);                     \
899
+           break;                                          \
900
+       case 58:                                                \
901
+           func_name(__VA_ARGS__, 58);                     \
902
+           break;                                          \
903
+       case 59:                                                \
904
+           func_name(__VA_ARGS__, 59);                     \
905
+           break;                                          \
906
+       case 60:                                                \
907
+           func_name(__VA_ARGS__, 60);                     \
908
+           break;                                          \
909
+       case 61:                                                \
910
+           func_name(__VA_ARGS__, 61);                     \
911
+           break;                                          \
912
+       case 62:                                                \
913
+           func_name(__VA_ARGS__, 62);                     \
914
+           break;                                          \
915
+       case 63:                                                \
916
+           func_name(__VA_ARGS__, 63);                     \
917
+           break;                                          \
918
+       default:                                                \
919
+           default_case;                                   \
920
+           break;                                          \
921
+       }                                                       \
922
+   } while (0)
923
+
924
+HEDLEY_DIAGNOSTIC_POP
925
+
926
+#endif
927
obs-studio-26.1.1.tar.xz/libobs/util/simde/simde-detect-clang.h Added
116
 
1
@@ -0,0 +1,114 @@
2
+/* Detect Clang Version
3
+ * Created by Evan Nemerson <evan@nemerson.com>
4
+ *
5
+ * To the extent possible under law, the author(s) have dedicated all
6
+ * copyright and related and neighboring rights to this software to
7
+ * the public domain worldwide. This software is distributed without
8
+ * any warranty.
9
+ *
10
+ * For details, see <http://creativecommons.org/publicdomain/zero/1.0/>.
11
+ * SPDX-License-Identifier: CC0-1.0
12
+ */
13
+
14
+/* This file was originally part of SIMDe
15
+ * (<https://github.com/simd-everywhere/simde>).  You're free to do with it as
16
+ * you please, but I do have a few small requests:
17
+ *
18
+ *  * If you make improvements, please submit them back to SIMDe
19
+ *    (at <https://github.com/simd-everywhere/simde/issues>) so others can
20
+ *    benefit from them.
21
+ *  * Please keep a link to SIMDe intact so people know where to submit
22
+ *    improvements.
23
+ *  * If you expose it publicly, please change the SIMDE_ prefix to
24
+ *    something specific to your project.
25
+ *
26
+ * The version numbers clang exposes (in the ___clang_major__,
27
+ * __clang_minor__, and __clang_patchlevel__ macros) are unreliable.
28
+ * Vendors such as Apple will define these values to their version
29
+ * numbers; for example, "Apple Clang 4.0" is really clang 3.1, but
30
+ * __clang_major__ and __clang_minor__ are defined to 4 and 0
31
+ * respectively, instead of 3 and 1.
32
+ *
33
+ * The solution is *usually* to use clang's feature detection macros
34
+ * (<https://clang.llvm.org/docs/LanguageExtensions.html#feature-checking-macros>)
35
+ * to determine if the feature you're interested in is available.  This
36
+ * generally works well, and it should probably be the first thing you
37
+ * try.  Unfortunately, it's not possible to check for everything.  In
38
+ * particular, compiler bugs.
39
+ *
40
+ * This file just uses the feature checking macros to detect features
41
+ * added in specific versions of clang to identify which version of
42
+ * clang the compiler is based on.
43
+ *
44
+ * Right now it only goes back to 3.6, but I'm happy to accept patches
45
+ * to go back further.  And, of course, newer versions are welcome if
46
+ * they're not already present, and if you find a way to detect a point
47
+ * release that would be great, too!
48
+ */
49
+
50
+#if !defined(SIMDE_DETECT_CLANG_H)
51
+#define SIMDE_DETECT_CLANG_H 1
52
+
53
+/* Attempt to detect the upstream clang version number.  I usually only
54
+ * worry about major version numbers (at least for 4.0+), but if you
55
+ * need more resolution I'm happy to accept patches that are able to
56
+ * detect minor versions as well.  That said, you'll probably have a
57
+ * hard time with detection since AFAIK most minor releases don't add
58
+ * anything we can detect. */
59
+
60
+#if defined(__clang__) && !defined(SIMDE_DETECT_CLANG_VERSION)
61
+#if __has_warning("-Wformat-insufficient-args")
62
+#define SIMDE_DETECT_CLANG_VERSION 120000
63
+#elif __has_warning("-Wimplicit-const-int-float-conversion")
64
+#define SIMDE_DETECT_CLANG_VERSION 110000
65
+#elif __has_warning("-Wmisleading-indentation")
66
+#define SIMDE_DETECT_CLANG_VERSION 100000
67
+#elif defined(__FILE_NAME__)
68
+#define SIMDE_DETECT_CLANG_VERSION 90000
69
+#elif __has_warning("-Wextra-semi-stmt") || \
70
+   __has_builtin(__builtin_rotateleft32)
71
+#define SIMDE_DETECT_CLANG_VERSION 80000
72
+#elif __has_warning("-Wc++98-compat-extra-semi")
73
+#define SIMDE_DETECT_CLANG_VERSION 70000
74
+#elif __has_warning("-Wpragma-pack")
75
+#define SIMDE_DETECT_CLANG_VERSION 60000
76
+#elif __has_warning("-Wbitfield-enum-conversion")
77
+#define SIMDE_DETECT_CLANG_VERSION 50000
78
+#elif __has_attribute(diagnose_if)
79
+#define SIMDE_DETECT_CLANG_VERSION 40000
80
+#elif __has_warning("-Wcast-calling-convention")
81
+#define SIMDE_DETECT_CLANG_VERSION 30900
82
+#elif __has_warning("-WCL4")
83
+#define SIMDE_DETECT_CLANG_VERSION 30800
84
+#elif __has_warning("-WIndependentClass-attribute")
85
+#define SIMDE_DETECT_CLANG_VERSION 30700
86
+#elif __has_warning("-Wambiguous-ellipsis")
87
+#define SIMDE_DETECT_CLANG_VERSION 30600
88
+#else
89
+#define SIMDE_DETECT_CLANG_VERSION 1
90
+#endif
91
+#endif /* defined(__clang__) && !defined(SIMDE_DETECT_CLANG_VERSION) */
92
+
93
+/* The SIMDE_DETECT_CLANG_VERSION_CHECK macro is pretty
94
+ * straightforward; it returns true if the compiler is a derivative
95
+ * of clang >= the specified version.
96
+ *
97
+ * Since this file is often (primarily?) useful for working around bugs
98
+ * it is also helpful to have a macro which returns true if only if the
99
+ * compiler is a version of clang *older* than the specified version to
100
+ * make it a bit easier to ifdef regions to add code for older versions,
101
+ * such as pragmas to disable a specific warning. */
102
+
103
+#if defined(SIMDE_DETECT_CLANG_VERSION)
104
+#define SIMDE_DETECT_CLANG_VERSION_CHECK(major, minor, revision) \
105
+   (SIMDE_DETECT_CLANG_VERSION >=                           \
106
+    ((major * 10000) + (minor * 1000) + (revision)))
107
+#define SIMDE_DETECT_CLANG_VERSION_NOT(major, minor, revision) \
108
+   (SIMDE_DETECT_CLANG_VERSION <                          \
109
+    ((major * 10000) + (minor * 1000) + (revision)))
110
+#else
111
+#define SIMDE_DETECT_CLANG_VERSION_CHECK(major, minor, revision) (0)
112
+#define SIMDE_DETECT_CLANG_VERSION_NOT(major, minor, revision) (1)
113
+#endif
114
+
115
+#endif /* !defined(SIMDE_DETECT_CLANG_H) */
116
obs-studio-26.1.0.tar.xz/libobs/util/simde/simde-diagnostic.h -> obs-studio-26.1.1.tar.xz/libobs/util/simde/simde-diagnostic.h Changed
262
 
1
@@ -45,8 +45,10 @@
2
  */
3
 
4
 #if !defined(SIMDE_DIAGNOSTIC_H)
5
+#define SIMDE_DIAGNOSTIC_H
6
 
7
 #include "hedley.h"
8
+#include "simde-detect-clang.h"
9
 
10
 /* This is only to help us implement functions like _mm_undefined_ps. */
11
 #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
12
@@ -119,6 +121,9 @@
13
 #define SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_
14
 #endif
15
 
16
+/* MSVC emits a diagnostic when we call a function (like
17
+ * simde_mm_set_epi32) while initializing a struct.  We currently do
18
+ * this a *lot* in the tests. */
19
 #if defined(HEDLEY_MSVC_VERSION)
20
 #define SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ \
21
    __pragma(warning(disable : 4204))
22
@@ -183,6 +188,32 @@
23
 #define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_
24
 #endif
25
 
26
+/* emscripten requires us to use a __wasm_unimplemented_simd128__ macro
27
+ * before we can access certain SIMD intrinsics, but this diagnostic
28
+ * warns about it being a reserved name.  It is a reserved name, but
29
+ * it's reserved for the compiler and we are using it to convey
30
+ * information to the compiler.
31
+ *
32
+ * This is also used when enabling native aliases since we don't get to
33
+ * choose the macro names. */
34
+#if HEDLEY_HAS_WARNING("-Wdouble-promotion")
35
+#define SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_ \
36
+   _Pragma("clang diagnostic ignored \"-Wreserved-id-macro\"")
37
+#else
38
+#define SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_
39
+#endif
40
+
41
+/* clang 3.8 warns about the packed attribute being unnecessary when
42
+ * used in the _mm_loadu_* functions.  That *may* be true for version
43
+ * 3.8, but for later versions it is crucial in order to make unaligned
44
+ * access safe. */
45
+#if HEDLEY_HAS_WARNING("-Wpacked")
46
+#define SIMDE_DIAGNOSTIC_DISABLE_PACKED_ \
47
+   _Pragma("clang diagnostic ignored \"-Wpacked\"")
48
+#else
49
+#define SIMDE_DIAGNOSTIC_DISABLE_PACKED_
50
+#endif
51
+
52
 /* Triggered when assigning a float to a double implicitly.  We use
53
  * explicit casts in SIMDe, this is only used in the test suite. */
54
 #if HEDLEY_HAS_WARNING("-Wdouble-promotion")
55
@@ -194,7 +225,7 @@
56
 
57
 /* Several compilers treat conformant array parameters as VLAs.  We
58
  * test to make sure we're in C mode (C++ doesn't support CAPs), and
59
- * that the version of the standard supports CAPs.  We also blacklist
60
+ * that the version of the standard supports CAPs.  We also reject
61
  * some buggy compilers like MSVC (the logic is in Hedley if you want
62
  * to take a look), but with certain warnings enabled some compilers
63
  * still like to emit a diagnostic. */
64
@@ -221,6 +252,9 @@
65
 #elif HEDLEY_GCC_VERSION_CHECK(3, 4, 0)
66
 #define SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_ \
67
    _Pragma("GCC diagnostic ignored \"-Wunused-function\"")
68
+#elif HEDLEY_MSVC_VERSION_CHECK(19, 0, 0) /* Likely goes back further */
69
+#define SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_ \
70
+   __pragma(warning(disable : 4505))
71
 #else
72
 #define SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_
73
 #endif
74
@@ -232,13 +266,63 @@
75
 #define SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_
76
 #endif
77
 
78
-/* https://github.com/nemequ/simde/issues/277 */
79
+#if HEDLEY_HAS_WARNING("-Wpadded")
80
+#define SIMDE_DIAGNOSTIC_DISABLE_PADDED_ \
81
+   _Pragma("clang diagnostic ignored \"-Wpadded\"")
82
+#elif HEDLEY_MSVC_VERSION_CHECK(19, 0, 0) /* Likely goes back further */
83
+#define SIMDE_DIAGNOSTIC_DISABLE_PADDED_ __pragma(warning(disable : 4324))
84
+#else
85
+#define SIMDE_DIAGNOSTIC_DISABLE_PADDED_
86
+#endif
87
+
88
+#if HEDLEY_HAS_WARNING("-Wzero-as-null-pointer-constant")
89
+#define SIMDE_DIAGNOSTIC_DISABLE_ZERO_AS_NULL_POINTER_CONSTANT_ \
90
+   _Pragma("clang diagnostic ignored \"-Wzero-as-null-pointer-constant\"")
91
+#else
92
+#define SIMDE_DIAGNOSTIC_DISABLE_ZERO_AS_NULL_POINTER_CONSTANT_
93
+#endif
94
+
95
+#if HEDLEY_HAS_WARNING("-Wold-style-cast")
96
+#define SIMDE_DIAGNOSTIC_DISABLE_OLD_STYLE_CAST_ \
97
+   _Pragma("clang diagnostic ignored \"-Wold-style-cast\"")
98
+#else
99
+#define SIMDE_DIAGNOSTIC_DISABLE_OLD_STYLE_CAST_
100
+#endif
101
+
102
+#if HEDLEY_HAS_WARNING("-Wcast-function-type") || \
103
+   HEDLEY_GCC_VERSION_CHECK(8, 0, 0)
104
+#define SIMDE_DIAGNOSTIC_DISABLE_CAST_FUNCTION_TYPE_ \
105
+   _Pragma("GCC diagnostic ignored \"-Wcast-function-type\"")
106
+#else
107
+#define SIMDE_DIAGNOSTIC_DISABLE_CAST_FUNCTION_TYPE_
108
+#endif
109
+
110
+/* clang will emit this warning when we use C99 extensions whan not in
111
+ * C99 mode, even though it does support this.  In such cases we check
112
+ * the compiler and version first, so we know it's not a problem. */
113
+#if HEDLEY_HAS_WARNING("-Wc99-extensions")
114
+#define SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_ \
115
+   _Pragma("clang diagnostic ignored \"-Wc99-extensions\"")
116
+#else
117
+#define SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_
118
+#endif
119
+
120
+/* https://github.com/simd-everywhere/simde/issues/277 */
121
 #if defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(4, 6, 0) && \
122
-   !HEDLEY_GCC_VERSION_CHECK(6, 0, 0) && defined(__cplusplus)
123
-#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE \
124
+   !HEDLEY_GCC_VERSION_CHECK(6, 4, 0) && defined(__cplusplus)
125
+#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_ \
126
    _Pragma("GCC diagnostic ignored \"-Wunused-but-set-variable\"")
127
 #else
128
-#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE
129
+#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_
130
+#endif
131
+
132
+/* This is the warning that you normally define _CRT_SECURE_NO_WARNINGS
133
+ * to silence, but you have to do that before including anything and
134
+ * that would require reordering includes. */
135
+#if defined(_MSC_VER)
136
+#define SIMDE_DIAGNOSTIC_DISABLE_ANNEX_K_ __pragma(warning(disable : 4996))
137
+#else
138
+#define SIMDE_DIAGNOSTIC_DISABLE_ANNEX_K_
139
 #endif
140
 
141
 /* Some compilers, such as clang, may use `long long` for 64-bit
142
@@ -246,13 +330,104 @@
143
  * -Wc++98-compat-pedantic which says 'long long' is incompatible with
144
  * C++98. */
145
 #if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic")
146
-#define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC \
147
+#define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ \
148
    _Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"")
149
 #else
150
-#define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC
151
+#define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_
152
+#endif
153
+
154
+/* Some problem as above */
155
+#if HEDLEY_HAS_WARNING("-Wc++11-long-long")
156
+#define SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_ \
157
+   _Pragma("clang diagnostic ignored \"-Wc++11-long-long\"")
158
+#else
159
+#define SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_
160
+#endif
161
+
162
+/* emscripten emits this whenever stdin/stdout/stderr is used in a
163
+ * macro. */
164
+#if HEDLEY_HAS_WARNING("-Wdisabled-macro-expansion")
165
+#define SIMDE_DIAGNOSTIC_DISABLE_DISABLED_MACRO_EXPANSION_ \
166
+   _Pragma("clang diagnostic ignored \"-Wdisabled-macro-expansion\"")
167
+#else
168
+#define SIMDE_DIAGNOSTIC_DISABLE_DISABLED_MACRO_EXPANSION_
169
+#endif
170
+
171
+/* Clang uses C11 generic selections to implement some AltiVec
172
+ * functions, which triggers this diagnostic when not compiling
173
+ * in C11 mode */
174
+#if HEDLEY_HAS_WARNING("-Wc11-extensions")
175
+#define SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_ \
176
+   _Pragma("clang diagnostic ignored \"-Wc11-extensions\"")
177
+#else
178
+#define SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_
179
+#endif
180
+
181
+/* Clang sometimes triggers this warning in macros in the AltiVec and
182
+ * NEON headers, or due to missing functions. */
183
+#if HEDLEY_HAS_WARNING("-Wvector-conversion")
184
+#define SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ \
185
+   _Pragma("clang diagnostic ignored \"-Wvector-conversion\"")
186
+/* For NEON, the situation with -Wvector-conversion in clang < 10 is
187
+   * bad enough that we just disable the warning altogether. */
188
+#if defined(SIMDE_ARCH_ARM) && SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
189
+#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_ \
190
+   SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
191
+#endif
192
+#else
193
+#define SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
194
+#endif
195
+#if !defined(SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_)
196
+#define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_
197
+#endif
198
+
199
+/* SLEEF triggers this a *lot* in their headers */
200
+#if HEDLEY_HAS_WARNING("-Wignored-qualifiers")
201
+#define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_ \
202
+   _Pragma("clang diagnostic ignored \"-Wignored-qualifiers\"")
203
+#elif HEDLEY_GCC_VERSION_CHECK(4, 3, 0)
204
+#define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_ \
205
+   _Pragma("GCC diagnostic ignored \"-Wignored-qualifiers\"")
206
+#else
207
+#define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_
208
+#endif
209
+
210
+/* GCC emits this under some circumstances when using __int128 */
211
+#if HEDLEY_GCC_VERSION_CHECK(4, 8, 0)
212
+#define SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_ \
213
+   _Pragma("GCC diagnostic ignored \"-Wpedantic\"")
214
+#else
215
+#define SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_
216
+#endif
217
+
218
+/* MSVC doesn't like (__assume(0), code) and will warn about code being
219
+ * unreachable, but we want it there because not all compilers
220
+ * understand the unreachable macro and will complain if it is missing.
221
+ * I'm planning on adding a new macro to Hedley to handle this a bit
222
+ * more elegantly, but until then... */
223
+#if defined(HEDLEY_MSVC_VERSION)
224
+#define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ __pragma(warning(disable : 4702))
225
+#else
226
+#define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_
227
+#endif
228
+
229
+/* This is a false positive from GCC in a few places. */
230
+#if HEDLEY_GCC_VERSION_CHECK(4, 7, 0)
231
+#define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ \
232
+   _Pragma("GCC diagnostic ignored \"-Wmaybe-uninitialized\"")
233
+#else
234
+#define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_
235
+#endif
236
+
237
+#if defined(SIMDE_ENABLE_NATIVE_ALIASES)
238
+#define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_ \
239
+   SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_
240
+#else
241
+#define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_
242
 #endif
243
 
244
 #define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS                           \
245
+   SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_           \
246
    SIMDE_DIAGNOSTIC_DISABLE_PSABI_                              \
247
    SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_                \
248
    SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_             \
249
@@ -264,7 +439,9 @@
250
    SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_             \
251
    SIMDE_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION_                    \
252
    SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_                        \
253
-   SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC               \
254
-   SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE
255
+   SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_              \
256
+   SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_                    \
257
+   SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_      \
258
+   SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_
259
 
260
-#endif
261
+#endif /* !defined(SIMDE_DIAGNOSTIC_H) */
262
obs-studio-26.1.0.tar.xz/libobs/util/simde/simde-features.h -> obs-studio-26.1.1.tar.xz/libobs/util/simde/simde-features.h Changed
292
 
1
@@ -32,6 +32,7 @@
2
 #define SIMDE_FEATURES_H
3
 
4
 #include "simde-arch.h"
5
+#include "simde-diagnostic.h"
6
 
7
 #if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && \
8
    !defined(SIMDE_NO_NATIVE)
9
@@ -43,6 +44,28 @@
10
 #define SIMDE_X86_AVX512F_NATIVE
11
 #endif
12
 
13
+#if !defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) &&        \
14
+   !defined(SIMDE_X86_AVX512VP2INTERSECT_NO_NATIVE) && \
15
+   !defined(SIMDE_NO_NATIVE)
16
+#if defined(SIMDE_ARCH_X86_AVX512VP2INTERSECT)
17
+#define SIMDE_X86_AVX512VP2INTERSECT_NATIVE
18
+#endif
19
+#endif
20
+#if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && \
21
+   !defined(SIMDE_X86_AVX512F_NATIVE)
22
+#define SIMDE_X86_AVX512F_NATIVE
23
+#endif
24
+
25
+#if !defined(SIMDE_X86_AVX512VBMI_NATIVE) && \
26
+   !defined(SIMDE_X86_AVX512VBMI_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
27
+#if defined(SIMDE_ARCH_X86_AVX512VBMI)
28
+#define SIMDE_X86_AVX512VBMI_NATIVE
29
+#endif
30
+#endif
31
+#if defined(SIMDE_X86_AVX512VBMI_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)
32
+#define SIMDE_X86_AVX512F_NATIVE
33
+#endif
34
+
35
 #if !defined(SIMDE_X86_AVX512CD_NATIVE) && \
36
    !defined(SIMDE_X86_AVX512CD_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
37
 #if defined(SIMDE_ARCH_X86_AVX512CD)
38
@@ -194,6 +217,20 @@
39
 #endif
40
 #endif
41
 
42
+#if !defined(SIMDE_X86_PCLMUL_NATIVE) && \
43
+   !defined(SIMDE_X86_PCLMUL_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
44
+#if defined(SIMDE_ARCH_X86_PCLMUL)
45
+#define SIMDE_X86_PCLMUL_NATIVE
46
+#endif
47
+#endif
48
+
49
+#if !defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && \
50
+   !defined(SIMDE_X86_VPCLMULQDQ_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
51
+#if defined(SIMDE_ARCH_X86_VPCLMULQDQ)
52
+#define SIMDE_X86_VPCLMULQDQ_NATIVE
53
+#endif
54
+#endif
55
+
56
 #if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && \
57
    !defined(SIMDE_NO_NATIVE)
58
 #if defined(__INTEL_COMPILER)
59
@@ -206,8 +243,7 @@
60
 #pragma warning(disable : 4799)
61
 #endif
62
 
63
-#if defined(SIMDE_X86_AVX_NATIVE) || defined(SIMDE_X86_GFNI_NATIVE) || \
64
-   defined(SIMDE_X86_SVML_NATIVE)
65
+#if defined(SIMDE_X86_AVX_NATIVE) || defined(SIMDE_X86_GFNI_NATIVE)
66
 #include <immintrin.h>
67
 #elif defined(SIMDE_X86_SSE4_2_NATIVE)
68
 #include <nmmintrin.h>
69
@@ -243,7 +279,8 @@
70
 
71
 #if !defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
72
    !defined(SIMDE_ARM_NEON_A32V8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
73
-#if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(80)
74
+#if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(80) && \
75
+   (__ARM_NEON_FP & 0x02)
76
 #define SIMDE_ARM_NEON_A32V8_NATIVE
77
 #endif
78
 #endif
79
@@ -262,6 +299,14 @@
80
 #include <arm_neon.h>
81
 #endif
82
 
83
+#if !defined(SIMDE_ARM_SVE_NATIVE) && !defined(SIMDE_ARM_SVE_NO_NATIVE) && \
84
+   !defined(SIMDE_NO_NATIVE)
85
+#if defined(SIMDE_ARCH_ARM_SVE)
86
+#define SIMDE_ARM_SVE_NATIVE
87
+#include <arm_sve.h>
88
+#endif
89
+#endif
90
+
91
 #if !defined(SIMDE_WASM_SIMD128_NATIVE) && \
92
    !defined(SIMDE_WASM_SIMD128_NO_NATIVE) && !defined(SIMDE_NO_NATIVE)
93
 #if defined(SIMDE_ARCH_WASM_SIMD128)
94
@@ -270,7 +315,10 @@
95
 #endif
96
 #if defined(SIMDE_WASM_SIMD128_NATIVE)
97
 #if !defined(__wasm_unimplemented_simd128__)
98
+HEDLEY_DIAGNOSTIC_PUSH
99
+SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_
100
 #define __wasm_unimplemented_simd128__
101
+HEDLEY_DIAGNOSTIC_POP
102
 #endif
103
 #include <wasm_simd128.h>
104
 #endif
105
@@ -326,15 +374,28 @@
106
 #define SIMDE_POWER_ALTIVEC_P5_NATIVE
107
 #endif
108
 #endif
109
-#if defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
110
-/* stdbool.h conflicts with the bool in altivec.h */
111
-#if defined(bool) && !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF_BOOL_)
112
+
113
+#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
114
+/* AltiVec conflicts with lots of stuff.  The bool keyword conflicts
115
+   * with the bool keyword in C++ and the bool macro in C99+ (defined
116
+   * in stdbool.h).  The vector keyword conflicts with std::vector in
117
+   * C++ if you are `using std;`.
118
+   *
119
+   * Luckily AltiVec allows you to use `__vector`/`__bool`/`__pixel`
120
+   * instead, but altivec.h will unconditionally define
121
+   * `vector`/`bool`/`pixel` so we need to work around that.
122
+   *
123
+   * Unfortunately this means that if your code uses AltiVec directly
124
+   * it may break.  If this is the case you'll want to define
125
+   * `SIMDE_POWER_ALTIVEC_NO_UNDEF` before including SIMDe.  Or, even
126
+   * better, port your code to use the double-underscore versions. */
127
+#if defined(bool)
128
 #undef bool
129
 #endif
130
+
131
 #include <altivec.h>
132
-/* GCC allows you to undefine these macros to prevent conflicts with
133
-   * standard types as they become context-sensitive keywords. */
134
-#if defined(__cplusplus)
135
+
136
+#if !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF)
137
 #if defined(vector)
138
 #undef vector
139
 #endif
140
@@ -344,14 +405,146 @@
141
 #if defined(bool)
142
 #undef bool
143
 #endif
144
-#define SIMDE_POWER_ALTIVEC_VECTOR(T) vector T
145
-#define SIMDE_POWER_ALTIVEC_PIXEL pixel
146
-#define SIMDE_POWER_ALTIVEC_BOOL bool
147
-#else
148
+#endif /* !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF) */
149
+
150
+/* Use these intsead of vector/pixel/bool in SIMDe. */
151
 #define SIMDE_POWER_ALTIVEC_VECTOR(T) __vector T
152
 #define SIMDE_POWER_ALTIVEC_PIXEL __pixel
153
 #define SIMDE_POWER_ALTIVEC_BOOL __bool
154
-#endif /* defined(__cplusplus) */
155
+
156
+/* Re-define bool if we're using stdbool.h */
157
+#if !defined(__cplusplus) && defined(__bool_true_false_are_defined) && \
158
+   !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF)
159
+#define bool _Bool
160
+#endif
161
+#endif
162
+
163
+#if !defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) &&        \
164
+   !defined(SIMDE_MIPS_LOONGSON_MMI_NO_NATIVE) && \
165
+   !defined(SIMDE_NO_NATIVE)
166
+#if defined(SIMDE_ARCH_MIPS_LOONGSON_MMI)
167
+#define SIMDE_MIPS_LOONGSON_MMI_NATIVE 1
168
+#endif
169
+#endif
170
+#if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
171
+#include <loongson-mmiintrin.h>
172
+#endif
173
+
174
+/* This is used to determine whether or not to fall back on a vector
175
+ * function in an earlier ISA extensions, as well as whether
176
+ * we expected any attempts at vectorization to be fruitful or if we
177
+ * expect to always be running serial code. */
178
+
179
+#if !defined(SIMDE_NATURAL_VECTOR_SIZE)
180
+#if defined(SIMDE_X86_AVX512F_NATIVE)
181
+#define SIMDE_NATURAL_VECTOR_SIZE (512)
182
+#elif defined(SIMDE_X86_AVX_NATIVE)
183
+#define SIMDE_NATURAL_VECTOR_SIZE (256)
184
+#elif defined(SIMDE_X86_SSE_NATIVE) || defined(SIMDE_ARM_NEON_A32V7_NATIVE) || \
185
+   defined(SIMDE_WASM_SIMD128_NATIVE) ||                                  \
186
+   defined(SIMDE_POWER_ALTIVEC_P5_NATIVE)
187
+#define SIMDE_NATURAL_VECTOR_SIZE (128)
188
+#endif
189
+
190
+#if !defined(SIMDE_NATURAL_VECTOR_SIZE)
191
+#define SIMDE_NATURAL_VECTOR_SIZE (0)
192
+#endif
193
+#endif
194
+
195
+#define SIMDE_NATURAL_VECTOR_SIZE_LE(x) \
196
+   ((SIMDE_NATURAL_VECTOR_SIZE > 0) && (SIMDE_NATURAL_VECTOR_SIZE <= (x)))
197
+#define SIMDE_NATURAL_VECTOR_SIZE_GE(x) \
198
+   ((SIMDE_NATURAL_VECTOR_SIZE > 0) && (SIMDE_NATURAL_VECTOR_SIZE >= (x)))
199
+
200
+/* Native aliases */
201
+#if defined(SIMDE_ENABLE_NATIVE_ALIASES)
202
+#if !defined(SIMDE_X86_MMX_NATIVE)
203
+#define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES
204
+#endif
205
+#if !defined(SIMDE_X86_SSE_NATIVE)
206
+#define SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES
207
+#endif
208
+#if !defined(SIMDE_X86_SSE2_NATIVE)
209
+#define SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES
210
+#endif
211
+#if !defined(SIMDE_X86_SSE3_NATIVE)
212
+#define SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES
213
+#endif
214
+#if !defined(SIMDE_X86_SSSE3_NATIVE)
215
+#define SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES
216
+#endif
217
+#if !defined(SIMDE_X86_SSE4_1_NATIVE)
218
+#define SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES
219
+#endif
220
+#if !defined(SIMDE_X86_SSE4_2_NATIVE)
221
+#define SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES
222
+#endif
223
+#if !defined(SIMDE_X86_AVX_NATIVE)
224
+#define SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES
225
+#endif
226
+#if !defined(SIMDE_X86_AVX2_NATIVE)
227
+#define SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES
228
+#endif
229
+#if !defined(SIMDE_X86_FMA_NATIVE)
230
+#define SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES
231
+#endif
232
+#if !defined(SIMDE_X86_AVX512F_NATIVE)
233
+#define SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES
234
+#endif
235
+#if !defined(SIMDE_X86_AVX512VL_NATIVE)
236
+#define SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES
237
+#endif
238
+#if !defined(SIMDE_X86_AVX512BW_NATIVE)
239
+#define SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES
240
+#endif
241
+#if !defined(SIMDE_X86_AVX512DQ_NATIVE)
242
+#define SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES
243
+#endif
244
+#if !defined(SIMDE_X86_AVX512CD_NATIVE)
245
+#define SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES
246
+#endif
247
+#if !defined(SIMDE_X86_GFNI_NATIVE)
248
+#define SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES
249
+#endif
250
+#if !defined(SIMDE_X86_PCLMUL_NATIVE)
251
+#define SIMDE_X86_PCLMUL_ENABLE_NATIVE_ALIASES
252
+#endif
253
+#if !defined(SIMDE_X86_VPCLMULQDQ_NATIVE)
254
+#define SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES
255
+#endif
256
+
257
+#if !defined(SIMDE_ARM_NEON_A32V7_NATIVE)
258
+#define SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES
259
+#endif
260
+#if !defined(SIMDE_ARM_NEON_A32V8_NATIVE)
261
+#define SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES
262
+#endif
263
+#if !defined(SIMDE_ARM_NEON_A64V8_NATIVE)
264
+#define SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES
265
+#endif
266
+#endif
267
+
268
+/* Are floating point values stored using IEEE 754?  Knowing
269
+ * this at during preprocessing is a bit tricky, mostly because what
270
+ * we're curious about is how values are stored and not whether the
271
+ * implementation is fully conformant in terms of rounding, NaN
272
+ * handling, etc.
273
+ *
274
+ * For example, if you use -ffast-math or -Ofast on
275
+ * GCC or clang IEEE 754 isn't strictly followed, therefore IEE 754
276
+ * support is not advertised (by defining __STDC_IEC_559__).
277
+ *
278
+ * However, what we care about is whether it is safe to assume that
279
+ * floating point values are stored in IEEE 754 format, in which case
280
+ * we can provide faster implementations of some functions.
281
+ *
282
+ * Luckily every vaugely modern architecture I'm aware of uses IEEE 754-
283
+ * so we just assume IEEE 754 for now.  There is a test which verifies
284
+ * this, if that test fails sowewhere please let us know and we'll add
285
+ * an exception for that platform.  Meanwhile, you can define
286
+ * SIMDE_NO_IEEE754_STORAGE. */
287
+#if !defined(SIMDE_IEEE754_STORAGE) && !defined(SIMDE_NO_IEE754_STORAGE)
288
+#define SIMDE_IEEE754_STORAGE
289
 #endif
290
 
291
 #endif /* !defined(SIMDE_FEATURES_H) */
292
obs-studio-26.1.0.tar.xz/libobs/util/simde/simde-math.h -> obs-studio-26.1.1.tar.xz/libobs/util/simde/simde-math.h Changed
674
 
1
@@ -34,6 +34,58 @@
2
 #include "hedley.h"
3
 #include "simde-features.h"
4
 
5
+#include <stdint.h>
6
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7
+#include <arm_neon.h>
8
+#endif
9
+
10
+HEDLEY_DIAGNOSTIC_PUSH
11
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
12
+
13
+/* SLEEF support
14
+ * https://sleef.org/
15
+ *
16
+ * If you include <sleef.h> prior to including SIMDe, SIMDe will use
17
+ * SLEEF.  You can also define SIMDE_MATH_SLEEF_ENABLE prior to
18
+ * including SIMDe to force the issue.
19
+ *
20
+ * Note that SLEEF does requires linking to libsleef.
21
+ *
22
+ * By default, SIMDe will use the 1 ULP functions, but if you use
23
+ * SIMDE_ACCURACY_PREFERENCE of 0 we will use up to 4 ULP.  This is
24
+ * only the case for the simde_math_* functions; for code in other
25
+ * SIMDe headers which calls SLEEF directly we may use functions with
26
+ * greater error if the API we're implementing is less precise (for
27
+ * example, SVML guarantees 4 ULP, so we will generally use the 3.5
28
+ * ULP functions from SLEEF). */
29
+#if !defined(SIMDE_MATH_SLEEF_DISABLE)
30
+#if defined(__SLEEF_H__)
31
+#define SIMDE_MATH_SLEEF_ENABLE
32
+#endif
33
+#endif
34
+
35
+#if defined(SIMDE_MATH_SLEEF_ENABLE) && !defined(__SLEEF_H__)
36
+HEDLEY_DIAGNOSTIC_PUSH
37
+SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_
38
+#include <sleef.h>
39
+HEDLEY_DIAGNOSTIC_POP
40
+#endif
41
+
42
+#if defined(SIMDE_MATH_SLEEF_ENABLE) && defined(__SLEEF_H__)
43
+#if defined(SLEEF_VERSION_MAJOR)
44
+#define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch)              \
45
+   (HEDLEY_VERSION_ENCODE(SLEEF_VERSION_MAJOR, SLEEF_VERSION_MINOR, \
46
+                  SLEEF_VERSION_PATCHLEVEL) >=              \
47
+    HEDLEY_VERSION_ENCODE(major, minor, patch))
48
+#else
49
+#define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch) \
50
+   (HEDLEY_VERSION_ENCODE(3, 0, 0) >=                  \
51
+    HEDLEY_VERSION_ENCODE(major, minor, patch))
52
+#endif
53
+#else
54
+#define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch) (0)
55
+#endif
56
+
57
 #if defined(__has_builtin)
58
 #define SIMDE_MATH_BUILTIN_LIBM(func) __has_builtin(__builtin_##func)
59
 #elif HEDLEY_INTEL_VERSION_CHECK(13, 0, 0) || \
60
@@ -82,11 +134,35 @@
61
 #endif
62
 #endif
63
 
64
-#if !defined(__cplusplus)
65
-/* If this is a problem we *might* be able to avoid including
66
-   * <complex.h> on some compilers (gcc, clang, and others which
67
-   * implement builtins like __builtin_cexpf).  If you don't have
68
-   * a <complex.h> please file an issue and we'll take a look. */
69
+/* Try to avoid including <complex> since it pulls in a *lot* of code. */
70
+#if HEDLEY_HAS_BUILTIN(__builtin_creal) ||   \
71
+   HEDLEY_GCC_VERSION_CHECK(4, 7, 0) || \
72
+   HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
73
+HEDLEY_DIAGNOSTIC_PUSH
74
+SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_
75
+typedef __complex__ float simde_cfloat32;
76
+typedef __complex__ double simde_cfloat64;
77
+HEDLEY_DIAGNOSTIC_POP
78
+#define SIMDE_MATH_CMPLX(x, y)           \
79
+   (HEDLEY_STATIC_CAST(double, x) + \
80
+    HEDLEY_STATIC_CAST(double, y) * (__extension__ 1.0j))
81
+#define SIMDE_MATH_CMPLXF(x, y)         \
82
+   (HEDLEY_STATIC_CAST(float, x) + \
83
+    HEDLEY_STATIC_CAST(float, y) * (__extension__ 1.0fj))
84
+
85
+#if !defined(simde_math_creal)
86
+#define simde_math_crealf(z) __builtin_crealf(z)
87
+#endif
88
+#if !defined(simde_math_crealf)
89
+#define simde_math_creal(z) __builtin_creal(z)
90
+#endif
91
+#if !defined(simde_math_cimag)
92
+#define simde_math_cimagf(z) __builtin_cimagf(z)
93
+#endif
94
+#if !defined(simde_math_cimagf)
95
+#define simde_math_cimag(z) __builtin_cimag(z)
96
+#endif
97
+#elif !defined(__cplusplus)
98
 #include <complex.h>
99
 
100
 #if !defined(HEDLEY_MSVC_VERSION)
101
@@ -96,20 +172,14 @@
102
 typedef _Fcomplex simde_cfloat32;
103
 typedef _Dcomplex simde_cfloat64;
104
 #endif
105
-#if HEDLEY_HAS_BUILTIN(__builtin_complex) || \
106
-   HEDLEY_GCC_VERSION_CHECK(4, 7, 0) || \
107
-   HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
108
-#define SIMDE_MATH_CMPLX(x, y) __builtin_complex((double)(x), (double)(y))
109
-#define SIMDE_MATH_CMPLXF(x, y) __builtin_complex((float)(x), (float)(y))
110
-#elif defined(HEDLEY_MSVC_VERSION)
111
+
112
+#if defined(HEDLEY_MSVC_VERSION)
113
 #define SIMDE_MATH_CMPLX(x, y) ((simde_cfloat64){(x), (y)})
114
 #define SIMDE_MATH_CMPLXF(x, y) ((simde_cfloat32){(x), (y)})
115
 #elif defined(CMPLX) && defined(CMPLXF)
116
 #define SIMDE_MATH_CMPLX(x, y) CMPLX(x, y)
117
 #define SIMDE_MATH_CMPLXF(x, y) CMPLXF(x, y)
118
 #else
119
-/* CMPLX / CMPLXF are in C99, but these seem to be necessary in
120
-     * some compilers that aren't even MSVC. */
121
 #define SIMDE_MATH_CMPLX(x, y) \
122
    (HEDLEY_STATIC_CAST(double, x) + HEDLEY_STATIC_CAST(double, y) * I)
123
 #define SIMDE_MATH_CMPLXF(x, y) \
124
@@ -117,38 +187,18 @@
125
 #endif
126
 
127
 #if !defined(simde_math_creal)
128
-#if SIMDE_MATH_BUILTIN_LIBM(creal)
129
-#define simde_math_creal(z) __builtin_creal(z)
130
-#else
131
 #define simde_math_creal(z) creal(z)
132
 #endif
133
-#endif
134
-
135
 #if !defined(simde_math_crealf)
136
-#if SIMDE_MATH_BUILTIN_LIBM(crealf)
137
-#define simde_math_crealf(z) __builtin_crealf(z)
138
-#else
139
 #define simde_math_crealf(z) crealf(z)
140
 #endif
141
-#endif
142
-
143
 #if !defined(simde_math_cimag)
144
-#if SIMDE_MATH_BUILTIN_LIBM(cimag)
145
-#define simde_math_cimag(z) __builtin_cimag(z)
146
-#else
147
 #define simde_math_cimag(z) cimag(z)
148
 #endif
149
-#endif
150
-
151
 #if !defined(simde_math_cimagf)
152
-#if SIMDE_MATH_BUILTIN_LIBM(cimagf)
153
-#define simde_math_cimagf(z) __builtin_cimagf(z)
154
-#else
155
 #define simde_math_cimagf(z) cimagf(z)
156
 #endif
157
-#endif
158
 #else
159
-
160
 HEDLEY_DIAGNOSTIC_PUSH
161
 #if defined(HEDLEY_MSVC_VERSION)
162
 #pragma warning(disable : 4530)
163
@@ -240,6 +290,26 @@
164
 #endif
165
 #endif
166
 
167
+#if !defined(SIMDE_MATH_PI_OVER_180)
168
+#define SIMDE_MATH_PI_OVER_180 \
169
+   0.0174532925199432957692369076848861271344287188854172545609719144
170
+#endif
171
+
172
+#if !defined(SIMDE_MATH_PI_OVER_180F)
173
+#define SIMDE_MATH_PI_OVER_180F \
174
+   0.0174532925199432957692369076848861271344287188854172545609719144f
175
+#endif
176
+
177
+#if !defined(SIMDE_MATH_180_OVER_PI)
178
+#define SIMDE_MATH_180_OVER_PI \
179
+   57.295779513082320876798154814105170332405472466564321549160243861
180
+#endif
181
+
182
+#if !defined(SIMDE_MATH_180_OVER_PIF)
183
+#define SIMDE_MATH_180_OVER_PIF \
184
+   57.295779513082320876798154814105170332405472466564321549160243861f
185
+#endif
186
+
187
 #if !defined(SIMDE_MATH_FLT_MIN)
188
 #if defined(FLT_MIN)
189
 #define SIMDE_MATH_FLT_MIN FLT_MIN
190
@@ -341,6 +411,36 @@
191
 #endif
192
 #endif
193
 
194
+/*** Manipulation functions ***/
195
+
196
+#if !defined(simde_math_nextafter)
197
+#if (HEDLEY_HAS_BUILTIN(__builtin_nextafter) && \
198
+     !defined(HEDLEY_IBM_VERSION)) ||           \
199
+   HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||    \
200
+   HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||    \
201
+   HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
202
+#define simde_math_nextafter(x, y) __builtin_nextafter(x, y)
203
+#elif defined(SIMDE_MATH_HAVE_CMATH)
204
+#define simde_math_nextafter(x, y) std::nextafter(x, y)
205
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
206
+#define simde_math_nextafter(x, y) nextafter(x, y)
207
+#endif
208
+#endif
209
+
210
+#if !defined(simde_math_nextafterf)
211
+#if (HEDLEY_HAS_BUILTIN(__builtin_nextafterf) && \
212
+     !defined(HEDLEY_IBM_VERSION)) ||            \
213
+   HEDLEY_ARM_VERSION_CHECK(4, 1, 0) ||     \
214
+   HEDLEY_GCC_VERSION_CHECK(3, 4, 0) ||     \
215
+   HEDLEY_INTEL_VERSION_CHECK(13, 0, 0)
216
+#define simde_math_nextafterf(x, y) __builtin_nextafterf(x, y)
217
+#elif defined(SIMDE_MATH_HAVE_CMATH)
218
+#define simde_math_nextafterf(x, y) std::nextafter(x, y)
219
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
220
+#define simde_math_nextafterf(x, y) nextafterf(x, y)
221
+#endif
222
+#endif
223
+
224
 /*** Functions from C99 ***/
225
 
226
 #if !defined(simde_math_abs)
227
@@ -353,13 +453,13 @@
228
 #endif
229
 #endif
230
 
231
-#if !defined(simde_math_absf)
232
-#if SIMDE_MATH_BUILTIN_LIBM(absf)
233
-#define simde_math_absf(v) __builtin_absf(v)
234
+#if !defined(simde_math_fabsf)
235
+#if SIMDE_MATH_BUILTIN_LIBM(fabsf)
236
+#define simde_math_fabsf(v) __builtin_fabsf(v)
237
 #elif defined(SIMDE_MATH_HAVE_CMATH)
238
-#define simde_math_absf(v) std::abs(v)
239
+#define simde_math_fabsf(v) std::abs(v)
240
 #elif defined(SIMDE_MATH_HAVE_MATH_H)
241
-#define simde_math_absf(v) absf(v)
242
+#define simde_math_fabsf(v) fabsf(v)
243
 #endif
244
 #endif
245
 
246
@@ -574,7 +674,13 @@
247
 #endif
248
 
249
 #if !defined(simde_math_cosf)
250
-#if SIMDE_MATH_BUILTIN_LIBM(cosf)
251
+#if defined(SIMDE_MATH_SLEEF_ENABLE)
252
+#if SIMDE_ACCURACY_PREFERENCE < 1
253
+#define simde_math_cosf(v) Sleef_cosf_u35(v)
254
+#else
255
+#define simde_math_cosf(v) Sleef_cosf_u10(v)
256
+#endif
257
+#elif SIMDE_MATH_BUILTIN_LIBM(cosf)
258
 #define simde_math_cosf(v) __builtin_cosf(v)
259
 #elif defined(SIMDE_MATH_HAVE_CMATH)
260
 #define simde_math_cosf(v) std::cos(v)
261
@@ -755,6 +861,46 @@
262
 #endif
263
 #endif
264
 
265
+#if !defined(simde_math_fma)
266
+#if SIMDE_MATH_BUILTIN_LIBM(fma)
267
+#define simde_math_fma(x, y, z) __builtin_fma(x, y, z)
268
+#elif defined(SIMDE_MATH_HAVE_CMATH)
269
+#define simde_math_fma(x, y, z) std::fma(x, y, z)
270
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
271
+#define simde_math_fma(x, y, z) fma(x, y, z)
272
+#endif
273
+#endif
274
+
275
+#if !defined(simde_math_fmaf)
276
+#if SIMDE_MATH_BUILTIN_LIBM(fmaf)
277
+#define simde_math_fmaf(x, y, z) __builtin_fmaf(x, y, z)
278
+#elif defined(SIMDE_MATH_HAVE_CMATH)
279
+#define simde_math_fmaf(x, y, z) std::fma(x, y, z)
280
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
281
+#define simde_math_fmaf(x, y, z) fmaf(x, y, z)
282
+#endif
283
+#endif
284
+
285
+#if !defined(simde_math_fmax)
286
+#if SIMDE_MATH_BUILTIN_LIBM(fmax)
287
+#define simde_math_fmax(x, y, z) __builtin_fmax(x, y, z)
288
+#elif defined(SIMDE_MATH_HAVE_CMATH)
289
+#define simde_math_fmax(x, y, z) std::fmax(x, y, z)
290
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
291
+#define simde_math_fmax(x, y, z) fmax(x, y, z)
292
+#endif
293
+#endif
294
+
295
+#if !defined(simde_math_fmaxf)
296
+#if SIMDE_MATH_BUILTIN_LIBM(fmaxf)
297
+#define simde_math_fmaxf(x, y, z) __builtin_fmaxf(x, y, z)
298
+#elif defined(SIMDE_MATH_HAVE_CMATH)
299
+#define simde_math_fmaxf(x, y, z) std::fmax(x, y, z)
300
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
301
+#define simde_math_fmaxf(x, y, z) fmaxf(x, y, z)
302
+#endif
303
+#endif
304
+
305
 #if !defined(simde_math_hypot)
306
 #if SIMDE_MATH_BUILTIN_LIBM(hypot)
307
 #define simde_math_hypot(y, x) __builtin_hypot(y, x)
308
@@ -875,6 +1021,26 @@
309
 #endif
310
 #endif
311
 
312
+#if !defined(simde_math_modf)
313
+#if SIMDE_MATH_BUILTIN_LIBM(modf)
314
+#define simde_math_modf(x, iptr) __builtin_modf(x, iptr)
315
+#elif defined(SIMDE_MATH_HAVE_CMATH)
316
+#define simde_math_modf(x, iptr) std::modf(x, iptr)
317
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
318
+#define simde_math_modf(x, iptr) modf(x, iptr)
319
+#endif
320
+#endif
321
+
322
+#if !defined(simde_math_modff)
323
+#if SIMDE_MATH_BUILTIN_LIBM(modff)
324
+#define simde_math_modff(x, iptr) __builtin_modff(x, iptr)
325
+#elif defined(SIMDE_MATH_HAVE_CMATH)
326
+#define simde_math_modff(x, iptr) std::modf(x, iptr)
327
+#elif defined(SIMDE_MATH_HAVE_MATH_H)
328
+#define simde_math_modff(x, iptr) modff(x, iptr)
329
+#endif
330
+#endif
331
+
332
 #if !defined(simde_math_nearbyint)
333
 #if SIMDE_MATH_BUILTIN_LIBM(nearbyint)
334
 #define simde_math_nearbyint(v) __builtin_nearbyint(v)
335
@@ -955,6 +1121,44 @@
336
 #endif
337
 #endif
338
 
339
+#if !defined(simde_math_roundeven)
340
+#if HEDLEY_HAS_BUILTIN(__builtin_roundeven) || \
341
+   HEDLEY_GCC_VERSION_CHECK(10, 0, 0)
342
+#define simde_math_roundeven(v) __builtin_roundeven(v)
343
+#elif defined(simde_math_round) && defined(simde_math_fabs)
344
+static HEDLEY_INLINE double simde_math_roundeven(double v)
345
+{
346
+   double rounded = simde_math_round(v);
347
+   double diff = rounded - v;
348
+   if (HEDLEY_UNLIKELY(simde_math_fabs(diff) == 0.5) &&
349
+       (HEDLEY_STATIC_CAST(int64_t, rounded) & 1)) {
350
+       rounded = v - diff;
351
+   }
352
+   return rounded;
353
+}
354
+#define simde_math_roundeven simde_math_roundeven
355
+#endif
356
+#endif
357
+
358
+#if !defined(simde_math_roundevenf)
359
+#if HEDLEY_HAS_BUILTIN(__builtin_roundevenf) || \
360
+   HEDLEY_GCC_VERSION_CHECK(10, 0, 0)
361
+#define simde_math_roundevenf(v) __builtin_roundevenf(v)
362
+#elif defined(simde_math_roundf) && defined(simde_math_fabsf)
363
+static HEDLEY_INLINE float simde_math_roundevenf(float v)
364
+{
365
+   float rounded = simde_math_roundf(v);
366
+   float diff = rounded - v;
367
+   if (HEDLEY_UNLIKELY(simde_math_fabsf(diff) == 0.5f) &&
368
+       (HEDLEY_STATIC_CAST(int32_t, rounded) & 1)) {
369
+       rounded = v - diff;
370
+   }
371
+   return rounded;
372
+}
373
+#define simde_math_roundevenf simde_math_roundevenf
374
+#endif
375
+#endif
376
+
377
 #if !defined(simde_math_sin)
378
 #if SIMDE_MATH_BUILTIN_LIBM(sin)
379
 #define simde_math_sin(v) __builtin_sin(v)
380
@@ -1078,20 +1282,20 @@
381
 /***  Complex functions ***/
382
 
383
 #if !defined(simde_math_cexp)
384
-#if defined(__cplusplus)
385
-#define simde_math_cexp(v) std::cexp(v)
386
-#elif SIMDE_MATH_BUILTIN_LIBM(cexp)
387
+#if SIMDE_MATH_BUILTIN_LIBM(cexp)
388
 #define simde_math_cexp(v) __builtin_cexp(v)
389
+#elif defined(__cplusplus)
390
+#define simde_math_cexp(v) std::cexp(v)
391
 #elif defined(SIMDE_MATH_HAVE_MATH_H)
392
 #define simde_math_cexp(v) cexp(v)
393
 #endif
394
 #endif
395
 
396
 #if !defined(simde_math_cexpf)
397
-#if defined(__cplusplus)
398
-#define simde_math_cexpf(v) std::exp(v)
399
-#elif SIMDE_MATH_BUILTIN_LIBM(cexpf)
400
+#if SIMDE_MATH_BUILTIN_LIBM(cexpf)
401
 #define simde_math_cexpf(v) __builtin_cexpf(v)
402
+#elif defined(__cplusplus)
403
+#define simde_math_cexpf(v) std::exp(v)
404
 #elif defined(SIMDE_MATH_HAVE_MATH_H)
405
 #define simde_math_cexpf(v) cexpf(v)
406
 #endif
407
@@ -1393,22 +1597,262 @@
408
 
409
 static HEDLEY_INLINE double simde_math_rad2deg(double radians)
410
 {
411
-   return radians * (180.0 / SIMDE_MATH_PI);
412
+   return radians * SIMDE_MATH_180_OVER_PI;
413
 }
414
 
415
 static HEDLEY_INLINE float simde_math_rad2degf(float radians)
416
 {
417
-   return radians * (180.0f / SIMDE_MATH_PIF);
418
+   return radians * SIMDE_MATH_180_OVER_PIF;
419
 }
420
 
421
 static HEDLEY_INLINE double simde_math_deg2rad(double degrees)
422
 {
423
-   return degrees * (SIMDE_MATH_PI / 180.0);
424
+   return degrees * SIMDE_MATH_PI_OVER_180;
425
 }
426
 
427
 static HEDLEY_INLINE float simde_math_deg2radf(float degrees)
428
 {
429
-   return degrees * (SIMDE_MATH_PIF / 180.0f);
430
+   return degrees * (SIMDE_MATH_PI_OVER_180F);
431
 }
432
 
433
+/***  Saturated arithmetic ***/
434
+
435
+static HEDLEY_INLINE int8_t simde_math_adds_i8(int8_t a, int8_t b)
436
+{
437
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
438
+   return vqaddb_s8(a, b);
439
+#else
440
+   uint8_t a_ = HEDLEY_STATIC_CAST(uint8_t, a);
441
+   uint8_t b_ = HEDLEY_STATIC_CAST(uint8_t, b);
442
+   uint8_t r_ = a_ + b_;
443
+
444
+   a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT8_MAX;
445
+   if (HEDLEY_STATIC_CAST(int8_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) {
446
+       r_ = a_;
447
+   }
448
+
449
+   return HEDLEY_STATIC_CAST(int8_t, r_);
450
+#endif
451
+}
452
+
453
+static HEDLEY_INLINE int16_t simde_math_adds_i16(int16_t a, int16_t b)
454
+{
455
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
456
+   return vqaddh_s16(a, b);
457
+#else
458
+   uint16_t a_ = HEDLEY_STATIC_CAST(uint16_t, a);
459
+   uint16_t b_ = HEDLEY_STATIC_CAST(uint16_t, b);
460
+   uint16_t r_ = a_ + b_;
461
+
462
+   a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT16_MAX;
463
+   if (HEDLEY_STATIC_CAST(int16_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) {
464
+       r_ = a_;
465
+   }
466
+
467
+   return HEDLEY_STATIC_CAST(int16_t, r_);
468
+#endif
469
+}
470
+
471
+static HEDLEY_INLINE int32_t simde_math_adds_i32(int32_t a, int32_t b)
472
+{
473
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
474
+   return vqadds_s32(a, b);
475
+#else
476
+   uint32_t a_ = HEDLEY_STATIC_CAST(uint32_t, a);
477
+   uint32_t b_ = HEDLEY_STATIC_CAST(uint32_t, b);
478
+   uint32_t r_ = a_ + b_;
479
+
480
+   a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT32_MAX;
481
+   if (HEDLEY_STATIC_CAST(int32_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) {
482
+       r_ = a_;
483
+   }
484
+
485
+   return HEDLEY_STATIC_CAST(int32_t, r_);
486
+#endif
487
+}
488
+
489
+static HEDLEY_INLINE int64_t simde_math_adds_i64(int64_t a, int64_t b)
490
+{
491
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
492
+   return vqaddd_s64(a, b);
493
+#else
494
+   uint64_t a_ = HEDLEY_STATIC_CAST(uint64_t, a);
495
+   uint64_t b_ = HEDLEY_STATIC_CAST(uint64_t, b);
496
+   uint64_t r_ = a_ + b_;
497
+
498
+   a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT64_MAX;
499
+   if (HEDLEY_STATIC_CAST(int64_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) {
500
+       r_ = a_;
501
+   }
502
+
503
+   return HEDLEY_STATIC_CAST(int64_t, r_);
504
+#endif
505
+}
506
+
507
+static HEDLEY_INLINE uint8_t simde_math_adds_u8(uint8_t a, uint8_t b)
508
+{
509
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
510
+   return vqaddb_u8(a, b);
511
+#else
512
+   uint8_t r = a + b;
513
+   r |= -(r < a);
514
+   return r;
515
+#endif
516
+}
517
+
518
+static HEDLEY_INLINE uint16_t simde_math_adds_u16(uint16_t a, uint16_t b)
519
+{
520
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
521
+   return vqaddh_u16(a, b);
522
+#else
523
+   uint16_t r = a + b;
524
+   r |= -(r < a);
525
+   return r;
526
+#endif
527
+}
528
+
529
+static HEDLEY_INLINE uint32_t simde_math_adds_u32(uint32_t a, uint32_t b)
530
+{
531
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
532
+   return vqadds_u32(a, b);
533
+#else
534
+   uint32_t r = a + b;
535
+   r |= -(r < a);
536
+   return r;
537
+#endif
538
+}
539
+
540
+static HEDLEY_INLINE uint64_t simde_math_adds_u64(uint64_t a, uint64_t b)
541
+{
542
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
543
+   return vqaddd_u64(a, b);
544
+#else
545
+   uint64_t r = a + b;
546
+   r |= -(r < a);
547
+   return r;
548
+#endif
549
+}
550
+
551
+static HEDLEY_INLINE int8_t simde_math_subs_i8(int8_t a, int8_t b)
552
+{
553
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
554
+   return vqsubb_s8(a, b);
555
+#else
556
+   uint8_t a_ = HEDLEY_STATIC_CAST(uint8_t, a);
557
+   uint8_t b_ = HEDLEY_STATIC_CAST(uint8_t, b);
558
+   uint8_t r_ = a_ - b_;
559
+
560
+   a_ = (a_ >> 7) + INT8_MAX;
561
+
562
+   if (HEDLEY_STATIC_CAST(int8_t, (a_ ^ b_) & (a_ ^ r_)) < 0) {
563
+       r_ = a_;
564
+   }
565
+
566
+   return HEDLEY_STATIC_CAST(int8_t, r_);
567
+#endif
568
+}
569
+
570
+static HEDLEY_INLINE int16_t simde_math_subs_i16(int16_t a, int16_t b)
571
+{
572
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
573
+   return vqsubh_s16(a, b);
574
+#else
575
+   uint16_t a_ = HEDLEY_STATIC_CAST(uint16_t, a);
576
+   uint16_t b_ = HEDLEY_STATIC_CAST(uint16_t, b);
577
+   uint16_t r_ = a_ - b_;
578
+
579
+   a_ = (a_ >> 15) + INT16_MAX;
580
+
581
+   if (HEDLEY_STATIC_CAST(int16_t, (a_ ^ b_) & (a_ ^ r_)) < 0) {
582
+       r_ = a_;
583
+   }
584
+
585
+   return HEDLEY_STATIC_CAST(int16_t, r_);
586
+#endif
587
+}
588
+
589
+static HEDLEY_INLINE int32_t simde_math_subs_i32(int32_t a, int32_t b)
590
+{
591
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
592
+   return vqsubs_s32(a, b);
593
+#else
594
+   uint32_t a_ = HEDLEY_STATIC_CAST(uint32_t, a);
595
+   uint32_t b_ = HEDLEY_STATIC_CAST(uint32_t, b);
596
+   uint32_t r_ = a_ - b_;
597
+
598
+   a_ = (a_ >> 31) + INT32_MAX;
599
+
600
+   if (HEDLEY_STATIC_CAST(int32_t, (a_ ^ b_) & (a_ ^ r_)) < 0) {
601
+       r_ = a_;
602
+   }
603
+
604
+   return HEDLEY_STATIC_CAST(int32_t, r_);
605
+#endif
606
+}
607
+
608
+static HEDLEY_INLINE int64_t simde_math_subs_i64(int64_t a, int64_t b)
609
+{
610
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
611
+   return vqsubd_s64(a, b);
612
+#else
613
+   uint64_t a_ = HEDLEY_STATIC_CAST(uint64_t, a);
614
+   uint64_t b_ = HEDLEY_STATIC_CAST(uint64_t, b);
615
+   uint64_t r_ = a_ - b_;
616
+
617
+   a_ = (a_ >> 63) + INT64_MAX;
618
+
619
+   if (HEDLEY_STATIC_CAST(int64_t, (a_ ^ b_) & (a_ ^ r_)) < 0) {
620
+       r_ = a_;
621
+   }
622
+
623
+   return HEDLEY_STATIC_CAST(int64_t, r_);
624
+#endif
625
+}
626
+
627
+static HEDLEY_INLINE uint8_t simde_math_subs_u8(uint8_t a, uint8_t b)
628
+{
629
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
630
+   return vqsubb_u8(a, b);
631
+#else
632
+   uint8_t res = a - b;
633
+   res &= -(res <= a);
634
+   return res;
635
+#endif
636
+}
637
+
638
+static HEDLEY_INLINE uint16_t simde_math_subs_u16(uint16_t a, uint16_t b)
639
+{
640
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
641
+   return vqsubh_u16(a, b);
642
+#else
643
+   uint16_t res = a - b;
644
+   res &= -(res <= a);
645
+   return res;
646
+#endif
647
+}
648
+
649
+static HEDLEY_INLINE uint32_t simde_math_subs_u32(uint32_t a, uint32_t b)
650
+{
651
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
652
+   return vqsubs_u32(a, b);
653
+#else
654
+   uint32_t res = a - b;
655
+   res &= -(res <= a);
656
+   return res;
657
+#endif
658
+}
659
+
660
+static HEDLEY_INLINE uint64_t simde_math_subs_u64(uint64_t a, uint64_t b)
661
+{
662
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
663
+   return vqsubd_u64(a, b);
664
+#else
665
+   uint64_t res = a - b;
666
+   res &= -(res <= a);
667
+   return res;
668
+#endif
669
+}
670
+
671
+HEDLEY_DIAGNOSTIC_POP
672
+
673
 #endif /* !defined(SIMDE_MATH_H) */
674
obs-studio-26.1.1.tar.xz/libobs/util/simde/x86 Added
2
 
1
+(directory)
2
obs-studio-26.1.1.tar.xz/libobs/util/simde/x86/mmx.h Added
2458
 
1
@@ -0,0 +1,2456 @@
2
+/* SPDX-License-Identifier: MIT
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person
5
+ * obtaining a copy of this software and associated documentation
6
+ * files (the "Software"), to deal in the Software without
7
+ * restriction, including without limitation the rights to use, copy,
8
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
9
+ * of the Software, and to permit persons to whom the Software is
10
+ * furnished to do so, subject to the following conditions:
11
+ *
12
+ * The above copyright notice and this permission notice shall be
13
+ * included in all copies or substantial portions of the Software.
14
+ *
15
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ * SOFTWARE.
23
+ *
24
+ * Copyright:
25
+ *   2017-2020 Evan Nemerson <evan@nemerson.com>
26
+ */
27
+
28
+#if !defined(SIMDE_X86_MMX_H)
29
+#define SIMDE_X86_MMX_H
30
+
31
+#include "../simde-common.h"
32
+
33
+HEDLEY_DIAGNOSTIC_PUSH
34
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
35
+
36
+#if defined(SIMDE_X86_MMX_NATIVE)
37
+#define SIMDE_X86_MMX_USE_NATIVE_TYPE
38
+#elif defined(SIMDE_X86_SSE_NATIVE)
39
+#define SIMDE_X86_MMX_USE_NATIVE_TYPE
40
+#endif
41
+
42
+#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
43
+#include <mmintrin.h>
44
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
45
+#include <arm_neon.h>
46
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
47
+#include <loongson-mmiintrin.h>
48
+#endif
49
+
50
+#include <stdint.h>
51
+#include <limits.h>
52
+
53
+SIMDE_BEGIN_DECLS_
54
+
55
+typedef union {
56
+#if defined(SIMDE_VECTOR_SUBSCRIPT)
57
+   SIMDE_ALIGN_TO_8 int8_t i8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
58
+   SIMDE_ALIGN_TO_8 int16_t i16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
59
+   SIMDE_ALIGN_TO_8 int32_t i32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
60
+   SIMDE_ALIGN_TO_8 int64_t i64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
61
+   SIMDE_ALIGN_TO_8 uint8_t u8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
62
+   SIMDE_ALIGN_TO_8 uint16_t u16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
63
+   SIMDE_ALIGN_TO_8 uint32_t u32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
64
+   SIMDE_ALIGN_TO_8 uint64_t u64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
65
+   SIMDE_ALIGN_TO_8 simde_float32 f32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
66
+   SIMDE_ALIGN_TO_8 int_fast32_t i32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
67
+   SIMDE_ALIGN_TO_8 uint_fast32_t u32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
68
+#else
69
+   SIMDE_ALIGN_TO_8 int8_t i8[8];
70
+   SIMDE_ALIGN_TO_8 int16_t i16[4];
71
+   SIMDE_ALIGN_TO_8 int32_t i32[2];
72
+   SIMDE_ALIGN_TO_8 int64_t i64[1];
73
+   SIMDE_ALIGN_TO_8 uint8_t u8[8];
74
+   SIMDE_ALIGN_TO_8 uint16_t u16[4];
75
+   SIMDE_ALIGN_TO_8 uint32_t u32[2];
76
+   SIMDE_ALIGN_TO_8 uint64_t u64[1];
77
+   SIMDE_ALIGN_TO_8 simde_float32 f32[2];
78
+   SIMDE_ALIGN_TO_8 int_fast32_t i32f[8 / sizeof(int_fast32_t)];
79
+   SIMDE_ALIGN_TO_8 uint_fast32_t u32f[8 / sizeof(uint_fast32_t)];
80
+#endif
81
+
82
+#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
83
+   __m64 n;
84
+#endif
85
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
86
+   int8x8_t neon_i8;
87
+   int16x4_t neon_i16;
88
+   int32x2_t neon_i32;
89
+   int64x1_t neon_i64;
90
+   uint8x8_t neon_u8;
91
+   uint16x4_t neon_u16;
92
+   uint32x2_t neon_u32;
93
+   uint64x1_t neon_u64;
94
+   float32x2_t neon_f32;
95
+#endif
96
+#if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
97
+   int8x8_t mmi_i8;
98
+   int16x4_t mmi_i16;
99
+   int32x2_t mmi_i32;
100
+   int64_t mmi_i64;
101
+   uint8x8_t mmi_u8;
102
+   uint16x4_t mmi_u16;
103
+   uint32x2_t mmi_u32;
104
+   uint64_t mmi_u64;
105
+#endif
106
+} simde__m64_private;
107
+
108
+#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE)
109
+typedef __m64 simde__m64;
110
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
111
+typedef int32x2_t simde__m64;
112
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
113
+typedef int32x2_t simde__m64;
114
+#elif defined(SIMDE_VECTOR_SUBSCRIPT)
115
+typedef int32_t simde__m64 SIMDE_ALIGN_TO_8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS;
116
+#else
117
+typedef simde__m64_private simde__m64;
118
+#endif
119
+
120
+#if !defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) && \
121
+   defined(SIMDE_ENABLE_NATIVE_ALIASES)
122
+#define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES
123
+typedef simde__m64 __m64;
124
+#endif
125
+
126
+HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect");
127
+HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64_private), "__m64 size incorrect");
128
+#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
129
+HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64) == 8,
130
+            "simde__m64 is not 8-byte aligned");
131
+HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64_private) == 8,
132
+            "simde__m64_private is not 8-byte aligned");
133
+#endif
134
+
135
+SIMDE_FUNCTION_ATTRIBUTES
136
+simde__m64 simde__m64_from_private(simde__m64_private v)
137
+{
138
+   simde__m64 r;
139
+   simde_memcpy(&r, &v, sizeof(r));
140
+   return r;
141
+}
142
+
143
+SIMDE_FUNCTION_ATTRIBUTES
144
+simde__m64_private simde__m64_to_private(simde__m64 v)
145
+{
146
+   simde__m64_private r;
147
+   simde_memcpy(&r, &v, sizeof(r));
148
+   return r;
149
+}
150
+
151
+#define SIMDE_X86_GENERATE_CONVERSION_FUNCTION(simde_type, source_type, isax, \
152
+                          fragment)                      \
153
+   SIMDE_FUNCTION_ATTRIBUTES                                             \
154
+   simde__##simde_type simde__##simde_type##_from_##isax##_##fragment(   \
155
+       source_type value)                                            \
156
+   {                                                                     \
157
+       simde__##simde_type##_private r_;                             \
158
+       r_.isax##_##fragment = value;                                 \
159
+       return simde__##simde_type##_from_private(r_);                \
160
+   }                                                                     \
161
+                                                                              \
162
+   SIMDE_FUNCTION_ATTRIBUTES                                             \
163
+   source_type simde__##simde_type##_to_##isax##_##fragment(             \
164
+       simde__##simde_type value)                                    \
165
+   {                                                                     \
166
+       simde__##simde_type##_private r_ =                            \
167
+           simde__##simde_type##_to_private(value);              \
168
+       return r_.isax##_##fragment;                                  \
169
+   }
170
+
171
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
172
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, neon, i8)
173
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, neon, i16)
174
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, neon, i32)
175
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64x1_t, neon, i64)
176
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, neon, u8)
177
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, neon, u16)
178
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, neon, u32)
179
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64x1_t, neon, u64)
180
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, float32x2_t, neon, f32)
181
+#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
182
+
183
+#if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
184
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, mmi, i8)
185
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, mmi, i16)
186
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, mmi, i32)
187
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64_t, mmi, i64)
188
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, mmi, u8)
189
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, mmi, u16)
190
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, mmi, u32)
191
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64_t, mmi, u64)
192
+#endif /* defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) */
193
+
194
+SIMDE_FUNCTION_ATTRIBUTES
195
+simde__m64 simde_mm_add_pi8(simde__m64 a, simde__m64 b)
196
+{
197
+#if defined(SIMDE_X86_MMX_NATIVE)
198
+   return _mm_add_pi8(a, b);
199
+#else
200
+   simde__m64_private r_;
201
+   simde__m64_private a_ = simde__m64_to_private(a);
202
+   simde__m64_private b_ = simde__m64_to_private(b);
203
+
204
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
205
+   r_.neon_i8 = vadd_s8(a_.neon_i8, b_.neon_i8);
206
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
207
+   r_.mmi_i8 = paddb_s(a_.mmi_i8, b_.mmi_i8);
208
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
209
+   r_.i8 = a_.i8 + b_.i8;
210
+#else
211
+   SIMDE_VECTORIZE
212
+   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
213
+       r_.i8[i] = a_.i8[i] + b_.i8[i];
214
+   }
215
+#endif
216
+
217
+   return simde__m64_from_private(r_);
218
+#endif
219
+}
220
+#define simde_m_paddb(a, b) simde_mm_add_pi8(a, b)
221
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
222
+#define _mm_add_pi8(a, b) simde_mm_add_pi8(a, b)
223
+#define _m_paddb(a, b) simde_m_paddb(a, b)
224
+#endif
225
+
226
+SIMDE_FUNCTION_ATTRIBUTES
227
+simde__m64 simde_mm_add_pi16(simde__m64 a, simde__m64 b)
228
+{
229
+#if defined(SIMDE_X86_MMX_NATIVE)
230
+   return _mm_add_pi16(a, b);
231
+#else
232
+   simde__m64_private r_;
233
+   simde__m64_private a_ = simde__m64_to_private(a);
234
+   simde__m64_private b_ = simde__m64_to_private(b);
235
+
236
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
237
+   r_.neon_i16 = vadd_s16(a_.neon_i16, b_.neon_i16);
238
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
239
+   r_.mmi_i16 = paddh_s(a_.mmi_i16, b_.mmi_i16);
240
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
241
+   r_.i16 = a_.i16 + b_.i16;
242
+#else
243
+   SIMDE_VECTORIZE
244
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
245
+       r_.i16[i] = a_.i16[i] + b_.i16[i];
246
+   }
247
+#endif
248
+
249
+   return simde__m64_from_private(r_);
250
+#endif
251
+}
252
+#define simde_m_paddw(a, b) simde_mm_add_pi16(a, b)
253
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
254
+#define _mm_add_pi16(a, b) simde_mm_add_pi16(a, b)
255
+#define _m_paddw(a, b) simde_mm_add_pi16(a, b)
256
+#endif
257
+
258
+SIMDE_FUNCTION_ATTRIBUTES
259
+simde__m64 simde_mm_add_pi32(simde__m64 a, simde__m64 b)
260
+{
261
+#if defined(SIMDE_X86_MMX_NATIVE)
262
+   return _mm_add_pi32(a, b);
263
+#else
264
+   simde__m64_private r_;
265
+   simde__m64_private a_ = simde__m64_to_private(a);
266
+   simde__m64_private b_ = simde__m64_to_private(b);
267
+
268
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
269
+   r_.neon_i32 = vadd_s32(a_.neon_i32, b_.neon_i32);
270
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
271
+   r_.mmi_i32 = paddw_s(a_.mmi_i32, b_.mmi_i32);
272
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
273
+   r_.i32 = a_.i32 + b_.i32;
274
+#else
275
+   SIMDE_VECTORIZE
276
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
277
+       r_.i32[i] = a_.i32[i] + b_.i32[i];
278
+   }
279
+#endif
280
+
281
+   return simde__m64_from_private(r_);
282
+#endif
283
+}
284
+#define simde_m_paddd(a, b) simde_mm_add_pi32(a, b)
285
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
286
+#define _mm_add_pi32(a, b) simde_mm_add_pi32(a, b)
287
+#define _m_paddd(a, b) simde_mm_add_pi32(a, b)
288
+#endif
289
+
290
+SIMDE_FUNCTION_ATTRIBUTES
291
+simde__m64 simde_mm_adds_pi8(simde__m64 a, simde__m64 b)
292
+{
293
+#if defined(SIMDE_X86_MMX_NATIVE)
294
+   return _mm_adds_pi8(a, b);
295
+#else
296
+   simde__m64_private r_, a_ = simde__m64_to_private(a),
297
+                  b_ = simde__m64_to_private(b);
298
+
299
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
300
+   r_.neon_i8 = vqadd_s8(a_.neon_i8, b_.neon_i8);
301
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
302
+   r_.mmi_i8 = paddsb(a_.mmi_i8, b_.mmi_i8);
303
+#else
304
+   SIMDE_VECTORIZE
305
+   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
306
+       if ((((b_.i8[i]) > 0) &&
307
+            ((a_.i8[i]) > (INT8_MAX - (b_.i8[i]))))) {
308
+           r_.i8[i] = INT8_MAX;
309
+       } else if ((((b_.i8[i]) < 0) &&
310
+               ((a_.i8[i]) < (INT8_MIN - (b_.i8[i]))))) {
311
+           r_.i8[i] = INT8_MIN;
312
+       } else {
313
+           r_.i8[i] = (a_.i8[i]) + (b_.i8[i]);
314
+       }
315
+   }
316
+#endif
317
+
318
+   return simde__m64_from_private(r_);
319
+#endif
320
+}
321
+#define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b)
322
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
323
+#define _mm_adds_pi8(a, b) simde_mm_adds_pi8(a, b)
324
+#define _m_paddsb(a, b) simde_mm_adds_pi8(a, b)
325
+#endif
326
+
327
+SIMDE_FUNCTION_ATTRIBUTES
328
+simde__m64 simde_mm_adds_pu8(simde__m64 a, simde__m64 b)
329
+{
330
+#if defined(SIMDE_X86_MMX_NATIVE)
331
+   return _mm_adds_pu8(a, b);
332
+#else
333
+   simde__m64_private r_;
334
+   simde__m64_private a_ = simde__m64_to_private(a);
335
+   simde__m64_private b_ = simde__m64_to_private(b);
336
+
337
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
338
+   r_.neon_u8 = vqadd_u8(a_.neon_u8, b_.neon_u8);
339
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
340
+   r_.mmi_u8 = paddusb(a_.mmi_u8, b_.mmi_u8);
341
+#else
342
+   SIMDE_VECTORIZE
343
+   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
344
+       const uint_fast16_t x =
345
+           HEDLEY_STATIC_CAST(uint_fast16_t, a_.u8[i]) +
346
+           HEDLEY_STATIC_CAST(uint_fast16_t, b_.u8[i]);
347
+       if (x > UINT8_MAX)
348
+           r_.u8[i] = UINT8_MAX;
349
+       else
350
+           r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
351
+   }
352
+#endif
353
+
354
+   return simde__m64_from_private(r_);
355
+#endif
356
+}
357
+#define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b)
358
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
359
+#define _mm_adds_pu8(a, b) simde_mm_adds_pu8(a, b)
360
+#define _m_paddusb(a, b) simde_mm_adds_pu8(a, b)
361
+#endif
362
+
363
+SIMDE_FUNCTION_ATTRIBUTES
364
+simde__m64 simde_mm_adds_pi16(simde__m64 a, simde__m64 b)
365
+{
366
+#if defined(SIMDE_X86_MMX_NATIVE)
367
+   return _mm_adds_pi16(a, b);
368
+#else
369
+   simde__m64_private r_;
370
+   simde__m64_private a_ = simde__m64_to_private(a);
371
+   simde__m64_private b_ = simde__m64_to_private(b);
372
+
373
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
374
+   r_.neon_i16 = vqadd_s16(a_.neon_i16, b_.neon_i16);
375
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
376
+   r_.mmi_i16 = paddsh(a_.mmi_i16, b_.mmi_i16);
377
+#else
378
+   SIMDE_VECTORIZE
379
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
380
+       if ((((b_.i16[i]) > 0) &&
381
+            ((a_.i16[i]) > (INT16_MAX - (b_.i16[i]))))) {
382
+           r_.i16[i] = INT16_MAX;
383
+       } else if ((((b_.i16[i]) < 0) &&
384
+               ((a_.i16[i]) < (SHRT_MIN - (b_.i16[i]))))) {
385
+           r_.i16[i] = SHRT_MIN;
386
+       } else {
387
+           r_.i16[i] = (a_.i16[i]) + (b_.i16[i]);
388
+       }
389
+   }
390
+#endif
391
+
392
+   return simde__m64_from_private(r_);
393
+#endif
394
+}
395
+#define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b)
396
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
397
+#define _mm_adds_pi16(a, b) simde_mm_adds_pi16(a, b)
398
+#define _m_paddsw(a, b) simde_mm_adds_pi16(a, b)
399
+#endif
400
+
401
+SIMDE_FUNCTION_ATTRIBUTES
402
+simde__m64 simde_mm_adds_pu16(simde__m64 a, simde__m64 b)
403
+{
404
+#if defined(SIMDE_X86_MMX_NATIVE)
405
+   return _mm_adds_pu16(a, b);
406
+#else
407
+   simde__m64_private r_;
408
+   simde__m64_private a_ = simde__m64_to_private(a);
409
+   simde__m64_private b_ = simde__m64_to_private(b);
410
+
411
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
412
+   r_.neon_u16 = vqadd_u16(a_.neon_u16, b_.neon_u16);
413
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
414
+   r_.mmi_u16 = paddush(a_.mmi_u16, b_.mmi_u16);
415
+#else
416
+   SIMDE_VECTORIZE
417
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
418
+       const uint32_t x = a_.u16[i] + b_.u16[i];
419
+       if (x > UINT16_MAX)
420
+           r_.u16[i] = UINT16_MAX;
421
+       else
422
+           r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
423
+   }
424
+#endif
425
+
426
+   return simde__m64_from_private(r_);
427
+#endif
428
+}
429
+#define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b)
430
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
431
+#define _mm_adds_pu16(a, b) simde_mm_adds_pu16(a, b)
432
+#define _m_paddusw(a, b) simde_mm_adds_pu16(a, b)
433
+#endif
434
+
435
+SIMDE_FUNCTION_ATTRIBUTES
436
+simde__m64 simde_mm_and_si64(simde__m64 a, simde__m64 b)
437
+{
438
+#if defined(SIMDE_X86_MMX_NATIVE)
439
+   return _mm_and_si64(a, b);
440
+#else
441
+   simde__m64_private r_;
442
+   simde__m64_private a_ = simde__m64_to_private(a);
443
+   simde__m64_private b_ = simde__m64_to_private(b);
444
+
445
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
446
+   r_.neon_i32 = vand_s32(a_.neon_i32, b_.neon_i32);
447
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
448
+   r_.i64 = a_.i64 & b_.i64;
449
+#else
450
+   r_.i64[0] = a_.i64[0] & b_.i64[0];
451
+#endif
452
+
453
+   return simde__m64_from_private(r_);
454
+#endif
455
+}
456
+#define simde_m_pand(a, b) simde_mm_and_si64(a, b)
457
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
458
+#define _mm_and_si64(a, b) simde_mm_and_si64(a, b)
459
+#define _m_pand(a, b) simde_mm_and_si64(a, b)
460
+#endif
461
+
462
+SIMDE_FUNCTION_ATTRIBUTES
463
+simde__m64 simde_mm_andnot_si64(simde__m64 a, simde__m64 b)
464
+{
465
+#if defined(SIMDE_X86_MMX_NATIVE)
466
+   return _mm_andnot_si64(a, b);
467
+#else
468
+   simde__m64_private r_;
469
+   simde__m64_private a_ = simde__m64_to_private(a);
470
+   simde__m64_private b_ = simde__m64_to_private(b);
471
+
472
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
473
+   r_.neon_i32 = vbic_s32(b_.neon_i32, a_.neon_i32);
474
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
475
+   r_.mmi_i32 = pandn_sw(a_.mmi_i32, b_.mmi_i32);
476
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
477
+   r_.i32f = ~a_.i32f & b_.i32f;
478
+#else
479
+   r_.u64[0] = (~(a_.u64[0])) & (b_.u64[0]);
480
+#endif
481
+
482
+   return simde__m64_from_private(r_);
483
+#endif
484
+}
485
+#define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b)
486
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
487
+#define _mm_andnot_si64(a, b) simde_mm_andnot_si64(a, b)
488
+#define _m_pandn(a, b) simde_mm_andnot_si64(a, b)
489
+#endif
490
+
491
+SIMDE_FUNCTION_ATTRIBUTES
492
+simde__m64 simde_mm_cmpeq_pi8(simde__m64 a, simde__m64 b)
493
+{
494
+#if defined(SIMDE_X86_MMX_NATIVE)
495
+   return _mm_cmpeq_pi8(a, b);
496
+#else
497
+   simde__m64_private r_;
498
+   simde__m64_private a_ = simde__m64_to_private(a);
499
+   simde__m64_private b_ = simde__m64_to_private(b);
500
+
501
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
502
+   r_.neon_u8 = vceq_s8(a_.neon_i8, b_.neon_i8);
503
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
504
+   r_.mmi_i8 = pcmpeqb_s(a_.mmi_i8, b_.mmi_i8);
505
+#else
506
+   SIMDE_VECTORIZE
507
+   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
508
+       r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
509
+   }
510
+#endif
511
+
512
+   return simde__m64_from_private(r_);
513
+#endif
514
+}
515
+#define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
516
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
517
+#define _mm_cmpeq_pi8(a, b) simde_mm_cmpeq_pi8(a, b)
518
+#define _m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b)
519
+#endif
520
+
521
+SIMDE_FUNCTION_ATTRIBUTES
522
+simde__m64 simde_mm_cmpeq_pi16(simde__m64 a, simde__m64 b)
523
+{
524
+#if defined(SIMDE_X86_MMX_NATIVE)
525
+   return _mm_cmpeq_pi16(a, b);
526
+#else
527
+   simde__m64_private r_;
528
+   simde__m64_private a_ = simde__m64_to_private(a);
529
+   simde__m64_private b_ = simde__m64_to_private(b);
530
+
531
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
532
+   r_.neon_u16 = vceq_s16(a_.neon_i16, b_.neon_i16);
533
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
534
+   r_.mmi_i16 = pcmpeqh_s(a_.mmi_i16, b_.mmi_i16);
535
+#else
536
+   SIMDE_VECTORIZE
537
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
538
+       r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
539
+   }
540
+#endif
541
+
542
+   return simde__m64_from_private(r_);
543
+#endif
544
+}
545
+#define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
546
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
547
+#define _mm_cmpeq_pi16(a, b) simde_mm_cmpeq_pi16(a, b)
548
+#define _m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b)
549
+#endif
550
+
551
+SIMDE_FUNCTION_ATTRIBUTES
552
+simde__m64 simde_mm_cmpeq_pi32(simde__m64 a, simde__m64 b)
553
+{
554
+#if defined(SIMDE_X86_MMX_NATIVE)
555
+   return _mm_cmpeq_pi32(a, b);
556
+#else
557
+   simde__m64_private r_;
558
+   simde__m64_private a_ = simde__m64_to_private(a);
559
+   simde__m64_private b_ = simde__m64_to_private(b);
560
+
561
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
562
+   r_.neon_u32 = vceq_s32(a_.neon_i32, b_.neon_i32);
563
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
564
+   r_.mmi_i32 = pcmpeqw_s(a_.mmi_i32, b_.mmi_i32);
565
+#else
566
+   SIMDE_VECTORIZE
567
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
568
+       r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
569
+   }
570
+#endif
571
+
572
+   return simde__m64_from_private(r_);
573
+#endif
574
+}
575
+#define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
576
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
577
+#define _mm_cmpeq_pi32(a, b) simde_mm_cmpeq_pi32(a, b)
578
+#define _m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b)
579
+#endif
580
+
581
+SIMDE_FUNCTION_ATTRIBUTES
582
+simde__m64 simde_mm_cmpgt_pi8(simde__m64 a, simde__m64 b)
583
+{
584
+#if defined(SIMDE_X86_MMX_NATIVE)
585
+   return _mm_cmpgt_pi8(a, b);
586
+#else
587
+   simde__m64_private r_;
588
+   simde__m64_private a_ = simde__m64_to_private(a);
589
+   simde__m64_private b_ = simde__m64_to_private(b);
590
+
591
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
592
+   r_.neon_u8 = vcgt_s8(a_.neon_i8, b_.neon_i8);
593
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
594
+   r_.mmi_i8 = pcmpgtb_s(a_.mmi_i8, b_.mmi_i8);
595
+#else
596
+   SIMDE_VECTORIZE
597
+   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
598
+       r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
599
+   }
600
+#endif
601
+
602
+   return simde__m64_from_private(r_);
603
+#endif
604
+}
605
+#define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
606
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
607
+#define _mm_cmpgt_pi8(a, b) simde_mm_cmpgt_pi8(a, b)
608
+#define _m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b)
609
+#endif
610
+
611
+SIMDE_FUNCTION_ATTRIBUTES
612
+simde__m64 simde_mm_cmpgt_pi16(simde__m64 a, simde__m64 b)
613
+{
614
+#if defined(SIMDE_X86_MMX_NATIVE)
615
+   return _mm_cmpgt_pi16(a, b);
616
+#else
617
+   simde__m64_private r_;
618
+   simde__m64_private a_ = simde__m64_to_private(a);
619
+   simde__m64_private b_ = simde__m64_to_private(b);
620
+
621
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
622
+   r_.neon_u16 = vcgt_s16(a_.neon_i16, b_.neon_i16);
623
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
624
+   r_.mmi_i16 = pcmpgth_s(a_.mmi_i16, b_.mmi_i16);
625
+#else
626
+   SIMDE_VECTORIZE
627
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
628
+       r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
629
+   }
630
+#endif
631
+
632
+   return simde__m64_from_private(r_);
633
+#endif
634
+}
635
+#define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
636
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
637
+#define _mm_cmpgt_pi16(a, b) simde_mm_cmpgt_pi16(a, b)
638
+#define _m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b)
639
+#endif
640
+
641
+SIMDE_FUNCTION_ATTRIBUTES
642
+simde__m64 simde_mm_cmpgt_pi32(simde__m64 a, simde__m64 b)
643
+{
644
+#if defined(SIMDE_X86_MMX_NATIVE)
645
+   return _mm_cmpgt_pi32(a, b);
646
+#else
647
+   simde__m64_private r_;
648
+   simde__m64_private a_ = simde__m64_to_private(a);
649
+   simde__m64_private b_ = simde__m64_to_private(b);
650
+
651
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
652
+   r_.neon_u32 = vcgt_s32(a_.neon_i32, b_.neon_i32);
653
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
654
+   r_.mmi_i32 = pcmpgtw_s(a_.mmi_i32, b_.mmi_i32);
655
+#else
656
+   SIMDE_VECTORIZE
657
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
658
+       r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
659
+   }
660
+#endif
661
+
662
+   return simde__m64_from_private(r_);
663
+#endif
664
+}
665
+#define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
666
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
667
+#define _mm_cmpgt_pi32(a, b) simde_mm_cmpgt_pi32(a, b)
668
+#define _m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b)
669
+#endif
670
+
671
+SIMDE_FUNCTION_ATTRIBUTES
672
+int64_t simde_mm_cvtm64_si64(simde__m64 a)
673
+{
674
+#if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
675
+   !defined(__PGI)
676
+   return _mm_cvtm64_si64(a);
677
+#else
678
+   simde__m64_private a_ = simde__m64_to_private(a);
679
+
680
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
681
+   HEDLEY_DIAGNOSTIC_PUSH
682
+#if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
683
+   SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
684
+#pragma clang diagnostic ignored "-Wvector-conversion"
685
+#endif
686
+   return vget_lane_s64(a_.neon_i64, 0);
687
+   HEDLEY_DIAGNOSTIC_POP
688
+#else
689
+   return a_.i64[0];
690
+#endif
691
+#endif
692
+}
693
+#define simde_m_to_int64(a) simde_mm_cvtm64_si64(a)
694
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
695
+#define _mm_cvtm64_si64(a) simde_mm_cvtm64_si64(a)
696
+#define _m_to_int64(a) simde_mm_cvtm64_si64(a)
697
+#endif
698
+
699
+SIMDE_FUNCTION_ATTRIBUTES
700
+simde__m64 simde_mm_cvtsi32_si64(int32_t a)
701
+{
702
+#if defined(SIMDE_X86_MMX_NATIVE)
703
+   return _mm_cvtsi32_si64(a);
704
+#else
705
+   simde__m64_private r_;
706
+
707
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
708
+   const int32_t av[sizeof(r_.neon_i32) / sizeof(r_.neon_i32[0])] = {a, 0};
709
+   r_.neon_i32 = vld1_s32(av);
710
+#else
711
+   r_.i32[0] = a;
712
+   r_.i32[1] = 0;
713
+#endif
714
+
715
+   return simde__m64_from_private(r_);
716
+#endif
717
+}
718
+#define simde_m_from_int(a) simde_mm_cvtsi32_si64(a)
719
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
720
+#define _mm_cvtsi32_si64(a) simde_mm_cvtsi32_si64(a)
721
+#define _m_from_int(a) simde_mm_cvtsi32_si64(a)
722
+#endif
723
+
724
+SIMDE_FUNCTION_ATTRIBUTES
725
+simde__m64 simde_mm_cvtsi64_m64(int64_t a)
726
+{
727
+#if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
728
+   !defined(__PGI)
729
+   return _mm_cvtsi64_m64(a);
730
+#else
731
+   simde__m64_private r_;
732
+
733
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
734
+   r_.neon_i64 = vld1_s64(&a);
735
+#else
736
+   r_.i64[0] = a;
737
+#endif
738
+
739
+   return simde__m64_from_private(r_);
740
+#endif
741
+}
742
+#define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a)
743
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
744
+#define _mm_cvtsi64_m64(a) simde_mm_cvtsi64_m64(a)
745
+#define _m_from_int64(a) simde_mm_cvtsi64_m64(a)
746
+#endif
747
+
748
+SIMDE_FUNCTION_ATTRIBUTES
749
+int32_t simde_mm_cvtsi64_si32(simde__m64 a)
750
+{
751
+#if defined(SIMDE_X86_MMX_NATIVE)
752
+   return _mm_cvtsi64_si32(a);
753
+#else
754
+   simde__m64_private a_ = simde__m64_to_private(a);
755
+
756
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
757
+   HEDLEY_DIAGNOSTIC_PUSH
758
+#if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
759
+   SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
760
+#pragma clang diagnostic ignored "-Wvector-conversion"
761
+#endif
762
+   return vget_lane_s32(a_.neon_i32, 0);
763
+   HEDLEY_DIAGNOSTIC_POP
764
+#else
765
+   return a_.i32[0];
766
+#endif
767
+#endif
768
+}
769
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
770
+#define _mm_cvtsi64_si32(a) simde_mm_cvtsi64_si32(a)
771
+#endif
772
+
773
+SIMDE_FUNCTION_ATTRIBUTES
774
+void simde_mm_empty(void)
775
+{
776
+#if defined(SIMDE_X86_MMX_NATIVE)
777
+   _mm_empty();
778
+#else
779
+   /* noop */
780
+#endif
781
+}
782
+#define simde_m_empty() simde_mm_empty()
783
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
784
+#define _mm_empty() simde_mm_empty()
785
+#define _m_empty() simde_mm_empty()
786
+#endif
787
+
788
+SIMDE_FUNCTION_ATTRIBUTES
789
+simde__m64 simde_mm_madd_pi16(simde__m64 a, simde__m64 b)
790
+{
791
+#if defined(SIMDE_X86_MMX_NATIVE)
792
+   return _mm_madd_pi16(a, b);
793
+#else
794
+   simde__m64_private r_;
795
+   simde__m64_private a_ = simde__m64_to_private(a);
796
+   simde__m64_private b_ = simde__m64_to_private(b);
797
+
798
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
799
+   int32x4_t i1 = vmull_s16(a_.neon_i16, b_.neon_i16);
800
+   r_.neon_i32 = vpadd_s32(vget_low_s32(i1), vget_high_s32(i1));
801
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
802
+   r_.mmi_i32 = pmaddhw(a_.mmi_i16, b_.mmi_i16);
803
+#else
804
+   SIMDE_VECTORIZE
805
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i += 2) {
806
+       r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) +
807
+               (a_.i16[i + 1] * b_.i16[i + 1]);
808
+   }
809
+#endif
810
+
811
+   return simde__m64_from_private(r_);
812
+#endif
813
+}
814
+#define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
815
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
816
+#define _mm_madd_pi16(a, b) simde_mm_madd_pi16(a, b)
817
+#define _m_pmaddwd(a, b) simde_mm_madd_pi16(a, b)
818
+#endif
819
+
820
+SIMDE_FUNCTION_ATTRIBUTES
821
+simde__m64 simde_mm_mulhi_pi16(simde__m64 a, simde__m64 b)
822
+{
823
+#if defined(SIMDE_X86_MMX_NATIVE)
824
+   return _mm_mulhi_pi16(a, b);
825
+#else
826
+   simde__m64_private r_;
827
+   simde__m64_private a_ = simde__m64_to_private(a);
828
+   simde__m64_private b_ = simde__m64_to_private(b);
829
+
830
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
831
+   const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
832
+   const uint32x4_t t2 = vshrq_n_u32(vreinterpretq_u32_s32(t1), 16);
833
+   const uint16x4_t t3 = vmovn_u32(t2);
834
+   r_.neon_u16 = t3;
835
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
836
+   r_.mmi_i16 = pmulhh(a_.mmi_i16, b_.mmi_i16);
837
+#else
838
+   SIMDE_VECTORIZE
839
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
840
+       r_.i16[i] = HEDLEY_STATIC_CAST(int16_t,
841
+                          ((a_.i16[i] * b_.i16[i]) >> 16));
842
+   }
843
+#endif
844
+
845
+   return simde__m64_from_private(r_);
846
+#endif
847
+}
848
+#define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
849
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
850
+#define _mm_mulhi_pi16(a, b) simde_mm_mulhi_pi16(a, b)
851
+#define _m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b)
852
+#endif
853
+
854
+SIMDE_FUNCTION_ATTRIBUTES
855
+simde__m64 simde_mm_mullo_pi16(simde__m64 a, simde__m64 b)
856
+{
857
+#if defined(SIMDE_X86_MMX_NATIVE)
858
+   return _mm_mullo_pi16(a, b);
859
+#else
860
+   simde__m64_private r_;
861
+   simde__m64_private a_ = simde__m64_to_private(a);
862
+   simde__m64_private b_ = simde__m64_to_private(b);
863
+
864
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
865
+   const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16);
866
+   const uint16x4_t t2 = vmovn_u32(vreinterpretq_u32_s32(t1));
867
+   r_.neon_u16 = t2;
868
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
869
+   r_.mmi_i16 = pmullh(a_.mmi_i16, b_.mmi_i16);
870
+#else
871
+   SIMDE_VECTORIZE
872
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
873
+       r_.i16[i] = HEDLEY_STATIC_CAST(
874
+           int16_t, ((a_.i16[i] * b_.i16[i]) & 0xffff));
875
+   }
876
+#endif
877
+
878
+   return simde__m64_from_private(r_);
879
+#endif
880
+}
881
+#define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
882
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
883
+#define _mm_mullo_pi16(a, b) simde_mm_mullo_pi16(a, b)
884
+#define _m_pmullw(a, b) simde_mm_mullo_pi16(a, b)
885
+#endif
886
+
887
+SIMDE_FUNCTION_ATTRIBUTES
888
+simde__m64 simde_mm_or_si64(simde__m64 a, simde__m64 b)
889
+{
890
+#if defined(SIMDE_X86_MMX_NATIVE)
891
+   return _mm_or_si64(a, b);
892
+#else
893
+   simde__m64_private r_;
894
+   simde__m64_private a_ = simde__m64_to_private(a);
895
+   simde__m64_private b_ = simde__m64_to_private(b);
896
+
897
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
898
+   r_.neon_i32 = vorr_s32(a_.neon_i32, b_.neon_i32);
899
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
900
+   r_.i64 = a_.i64 | b_.i64;
901
+#else
902
+   r_.i64[0] = a_.i64[0] | b_.i64[0];
903
+#endif
904
+
905
+   return simde__m64_from_private(r_);
906
+#endif
907
+}
908
+#define simde_m_por(a, b) simde_mm_or_si64(a, b)
909
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
910
+#define _mm_or_si64(a, b) simde_mm_or_si64(a, b)
911
+#define _m_por(a, b) simde_mm_or_si64(a, b)
912
+#endif
913
+
914
+SIMDE_FUNCTION_ATTRIBUTES
915
+simde__m64 simde_mm_packs_pi16(simde__m64 a, simde__m64 b)
916
+{
917
+#if defined(SIMDE_X86_MMX_NATIVE)
918
+   return _mm_packs_pi16(a, b);
919
+#else
920
+   simde__m64_private r_;
921
+   simde__m64_private a_ = simde__m64_to_private(a);
922
+   simde__m64_private b_ = simde__m64_to_private(b);
923
+
924
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
925
+   r_.neon_i8 = vqmovn_s16(vcombine_s16(a_.neon_i16, b_.neon_i16));
926
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
927
+   r_.mmi_i8 = packsshb(a_.mmi_i16, b_.mmi_i16);
928
+#else
929
+   SIMDE_VECTORIZE
930
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
931
+       if (a_.i16[i] < INT8_MIN) {
932
+           r_.i8[i] = INT8_MIN;
933
+       } else if (a_.i16[i] > INT8_MAX) {
934
+           r_.i8[i] = INT8_MAX;
935
+       } else {
936
+           r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i16[i]);
937
+       }
938
+   }
939
+
940
+   SIMDE_VECTORIZE
941
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
942
+       if (b_.i16[i] < INT8_MIN) {
943
+           r_.i8[i + 4] = INT8_MIN;
944
+       } else if (b_.i16[i] > INT8_MAX) {
945
+           r_.i8[i + 4] = INT8_MAX;
946
+       } else {
947
+           r_.i8[i + 4] = HEDLEY_STATIC_CAST(int8_t, b_.i16[i]);
948
+       }
949
+   }
950
+#endif
951
+
952
+   return simde__m64_from_private(r_);
953
+#endif
954
+}
955
+#define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b)
956
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
957
+#define _mm_packs_pi16(a, b) simde_mm_packs_pi16(a, b)
958
+#define _m_packsswb(a, b) simde_mm_packs_pi16(a, b)
959
+#endif
960
+
961
+SIMDE_FUNCTION_ATTRIBUTES
962
+simde__m64 simde_mm_packs_pi32(simde__m64 a, simde__m64 b)
963
+{
964
+#if defined(SIMDE_X86_MMX_NATIVE)
965
+   return _mm_packs_pi32(a, b);
966
+#else
967
+   simde__m64_private r_;
968
+   simde__m64_private a_ = simde__m64_to_private(a);
969
+   simde__m64_private b_ = simde__m64_to_private(b);
970
+
971
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
972
+   r_.neon_i16 = vqmovn_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
973
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
974
+   r_.mmi_i16 = packsswh(a_.mmi_i32, b_.mmi_i32);
975
+#else
976
+   SIMDE_VECTORIZE
977
+   for (size_t i = 0; i < (8 / sizeof(a_.i32[0])); i++) {
978
+       if (a_.i32[i] < SHRT_MIN) {
979
+           r_.i16[i] = SHRT_MIN;
980
+       } else if (a_.i32[i] > INT16_MAX) {
981
+           r_.i16[i] = INT16_MAX;
982
+       } else {
983
+           r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i32[i]);
984
+       }
985
+   }
986
+
987
+   SIMDE_VECTORIZE
988
+   for (size_t i = 0; i < (8 / sizeof(b_.i32[0])); i++) {
989
+       if (b_.i32[i] < SHRT_MIN) {
990
+           r_.i16[i + 2] = SHRT_MIN;
991
+       } else if (b_.i32[i] > INT16_MAX) {
992
+           r_.i16[i + 2] = INT16_MAX;
993
+       } else {
994
+           r_.i16[i + 2] = HEDLEY_STATIC_CAST(int16_t, b_.i32[i]);
995
+       }
996
+   }
997
+#endif
998
+
999
+   return simde__m64_from_private(r_);
1000
+#endif
1001
+}
1002
+#define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b)
1003
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1004
+#define _mm_packs_pi32(a, b) simde_mm_packs_pi32(a, b)
1005
+#define _m_packssdw(a, b) simde_mm_packs_pi32(a, b)
1006
+#endif
1007
+
1008
+SIMDE_FUNCTION_ATTRIBUTES
1009
+simde__m64 simde_mm_packs_pu16(simde__m64 a, simde__m64 b)
1010
+{
1011
+#if defined(SIMDE_X86_MMX_NATIVE)
1012
+   return _mm_packs_pu16(a, b);
1013
+#else
1014
+   simde__m64_private r_;
1015
+   simde__m64_private a_ = simde__m64_to_private(a);
1016
+   simde__m64_private b_ = simde__m64_to_private(b);
1017
+
1018
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1019
+   const int16x8_t t1 = vcombine_s16(a_.neon_i16, b_.neon_i16);
1020
+
1021
+   /* Set elements which are < 0 to 0 */
1022
+   const int16x8_t t2 =
1023
+       vandq_s16(t1, vreinterpretq_s16_u16(vcgezq_s16(t1)));
1024
+
1025
+   /* Vector with all s16 elements set to UINT8_MAX */
1026
+   const int16x8_t vmax =
1027
+       vmovq_n_s16(HEDLEY_STATIC_CAST(int16_t, UINT8_MAX));
1028
+
1029
+   /* Elements which are within the acceptable range */
1030
+   const int16x8_t le_max =
1031
+       vandq_s16(t2, vreinterpretq_s16_u16(vcleq_s16(t2, vmax)));
1032
+   const int16x8_t gt_max =
1033
+       vandq_s16(vmax, vreinterpretq_s16_u16(vcgtq_s16(t2, vmax)));
1034
+
1035
+   /* Final values as 16-bit integers */
1036
+   const int16x8_t values = vorrq_s16(le_max, gt_max);
1037
+
1038
+   r_.neon_u8 = vmovn_u16(vreinterpretq_u16_s16(values));
1039
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
1040
+   r_.mmi_u8 = packushb(a_.mmi_u16, b_.mmi_u16);
1041
+#else
1042
+   SIMDE_VECTORIZE
1043
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1044
+       if (a_.i16[i] > UINT8_MAX) {
1045
+           r_.u8[i] = UINT8_MAX;
1046
+       } else if (a_.i16[i] < 0) {
1047
+           r_.u8[i] = 0;
1048
+       } else {
1049
+           r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]);
1050
+       }
1051
+   }
1052
+
1053
+   SIMDE_VECTORIZE
1054
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1055
+       if (b_.i16[i] > UINT8_MAX) {
1056
+           r_.u8[i + 4] = UINT8_MAX;
1057
+       } else if (b_.i16[i] < 0) {
1058
+           r_.u8[i + 4] = 0;
1059
+       } else {
1060
+           r_.u8[i + 4] = HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]);
1061
+       }
1062
+   }
1063
+#endif
1064
+
1065
+   return simde__m64_from_private(r_);
1066
+#endif
1067
+}
1068
+#define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b)
1069
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1070
+#define _mm_packs_pu16(a, b) simde_mm_packs_pu16(a, b)
1071
+#define _m_packuswb(a, b) simde_mm_packs_pu16(a, b)
1072
+#endif
1073
+
1074
+SIMDE_FUNCTION_ATTRIBUTES
1075
+simde__m64 simde_mm_set_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
1076
+               int8_t e3, int8_t e2, int8_t e1, int8_t e0)
1077
+{
1078
+#if defined(SIMDE_X86_MMX_NATIVE)
1079
+   return _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0);
1080
+#else
1081
+   simde__m64_private r_;
1082
+
1083
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1084
+   const int8_t v[sizeof(r_.i8) / sizeof(r_.i8[0])] = {e0, e1, e2, e3,
1085
+                               e4, e5, e6, e7};
1086
+   r_.neon_i8 = vld1_s8(v);
1087
+#else
1088
+   r_.i8[0] = e0;
1089
+   r_.i8[1] = e1;
1090
+   r_.i8[2] = e2;
1091
+   r_.i8[3] = e3;
1092
+   r_.i8[4] = e4;
1093
+   r_.i8[5] = e5;
1094
+   r_.i8[6] = e6;
1095
+   r_.i8[7] = e7;
1096
+#endif
1097
+
1098
+   return simde__m64_from_private(r_);
1099
+#endif
1100
+}
1101
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1102
+#define _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0) \
1103
+   simde_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0)
1104
+#endif
1105
+
1106
+SIMDE_FUNCTION_ATTRIBUTES
1107
+simde__m64 simde_x_mm_set_pu8(uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4,
1108
+                 uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0)
1109
+{
1110
+   simde__m64_private r_;
1111
+
1112
+#if defined(SIMDE_X86_MMX_NATIVE)
1113
+   r_.n = _mm_set_pi8(
1114
+       HEDLEY_STATIC_CAST(int8_t, e7), HEDLEY_STATIC_CAST(int8_t, e6),
1115
+       HEDLEY_STATIC_CAST(int8_t, e5), HEDLEY_STATIC_CAST(int8_t, e4),
1116
+       HEDLEY_STATIC_CAST(int8_t, e3), HEDLEY_STATIC_CAST(int8_t, e2),
1117
+       HEDLEY_STATIC_CAST(int8_t, e1), HEDLEY_STATIC_CAST(int8_t, e0));
1118
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1119
+   const uint8_t v[sizeof(r_.u8) / sizeof(r_.u8[0])] = {e0, e1, e2, e3,
1120
+                                e4, e5, e6, e7};
1121
+   r_.neon_u8 = vld1_u8(v);
1122
+#else
1123
+   r_.u8[0] = e0;
1124
+   r_.u8[1] = e1;
1125
+   r_.u8[2] = e2;
1126
+   r_.u8[3] = e3;
1127
+   r_.u8[4] = e4;
1128
+   r_.u8[5] = e5;
1129
+   r_.u8[6] = e6;
1130
+   r_.u8[7] = e7;
1131
+#endif
1132
+
1133
+   return simde__m64_from_private(r_);
1134
+}
1135
+
1136
+SIMDE_FUNCTION_ATTRIBUTES
1137
+simde__m64 simde_mm_set_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
1138
+{
1139
+#if defined(SIMDE_X86_MMX_NATIVE)
1140
+   return _mm_set_pi16(e3, e2, e1, e0);
1141
+#else
1142
+   simde__m64_private r_;
1143
+
1144
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1145
+   const int16_t v[sizeof(r_.i16) / sizeof(r_.i16[0])] = {e0, e1, e2, e3};
1146
+   r_.neon_i16 = vld1_s16(v);
1147
+#else
1148
+   r_.i16[0] = e0;
1149
+   r_.i16[1] = e1;
1150
+   r_.i16[2] = e2;
1151
+   r_.i16[3] = e3;
1152
+#endif
1153
+
1154
+   return simde__m64_from_private(r_);
1155
+#endif
1156
+}
1157
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1158
+#define _mm_set_pi16(e3, e2, e1, e0) simde_mm_set_pi16(e3, e2, e1, e0)
1159
+#endif
1160
+
1161
+SIMDE_FUNCTION_ATTRIBUTES
1162
+simde__m64 simde_x_mm_set_pu16(uint16_t e3, uint16_t e2, uint16_t e1,
1163
+                  uint16_t e0)
1164
+{
1165
+   simde__m64_private r_;
1166
+
1167
+#if defined(SIMDE_X86_MMX_NATIVE)
1168
+   r_.n = _mm_set_pi16(HEDLEY_STATIC_CAST(int16_t, e3),
1169
+               HEDLEY_STATIC_CAST(int16_t, e2),
1170
+               HEDLEY_STATIC_CAST(int16_t, e1),
1171
+               HEDLEY_STATIC_CAST(int16_t, e0));
1172
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1173
+   const uint16_t v[sizeof(r_.u16) / sizeof(r_.u16[0])] = {e0, e1, e2, e3};
1174
+   r_.neon_u16 = vld1_u16(v);
1175
+#else
1176
+   r_.u16[0] = e0;
1177
+   r_.u16[1] = e1;
1178
+   r_.u16[2] = e2;
1179
+   r_.u16[3] = e3;
1180
+#endif
1181
+
1182
+   return simde__m64_from_private(r_);
1183
+}
1184
+
1185
+SIMDE_FUNCTION_ATTRIBUTES
1186
+simde__m64 simde_x_mm_set_pu32(uint32_t e1, uint32_t e0)
1187
+{
1188
+   simde__m64_private r_;
1189
+
1190
+#if defined(SIMDE_X86_MMX_NATIVE)
1191
+   r_.n = _mm_set_pi32(HEDLEY_STATIC_CAST(int32_t, e1),
1192
+               HEDLEY_STATIC_CAST(int32_t, e0));
1193
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1194
+   const uint32_t v[sizeof(r_.u32) / sizeof(r_.u32[0])] = {e0, e1};
1195
+   r_.neon_u32 = vld1_u32(v);
1196
+#else
1197
+   r_.u32[0] = e0;
1198
+   r_.u32[1] = e1;
1199
+#endif
1200
+
1201
+   return simde__m64_from_private(r_);
1202
+}
1203
+
1204
+SIMDE_FUNCTION_ATTRIBUTES
1205
+simde__m64 simde_mm_set_pi32(int32_t e1, int32_t e0)
1206
+{
1207
+   simde__m64_private r_;
1208
+
1209
+#if defined(SIMDE_X86_MMX_NATIVE)
1210
+   r_.n = _mm_set_pi32(e1, e0);
1211
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1212
+   const int32_t v[sizeof(r_.i32) / sizeof(r_.i32[0])] = {e0, e1};
1213
+   r_.neon_i32 = vld1_s32(v);
1214
+#else
1215
+   r_.i32[0] = e0;
1216
+   r_.i32[1] = e1;
1217
+#endif
1218
+
1219
+   return simde__m64_from_private(r_);
1220
+}
1221
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1222
+#define _mm_set_pi32(e1, e0) simde_mm_set_pi32(e1, e0)
1223
+#endif
1224
+
1225
+SIMDE_FUNCTION_ATTRIBUTES
1226
+simde__m64 simde_x_mm_set_pi64(int64_t e0)
1227
+{
1228
+   simde__m64_private r_;
1229
+
1230
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1231
+   const int64_t v[sizeof(r_.i64) / sizeof(r_.i64[0])] = {e0};
1232
+   r_.neon_i64 = vld1_s64(v);
1233
+#else
1234
+   r_.i64[0] = e0;
1235
+#endif
1236
+
1237
+   return simde__m64_from_private(r_);
1238
+}
1239
+
1240
+SIMDE_FUNCTION_ATTRIBUTES
1241
+simde__m64 simde_x_mm_set_f32x2(simde_float32 e1, simde_float32 e0)
1242
+{
1243
+   simde__m64_private r_;
1244
+
1245
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1246
+   const simde_float32 v[sizeof(r_.f32) / sizeof(r_.f32[0])] = {e0, e1};
1247
+   r_.neon_f32 = vld1_f32(v);
1248
+#else
1249
+   r_.f32[0] = e0;
1250
+   r_.f32[1] = e1;
1251
+#endif
1252
+
1253
+   return simde__m64_from_private(r_);
1254
+}
1255
+
1256
+SIMDE_FUNCTION_ATTRIBUTES
1257
+simde__m64 simde_mm_set1_pi8(int8_t a)
1258
+{
1259
+#if defined(SIMDE_X86_MMX_NATIVE)
1260
+   return _mm_set1_pi8(a);
1261
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1262
+   simde__m64_private r_;
1263
+   r_.neon_i8 = vmov_n_s8(a);
1264
+   return simde__m64_from_private(r_);
1265
+#else
1266
+   return simde_mm_set_pi8(a, a, a, a, a, a, a, a);
1267
+#endif
1268
+}
1269
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1270
+#define _mm_set1_pi8(a) simde_mm_set1_pi8(a)
1271
+#endif
1272
+
1273
+SIMDE_FUNCTION_ATTRIBUTES
1274
+simde__m64 simde_mm_set1_pi16(int16_t a)
1275
+{
1276
+#if defined(SIMDE_X86_MMX_NATIVE)
1277
+   return _mm_set1_pi16(a);
1278
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1279
+   simde__m64_private r_;
1280
+   r_.neon_i16 = vmov_n_s16(a);
1281
+   return simde__m64_from_private(r_);
1282
+#else
1283
+   return simde_mm_set_pi16(a, a, a, a);
1284
+#endif
1285
+}
1286
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1287
+#define _mm_set1_pi16(a) simde_mm_set1_pi16(a)
1288
+#endif
1289
+
1290
+SIMDE_FUNCTION_ATTRIBUTES
1291
+simde__m64 simde_mm_set1_pi32(int32_t a)
1292
+{
1293
+#if defined(SIMDE_X86_MMX_NATIVE)
1294
+   return _mm_set1_pi32(a);
1295
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1296
+   simde__m64_private r_;
1297
+   r_.neon_i32 = vmov_n_s32(a);
1298
+   return simde__m64_from_private(r_);
1299
+#else
1300
+   return simde_mm_set_pi32(a, a);
1301
+#endif
1302
+}
1303
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1304
+#define _mm_set1_pi32(a) simde_mm_set1_pi32(a)
1305
+#endif
1306
+
1307
+SIMDE_FUNCTION_ATTRIBUTES
1308
+simde__m64 simde_mm_setr_pi8(int8_t e7, int8_t e6, int8_t e5, int8_t e4,
1309
+                int8_t e3, int8_t e2, int8_t e1, int8_t e0)
1310
+{
1311
+#if defined(SIMDE_X86_MMX_NATIVE)
1312
+   return _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0);
1313
+#else
1314
+   return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7);
1315
+#endif
1316
+}
1317
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1318
+#define _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0) \
1319
+   simde_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0)
1320
+#endif
1321
+
1322
+SIMDE_FUNCTION_ATTRIBUTES
1323
+simde__m64 simde_mm_setr_pi16(int16_t e3, int16_t e2, int16_t e1, int16_t e0)
1324
+{
1325
+#if defined(SIMDE_X86_MMX_NATIVE)
1326
+   return _mm_setr_pi16(e3, e2, e1, e0);
1327
+#else
1328
+   return simde_mm_set_pi16(e0, e1, e2, e3);
1329
+#endif
1330
+}
1331
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1332
+#define _mm_setr_pi16(e3, e2, e1, e0) simde_mm_setr_pi16(e3, e2, e1, e0)
1333
+#endif
1334
+
1335
+SIMDE_FUNCTION_ATTRIBUTES
1336
+simde__m64 simde_mm_setr_pi32(int32_t e1, int32_t e0)
1337
+{
1338
+#if defined(SIMDE_X86_MMX_NATIVE)
1339
+   return _mm_setr_pi32(e1, e0);
1340
+#else
1341
+   return simde_mm_set_pi32(e0, e1);
1342
+#endif
1343
+}
1344
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1345
+#define _mm_setr_pi32(e1, e0) simde_mm_setr_pi32(e1, e0)
1346
+#endif
1347
+
1348
+SIMDE_FUNCTION_ATTRIBUTES
1349
+simde__m64 simde_mm_setzero_si64(void)
1350
+{
1351
+#if defined(SIMDE_X86_MMX_NATIVE)
1352
+   return _mm_setzero_si64();
1353
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1354
+   simde__m64_private r_;
1355
+   r_.neon_u32 = vmov_n_u32(0);
1356
+   return simde__m64_from_private(r_);
1357
+#else
1358
+   return simde_mm_set_pi32(0, 0);
1359
+#endif
1360
+}
1361
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1362
+#define _mm_setzero_si64() simde_mm_setzero_si64()
1363
+#endif
1364
+
1365
+SIMDE_FUNCTION_ATTRIBUTES
1366
+simde__m64 simde_x_mm_load_si64(const void *mem_addr)
1367
+{
1368
+   simde__m64 r;
1369
+   simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m64),
1370
+            sizeof(r));
1371
+   return r;
1372
+}
1373
+
1374
+SIMDE_FUNCTION_ATTRIBUTES
1375
+simde__m64 simde_x_mm_loadu_si64(const void *mem_addr)
1376
+{
1377
+   simde__m64 r;
1378
+   simde_memcpy(&r, mem_addr, sizeof(r));
1379
+   return r;
1380
+}
1381
+
1382
+SIMDE_FUNCTION_ATTRIBUTES
1383
+void simde_x_mm_store_si64(void *mem_addr, simde__m64 value)
1384
+{
1385
+   simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m64), &value,
1386
+            sizeof(value));
1387
+}
1388
+
1389
+SIMDE_FUNCTION_ATTRIBUTES
1390
+void simde_x_mm_storeu_si64(void *mem_addr, simde__m64 value)
1391
+{
1392
+   simde_memcpy(mem_addr, &value, sizeof(value));
1393
+}
1394
+
1395
+SIMDE_FUNCTION_ATTRIBUTES
1396
+simde__m64 simde_x_mm_setone_si64(void)
1397
+{
1398
+   return simde_mm_set1_pi32(~INT32_C(0));
1399
+}
1400
+
1401
+SIMDE_FUNCTION_ATTRIBUTES
1402
+simde__m64 simde_mm_sll_pi16(simde__m64 a, simde__m64 count)
1403
+{
1404
+#if defined(SIMDE_X86_MMX_NATIVE)
1405
+   return _mm_sll_pi16(a, count);
1406
+#else
1407
+   simde__m64_private r_;
1408
+   simde__m64_private a_ = simde__m64_to_private(a);
1409
+   simde__m64_private count_ = simde__m64_to_private(count);
1410
+
1411
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1412
+   HEDLEY_DIAGNOSTIC_PUSH
1413
+#if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
1414
+   SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
1415
+#pragma clang diagnostic ignored "-Wvector-conversion"
1416
+#endif
1417
+   r_.neon_i16 =
1418
+       vshl_s16(a_.neon_i16,
1419
+            vmov_n_s16(HEDLEY_STATIC_CAST(
1420
+                int16_t, vget_lane_u64(count_.neon_u64, 0))));
1421
+   HEDLEY_DIAGNOSTIC_POP
1422
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
1423
+   defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT)
1424
+   if (HEDLEY_UNLIKELY(count_.u64[0] > 15))
1425
+       return simde_mm_setzero_si64();
1426
+
1427
+   r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count_.u64[0]);
1428
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1429
+   r_.i16 = a_.i16 << count_.u64[0];
1430
+#else
1431
+   if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) {
1432
+       simde_memset(&r_, 0, sizeof(r_));
1433
+       return simde__m64_from_private(r_);
1434
+   }
1435
+
1436
+   SIMDE_VECTORIZE
1437
+   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
1438
+       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t,
1439
+                          a_.u16[i] << count_.u64[0]);
1440
+   }
1441
+#endif
1442
+
1443
+   return simde__m64_from_private(r_);
1444
+#endif
1445
+}
1446
+#define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count)
1447
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1448
+#define _mm_sll_pi16(a, count) simde_mm_sll_pi16(a, count)
1449
+#define _m_psllw(a, count) simde_mm_sll_pi16(a, count)
1450
+#endif
1451
+
1452
+SIMDE_FUNCTION_ATTRIBUTES
1453
+simde__m64 simde_mm_sll_pi32(simde__m64 a, simde__m64 count)
1454
+{
1455
+#if defined(SIMDE_X86_MMX_NATIVE)
1456
+   return _mm_sll_pi32(a, count);
1457
+#else
1458
+   simde__m64_private r_;
1459
+   simde__m64_private a_ = simde__m64_to_private(a);
1460
+   simde__m64_private count_ = simde__m64_to_private(count);
1461
+
1462
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1463
+   HEDLEY_DIAGNOSTIC_PUSH
1464
+#if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
1465
+   SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
1466
+#pragma clang diagnostic ignored "-Wvector-conversion"
1467
+#endif
1468
+   r_.neon_i32 =
1469
+       vshl_s32(a_.neon_i32,
1470
+            vmov_n_s32(HEDLEY_STATIC_CAST(
1471
+                int32_t, vget_lane_u64(count_.neon_u64, 0))));
1472
+   HEDLEY_DIAGNOSTIC_POP
1473
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1474
+   r_.i32 = a_.i32 << count_.u64[0];
1475
+#else
1476
+   if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) {
1477
+       simde_memset(&r_, 0, sizeof(r_));
1478
+       return simde__m64_from_private(r_);
1479
+   }
1480
+
1481
+   SIMDE_VECTORIZE
1482
+   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
1483
+       r_.u32[i] = a_.u32[i] << count_.u64[0];
1484
+   }
1485
+#endif
1486
+
1487
+   return simde__m64_from_private(r_);
1488
+#endif
1489
+}
1490
+#define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count)
1491
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1492
+#define _mm_sll_pi32(a, count) simde_mm_sll_pi32(a, count)
1493
+#define _m_pslld(a, count) simde_mm_sll_pi32(a, count)
1494
+#endif
1495
+
1496
+SIMDE_FUNCTION_ATTRIBUTES
1497
+simde__m64 simde_mm_slli_pi16(simde__m64 a, int count)
1498
+{
1499
+#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
1500
+   return _mm_slli_pi16(a, count);
1501
+#else
1502
+   simde__m64_private r_;
1503
+   simde__m64_private a_ = simde__m64_to_private(a);
1504
+
1505
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
1506
+   defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT)
1507
+   if (HEDLEY_UNLIKELY(count > 15))
1508
+       return simde_mm_setzero_si64();
1509
+
1510
+   r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count);
1511
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1512
+   r_.i16 = a_.i16 << count;
1513
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1514
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1515
+   r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t)count));
1516
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
1517
+   r_.mmi_i16 = psllh_s(a_.mmi_i16, b_.mmi_i16);
1518
+#else
1519
+   SIMDE_VECTORIZE
1520
+   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
1521
+       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << count);
1522
+   }
1523
+#endif
1524
+
1525
+   return simde__m64_from_private(r_);
1526
+#endif
1527
+}
1528
+#define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count)
1529
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1530
+#define _mm_slli_pi16(a, count) simde_mm_slli_pi16(a, count)
1531
+#define _m_psllwi(a, count) simde_mm_slli_pi16(a, count)
1532
+#endif
1533
+
1534
+SIMDE_FUNCTION_ATTRIBUTES
1535
+simde__m64 simde_mm_slli_pi32(simde__m64 a, int count)
1536
+{
1537
+#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
1538
+   return _mm_slli_pi32(a, count);
1539
+#else
1540
+   simde__m64_private r_;
1541
+   simde__m64_private a_ = simde__m64_to_private(a);
1542
+
1543
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1544
+   r_.i32 = a_.i32 << count;
1545
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1546
+   r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t)count));
1547
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
1548
+   r_.mmi_i32 = psllw_s(a_.mmi_i32, b_.mmi_i32);
1549
+#else
1550
+   SIMDE_VECTORIZE
1551
+   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
1552
+       r_.u32[i] = a_.u32[i] << count;
1553
+   }
1554
+#endif
1555
+
1556
+   return simde__m64_from_private(r_);
1557
+#endif
1558
+}
1559
+#define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b)
1560
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1561
+#define _mm_slli_pi32(a, count) simde_mm_slli_pi32(a, count)
1562
+#define _m_pslldi(a, count) simde_mm_slli_pi32(a, count)
1563
+#endif
1564
+
1565
+SIMDE_FUNCTION_ATTRIBUTES
1566
+simde__m64 simde_mm_slli_si64(simde__m64 a, int count)
1567
+{
1568
+#if defined(SIMDE_X86_MMX_NATIVE)
1569
+   return _mm_slli_si64(a, count);
1570
+#else
1571
+   simde__m64_private r_;
1572
+   simde__m64_private a_ = simde__m64_to_private(a);
1573
+
1574
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1575
+   r_.i64 = a_.i64 << count;
1576
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1577
+   r_.neon_i64 = vshl_s64(a_.neon_i64, vmov_n_s64((int64_t)count));
1578
+#else
1579
+   r_.u64[0] = a_.u64[0] << count;
1580
+#endif
1581
+
1582
+   return simde__m64_from_private(r_);
1583
+#endif
1584
+}
1585
+#define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count)
1586
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1587
+#define _mm_slli_si64(a, count) simde_mm_slli_si64(a, count)
1588
+#define _m_psllqi(a, count) simde_mm_slli_si64(a, count)
1589
+#endif
1590
+
1591
+SIMDE_FUNCTION_ATTRIBUTES
1592
+simde__m64 simde_mm_sll_si64(simde__m64 a, simde__m64 count)
1593
+{
1594
+#if defined(SIMDE_X86_MMX_NATIVE)
1595
+   return _mm_sll_si64(a, count);
1596
+#else
1597
+   simde__m64_private r_;
1598
+   simde__m64_private a_ = simde__m64_to_private(a);
1599
+   simde__m64_private count_ = simde__m64_to_private(count);
1600
+
1601
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1602
+   r_.neon_i64 = vshl_s64(a_.neon_i64, count_.neon_i64);
1603
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1604
+   r_.i64 = a_.i64 << count_.i64;
1605
+#else
1606
+   if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) {
1607
+       simde_memset(&r_, 0, sizeof(r_));
1608
+       return simde__m64_from_private(r_);
1609
+   }
1610
+
1611
+   r_.u64[0] = a_.u64[0] << count_.u64[0];
1612
+#endif
1613
+
1614
+   return simde__m64_from_private(r_);
1615
+#endif
1616
+}
1617
+#define simde_m_psllq(a, count) simde_mm_sll_si64(a, count)
1618
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1619
+#define _mm_sll_si64(a, count) simde_mm_sll_si64(a, count)
1620
+#define _m_psllq(a, count) simde_mm_sll_si64(a, count)
1621
+#endif
1622
+
1623
+SIMDE_FUNCTION_ATTRIBUTES
1624
+simde__m64 simde_mm_srl_pi16(simde__m64 a, simde__m64 count)
1625
+{
1626
+#if defined(SIMDE_X86_MMX_NATIVE)
1627
+   return _mm_srl_pi16(a, count);
1628
+#else
1629
+   simde__m64_private r_;
1630
+   simde__m64_private a_ = simde__m64_to_private(a);
1631
+   simde__m64_private count_ = simde__m64_to_private(count);
1632
+
1633
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
1634
+   defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT)
1635
+   if (HEDLEY_UNLIKELY(count_.u64[0] > 15))
1636
+       return simde_mm_setzero_si64();
1637
+
1638
+   r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, count_.u64[0]);
1639
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1640
+   r_.u16 = a_.u16 >> count_.u64[0];
1641
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1642
+   r_.neon_u16 = vshl_u16(
1643
+       a_.neon_u16,
1644
+       vmov_n_s16(-((int16_t)vget_lane_u64(count_.neon_u64, 0))));
1645
+#else
1646
+   if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) {
1647
+       simde_memset(&r_, 0, sizeof(r_));
1648
+       return simde__m64_from_private(r_);
1649
+   }
1650
+
1651
+   SIMDE_VECTORIZE
1652
+   for (size_t i = 0; i < sizeof(r_.u16) / sizeof(r_.u16[0]); i++) {
1653
+       r_.u16[i] = a_.u16[i] >> count_.u64[0];
1654
+   }
1655
+#endif
1656
+
1657
+   return simde__m64_from_private(r_);
1658
+#endif
1659
+}
1660
+#define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count)
1661
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1662
+#define _mm_srl_pi16(a, count) simde_mm_srl_pi16(a, count)
1663
+#define _m_psrlw(a, count) simde_mm_srl_pi16(a, count)
1664
+#endif
1665
+
1666
+SIMDE_FUNCTION_ATTRIBUTES
1667
+simde__m64 simde_mm_srl_pi32(simde__m64 a, simde__m64 count)
1668
+{
1669
+#if defined(SIMDE_X86_MMX_NATIVE)
1670
+   return _mm_srl_pi32(a, count);
1671
+#else
1672
+   simde__m64_private r_;
1673
+   simde__m64_private a_ = simde__m64_to_private(a);
1674
+   simde__m64_private count_ = simde__m64_to_private(count);
1675
+
1676
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1677
+   r_.u32 = a_.u32 >> count_.u64[0];
1678
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1679
+   r_.neon_u32 = vshl_u32(
1680
+       a_.neon_u32,
1681
+       vmov_n_s32(-((int32_t)vget_lane_u64(count_.neon_u64, 0))));
1682
+#else
1683
+   if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) {
1684
+       simde_memset(&r_, 0, sizeof(r_));
1685
+       return simde__m64_from_private(r_);
1686
+   }
1687
+
1688
+   SIMDE_VECTORIZE
1689
+   for (size_t i = 0; i < sizeof(r_.u32) / sizeof(r_.u32[0]); i++) {
1690
+       r_.u32[i] = a_.u32[i] >> count_.u64[0];
1691
+   }
1692
+#endif
1693
+
1694
+   return simde__m64_from_private(r_);
1695
+#endif
1696
+}
1697
+#define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count)
1698
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1699
+#define _mm_srl_pi32(a, count) simde_mm_srl_pi32(a, count)
1700
+#define _m_psrld(a, count) simde_mm_srl_pi32(a, count)
1701
+#endif
1702
+
1703
+SIMDE_FUNCTION_ATTRIBUTES
1704
+simde__m64 simde_mm_srli_pi16(simde__m64 a, int count)
1705
+{
1706
+#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
1707
+   return _mm_srli_pi16(a, count);
1708
+#else
1709
+   simde__m64_private r_;
1710
+   simde__m64_private a_ = simde__m64_to_private(a);
1711
+
1712
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1713
+   r_.u16 = a_.u16 >> count;
1714
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1715
+   r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t)count)));
1716
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
1717
+   r_.mmi_i16 = psrlh_s(a_.mmi_i16, b_.mmi_i16);
1718
+#else
1719
+   SIMDE_VECTORIZE
1720
+   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
1721
+       r_.u16[i] = a_.u16[i] >> count;
1722
+   }
1723
+#endif
1724
+
1725
+   return simde__m64_from_private(r_);
1726
+#endif
1727
+}
1728
+#define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
1729
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1730
+#define _mm_srli_pi16(a, count) simde_mm_srli_pi16(a, count)
1731
+#define _m_psrlwi(a, count) simde_mm_srli_pi16(a, count)
1732
+#endif
1733
+
1734
+SIMDE_FUNCTION_ATTRIBUTES
1735
+simde__m64 simde_mm_srli_pi32(simde__m64 a, int count)
1736
+{
1737
+#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
1738
+   return _mm_srli_pi32(a, count);
1739
+#else
1740
+   simde__m64_private r_;
1741
+   simde__m64_private a_ = simde__m64_to_private(a);
1742
+
1743
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1744
+   r_.u32 = a_.u32 >> count;
1745
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1746
+   r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t)count)));
1747
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
1748
+   r_.mmi_i32 = psrlw_s(a_.mmi_i32, b_.mmi_i32);
1749
+#else
1750
+   SIMDE_VECTORIZE
1751
+   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
1752
+       r_.u32[i] = a_.u32[i] >> count;
1753
+   }
1754
+#endif
1755
+
1756
+   return simde__m64_from_private(r_);
1757
+#endif
1758
+}
1759
+#define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count)
1760
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1761
+#define _mm_srli_pi32(a, count) simde_mm_srli_pi32(a, count)
1762
+#define _m_psrldi(a, count) simde_mm_srli_pi32(a, count)
1763
+#endif
1764
+
1765
+SIMDE_FUNCTION_ATTRIBUTES
1766
+simde__m64 simde_mm_srli_si64(simde__m64 a, int count)
1767
+{
1768
+#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
1769
+   return _mm_srli_si64(a, count);
1770
+#else
1771
+   simde__m64_private r_;
1772
+   simde__m64_private a_ = simde__m64_to_private(a);
1773
+
1774
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1775
+   r_.neon_u64 = vshl_u64(a_.neon_u64, vmov_n_s64(-count));
1776
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1777
+   r_.u64 = a_.u64 >> count;
1778
+#else
1779
+   r_.u64[0] = a_.u64[0] >> count;
1780
+#endif
1781
+
1782
+   return simde__m64_from_private(r_);
1783
+#endif
1784
+}
1785
+#define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count)
1786
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1787
+#define _mm_srli_si64(a, count) simde_mm_srli_si64(a, count)
1788
+#define _m_psrlqi(a, count) simde_mm_srli_si64(a, count)
1789
+#endif
1790
+
1791
+SIMDE_FUNCTION_ATTRIBUTES
1792
+simde__m64 simde_mm_srl_si64(simde__m64 a, simde__m64 count)
1793
+{
1794
+#if defined(SIMDE_X86_MMX_NATIVE)
1795
+   return _mm_srl_si64(a, count);
1796
+#else
1797
+   simde__m64_private r_;
1798
+   simde__m64_private a_ = simde__m64_to_private(a);
1799
+   simde__m64_private count_ = simde__m64_to_private(count);
1800
+
1801
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1802
+   r_.neon_u64 = vshl_u64(a_.neon_u64, vneg_s64(count_.neon_i64));
1803
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1804
+   r_.u64 = a_.u64 >> count_.u64;
1805
+#else
1806
+   if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) {
1807
+       simde_memset(&r_, 0, sizeof(r_));
1808
+       return simde__m64_from_private(r_);
1809
+   }
1810
+
1811
+   r_.u64[0] = a_.u64[0] >> count_.u64[0];
1812
+#endif
1813
+
1814
+   return simde__m64_from_private(r_);
1815
+#endif
1816
+}
1817
+#define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count)
1818
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1819
+#define _mm_srl_si64(a, count) simde_mm_srl_si64(a, count)
1820
+#define _m_psrlq(a, count) simde_mm_srl_si64(a, count)
1821
+#endif
1822
+
1823
+SIMDE_FUNCTION_ATTRIBUTES
1824
+simde__m64 simde_mm_srai_pi16(simde__m64 a, int count)
1825
+{
1826
+#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
1827
+   return _mm_srai_pi16(a, count);
1828
+#else
1829
+   simde__m64_private r_;
1830
+   simde__m64_private a_ = simde__m64_to_private(a);
1831
+
1832
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1833
+   r_.i16 = a_.i16 >> (count & 0xff);
1834
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1835
+   r_.neon_i16 = vshl_s16(a_.neon_i16,
1836
+                  vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, count)));
1837
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
1838
+   r_.mmi_i16 = psrah_s(a_.mmi_i16, count);
1839
+#else
1840
+   SIMDE_VECTORIZE
1841
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1842
+       r_.i16[i] = a_.i16[i] >> (count & 0xff);
1843
+   }
1844
+#endif
1845
+
1846
+   return simde__m64_from_private(r_);
1847
+#endif
1848
+}
1849
+#define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count)
1850
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1851
+#define _mm_srai_pi16(a, count) simde_mm_srai_pi16(a, count)
1852
+#define _m_psrawi(a, count) simde_mm_srai_pi16(a, count)
1853
+#endif
1854
+
1855
+SIMDE_FUNCTION_ATTRIBUTES
1856
+simde__m64 simde_mm_srai_pi32(simde__m64 a, int count)
1857
+{
1858
+#if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI)
1859
+   return _mm_srai_pi32(a, count);
1860
+#else
1861
+   simde__m64_private r_;
1862
+   simde__m64_private a_ = simde__m64_to_private(a);
1863
+
1864
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1865
+   r_.i32 = a_.i32 >> (count & 0xff);
1866
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1867
+   r_.neon_i32 = vshl_s32(a_.neon_i32,
1868
+                  vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, count)));
1869
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
1870
+   r_.mmi_i32 = psraw_s(a_.mmi_i32, count);
1871
+#else
1872
+   SIMDE_VECTORIZE
1873
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1874
+       r_.i32[i] = a_.i32[i] >> (count & 0xff);
1875
+   }
1876
+#endif
1877
+
1878
+   return simde__m64_from_private(r_);
1879
+#endif
1880
+}
1881
+#define simde_m_psradi(a, count) simde_mm_srai_pi32(a, count)
1882
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1883
+#define _mm_srai_pi32(a, count) simde_mm_srai_pi32(a, count)
1884
+#define _m_psradi(a, count) simde_mm_srai_pi32(a, count)
1885
+#endif
1886
+
1887
+SIMDE_FUNCTION_ATTRIBUTES
1888
+simde__m64 simde_mm_sra_pi16(simde__m64 a, simde__m64 count)
1889
+{
1890
+#if defined(SIMDE_X86_MMX_NATIVE)
1891
+   return _mm_sra_pi16(a, count);
1892
+#else
1893
+   simde__m64_private r_;
1894
+   simde__m64_private a_ = simde__m64_to_private(a);
1895
+   simde__m64_private count_ = simde__m64_to_private(count);
1896
+   const int cnt = HEDLEY_STATIC_CAST(
1897
+       int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
1898
+
1899
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1900
+   r_.i16 = a_.i16 >> cnt;
1901
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1902
+   r_.neon_i16 =
1903
+       vshl_s16(a_.neon_i16,
1904
+            vmov_n_s16(-HEDLEY_STATIC_CAST(
1905
+                int16_t, vget_lane_u64(count_.neon_u64, 0))));
1906
+#else
1907
+   SIMDE_VECTORIZE
1908
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1909
+       r_.i16[i] = a_.i16[i] >> cnt;
1910
+   }
1911
+#endif
1912
+
1913
+   return simde__m64_from_private(r_);
1914
+#endif
1915
+}
1916
+#define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count)
1917
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1918
+#define _mm_sra_pi16(a, count) simde_mm_sra_pi16(a, count)
1919
+#define _m_psraw(a, count) simde_mm_sra_pi16(a, count)
1920
+#endif
1921
+
1922
+SIMDE_FUNCTION_ATTRIBUTES
1923
+simde__m64 simde_mm_sra_pi32(simde__m64 a, simde__m64 count)
1924
+{
1925
+#if defined(SIMDE_X86_MMX_NATIVE)
1926
+   return _mm_sra_pi32(a, count);
1927
+#else
1928
+   simde__m64_private r_;
1929
+   simde__m64_private a_ = simde__m64_to_private(a);
1930
+   simde__m64_private count_ = simde__m64_to_private(count);
1931
+   const int32_t cnt =
1932
+       (count_.u64[0] > 31)
1933
+           ? 31
1934
+           : HEDLEY_STATIC_CAST(int32_t, count_.u64[0]);
1935
+
1936
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
1937
+   r_.i32 = a_.i32 >> cnt;
1938
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1939
+   r_.neon_i32 =
1940
+       vshl_s32(a_.neon_i32,
1941
+            vmov_n_s32(-HEDLEY_STATIC_CAST(
1942
+                int32_t, vget_lane_u64(count_.neon_u64, 0))));
1943
+#else
1944
+   SIMDE_VECTORIZE
1945
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1946
+       r_.i32[i] = a_.i32[i] >> cnt;
1947
+   }
1948
+#endif
1949
+
1950
+   return simde__m64_from_private(r_);
1951
+#endif
1952
+}
1953
+#define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b)
1954
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1955
+#define _mm_sra_pi32(a, count) simde_mm_sra_pi32(a, count)
1956
+#define _m_psrad(a, count) simde_mm_sra_pi32(a, count)
1957
+#endif
1958
+
1959
+SIMDE_FUNCTION_ATTRIBUTES
1960
+simde__m64 simde_mm_sub_pi8(simde__m64 a, simde__m64 b)
1961
+{
1962
+#if defined(SIMDE_X86_MMX_NATIVE)
1963
+   return _mm_sub_pi8(a, b);
1964
+#else
1965
+   simde__m64_private r_;
1966
+   simde__m64_private a_ = simde__m64_to_private(a);
1967
+   simde__m64_private b_ = simde__m64_to_private(b);
1968
+
1969
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1970
+   r_.neon_i8 = vsub_s8(a_.neon_i8, b_.neon_i8);
1971
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
1972
+   r_.mmi_i8 = psubb_s(a_.mmi_i8, b_.mmi_i8);
1973
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1974
+   r_.i8 = a_.i8 - b_.i8;
1975
+#else
1976
+   SIMDE_VECTORIZE
1977
+   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
1978
+       r_.i8[i] = a_.i8[i] - b_.i8[i];
1979
+   }
1980
+#endif
1981
+
1982
+   return simde__m64_from_private(r_);
1983
+#endif
1984
+}
1985
+#define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b)
1986
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
1987
+#define _mm_sub_pi8(a, b) simde_mm_sub_pi8(a, b)
1988
+#define _m_psubb(a, b) simde_mm_sub_pi8(a, b)
1989
+#endif
1990
+
1991
+SIMDE_FUNCTION_ATTRIBUTES
1992
+simde__m64 simde_mm_sub_pi16(simde__m64 a, simde__m64 b)
1993
+{
1994
+#if defined(SIMDE_X86_MMX_NATIVE)
1995
+   return _mm_sub_pi16(a, b);
1996
+#else
1997
+   simde__m64_private r_;
1998
+   simde__m64_private a_ = simde__m64_to_private(a);
1999
+   simde__m64_private b_ = simde__m64_to_private(b);
2000
+
2001
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2002
+   r_.neon_i16 = vsub_s16(a_.neon_i16, b_.neon_i16);
2003
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
2004
+   r_.mmi_i16 = psubh_s(a_.mmi_i16, b_.mmi_i16);
2005
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2006
+   r_.i16 = a_.i16 - b_.i16;
2007
+#else
2008
+   SIMDE_VECTORIZE
2009
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
2010
+       r_.i16[i] = a_.i16[i] - b_.i16[i];
2011
+   }
2012
+#endif
2013
+
2014
+   return simde__m64_from_private(r_);
2015
+#endif
2016
+}
2017
+#define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b)
2018
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2019
+#define _mm_sub_pi16(a, b) simde_mm_sub_pi16(a, b)
2020
+#define _m_psubw(a, b) simde_mm_sub_pi16(a, b)
2021
+#endif
2022
+
2023
+SIMDE_FUNCTION_ATTRIBUTES
2024
+simde__m64 simde_mm_sub_pi32(simde__m64 a, simde__m64 b)
2025
+{
2026
+#if defined(SIMDE_X86_MMX_NATIVE)
2027
+   return _mm_sub_pi32(a, b);
2028
+#else
2029
+   simde__m64_private r_;
2030
+   simde__m64_private a_ = simde__m64_to_private(a);
2031
+   simde__m64_private b_ = simde__m64_to_private(b);
2032
+
2033
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2034
+   r_.neon_i32 = vsub_s32(a_.neon_i32, b_.neon_i32);
2035
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
2036
+   r_.mmi_i32 = psubw_s(a_.mmi_i32, b_.mmi_i32);
2037
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2038
+   r_.i32 = a_.i32 - b_.i32;
2039
+#else
2040
+   SIMDE_VECTORIZE
2041
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
2042
+       r_.i32[i] = a_.i32[i] - b_.i32[i];
2043
+   }
2044
+#endif
2045
+
2046
+   return simde__m64_from_private(r_);
2047
+#endif
2048
+}
2049
+#define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b)
2050
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2051
+#define _mm_sub_pi32(a, b) simde_mm_sub_pi32(a, b)
2052
+#define _m_psubd(a, b) simde_mm_sub_pi32(a, b)
2053
+#endif
2054
+
2055
+SIMDE_FUNCTION_ATTRIBUTES
2056
+simde__m64 simde_mm_subs_pi8(simde__m64 a, simde__m64 b)
2057
+{
2058
+#if defined(SIMDE_X86_MMX_NATIVE)
2059
+   return _mm_subs_pi8(a, b);
2060
+#else
2061
+   simde__m64_private r_;
2062
+   simde__m64_private a_ = simde__m64_to_private(a);
2063
+   simde__m64_private b_ = simde__m64_to_private(b);
2064
+
2065
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2066
+   r_.neon_i8 = vqsub_s8(a_.neon_i8, b_.neon_i8);
2067
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
2068
+   r_.mmi_i8 = psubsb(a_.mmi_i8, b_.mmi_i8);
2069
+#else
2070
+   SIMDE_VECTORIZE
2071
+   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
2072
+       if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
2073
+           r_.i8[i] = INT8_MIN;
2074
+       } else if ((b_.i8[i]) < 0 &&
2075
+              (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
2076
+           r_.i8[i] = INT8_MAX;
2077
+       } else {
2078
+           r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
2079
+       }
2080
+   }
2081
+#endif
2082
+
2083
+   return simde__m64_from_private(r_);
2084
+#endif
2085
+}
2086
+#define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b)
2087
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2088
+#define _mm_subs_pi8(a, b) simde_mm_subs_pi8(a, b)
2089
+#define _m_psubsb(a, b) simde_mm_subs_pi8(a, b)
2090
+#endif
2091
+
2092
+SIMDE_FUNCTION_ATTRIBUTES
2093
+simde__m64 simde_mm_subs_pu8(simde__m64 a, simde__m64 b)
2094
+{
2095
+#if defined(SIMDE_X86_MMX_NATIVE)
2096
+   return _mm_subs_pu8(a, b);
2097
+#else
2098
+   simde__m64_private r_;
2099
+   simde__m64_private a_ = simde__m64_to_private(a);
2100
+   simde__m64_private b_ = simde__m64_to_private(b);
2101
+
2102
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2103
+   r_.neon_u8 = vqsub_u8(a_.neon_u8, b_.neon_u8);
2104
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
2105
+   r_.mmi_u8 = psubusb(a_.mmi_u8, b_.mmi_u8);
2106
+#else
2107
+   SIMDE_VECTORIZE
2108
+   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
2109
+       const int32_t x = a_.u8[i] - b_.u8[i];
2110
+       if (x < 0) {
2111
+           r_.u8[i] = 0;
2112
+       } else if (x > UINT8_MAX) {
2113
+           r_.u8[i] = UINT8_MAX;
2114
+       } else {
2115
+           r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
2116
+       }
2117
+   }
2118
+#endif
2119
+
2120
+   return simde__m64_from_private(r_);
2121
+#endif
2122
+}
2123
+#define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b)
2124
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2125
+#define _mm_subs_pu8(a, b) simde_mm_subs_pu8(a, b)
2126
+#define _m_psubusb(a, b) simde_mm_subs_pu8(a, b)
2127
+#endif
2128
+
2129
+SIMDE_FUNCTION_ATTRIBUTES
2130
+simde__m64 simde_mm_subs_pi16(simde__m64 a, simde__m64 b)
2131
+{
2132
+#if defined(SIMDE_X86_MMX_NATIVE)
2133
+   return _mm_subs_pi16(a, b);
2134
+#else
2135
+   simde__m64_private r_;
2136
+   simde__m64_private a_ = simde__m64_to_private(a);
2137
+   simde__m64_private b_ = simde__m64_to_private(b);
2138
+
2139
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2140
+   r_.neon_i16 = vqsub_s16(a_.neon_i16, b_.neon_i16);
2141
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
2142
+   r_.mmi_i16 = psubsh(a_.mmi_i16, b_.mmi_i16);
2143
+#else
2144
+   SIMDE_VECTORIZE
2145
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
2146
+       if (((b_.i16[i]) > 0 && (a_.i16[i]) < SHRT_MIN + (b_.i16[i]))) {
2147
+           r_.i16[i] = SHRT_MIN;
2148
+       } else if ((b_.i16[i]) < 0 &&
2149
+              (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
2150
+           r_.i16[i] = INT16_MAX;
2151
+       } else {
2152
+           r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
2153
+       }
2154
+   }
2155
+#endif
2156
+
2157
+   return simde__m64_from_private(r_);
2158
+#endif
2159
+}
2160
+#define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b)
2161
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2162
+#define _mm_subs_pi16(a, b) simde_mm_subs_pi16(a, b)
2163
+#define _m_psubsw(a, b) simde_mm_subs_pi16(a, b)
2164
+#endif
2165
+
2166
+SIMDE_FUNCTION_ATTRIBUTES
2167
+simde__m64 simde_mm_subs_pu16(simde__m64 a, simde__m64 b)
2168
+{
2169
+#if defined(SIMDE_X86_MMX_NATIVE)
2170
+   return _mm_subs_pu16(a, b);
2171
+#else
2172
+   simde__m64_private r_;
2173
+   simde__m64_private a_ = simde__m64_to_private(a);
2174
+   simde__m64_private b_ = simde__m64_to_private(b);
2175
+
2176
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2177
+   r_.neon_u16 = vqsub_u16(a_.neon_u16, b_.neon_u16);
2178
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
2179
+   r_.mmi_u16 = psubush(a_.mmi_u16, b_.mmi_u16);
2180
+#else
2181
+   SIMDE_VECTORIZE
2182
+   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
2183
+       const int x = a_.u16[i] - b_.u16[i];
2184
+       if (x < 0) {
2185
+           r_.u16[i] = 0;
2186
+       } else if (x > UINT16_MAX) {
2187
+           r_.u16[i] = UINT16_MAX;
2188
+       } else {
2189
+           r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
2190
+       }
2191
+   }
2192
+#endif
2193
+
2194
+   return simde__m64_from_private(r_);
2195
+#endif
2196
+}
2197
+#define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b)
2198
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2199
+#define _mm_subs_pu16(a, b) simde_mm_subs_pu16(a, b)
2200
+#define _m_psubusw(a, b) simde_mm_subs_pu16(a, b)
2201
+#endif
2202
+
2203
+SIMDE_FUNCTION_ATTRIBUTES
2204
+simde__m64 simde_mm_unpackhi_pi8(simde__m64 a, simde__m64 b)
2205
+{
2206
+#if defined(SIMDE_X86_MMX_NATIVE)
2207
+   return _mm_unpackhi_pi8(a, b);
2208
+#else
2209
+   simde__m64_private r_;
2210
+   simde__m64_private a_ = simde__m64_to_private(a);
2211
+   simde__m64_private b_ = simde__m64_to_private(b);
2212
+
2213
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2214
+   r_.neon_i8 = vzip2_s8(a_.neon_i8, b_.neon_i8);
2215
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
2216
+   r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14,
2217
+                     7, 15);
2218
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
2219
+   r_.mmi_i8 = punpckhbh_s(a_.mmi_i8, b_.mmi_i8);
2220
+#else
2221
+   r_.i8[0] = a_.i8[4];
2222
+   r_.i8[1] = b_.i8[4];
2223
+   r_.i8[2] = a_.i8[5];
2224
+   r_.i8[3] = b_.i8[5];
2225
+   r_.i8[4] = a_.i8[6];
2226
+   r_.i8[5] = b_.i8[6];
2227
+   r_.i8[6] = a_.i8[7];
2228
+   r_.i8[7] = b_.i8[7];
2229
+#endif
2230
+
2231
+   return simde__m64_from_private(r_);
2232
+#endif
2233
+}
2234
+#define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
2235
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2236
+#define _mm_unpackhi_pi8(a, b) simde_mm_unpackhi_pi8(a, b)
2237
+#define _m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b)
2238
+#endif
2239
+
2240
+SIMDE_FUNCTION_ATTRIBUTES
2241
+simde__m64 simde_mm_unpackhi_pi16(simde__m64 a, simde__m64 b)
2242
+{
2243
+#if defined(SIMDE_X86_MMX_NATIVE)
2244
+   return _mm_unpackhi_pi16(a, b);
2245
+#else
2246
+   simde__m64_private r_;
2247
+   simde__m64_private a_ = simde__m64_to_private(a);
2248
+   simde__m64_private b_ = simde__m64_to_private(b);
2249
+
2250
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2251
+   r_.neon_i16 = vzip2_s16(a_.neon_i16, b_.neon_i16);
2252
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
2253
+   r_.mmi_i16 = punpckhhw_s(a_.mmi_i16, b_.mmi_i16);
2254
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
2255
+   r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 2, 6, 3, 7);
2256
+#else
2257
+   r_.i16[0] = a_.i16[2];
2258
+   r_.i16[1] = b_.i16[2];
2259
+   r_.i16[2] = a_.i16[3];
2260
+   r_.i16[3] = b_.i16[3];
2261
+#endif
2262
+
2263
+   return simde__m64_from_private(r_);
2264
+#endif
2265
+}
2266
+#define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
2267
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2268
+#define _mm_unpackhi_pi16(a, b) simde_mm_unpackhi_pi16(a, b)
2269
+#define _m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b)
2270
+#endif
2271
+
2272
+SIMDE_FUNCTION_ATTRIBUTES
2273
+simde__m64 simde_mm_unpackhi_pi32(simde__m64 a, simde__m64 b)
2274
+{
2275
+#if defined(SIMDE_X86_MMX_NATIVE)
2276
+   return _mm_unpackhi_pi32(a, b);
2277
+#else
2278
+   simde__m64_private r_;
2279
+   simde__m64_private a_ = simde__m64_to_private(a);
2280
+   simde__m64_private b_ = simde__m64_to_private(b);
2281
+
2282
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2283
+   r_.neon_i32 = vzip2_s32(a_.neon_i32, b_.neon_i32);
2284
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
2285
+   r_.mmi_i32 = punpckhwd_s(a_.mmi_i32, b_.mmi_i32);
2286
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
2287
+   r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3);
2288
+#else
2289
+   r_.i32[0] = a_.i32[1];
2290
+   r_.i32[1] = b_.i32[1];
2291
+#endif
2292
+
2293
+   return simde__m64_from_private(r_);
2294
+#endif
2295
+}
2296
+#define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
2297
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2298
+#define _mm_unpackhi_pi32(a, b) simde_mm_unpackhi_pi32(a, b)
2299
+#define _m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b)
2300
+#endif
2301
+
2302
+SIMDE_FUNCTION_ATTRIBUTES
2303
+simde__m64 simde_mm_unpacklo_pi8(simde__m64 a, simde__m64 b)
2304
+{
2305
+#if defined(SIMDE_X86_MMX_NATIVE)
2306
+   return _mm_unpacklo_pi8(a, b);
2307
+#else
2308
+   simde__m64_private r_;
2309
+   simde__m64_private a_ = simde__m64_to_private(a);
2310
+   simde__m64_private b_ = simde__m64_to_private(b);
2311
+
2312
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2313
+   r_.neon_i8 = vzip1_s8(a_.neon_i8, b_.neon_i8);
2314
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
2315
+   r_.mmi_i8 = punpcklbh_s(a_.mmi_i8, b_.mmi_i8);
2316
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
2317
+   r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 0, 8, 1, 9, 2, 10, 3,
2318
+                     11);
2319
+#else
2320
+   r_.i8[0] = a_.i8[0];
2321
+   r_.i8[1] = b_.i8[0];
2322
+   r_.i8[2] = a_.i8[1];
2323
+   r_.i8[3] = b_.i8[1];
2324
+   r_.i8[4] = a_.i8[2];
2325
+   r_.i8[5] = b_.i8[2];
2326
+   r_.i8[6] = a_.i8[3];
2327
+   r_.i8[7] = b_.i8[3];
2328
+#endif
2329
+
2330
+   return simde__m64_from_private(r_);
2331
+#endif
2332
+}
2333
+#define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
2334
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2335
+#define _mm_unpacklo_pi8(a, b) simde_mm_unpacklo_pi8(a, b)
2336
+#define _m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b)
2337
+#endif
2338
+
2339
+SIMDE_FUNCTION_ATTRIBUTES
2340
+simde__m64 simde_mm_unpacklo_pi16(simde__m64 a, simde__m64 b)
2341
+{
2342
+#if defined(SIMDE_X86_MMX_NATIVE)
2343
+   return _mm_unpacklo_pi16(a, b);
2344
+#else
2345
+   simde__m64_private r_;
2346
+   simde__m64_private a_ = simde__m64_to_private(a);
2347
+   simde__m64_private b_ = simde__m64_to_private(b);
2348
+
2349
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2350
+   r_.neon_i16 = vzip1_s16(a_.neon_i16, b_.neon_i16);
2351
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
2352
+   r_.mmi_i16 = punpcklhw_s(a_.mmi_i16, b_.mmi_i16);
2353
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
2354
+   r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 4, 1, 5);
2355
+#else
2356
+   r_.i16[0] = a_.i16[0];
2357
+   r_.i16[1] = b_.i16[0];
2358
+   r_.i16[2] = a_.i16[1];
2359
+   r_.i16[3] = b_.i16[1];
2360
+#endif
2361
+
2362
+   return simde__m64_from_private(r_);
2363
+#endif
2364
+}
2365
+#define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
2366
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2367
+#define _mm_unpacklo_pi16(a, b) simde_mm_unpacklo_pi16(a, b)
2368
+#define _m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b)
2369
+#endif
2370
+
2371
+SIMDE_FUNCTION_ATTRIBUTES
2372
+simde__m64 simde_mm_unpacklo_pi32(simde__m64 a, simde__m64 b)
2373
+{
2374
+#if defined(SIMDE_X86_MMX_NATIVE)
2375
+   return _mm_unpacklo_pi32(a, b);
2376
+#else
2377
+   simde__m64_private r_;
2378
+   simde__m64_private a_ = simde__m64_to_private(a);
2379
+   simde__m64_private b_ = simde__m64_to_private(b);
2380
+
2381
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2382
+   r_.neon_i32 = vzip1_s32(a_.neon_i32, b_.neon_i32);
2383
+#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE)
2384
+   r_.mmi_i32 = punpcklwd_s(a_.mmi_i32, b_.mmi_i32);
2385
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
2386
+   r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2);
2387
+#else
2388
+   r_.i32[0] = a_.i32[0];
2389
+   r_.i32[1] = b_.i32[0];
2390
+#endif
2391
+
2392
+   return simde__m64_from_private(r_);
2393
+#endif
2394
+}
2395
+#define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
2396
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2397
+#define _mm_unpacklo_pi32(a, b) simde_mm_unpacklo_pi32(a, b)
2398
+#define _m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b)
2399
+#endif
2400
+
2401
+SIMDE_FUNCTION_ATTRIBUTES
2402
+simde__m64 simde_mm_xor_si64(simde__m64 a, simde__m64 b)
2403
+{
2404
+#if defined(SIMDE_X86_MMX_NATIVE)
2405
+   return _mm_xor_si64(a, b);
2406
+#else
2407
+   simde__m64_private r_;
2408
+   simde__m64_private a_ = simde__m64_to_private(a);
2409
+   simde__m64_private b_ = simde__m64_to_private(b);
2410
+
2411
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2412
+   r_.neon_i32 = veor_s32(a_.neon_i32, b_.neon_i32);
2413
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2414
+   r_.i32f = a_.i32f ^ b_.i32f;
2415
+#else
2416
+   r_.u64[0] = a_.u64[0] ^ b_.u64[0];
2417
+#endif
2418
+
2419
+   return simde__m64_from_private(r_);
2420
+#endif
2421
+}
2422
+#define simde_m_pxor(a, b) simde_mm_xor_si64(a, b)
2423
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2424
+#define _mm_xor_si64(a, b) simde_mm_xor_si64(a, b)
2425
+#define _m_pxor(a, b) simde_mm_xor_si64(a, b)
2426
+#endif
2427
+
2428
+SIMDE_FUNCTION_ATTRIBUTES
2429
+int32_t simde_m_to_int(simde__m64 a)
2430
+{
2431
+#if defined(SIMDE_X86_MMX_NATIVE)
2432
+   return _m_to_int(a);
2433
+#else
2434
+   simde__m64_private a_ = simde__m64_to_private(a);
2435
+
2436
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2437
+   HEDLEY_DIAGNOSTIC_PUSH
2438
+#if HEDLEY_HAS_WARNING("-Wvector-conversion") && \
2439
+   SIMDE_DETECT_CLANG_VERSION_NOT(10, 0, 0)
2440
+#pragma clang diagnostic ignored "-Wvector-conversion"
2441
+#endif
2442
+   return vget_lane_s32(a_.neon_i32, 0);
2443
+   HEDLEY_DIAGNOSTIC_POP
2444
+#else
2445
+   return a_.i32[0];
2446
+#endif
2447
+#endif
2448
+}
2449
+#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES)
2450
+#define _m_to_int(a) simde_m_to_int(a)
2451
+#endif
2452
+
2453
+SIMDE_END_DECLS_
2454
+
2455
+HEDLEY_DIAGNOSTIC_POP
2456
+
2457
+#endif /* !defined(SIMDE_X86_MMX_H) */
2458
obs-studio-26.1.1.tar.xz/libobs/util/simde/x86/sse.h Added
4481
 
1
@@ -0,0 +1,4479 @@
2
+/* SPDX-License-Identifier: MIT
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person
5
+ * obtaining a copy of this software and associated documentation
6
+ * files (the "Software"), to deal in the Software without
7
+ * restriction, including without limitation the rights to use, copy,
8
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
9
+ * of the Software, and to permit persons to whom the Software is
10
+ * furnished to do so, subject to the following conditions:
11
+ *
12
+ * The above copyright notice and this permission notice shall be
13
+ * included in all copies or substantial portions of the Software.
14
+ *
15
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ * SOFTWARE.
23
+ *
24
+ * Copyright:
25
+ *   2017-2020 Evan Nemerson <evan@nemerson.com>
26
+ *   2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
27
+ *   2015      Brandon Rowlett <browlett@nvidia.com>
28
+ *   2015      Ken Fast <kfast@gdeb.com>
29
+ */
30
+
31
+#if !defined(SIMDE_X86_SSE_H)
32
+#define SIMDE_X86_SSE_H
33
+
34
+#include "mmx.h"
35
+
36
+#if defined(_WIN32)
37
+#include <windows.h>
38
+#endif
39
+
40
+HEDLEY_DIAGNOSTIC_PUSH
41
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
42
+SIMDE_BEGIN_DECLS_
43
+
44
+typedef union {
45
+#if defined(SIMDE_VECTOR_SUBSCRIPT)
46
+   SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
47
+   SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
48
+   SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
49
+   SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
50
+   SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
51
+   SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
52
+   SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
53
+   SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
54
+#if defined(SIMDE_HAVE_INT128_)
55
+   SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
56
+   SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
57
+#endif
58
+   SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
59
+   SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
60
+   SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
61
+#else
62
+   SIMDE_ALIGN_TO_16 int8_t i8[16];
63
+   SIMDE_ALIGN_TO_16 int16_t i16[8];
64
+   SIMDE_ALIGN_TO_16 int32_t i32[4];
65
+   SIMDE_ALIGN_TO_16 int64_t i64[2];
66
+   SIMDE_ALIGN_TO_16 uint8_t u8[16];
67
+   SIMDE_ALIGN_TO_16 uint16_t u16[8];
68
+   SIMDE_ALIGN_TO_16 uint32_t u32[4];
69
+   SIMDE_ALIGN_TO_16 uint64_t u64[2];
70
+#if defined(SIMDE_HAVE_INT128_)
71
+   SIMDE_ALIGN_TO_16 simde_int128 i128[1];
72
+   SIMDE_ALIGN_TO_16 simde_uint128 u128[1];
73
+#endif
74
+   SIMDE_ALIGN_TO_16 simde_float32 f32[4];
75
+   SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)];
76
+   SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
77
+#endif
78
+
79
+   SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
80
+   SIMDE_ALIGN_TO_16 simde__m64 m64[2];
81
+
82
+#if defined(SIMDE_X86_SSE_NATIVE)
83
+   SIMDE_ALIGN_TO_16 __m128 n;
84
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
85
+   SIMDE_ALIGN_TO_16 int8x16_t neon_i8;
86
+   SIMDE_ALIGN_TO_16 int16x8_t neon_i16;
87
+   SIMDE_ALIGN_TO_16 int32x4_t neon_i32;
88
+   SIMDE_ALIGN_TO_16 int64x2_t neon_i64;
89
+   SIMDE_ALIGN_TO_16 uint8x16_t neon_u8;
90
+   SIMDE_ALIGN_TO_16 uint16x8_t neon_u16;
91
+   SIMDE_ALIGN_TO_16 uint32x4_t neon_u32;
92
+   SIMDE_ALIGN_TO_16 uint64x2_t neon_u64;
93
+   SIMDE_ALIGN_TO_16 float32x4_t neon_f32;
94
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
95
+   SIMDE_ALIGN_TO_16 float64x2_t neon_f64;
96
+#endif
97
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
98
+   SIMDE_ALIGN_TO_16 v128_t wasm_v128;
99
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
100
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
101
+   SIMDE_ALIGN_TO_16
102
+   SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
103
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
104
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
105
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
106
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
107
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
108
+#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
109
+   SIMDE_ALIGN_TO_16
110
+   SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
111
+   SIMDE_ALIGN_TO_16
112
+   SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
113
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
114
+#endif
115
+#endif
116
+} simde__m128_private;
117
+
118
+#if defined(SIMDE_X86_SSE_NATIVE)
119
+typedef __m128 simde__m128;
120
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
121
+typedef float32x4_t simde__m128;
122
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
123
+typedef v128_t simde__m128;
124
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
125
+typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128;
126
+#elif defined(SIMDE_VECTOR_SUBSCRIPT)
127
+typedef simde_float32
128
+   simde__m128 SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
129
+#else
130
+typedef simde__m128_private simde__m128;
131
+#endif
132
+
133
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
134
+typedef simde__m128 __m128;
135
+#endif
136
+
137
+HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect");
138
+HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128_private),
139
+            "simde__m128_private size incorrect");
140
+#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
141
+HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128) == 16,
142
+            "simde__m128 is not 16-byte aligned");
143
+HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128_private) == 16,
144
+            "simde__m128_private is not 16-byte aligned");
145
+#endif
146
+
147
+SIMDE_FUNCTION_ATTRIBUTES
148
+simde__m128 simde__m128_from_private(simde__m128_private v)
149
+{
150
+   simde__m128 r;
151
+   simde_memcpy(&r, &v, sizeof(r));
152
+   return r;
153
+}
154
+
155
+SIMDE_FUNCTION_ATTRIBUTES
156
+simde__m128_private simde__m128_to_private(simde__m128 v)
157
+{
158
+   simde__m128_private r;
159
+   simde_memcpy(&r, &v, sizeof(r));
160
+   return r;
161
+}
162
+
163
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
164
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int8x16_t, neon, i8)
165
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int16x8_t, neon, i16)
166
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int32x4_t, neon, i32)
167
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int64x2_t, neon, i64)
168
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint8x16_t, neon, u8)
169
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint16x8_t, neon, u16)
170
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint32x4_t, neon, u32)
171
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint64x2_t, neon, u64)
172
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float32x4_t, neon, f32)
173
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
174
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float64x2_t, neon, f64)
175
+#endif
176
+#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
177
+
178
+#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
179
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128,
180
+                      SIMDE_POWER_ALTIVEC_VECTOR(signed char),
181
+                      altivec, i8)
182
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128,
183
+                      SIMDE_POWER_ALTIVEC_VECTOR(signed short),
184
+                      altivec, i16)
185
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128,
186
+                      SIMDE_POWER_ALTIVEC_VECTOR(signed int),
187
+                      altivec, i32)
188
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
189
+   m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
190
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
191
+   m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
192
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128,
193
+                      SIMDE_POWER_ALTIVEC_VECTOR(unsigned int),
194
+                      altivec, u32)
195
+
196
+#if defined(SIMDE_BUG_GCC_95782)
197
+SIMDE_FUNCTION_ATTRIBUTES
198
+SIMDE_POWER_ALTIVEC_VECTOR(float)
199
+simde__m128_to_altivec_f32(simde__m128 value)
200
+{
201
+   simde__m128_private r_ = simde__m128_to_private(value);
202
+   return r_.altivec_f32;
203
+}
204
+
205
+SIMDE_FUNCTION_ATTRIBUTES
206
+simde__m128 simde__m128_from_altivec_f32(SIMDE_POWER_ALTIVEC_VECTOR(float)
207
+                        value)
208
+{
209
+   simde__m128_private r_;
210
+   r_.altivec_f32 = value;
211
+   return simde__m128_from_private(r_);
212
+}
213
+#else
214
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(float),
215
+                      altivec, f32)
216
+#endif
217
+
218
+#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
219
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
220
+   m128, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
221
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
222
+   m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
223
+#endif
224
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
225
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v128_t, wasm, v128);
226
+#endif /* defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) */
227
+
228
+enum {
229
+#if defined(SIMDE_X86_SSE_NATIVE)
230
+   SIMDE_MM_ROUND_NEAREST = _MM_ROUND_NEAREST,
231
+   SIMDE_MM_ROUND_DOWN = _MM_ROUND_DOWN,
232
+   SIMDE_MM_ROUND_UP = _MM_ROUND_UP,
233
+   SIMDE_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO
234
+#else
235
+   SIMDE_MM_ROUND_NEAREST = 0x0000,
236
+   SIMDE_MM_ROUND_DOWN = 0x2000,
237
+   SIMDE_MM_ROUND_UP = 0x4000,
238
+   SIMDE_MM_ROUND_TOWARD_ZERO = 0x6000
239
+#endif
240
+};
241
+
242
+#if defined(_MM_FROUND_TO_NEAREST_INT)
243
+#define SIMDE_MM_FROUND_TO_NEAREST_INT _MM_FROUND_TO_NEAREST_INT
244
+#define SIMDE_MM_FROUND_TO_NEG_INF _MM_FROUND_TO_NEG_INF
245
+#define SIMDE_MM_FROUND_TO_POS_INF _MM_FROUND_TO_POS_INF
246
+#define SIMDE_MM_FROUND_TO_ZERO _MM_FROUND_TO_ZERO
247
+#define SIMDE_MM_FROUND_CUR_DIRECTION _MM_FROUND_CUR_DIRECTION
248
+
249
+#define SIMDE_MM_FROUND_RAISE_EXC _MM_FROUND_RAISE_EXC
250
+#define SIMDE_MM_FROUND_NO_EXC _MM_FROUND_NO_EXC
251
+#else
252
+#define SIMDE_MM_FROUND_TO_NEAREST_INT 0x00
253
+#define SIMDE_MM_FROUND_TO_NEG_INF 0x01
254
+#define SIMDE_MM_FROUND_TO_POS_INF 0x02
255
+#define SIMDE_MM_FROUND_TO_ZERO 0x03
256
+#define SIMDE_MM_FROUND_CUR_DIRECTION 0x04
257
+
258
+#define SIMDE_MM_FROUND_RAISE_EXC 0x00
259
+#define SIMDE_MM_FROUND_NO_EXC 0x08
260
+#endif
261
+
262
+#define SIMDE_MM_FROUND_NINT \
263
+   (SIMDE_MM_FROUND_TO_NEAREST_INT | SIMDE_MM_FROUND_RAISE_EXC)
264
+#define SIMDE_MM_FROUND_FLOOR \
265
+   (SIMDE_MM_FROUND_TO_NEG_INF | SIMDE_MM_FROUND_RAISE_EXC)
266
+#define SIMDE_MM_FROUND_CEIL \
267
+   (SIMDE_MM_FROUND_TO_POS_INF | SIMDE_MM_FROUND_RAISE_EXC)
268
+#define SIMDE_MM_FROUND_TRUNC \
269
+   (SIMDE_MM_FROUND_TO_ZERO | SIMDE_MM_FROUND_RAISE_EXC)
270
+#define SIMDE_MM_FROUND_RINT \
271
+   (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_RAISE_EXC)
272
+#define SIMDE_MM_FROUND_NEARBYINT \
273
+   (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_NO_EXC)
274
+
275
+#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) && \
276
+   !defined(_MM_FROUND_TO_NEAREST_INT)
277
+#define _MM_FROUND_TO_NEAREST_INT SIMDE_MM_FROUND_TO_NEAREST_INT
278
+#define _MM_FROUND_TO_NEG_INF SIMDE_MM_FROUND_TO_NEG_INF
279
+#define _MM_FROUND_TO_POS_INF SIMDE_MM_FROUND_TO_POS_INF
280
+#define _MM_FROUND_TO_ZERO SIMDE_MM_FROUND_TO_ZERO
281
+#define _MM_FROUND_CUR_DIRECTION SIMDE_MM_FROUND_CUR_DIRECTION
282
+#define _MM_FROUND_RAISE_EXC SIMDE_MM_FROUND_RAISE_EXC
283
+#define _MM_FROUND_NINT SIMDE_MM_FROUND_NINT
284
+#define _MM_FROUND_FLOOR SIMDE_MM_FROUND_FLOOR
285
+#define _MM_FROUND_CEIL SIMDE_MM_FROUND_CEIL
286
+#define _MM_FROUND_TRUNC SIMDE_MM_FROUND_TRUNC
287
+#define _MM_FROUND_RINT SIMDE_MM_FROUND_RINT
288
+#define _MM_FROUND_NEARBYINT SIMDE_MM_FROUND_NEARBYINT
289
+#endif
290
+
291
+SIMDE_FUNCTION_ATTRIBUTES
292
+unsigned int SIMDE_MM_GET_ROUNDING_MODE(void)
293
+{
294
+#if defined(SIMDE_X86_SSE_NATIVE)
295
+   return _MM_GET_ROUNDING_MODE();
296
+#elif defined(SIMDE_HAVE_FENV_H)
297
+   unsigned int vfe_mode;
298
+
299
+   switch (fegetround()) {
300
+#if defined(FE_TONEAREST)
301
+   case FE_TONEAREST:
302
+       vfe_mode = SIMDE_MM_ROUND_NEAREST;
303
+       break;
304
+#endif
305
+
306
+#if defined(FE_TOWARDZERO)
307
+   case FE_TOWARDZERO:
308
+       vfe_mode = SIMDE_MM_ROUND_DOWN;
309
+       break;
310
+#endif
311
+
312
+#if defined(FE_UPWARD)
313
+   case FE_UPWARD:
314
+       vfe_mode = SIMDE_MM_ROUND_UP;
315
+       break;
316
+#endif
317
+
318
+#if defined(FE_DOWNWARD)
319
+   case FE_DOWNWARD:
320
+       vfe_mode = SIMDE_MM_ROUND_TOWARD_ZERO;
321
+       break;
322
+#endif
323
+
324
+   default:
325
+       vfe_mode = SIMDE_MM_ROUND_NEAREST;
326
+       break;
327
+   }
328
+
329
+   return vfe_mode;
330
+#else
331
+   return SIMDE_MM_ROUND_NEAREST;
332
+#endif
333
+}
334
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
335
+#define _MM_GET_ROUNDING_MODE() SIMDE_MM_GET_ROUNDING_MODE()
336
+#endif
337
+
338
+SIMDE_FUNCTION_ATTRIBUTES
339
+void SIMDE_MM_SET_ROUNDING_MODE(unsigned int a)
340
+{
341
+#if defined(SIMDE_X86_SSE_NATIVE)
342
+   _MM_SET_ROUNDING_MODE(a);
343
+#elif defined(SIMDE_HAVE_FENV_H)
344
+   int fe_mode = FE_TONEAREST;
345
+
346
+   switch (a) {
347
+#if defined(FE_TONEAREST)
348
+   case SIMDE_MM_ROUND_NEAREST:
349
+       fe_mode = FE_TONEAREST;
350
+       break;
351
+#endif
352
+
353
+#if defined(FE_TOWARDZERO)
354
+   case SIMDE_MM_ROUND_TOWARD_ZERO:
355
+       fe_mode = FE_TOWARDZERO;
356
+       break;
357
+#endif
358
+
359
+#if defined(FE_DOWNWARD)
360
+   case SIMDE_MM_ROUND_DOWN:
361
+       fe_mode = FE_DOWNWARD;
362
+       break;
363
+#endif
364
+
365
+#if defined(FE_UPWARD)
366
+   case SIMDE_MM_ROUND_UP:
367
+       fe_mode = FE_UPWARD;
368
+       break;
369
+#endif
370
+
371
+   default:
372
+       return;
373
+   }
374
+
375
+   fesetround(fe_mode);
376
+#else
377
+   (void)a;
378
+#endif
379
+}
380
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
381
+#define _MM_SET_ROUNDING_MODE(a) SIMDE_MM_SET_ROUNDING_MODE(a)
382
+#endif
383
+
384
+SIMDE_FUNCTION_ATTRIBUTES
385
+uint32_t simde_mm_getcsr(void)
386
+{
387
+#if defined(SIMDE_X86_SSE_NATIVE)
388
+   return _mm_getcsr();
389
+#else
390
+   return SIMDE_MM_GET_ROUNDING_MODE();
391
+#endif
392
+}
393
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
394
+#define _mm_getcsr() simde_mm_getcsr()
395
+#endif
396
+
397
+SIMDE_FUNCTION_ATTRIBUTES
398
+void simde_mm_setcsr(uint32_t a)
399
+{
400
+#if defined(SIMDE_X86_SSE_NATIVE)
401
+   _mm_setcsr(a);
402
+#else
403
+   SIMDE_MM_SET_ROUNDING_MODE(HEDLEY_STATIC_CAST(unsigned int, a));
404
+#endif
405
+}
406
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
407
+#define _mm_setcsr(a) simde_mm_setcsr(a)
408
+#endif
409
+
410
+SIMDE_FUNCTION_ATTRIBUTES
411
+simde__m128 simde_x_mm_round_ps(simde__m128 a, int rounding, int lax_rounding)
412
+   SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15)
413
+       SIMDE_REQUIRE_CONSTANT_RANGE(lax_rounding, 0, 1)
414
+{
415
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
416
+
417
+   (void)lax_rounding;
418
+
419
+/* For architectures which lack a current direction SIMD instruction.
420
+   *
421
+   * Note that NEON actually has a current rounding mode instruction,
422
+   * but in ARMv8+ the rounding mode is ignored and nearest is always
423
+   * used, so we treat ARMv7 as having a rounding mode but ARMv8 as
424
+   * not. */
425
+#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ARM_NEON_A32V8)
426
+   if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION)
427
+       rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE())
428
+              << 13;
429
+#endif
430
+
431
+   switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) {
432
+   case SIMDE_MM_FROUND_CUR_DIRECTION:
433
+#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
434
+       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
435
+           SIMDE_POWER_ALTIVEC_VECTOR(float),
436
+           vec_round(a_.altivec_f32));
437
+#elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
438
+       r_.neon_f32 = vrndiq_f32(a_.neon_f32);
439
+#elif defined(simde_math_nearbyintf)
440
+       SIMDE_VECTORIZE
441
+       for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
442
+            i++) {
443
+           r_.f32[i] = simde_math_nearbyintf(a_.f32[i]);
444
+       }
445
+#else
446
+       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
447
+#endif
448
+       break;
449
+
450
+   case SIMDE_MM_FROUND_TO_NEAREST_INT:
451
+#if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
452
+       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
453
+           SIMDE_POWER_ALTIVEC_VECTOR(float),
454
+           vec_rint(a_.altivec_f32));
455
+#elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
456
+       r_.neon_f32 = vrndnq_f32(a_.neon_f32);
457
+#elif defined(simde_math_roundevenf)
458
+       SIMDE_VECTORIZE
459
+       for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
460
+            i++) {
461
+           r_.f32[i] = simde_math_roundevenf(a_.f32[i]);
462
+       }
463
+#else
464
+       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
465
+#endif
466
+       break;
467
+
468
+   case SIMDE_MM_FROUND_TO_NEG_INF:
469
+#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
470
+       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
471
+           SIMDE_POWER_ALTIVEC_VECTOR(float),
472
+           vec_floor(a_.altivec_f32));
473
+#elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
474
+       r_.neon_f32 = vrndmq_f32(a_.neon_f32);
475
+#elif defined(simde_math_floorf)
476
+       SIMDE_VECTORIZE
477
+       for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
478
+            i++) {
479
+           r_.f32[i] = simde_math_floorf(a_.f32[i]);
480
+       }
481
+#else
482
+       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
483
+#endif
484
+       break;
485
+
486
+   case SIMDE_MM_FROUND_TO_POS_INF:
487
+#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
488
+       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
489
+           SIMDE_POWER_ALTIVEC_VECTOR(float),
490
+           vec_ceil(a_.altivec_f32));
491
+#elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
492
+       r_.neon_f32 = vrndpq_f32(a_.neon_f32);
493
+#elif defined(simde_math_ceilf)
494
+       SIMDE_VECTORIZE
495
+       for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
496
+            i++) {
497
+           r_.f32[i] = simde_math_ceilf(a_.f32[i]);
498
+       }
499
+#else
500
+       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
501
+#endif
502
+       break;
503
+
504
+   case SIMDE_MM_FROUND_TO_ZERO:
505
+#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
506
+       r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
507
+           SIMDE_POWER_ALTIVEC_VECTOR(float),
508
+           vec_trunc(a_.altivec_f32));
509
+#elif defined(SIMDE_ARM_NEON_A32V8_NATIVE)
510
+       r_.neon_f32 = vrndq_f32(a_.neon_f32);
511
+#elif defined(simde_math_truncf)
512
+       SIMDE_VECTORIZE
513
+       for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0]));
514
+            i++) {
515
+           r_.f32[i] = simde_math_truncf(a_.f32[i]);
516
+       }
517
+#else
518
+       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
519
+#endif
520
+       break;
521
+
522
+   default:
523
+       HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd());
524
+   }
525
+
526
+   return simde__m128_from_private(r_);
527
+}
528
+#if defined(SIMDE_X86_SSE4_1_NATIVE)
529
+#define simde_mm_round_ps(a, rounding) _mm_round_ps((a), (rounding))
530
+#else
531
+#define simde_mm_round_ps(a, rounding) simde_x_mm_round_ps((a), (rounding), 0)
532
+#endif
533
+#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES)
534
+#define _mm_round_ps(a, rounding) simde_mm_round_ps((a), (rounding))
535
+#endif
536
+
537
+SIMDE_FUNCTION_ATTRIBUTES
538
+simde__m128 simde_mm_set_ps(simde_float32 e3, simde_float32 e2,
539
+               simde_float32 e1, simde_float32 e0)
540
+{
541
+#if defined(SIMDE_X86_SSE_NATIVE)
542
+   return _mm_set_ps(e3, e2, e1, e0);
543
+#else
544
+   simde__m128_private r_;
545
+
546
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
547
+   SIMDE_ALIGN_TO_16 simde_float32 data[4] = {e0, e1, e2, e3};
548
+   r_.neon_f32 = vld1q_f32(data);
549
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
550
+   r_.wasm_v128 = wasm_f32x4_make(e0, e1, e2, e3);
551
+#else
552
+   r_.f32[0] = e0;
553
+   r_.f32[1] = e1;
554
+   r_.f32[2] = e2;
555
+   r_.f32[3] = e3;
556
+#endif
557
+
558
+   return simde__m128_from_private(r_);
559
+#endif
560
+}
561
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
562
+#define _mm_set_ps(e3, e2, e1, e0) simde_mm_set_ps(e3, e2, e1, e0)
563
+#endif
564
+
565
+SIMDE_FUNCTION_ATTRIBUTES
566
+simde__m128 simde_mm_set_ps1(simde_float32 a)
567
+{
568
+#if defined(SIMDE_X86_SSE_NATIVE)
569
+   return _mm_set_ps1(a);
570
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
571
+   return vdupq_n_f32(a);
572
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
573
+   (void)a;
574
+   return vec_splats(a);
575
+#else
576
+   return simde_mm_set_ps(a, a, a, a);
577
+#endif
578
+}
579
+#define simde_mm_set1_ps(a) simde_mm_set_ps1(a)
580
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
581
+#define _mm_set_ps1(a) simde_mm_set_ps1(a)
582
+#define _mm_set1_ps(a) simde_mm_set1_ps(a)
583
+#endif
584
+
585
+SIMDE_FUNCTION_ATTRIBUTES
586
+simde__m128 simde_mm_move_ss(simde__m128 a, simde__m128 b)
587
+{
588
+#if defined(SIMDE_X86_SSE_NATIVE)
589
+   return _mm_move_ss(a, b);
590
+#else
591
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
592
+               b_ = simde__m128_to_private(b);
593
+
594
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
595
+   r_.neon_f32 =
596
+       vsetq_lane_f32(vgetq_lane_f32(b_.neon_f32, 0), a_.neon_f32, 0);
597
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
598
+   SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)
599
+   m = {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
600
+   r_.altivec_f32 = vec_perm(a_.altivec_f32, b_.altivec_f32, m);
601
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
602
+   r_.wasm_v128 = wasm_v8x16_shuffle(b_.wasm_v128, a_.wasm_v128, 0, 1, 2,
603
+                     3, 20, 21, 22, 23, 24, 25, 26, 27, 28,
604
+                     29, 30, 31);
605
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
606
+   r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 4, 1, 2, 3);
607
+#else
608
+   r_.f32[0] = b_.f32[0];
609
+   r_.f32[1] = a_.f32[1];
610
+   r_.f32[2] = a_.f32[2];
611
+   r_.f32[3] = a_.f32[3];
612
+#endif
613
+
614
+   return simde__m128_from_private(r_);
615
+#endif
616
+}
617
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
618
+#define _mm_move_ss(a, b) simde_mm_move_ss((a), (b))
619
+#endif
620
+
621
+SIMDE_FUNCTION_ATTRIBUTES
622
+simde__m128 simde_mm_add_ps(simde__m128 a, simde__m128 b)
623
+{
624
+#if defined(SIMDE_X86_SSE_NATIVE)
625
+   return _mm_add_ps(a, b);
626
+#else
627
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
628
+               b_ = simde__m128_to_private(b);
629
+
630
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
631
+   r_.neon_f32 = vaddq_f32(a_.neon_f32, b_.neon_f32);
632
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
633
+   r_.wasm_v128 = wasm_f32x4_add(a_.wasm_v128, b_.wasm_v128);
634
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
635
+   r_.altivec_f32 = vec_add(a_.altivec_f32, b_.altivec_f32);
636
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
637
+   r_.f32 = a_.f32 + b_.f32;
638
+#else
639
+   SIMDE_VECTORIZE
640
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
641
+       r_.f32[i] = a_.f32[i] + b_.f32[i];
642
+   }
643
+#endif
644
+
645
+   return simde__m128_from_private(r_);
646
+#endif
647
+}
648
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
649
+#define _mm_add_ps(a, b) simde_mm_add_ps((a), (b))
650
+#endif
651
+
652
+SIMDE_FUNCTION_ATTRIBUTES
653
+simde__m128 simde_mm_add_ss(simde__m128 a, simde__m128 b)
654
+{
655
+#if defined(SIMDE_X86_SSE_NATIVE)
656
+   return _mm_add_ss(a, b);
657
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
658
+   return simde_mm_move_ss(a, simde_mm_add_ps(a, b));
659
+#else
660
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
661
+               b_ = simde__m128_to_private(b);
662
+
663
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
664
+   float32_t b0 = vgetq_lane_f32(b_.neon_f32, 0);
665
+   float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
666
+   // the upper values in the result must be the remnants of <a>.
667
+   r_.neon_f32 = vaddq_f32(a_.neon_f32, value);
668
+#else
669
+   r_.f32[0] = a_.f32[0] + b_.f32[0];
670
+   r_.f32[1] = a_.f32[1];
671
+   r_.f32[2] = a_.f32[2];
672
+   r_.f32[3] = a_.f32[3];
673
+#endif
674
+
675
+   return simde__m128_from_private(r_);
676
+#endif
677
+}
678
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
679
+#define _mm_add_ss(a, b) simde_mm_add_ss((a), (b))
680
+#endif
681
+
682
+SIMDE_FUNCTION_ATTRIBUTES
683
+simde__m128 simde_mm_and_ps(simde__m128 a, simde__m128 b)
684
+{
685
+#if defined(SIMDE_X86_SSE_NATIVE)
686
+   return _mm_and_ps(a, b);
687
+#else
688
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
689
+               b_ = simde__m128_to_private(b);
690
+
691
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
692
+   r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
693
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
694
+   r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
695
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
696
+   r_.i32 = a_.i32 & b_.i32;
697
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
698
+   r_.altivec_f32 = vec_and(a_.altivec_f32, b_.altivec_f32);
699
+#else
700
+   SIMDE_VECTORIZE
701
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
702
+       r_.i32[i] = a_.i32[i] & b_.i32[i];
703
+   }
704
+#endif
705
+
706
+   return simde__m128_from_private(r_);
707
+#endif
708
+}
709
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
710
+#define _mm_and_ps(a, b) simde_mm_and_ps((a), (b))
711
+#endif
712
+
713
+SIMDE_FUNCTION_ATTRIBUTES
714
+simde__m128 simde_mm_andnot_ps(simde__m128 a, simde__m128 b)
715
+{
716
+#if defined(SIMDE_X86_SSE_NATIVE)
717
+   return _mm_andnot_ps(a, b);
718
+#else
719
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
720
+               b_ = simde__m128_to_private(b);
721
+
722
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
723
+   r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
724
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
725
+   r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
726
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
727
+   r_.altivec_f32 = vec_andc(b_.altivec_f32, a_.altivec_f32);
728
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
729
+   r_.i32 = ~a_.i32 & b_.i32;
730
+#else
731
+   SIMDE_VECTORIZE
732
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
733
+       r_.i32[i] = ~(a_.i32[i]) & b_.i32[i];
734
+   }
735
+#endif
736
+
737
+   return simde__m128_from_private(r_);
738
+#endif
739
+}
740
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
741
+#define _mm_andnot_ps(a, b) simde_mm_andnot_ps((a), (b))
742
+#endif
743
+
744
+SIMDE_FUNCTION_ATTRIBUTES
745
+simde__m128 simde_mm_xor_ps(simde__m128 a, simde__m128 b)
746
+{
747
+#if defined(SIMDE_X86_SSE_NATIVE)
748
+   return _mm_xor_ps(a, b);
749
+#else
750
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
751
+               b_ = simde__m128_to_private(b);
752
+
753
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
754
+   r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
755
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
756
+   r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128);
757
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
758
+   r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
759
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
760
+   r_.i32f = a_.i32f ^ b_.i32f;
761
+#else
762
+   SIMDE_VECTORIZE
763
+   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
764
+       r_.u32[i] = a_.u32[i] ^ b_.u32[i];
765
+   }
766
+#endif
767
+
768
+   return simde__m128_from_private(r_);
769
+#endif
770
+}
771
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
772
+#define _mm_xor_ps(a, b) simde_mm_xor_ps((a), (b))
773
+#endif
774
+
775
+SIMDE_FUNCTION_ATTRIBUTES
776
+simde__m128 simde_mm_or_ps(simde__m128 a, simde__m128 b)
777
+{
778
+#if defined(SIMDE_X86_SSE_NATIVE)
779
+   return _mm_or_ps(a, b);
780
+#else
781
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
782
+               b_ = simde__m128_to_private(b);
783
+
784
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
785
+   r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
786
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
787
+   r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128);
788
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
789
+   r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
790
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
791
+   r_.i32f = a_.i32f | b_.i32f;
792
+#else
793
+   SIMDE_VECTORIZE
794
+   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
795
+       r_.u32[i] = a_.u32[i] | b_.u32[i];
796
+   }
797
+#endif
798
+
799
+   return simde__m128_from_private(r_);
800
+#endif
801
+}
802
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
803
+#define _mm_or_ps(a, b) simde_mm_or_ps((a), (b))
804
+#endif
805
+
806
+SIMDE_FUNCTION_ATTRIBUTES
807
+simde__m128 simde_x_mm_not_ps(simde__m128 a)
808
+{
809
+#if defined(SIMDE_X86_AVX512VL_NATIVE)
810
+   __m128i ai = _mm_castps_si128(a);
811
+   return _mm_castsi128_ps(_mm_ternarylogic_epi32(ai, ai, ai, 0x55));
812
+#elif defined(SIMDE_X86_SSE2_NATIVE)
813
+   /* Note: we use ints instead of floats because we don't want cmpeq
814
+     * to return false for (NaN, NaN) */
815
+   __m128i ai = _mm_castps_si128(a);
816
+   return _mm_castsi128_ps(_mm_andnot_si128(ai, _mm_cmpeq_epi32(ai, ai)));
817
+#else
818
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
819
+
820
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
821
+   r_.neon_i32 = vmvnq_s32(a_.neon_i32);
822
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
823
+   r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
824
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
825
+   r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
826
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
827
+   r_.i32 = ~a_.i32;
828
+#else
829
+   SIMDE_VECTORIZE
830
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
831
+       r_.i32[i] = ~(a_.i32[i]);
832
+   }
833
+#endif
834
+
835
+   return simde__m128_from_private(r_);
836
+#endif
837
+}
838
+
839
+SIMDE_FUNCTION_ATTRIBUTES
840
+simde__m128 simde_x_mm_select_ps(simde__m128 a, simde__m128 b, simde__m128 mask)
841
+{
842
+/* This function is for when you want to blend two elements together
843
+   * according to a mask.  It is similar to _mm_blendv_ps, except that
844
+   * it is undefined whether the blend is based on the highest bit in
845
+   * each lane (like blendv) or just bitwise operations.  This allows
846
+   * us to implement the function efficiently everywhere.
847
+   *
848
+   * Basically, you promise that all the lanes in mask are either 0 or
849
+   * ~0. */
850
+#if defined(SIMDE_X86_SSE4_1_NATIVE)
851
+   return _mm_blendv_ps(a, b, mask);
852
+#else
853
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
854
+               b_ = simde__m128_to_private(b),
855
+               mask_ = simde__m128_to_private(mask);
856
+
857
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
858
+   r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32);
859
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
860
+   r_.wasm_v128 = wasm_v128_bitselect(b_.wasm_v128, a_.wasm_v128,
861
+                      mask_.wasm_v128);
862
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
863
+   r_.altivec_i32 =
864
+       vec_sel(a_.altivec_i32, b_.altivec_i32, mask_.altivec_u32);
865
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
866
+   r_.i32 = a_.i32 ^ ((a_.i32 ^ b_.i32) & mask_.i32);
867
+#else
868
+   SIMDE_VECTORIZE
869
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
870
+       r_.i32[i] = a_.i32[i] ^
871
+               ((a_.i32[i] ^ b_.i32[i]) & mask_.i32[i]);
872
+   }
873
+#endif
874
+
875
+   return simde__m128_from_private(r_);
876
+#endif
877
+}
878
+
879
+SIMDE_FUNCTION_ATTRIBUTES
880
+simde__m64 simde_mm_avg_pu16(simde__m64 a, simde__m64 b)
881
+{
882
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
883
+   return _mm_avg_pu16(a, b);
884
+#else
885
+   simde__m64_private r_, a_ = simde__m64_to_private(a),
886
+                  b_ = simde__m64_to_private(b);
887
+
888
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
889
+   r_.neon_u16 = vrhadd_u16(b_.neon_u16, a_.neon_u16);
890
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) &&      \
891
+   defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
892
+   defined(SIMDE_CONVERT_VECTOR_)
893
+   uint32_t wa SIMDE_VECTOR(16);
894
+   uint32_t wb SIMDE_VECTOR(16);
895
+   uint32_t wr SIMDE_VECTOR(16);
896
+   SIMDE_CONVERT_VECTOR_(wa, a_.u16);
897
+   SIMDE_CONVERT_VECTOR_(wb, b_.u16);
898
+   wr = (wa + wb + 1) >> 1;
899
+   SIMDE_CONVERT_VECTOR_(r_.u16, wr);
900
+#else
901
+   SIMDE_VECTORIZE
902
+   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
903
+       r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
904
+   }
905
+#endif
906
+
907
+   return simde__m64_from_private(r_);
908
+#endif
909
+}
910
+#define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b)
911
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
912
+#define _mm_avg_pu16(a, b) simde_mm_avg_pu16(a, b)
913
+#define _m_pavgw(a, b) simde_mm_avg_pu16(a, b)
914
+#endif
915
+
916
+SIMDE_FUNCTION_ATTRIBUTES
917
+simde__m64 simde_mm_avg_pu8(simde__m64 a, simde__m64 b)
918
+{
919
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
920
+   return _mm_avg_pu8(a, b);
921
+#else
922
+   simde__m64_private r_, a_ = simde__m64_to_private(a),
923
+                  b_ = simde__m64_to_private(b);
924
+
925
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
926
+   r_.neon_u8 = vrhadd_u8(b_.neon_u8, a_.neon_u8);
927
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) &&      \
928
+   defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
929
+   defined(SIMDE_CONVERT_VECTOR_)
930
+   uint16_t wa SIMDE_VECTOR(16);
931
+   uint16_t wb SIMDE_VECTOR(16);
932
+   uint16_t wr SIMDE_VECTOR(16);
933
+   SIMDE_CONVERT_VECTOR_(wa, a_.u8);
934
+   SIMDE_CONVERT_VECTOR_(wb, b_.u8);
935
+   wr = (wa + wb + 1) >> 1;
936
+   SIMDE_CONVERT_VECTOR_(r_.u8, wr);
937
+#else
938
+   SIMDE_VECTORIZE
939
+   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
940
+       r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
941
+   }
942
+#endif
943
+
944
+   return simde__m64_from_private(r_);
945
+#endif
946
+}
947
+#define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b)
948
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
949
+#define _mm_avg_pu8(a, b) simde_mm_avg_pu8(a, b)
950
+#define _m_pavgb(a, b) simde_mm_avg_pu8(a, b)
951
+#endif
952
+
953
+SIMDE_FUNCTION_ATTRIBUTES
954
+simde__m128 simde_x_mm_abs_ps(simde__m128 a)
955
+{
956
+#if defined(SIMDE_X86_AVX512F_NATIVE) && \
957
+   (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7, 1, 0))
958
+   return _mm512_castps512_ps128(_mm512_abs_ps(_mm512_castps128_ps512(a)));
959
+#else
960
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
961
+
962
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
963
+   r_.neon_f32 = vabsq_f32(a_.neon_f32);
964
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
965
+   r_.altivec_f32 = vec_abs(a_.altivec_f32);
966
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
967
+   r_.wasm_v128 = wasm_f32x4_abs(a_.wasm_v128);
968
+#else
969
+   SIMDE_VECTORIZE
970
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
971
+       r_.f32[i] = simde_math_fabsf(a_.f32[i]);
972
+   }
973
+#endif
974
+
975
+   return simde__m128_from_private(r_);
976
+#endif
977
+}
978
+
979
+SIMDE_FUNCTION_ATTRIBUTES
980
+simde__m128 simde_mm_cmpeq_ps(simde__m128 a, simde__m128 b)
981
+{
982
+#if defined(SIMDE_X86_SSE_NATIVE)
983
+   return _mm_cmpeq_ps(a, b);
984
+#else
985
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
986
+               b_ = simde__m128_to_private(b);
987
+
988
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
989
+   r_.neon_u32 = vceqq_f32(a_.neon_f32, b_.neon_f32);
990
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
991
+   r_.wasm_v128 = wasm_f32x4_eq(a_.wasm_v128, b_.wasm_v128);
992
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
993
+   r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
994
+       SIMDE_POWER_ALTIVEC_VECTOR(float),
995
+       vec_cmpeq(a_.altivec_f32, b_.altivec_f32));
996
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
997
+   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.f32 == b_.f32);
998
+#else
999
+   SIMDE_VECTORIZE
1000
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1001
+       r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0)
1002
+                            : UINT32_C(0);
1003
+   }
1004
+#endif
1005
+
1006
+   return simde__m128_from_private(r_);
1007
+#endif
1008
+}
1009
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1010
+#define _mm_cmpeq_ps(a, b) simde_mm_cmpeq_ps((a), (b))
1011
+#endif
1012
+
1013
+SIMDE_FUNCTION_ATTRIBUTES
1014
+simde__m128 simde_mm_cmpeq_ss(simde__m128 a, simde__m128 b)
1015
+{
1016
+#if defined(SIMDE_X86_SSE_NATIVE)
1017
+   return _mm_cmpeq_ss(a, b);
1018
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1019
+   return simde_mm_move_ss(a, simde_mm_cmpeq_ps(a, b));
1020
+#else
1021
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
1022
+               b_ = simde__m128_to_private(b);
1023
+
1024
+   r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
1025
+   SIMDE_VECTORIZE
1026
+   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1027
+       r_.u32[i] = a_.u32[i];
1028
+   }
1029
+
1030
+   return simde__m128_from_private(r_);
1031
+#endif
1032
+}
1033
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1034
+#define _mm_cmpeq_ss(a, b) simde_mm_cmpeq_ss((a), (b))
1035
+#endif
1036
+
1037
+SIMDE_FUNCTION_ATTRIBUTES
1038
+simde__m128 simde_mm_cmpge_ps(simde__m128 a, simde__m128 b)
1039
+{
1040
+#if defined(SIMDE_X86_SSE_NATIVE)
1041
+   return _mm_cmpge_ps(a, b);
1042
+#else
1043
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
1044
+               b_ = simde__m128_to_private(b);
1045
+
1046
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1047
+   r_.neon_u32 = vcgeq_f32(a_.neon_f32, b_.neon_f32);
1048
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1049
+   r_.wasm_v128 = wasm_f32x4_ge(a_.wasm_v128, b_.wasm_v128);
1050
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1051
+   r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1052
+       SIMDE_POWER_ALTIVEC_VECTOR(float),
1053
+       vec_cmpge(a_.altivec_f32, b_.altivec_f32));
1054
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1055
+   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32));
1056
+#else
1057
+   SIMDE_VECTORIZE
1058
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1059
+       r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0)
1060
+                            : UINT32_C(0);
1061
+   }
1062
+#endif
1063
+
1064
+   return simde__m128_from_private(r_);
1065
+#endif
1066
+}
1067
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1068
+#define _mm_cmpge_ps(a, b) simde_mm_cmpge_ps((a), (b))
1069
+#endif
1070
+
1071
+SIMDE_FUNCTION_ATTRIBUTES
1072
+simde__m128 simde_mm_cmpge_ss(simde__m128 a, simde__m128 b)
1073
+{
1074
+#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
1075
+   return _mm_cmpge_ss(a, b);
1076
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1077
+   return simde_mm_move_ss(a, simde_mm_cmpge_ps(a, b));
1078
+#else
1079
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
1080
+               b_ = simde__m128_to_private(b);
1081
+
1082
+   r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
1083
+   SIMDE_VECTORIZE
1084
+   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1085
+       r_.u32[i] = a_.u32[i];
1086
+   }
1087
+
1088
+   return simde__m128_from_private(r_);
1089
+#endif
1090
+}
1091
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1092
+#define _mm_cmpge_ss(a, b) simde_mm_cmpge_ss((a), (b))
1093
+#endif
1094
+
1095
+SIMDE_FUNCTION_ATTRIBUTES
1096
+simde__m128 simde_mm_cmpgt_ps(simde__m128 a, simde__m128 b)
1097
+{
1098
+#if defined(SIMDE_X86_SSE_NATIVE)
1099
+   return _mm_cmpgt_ps(a, b);
1100
+#else
1101
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
1102
+               b_ = simde__m128_to_private(b);
1103
+
1104
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1105
+   r_.neon_u32 = vcgtq_f32(a_.neon_f32, b_.neon_f32);
1106
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1107
+   r_.wasm_v128 = wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128);
1108
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1109
+   r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1110
+       SIMDE_POWER_ALTIVEC_VECTOR(float),
1111
+       vec_cmpgt(a_.altivec_f32, b_.altivec_f32));
1112
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1113
+   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32));
1114
+#else
1115
+   SIMDE_VECTORIZE
1116
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1117
+       r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0)
1118
+                           : UINT32_C(0);
1119
+   }
1120
+#endif
1121
+
1122
+   return simde__m128_from_private(r_);
1123
+#endif
1124
+}
1125
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1126
+#define _mm_cmpgt_ps(a, b) simde_mm_cmpgt_ps((a), (b))
1127
+#endif
1128
+
1129
+SIMDE_FUNCTION_ATTRIBUTES
1130
+simde__m128 simde_mm_cmpgt_ss(simde__m128 a, simde__m128 b)
1131
+{
1132
+#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
1133
+   return _mm_cmpgt_ss(a, b);
1134
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1135
+   return simde_mm_move_ss(a, simde_mm_cmpgt_ps(a, b));
1136
+#else
1137
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
1138
+               b_ = simde__m128_to_private(b);
1139
+
1140
+   r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
1141
+   SIMDE_VECTORIZE
1142
+   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1143
+       r_.u32[i] = a_.u32[i];
1144
+   }
1145
+
1146
+   return simde__m128_from_private(r_);
1147
+#endif
1148
+}
1149
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1150
+#define _mm_cmpgt_ss(a, b) simde_mm_cmpgt_ss((a), (b))
1151
+#endif
1152
+
1153
+SIMDE_FUNCTION_ATTRIBUTES
1154
+simde__m128 simde_mm_cmple_ps(simde__m128 a, simde__m128 b)
1155
+{
1156
+#if defined(SIMDE_X86_SSE_NATIVE)
1157
+   return _mm_cmple_ps(a, b);
1158
+#else
1159
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
1160
+               b_ = simde__m128_to_private(b);
1161
+
1162
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1163
+   r_.neon_u32 = vcleq_f32(a_.neon_f32, b_.neon_f32);
1164
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1165
+   r_.wasm_v128 = wasm_f32x4_le(a_.wasm_v128, b_.wasm_v128);
1166
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1167
+   r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1168
+       SIMDE_POWER_ALTIVEC_VECTOR(float),
1169
+       vec_cmple(a_.altivec_f32, b_.altivec_f32));
1170
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1171
+   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32));
1172
+#else
1173
+   SIMDE_VECTORIZE
1174
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1175
+       r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0)
1176
+                            : UINT32_C(0);
1177
+   }
1178
+#endif
1179
+
1180
+   return simde__m128_from_private(r_);
1181
+#endif
1182
+}
1183
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1184
+#define _mm_cmple_ps(a, b) simde_mm_cmple_ps((a), (b))
1185
+#endif
1186
+
1187
+SIMDE_FUNCTION_ATTRIBUTES
1188
+simde__m128 simde_mm_cmple_ss(simde__m128 a, simde__m128 b)
1189
+{
1190
+#if defined(SIMDE_X86_SSE_NATIVE)
1191
+   return _mm_cmple_ss(a, b);
1192
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1193
+   return simde_mm_move_ss(a, simde_mm_cmple_ps(a, b));
1194
+#else
1195
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
1196
+               b_ = simde__m128_to_private(b);
1197
+
1198
+   r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
1199
+   SIMDE_VECTORIZE
1200
+   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1201
+       r_.u32[i] = a_.u32[i];
1202
+   }
1203
+
1204
+   return simde__m128_from_private(r_);
1205
+#endif
1206
+}
1207
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1208
+#define _mm_cmple_ss(a, b) simde_mm_cmple_ss((a), (b))
1209
+#endif
1210
+
1211
+SIMDE_FUNCTION_ATTRIBUTES
1212
+simde__m128 simde_mm_cmplt_ps(simde__m128 a, simde__m128 b)
1213
+{
1214
+#if defined(SIMDE_X86_SSE_NATIVE)
1215
+   return _mm_cmplt_ps(a, b);
1216
+#else
1217
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
1218
+               b_ = simde__m128_to_private(b);
1219
+
1220
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1221
+   r_.neon_u32 = vcltq_f32(a_.neon_f32, b_.neon_f32);
1222
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1223
+   r_.wasm_v128 = wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128);
1224
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1225
+   r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1226
+       SIMDE_POWER_ALTIVEC_VECTOR(float),
1227
+       vec_cmplt(a_.altivec_f32, b_.altivec_f32));
1228
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1229
+   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32));
1230
+#else
1231
+   SIMDE_VECTORIZE
1232
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1233
+       r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0)
1234
+                           : UINT32_C(0);
1235
+   }
1236
+#endif
1237
+
1238
+   return simde__m128_from_private(r_);
1239
+#endif
1240
+}
1241
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1242
+#define _mm_cmplt_ps(a, b) simde_mm_cmplt_ps((a), (b))
1243
+#endif
1244
+
1245
+SIMDE_FUNCTION_ATTRIBUTES
1246
+simde__m128 simde_mm_cmplt_ss(simde__m128 a, simde__m128 b)
1247
+{
1248
+#if defined(SIMDE_X86_SSE_NATIVE)
1249
+   return _mm_cmplt_ss(a, b);
1250
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1251
+   return simde_mm_move_ss(a, simde_mm_cmplt_ps(a, b));
1252
+#else
1253
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
1254
+               b_ = simde__m128_to_private(b);
1255
+
1256
+   r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
1257
+   SIMDE_VECTORIZE
1258
+   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1259
+       r_.u32[i] = a_.u32[i];
1260
+   }
1261
+
1262
+   return simde__m128_from_private(r_);
1263
+#endif
1264
+}
1265
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1266
+#define _mm_cmplt_ss(a, b) simde_mm_cmplt_ss((a), (b))
1267
+#endif
1268
+
1269
+SIMDE_FUNCTION_ATTRIBUTES
1270
+simde__m128 simde_mm_cmpneq_ps(simde__m128 a, simde__m128 b)
1271
+{
1272
+#if defined(SIMDE_X86_SSE_NATIVE)
1273
+   return _mm_cmpneq_ps(a, b);
1274
+#else
1275
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
1276
+               b_ = simde__m128_to_private(b);
1277
+
1278
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1279
+   r_.neon_u32 = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
1280
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1281
+   r_.wasm_v128 = wasm_f32x4_ne(a_.wasm_v128, b_.wasm_v128);
1282
+#elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && SIMDE_ARCH_POWER_CHECK(900) && \
1283
+   !defined(HEDLEY_IBM_VERSION)
1284
+   /* vec_cmpne(SIMDE_POWER_ALTIVEC_VECTOR(float), SIMDE_POWER_ALTIVEC_VECTOR(float))
1285
+        is missing from XL C/C++ v16.1.1,
1286
+        though the documentation (table 89 on page 432 of the IBM XL C/C++ for
1287
+        Linux Compiler Reference, Version 16.1.1) shows that it should be
1288
+        present.  Both GCC and clang support it. */
1289
+   r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1290
+       SIMDE_POWER_ALTIVEC_VECTOR(float),
1291
+       vec_cmpne(a_.altivec_f32, b_.altivec_f32));
1292
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1293
+   r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1294
+       SIMDE_POWER_ALTIVEC_VECTOR(float),
1295
+       vec_cmpeq(a_.altivec_f32, b_.altivec_f32));
1296
+   r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1297
+       SIMDE_POWER_ALTIVEC_VECTOR(float),
1298
+       vec_nor(r_.altivec_f32, r_.altivec_f32));
1299
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1300
+   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32));
1301
+#else
1302
+   SIMDE_VECTORIZE
1303
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1304
+       r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0)
1305
+                            : UINT32_C(0);
1306
+   }
1307
+#endif
1308
+
1309
+   return simde__m128_from_private(r_);
1310
+#endif
1311
+}
1312
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1313
+#define _mm_cmpneq_ps(a, b) simde_mm_cmpneq_ps((a), (b))
1314
+#endif
1315
+
1316
+SIMDE_FUNCTION_ATTRIBUTES
1317
+simde__m128 simde_mm_cmpneq_ss(simde__m128 a, simde__m128 b)
1318
+{
1319
+#if defined(SIMDE_X86_SSE_NATIVE)
1320
+   return _mm_cmpneq_ss(a, b);
1321
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1322
+   return simde_mm_move_ss(a, simde_mm_cmpneq_ps(a, b));
1323
+#else
1324
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
1325
+               b_ = simde__m128_to_private(b);
1326
+
1327
+   r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0);
1328
+   SIMDE_VECTORIZE
1329
+   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1330
+       r_.u32[i] = a_.u32[i];
1331
+   }
1332
+
1333
+   return simde__m128_from_private(r_);
1334
+#endif
1335
+}
1336
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1337
+#define _mm_cmpneq_ss(a, b) simde_mm_cmpneq_ss((a), (b))
1338
+#endif
1339
+
1340
+SIMDE_FUNCTION_ATTRIBUTES
1341
+simde__m128 simde_mm_cmpnge_ps(simde__m128 a, simde__m128 b)
1342
+{
1343
+   return simde_mm_cmplt_ps(a, b);
1344
+}
1345
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1346
+#define _mm_cmpnge_ps(a, b) simde_mm_cmpnge_ps((a), (b))
1347
+#endif
1348
+
1349
+SIMDE_FUNCTION_ATTRIBUTES
1350
+simde__m128 simde_mm_cmpnge_ss(simde__m128 a, simde__m128 b)
1351
+{
1352
+   return simde_mm_cmplt_ss(a, b);
1353
+}
1354
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1355
+#define _mm_cmpnge_ss(a, b) simde_mm_cmpnge_ss((a), (b))
1356
+#endif
1357
+
1358
+SIMDE_FUNCTION_ATTRIBUTES
1359
+simde__m128 simde_mm_cmpngt_ps(simde__m128 a, simde__m128 b)
1360
+{
1361
+   return simde_mm_cmple_ps(a, b);
1362
+}
1363
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1364
+#define _mm_cmpngt_ps(a, b) simde_mm_cmpngt_ps((a), (b))
1365
+#endif
1366
+
1367
+SIMDE_FUNCTION_ATTRIBUTES
1368
+simde__m128 simde_mm_cmpngt_ss(simde__m128 a, simde__m128 b)
1369
+{
1370
+   return simde_mm_cmple_ss(a, b);
1371
+}
1372
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1373
+#define _mm_cmpngt_ss(a, b) simde_mm_cmpngt_ss((a), (b))
1374
+#endif
1375
+
1376
+SIMDE_FUNCTION_ATTRIBUTES
1377
+simde__m128 simde_mm_cmpnle_ps(simde__m128 a, simde__m128 b)
1378
+{
1379
+   return simde_mm_cmpgt_ps(a, b);
1380
+}
1381
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1382
+#define _mm_cmpnle_ps(a, b) simde_mm_cmpnle_ps((a), (b))
1383
+#endif
1384
+
1385
+SIMDE_FUNCTION_ATTRIBUTES
1386
+simde__m128 simde_mm_cmpnle_ss(simde__m128 a, simde__m128 b)
1387
+{
1388
+   return simde_mm_cmpgt_ss(a, b);
1389
+}
1390
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1391
+#define _mm_cmpnle_ss(a, b) simde_mm_cmpnle_ss((a), (b))
1392
+#endif
1393
+
1394
+SIMDE_FUNCTION_ATTRIBUTES
1395
+simde__m128 simde_mm_cmpnlt_ps(simde__m128 a, simde__m128 b)
1396
+{
1397
+   return simde_mm_cmpge_ps(a, b);
1398
+}
1399
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1400
+#define _mm_cmpnlt_ps(a, b) simde_mm_cmpnlt_ps((a), (b))
1401
+#endif
1402
+
1403
+SIMDE_FUNCTION_ATTRIBUTES
1404
+simde__m128 simde_mm_cmpnlt_ss(simde__m128 a, simde__m128 b)
1405
+{
1406
+   return simde_mm_cmpge_ss(a, b);
1407
+}
1408
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1409
+#define _mm_cmpnlt_ss(a, b) simde_mm_cmpnlt_ss((a), (b))
1410
+#endif
1411
+
1412
+SIMDE_FUNCTION_ATTRIBUTES
1413
+simde__m128 simde_mm_cmpord_ps(simde__m128 a, simde__m128 b)
1414
+{
1415
+#if defined(SIMDE_X86_SSE_NATIVE)
1416
+   return _mm_cmpord_ps(a, b);
1417
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1418
+   return wasm_v128_and(wasm_f32x4_eq(a, a), wasm_f32x4_eq(b, b));
1419
+#else
1420
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
1421
+               b_ = simde__m128_to_private(b);
1422
+
1423
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1424
+   /* Note: NEON does not have ordered compare builtin
1425
+        Need to compare a eq a and b eq b to check for NaN
1426
+        Do AND of results to get final */
1427
+   uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32);
1428
+   uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32);
1429
+   r_.neon_u32 = vandq_u32(ceqaa, ceqbb);
1430
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1431
+   r_.wasm_v128 = wasm_v128_and(wasm_f32x4_eq(a_.wasm_v128, a_.wasm_v128),
1432
+                    wasm_f32x4_eq(b_.wasm_v128, b_.wasm_v128));
1433
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1434
+   r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1435
+       SIMDE_POWER_ALTIVEC_VECTOR(float),
1436
+       vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32),
1437
+           vec_cmpeq(b_.altivec_f32, b_.altivec_f32)));
1438
+#elif defined(simde_math_isnanf)
1439
+   SIMDE_VECTORIZE
1440
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1441
+       r_.u32[i] = (simde_math_isnanf(a_.f32[i]) ||
1442
+                simde_math_isnanf(b_.f32[i]))
1443
+                   ? UINT32_C(0)
1444
+                   : ~UINT32_C(0);
1445
+   }
1446
+#else
1447
+   HEDLEY_UNREACHABLE();
1448
+#endif
1449
+
1450
+   return simde__m128_from_private(r_);
1451
+#endif
1452
+}
1453
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1454
+#define _mm_cmpord_ps(a, b) simde_mm_cmpord_ps((a), (b))
1455
+#endif
1456
+
1457
+SIMDE_FUNCTION_ATTRIBUTES
1458
+simde__m128 simde_mm_cmpunord_ps(simde__m128 a, simde__m128 b)
1459
+{
1460
+#if defined(SIMDE_X86_SSE_NATIVE)
1461
+   return _mm_cmpunord_ps(a, b);
1462
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1463
+   return wasm_v128_or(wasm_f32x4_ne(a, a), wasm_f32x4_ne(b, b));
1464
+#else
1465
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
1466
+               b_ = simde__m128_to_private(b);
1467
+
1468
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1469
+   uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32);
1470
+   uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32);
1471
+   r_.neon_u32 = vmvnq_u32(vandq_u32(ceqaa, ceqbb));
1472
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1473
+   r_.wasm_v128 = wasm_v128_or(wasm_f32x4_ne(a_.wasm_v128, a_.wasm_v128),
1474
+                   wasm_f32x4_ne(b_.wasm_v128, b_.wasm_v128));
1475
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
1476
+   r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1477
+       SIMDE_POWER_ALTIVEC_VECTOR(float),
1478
+       vec_nand(vec_cmpeq(a_.altivec_f32, a_.altivec_f32),
1479
+            vec_cmpeq(b_.altivec_f32, b_.altivec_f32)));
1480
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1481
+   r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
1482
+       SIMDE_POWER_ALTIVEC_VECTOR(float),
1483
+       vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32),
1484
+           vec_cmpeq(b_.altivec_f32, b_.altivec_f32)));
1485
+   r_.altivec_f32 = vec_nor(r_.altivec_f32, r_.altivec_f32);
1486
+#elif defined(simde_math_isnanf)
1487
+   SIMDE_VECTORIZE
1488
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1489
+       r_.u32[i] = (simde_math_isnanf(a_.f32[i]) ||
1490
+                simde_math_isnanf(b_.f32[i]))
1491
+                   ? ~UINT32_C(0)
1492
+                   : UINT32_C(0);
1493
+   }
1494
+#else
1495
+   HEDLEY_UNREACHABLE();
1496
+#endif
1497
+
1498
+   return simde__m128_from_private(r_);
1499
+#endif
1500
+}
1501
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1502
+#define _mm_cmpunord_ps(a, b) simde_mm_cmpunord_ps((a), (b))
1503
+#endif
1504
+
1505
+SIMDE_FUNCTION_ATTRIBUTES
1506
+simde__m128 simde_mm_cmpunord_ss(simde__m128 a, simde__m128 b)
1507
+{
1508
+#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
1509
+   return _mm_cmpunord_ss(a, b);
1510
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1511
+   return simde_mm_move_ss(a, simde_mm_cmpunord_ps(a, b));
1512
+#else
1513
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
1514
+               b_ = simde__m128_to_private(b);
1515
+
1516
+#if defined(simde_math_isnanf)
1517
+   r_.u32[0] =
1518
+       (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0]))
1519
+           ? ~UINT32_C(0)
1520
+           : UINT32_C(0);
1521
+   SIMDE_VECTORIZE
1522
+   for (size_t i = 1; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
1523
+       r_.u32[i] = a_.u32[i];
1524
+   }
1525
+#else
1526
+   HEDLEY_UNREACHABLE();
1527
+#endif
1528
+
1529
+   return simde__m128_from_private(r_);
1530
+#endif
1531
+}
1532
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1533
+#define _mm_cmpunord_ss(a, b) simde_mm_cmpunord_ss((a), (b))
1534
+#endif
1535
+
1536
+SIMDE_FUNCTION_ATTRIBUTES
1537
+int simde_mm_comieq_ss(simde__m128 a, simde__m128 b)
1538
+{
1539
+#if defined(SIMDE_X86_SSE_NATIVE)
1540
+   return _mm_comieq_ss(a, b);
1541
+#else
1542
+   simde__m128_private a_ = simde__m128_to_private(a),
1543
+               b_ = simde__m128_to_private(b);
1544
+
1545
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1546
+   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1547
+   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1548
+   uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1549
+   uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32);
1550
+   return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0);
1551
+#else
1552
+   return a_.f32[0] == b_.f32[0];
1553
+#endif
1554
+#endif
1555
+}
1556
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1557
+#define _mm_comieq_ss(a, b) simde_mm_comieq_ss((a), (b))
1558
+#endif
1559
+
1560
+SIMDE_FUNCTION_ATTRIBUTES
1561
+int simde_mm_comige_ss(simde__m128 a, simde__m128 b)
1562
+{
1563
+#if defined(SIMDE_X86_SSE_NATIVE)
1564
+   return _mm_comige_ss(a, b);
1565
+#else
1566
+   simde__m128_private a_ = simde__m128_to_private(a),
1567
+               b_ = simde__m128_to_private(b);
1568
+
1569
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1570
+   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1571
+   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1572
+   uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1573
+   uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32);
1574
+   return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0);
1575
+#else
1576
+   return a_.f32[0] >= b_.f32[0];
1577
+#endif
1578
+#endif
1579
+}
1580
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1581
+#define _mm_comige_ss(a, b) simde_mm_comige_ss((a), (b))
1582
+#endif
1583
+
1584
+SIMDE_FUNCTION_ATTRIBUTES
1585
+int simde_mm_comigt_ss(simde__m128 a, simde__m128 b)
1586
+{
1587
+#if defined(SIMDE_X86_SSE_NATIVE)
1588
+   return _mm_comigt_ss(a, b);
1589
+#else
1590
+   simde__m128_private a_ = simde__m128_to_private(a),
1591
+               b_ = simde__m128_to_private(b);
1592
+
1593
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1594
+   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1595
+   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1596
+   uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1597
+   uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32);
1598
+   return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0);
1599
+#else
1600
+   return a_.f32[0] > b_.f32[0];
1601
+#endif
1602
+#endif
1603
+}
1604
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1605
+#define _mm_comigt_ss(a, b) simde_mm_comigt_ss((a), (b))
1606
+#endif
1607
+
1608
+SIMDE_FUNCTION_ATTRIBUTES
1609
+int simde_mm_comile_ss(simde__m128 a, simde__m128 b)
1610
+{
1611
+#if defined(SIMDE_X86_SSE_NATIVE)
1612
+   return _mm_comile_ss(a, b);
1613
+#else
1614
+   simde__m128_private a_ = simde__m128_to_private(a),
1615
+               b_ = simde__m128_to_private(b);
1616
+
1617
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1618
+   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1619
+   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1620
+   uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1621
+   uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32);
1622
+   return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0);
1623
+#else
1624
+   return a_.f32[0] <= b_.f32[0];
1625
+#endif
1626
+#endif
1627
+}
1628
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1629
+#define _mm_comile_ss(a, b) simde_mm_comile_ss((a), (b))
1630
+#endif
1631
+
1632
+SIMDE_FUNCTION_ATTRIBUTES
1633
+int simde_mm_comilt_ss(simde__m128 a, simde__m128 b)
1634
+{
1635
+#if defined(SIMDE_X86_SSE_NATIVE)
1636
+   return _mm_comilt_ss(a, b);
1637
+#else
1638
+   simde__m128_private a_ = simde__m128_to_private(a),
1639
+               b_ = simde__m128_to_private(b);
1640
+
1641
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1642
+   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1643
+   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1644
+   uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
1645
+   uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32);
1646
+   return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0);
1647
+#else
1648
+   return a_.f32[0] < b_.f32[0];
1649
+#endif
1650
+#endif
1651
+}
1652
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1653
+#define _mm_comilt_ss(a, b) simde_mm_comilt_ss((a), (b))
1654
+#endif
1655
+
1656
+SIMDE_FUNCTION_ATTRIBUTES
1657
+int simde_mm_comineq_ss(simde__m128 a, simde__m128 b)
1658
+{
1659
+#if defined(SIMDE_X86_SSE_NATIVE)
1660
+   return _mm_comineq_ss(a, b);
1661
+#else
1662
+   simde__m128_private a_ = simde__m128_to_private(a),
1663
+               b_ = simde__m128_to_private(b);
1664
+
1665
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1666
+   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
1667
+   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
1668
+   uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
1669
+   uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
1670
+   return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0);
1671
+#else
1672
+   return a_.f32[0] != b_.f32[0];
1673
+#endif
1674
+#endif
1675
+}
1676
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1677
+#define _mm_comineq_ss(a, b) simde_mm_comineq_ss((a), (b))
1678
+#endif
1679
+
1680
+SIMDE_FUNCTION_ATTRIBUTES
1681
+simde__m128 simde_x_mm_copysign_ps(simde__m128 dest, simde__m128 src)
1682
+{
1683
+   simde__m128_private r_, dest_ = simde__m128_to_private(dest),
1684
+               src_ = simde__m128_to_private(src);
1685
+
1686
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1687
+   const uint32x4_t sign_pos =
1688
+       vreinterpretq_u32_f32(vdupq_n_f32(-SIMDE_FLOAT32_C(0.0)));
1689
+   r_.neon_u32 = vbslq_u32(sign_pos, src_.neon_u32, dest_.neon_u32);
1690
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1691
+   const v128_t sign_pos = wasm_f32x4_splat(-0.0f);
1692
+   r_.wasm_v128 =
1693
+       wasm_v128_bitselect(src_.wasm_v128, dest_.wasm_v128, sign_pos);
1694
+#elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
1695
+#if !defined(HEDLEY_IBM_VERSION)
1696
+   r_.altivec_f32 = vec_cpsgn(dest_.altivec_f32, src_.altivec_f32);
1697
+#else
1698
+   r_.altivec_f32 = vec_cpsgn(src_.altivec_f32, dest_.altivec_f32);
1699
+#endif
1700
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1701
+   const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int)
1702
+       sign_pos = HEDLEY_REINTERPRET_CAST(
1703
+           SIMDE_POWER_ALTIVEC_VECTOR(unsigned int),
1704
+           vec_splats(-0.0f));
1705
+   r_.altivec_f32 = vec_sel(dest_.altivec_f32, src_.altivec_f32, sign_pos);
1706
+#elif defined(SIMDE_IEEE754_STORAGE)
1707
+   (void)src_;
1708
+   (void)dest_;
1709
+   simde__m128 sign_pos = simde_mm_set1_ps(-0.0f);
1710
+   r_ = simde__m128_to_private(simde_mm_xor_ps(
1711
+       dest, simde_mm_and_ps(simde_mm_xor_ps(dest, src), sign_pos)));
1712
+#else
1713
+   SIMDE_VECTORIZE
1714
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1715
+       r_.f32[i] = simde_math_copysignf(dest_.f32[i], src_.f32[i]);
1716
+   }
1717
+#endif
1718
+
1719
+   return simde__m128_from_private(r_);
1720
+}
1721
+
1722
+SIMDE_FUNCTION_ATTRIBUTES
1723
+simde__m128 simde_x_mm_xorsign_ps(simde__m128 dest, simde__m128 src)
1724
+{
1725
+   return simde_mm_xor_ps(simde_mm_and_ps(simde_mm_set1_ps(-0.0f), src),
1726
+                  dest);
1727
+}
1728
+
1729
+SIMDE_FUNCTION_ATTRIBUTES
1730
+simde__m128 simde_mm_cvt_pi2ps(simde__m128 a, simde__m64 b)
1731
+{
1732
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1733
+   return _mm_cvt_pi2ps(a, b);
1734
+#else
1735
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
1736
+   simde__m64_private b_ = simde__m64_to_private(b);
1737
+
1738
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1739
+   r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32),
1740
+                  vget_high_f32(a_.neon_f32));
1741
+#elif defined(SIMDE_CONVERT_VECTOR_)
1742
+   SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32);
1743
+   r_.m64_private[1] = a_.m64_private[1];
1744
+#else
1745
+   r_.f32[0] = (simde_float32)b_.i32[0];
1746
+   r_.f32[1] = (simde_float32)b_.i32[1];
1747
+   r_.i32[2] = a_.i32[2];
1748
+   r_.i32[3] = a_.i32[3];
1749
+#endif
1750
+
1751
+   return simde__m128_from_private(r_);
1752
+#endif
1753
+}
1754
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1755
+#define _mm_cvt_pi2ps(a, b) simde_mm_cvt_pi2ps((a), (b))
1756
+#endif
1757
+
1758
+SIMDE_FUNCTION_ATTRIBUTES
1759
+simde__m64 simde_mm_cvt_ps2pi(simde__m128 a)
1760
+{
1761
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1762
+   return _mm_cvt_ps2pi(a);
1763
+#else
1764
+   simde__m64_private r_;
1765
+   simde__m128_private a_;
1766
+
1767
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1768
+   a_ = simde__m128_to_private(
1769
+       simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION));
1770
+   r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
1771
+#elif defined(SIMDE_CONVERT_VECTOR_) && SIMDE_NATURAL_VECTOR_SIZE_GE(128)
1772
+   a_ = simde__m128_to_private(
1773
+       simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION));
1774
+   SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].f32);
1775
+#else
1776
+   a_ = simde__m128_to_private(a);
1777
+
1778
+   SIMDE_VECTORIZE
1779
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1780
+       r_.i32[i] = HEDLEY_STATIC_CAST(
1781
+           int32_t, simde_math_nearbyintf(a_.f32[i]));
1782
+   }
1783
+#endif
1784
+
1785
+   return simde__m64_from_private(r_);
1786
+#endif
1787
+}
1788
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1789
+#define _mm_cvt_ps2pi(a) simde_mm_cvt_ps2pi((a))
1790
+#endif
1791
+
1792
+SIMDE_FUNCTION_ATTRIBUTES
1793
+simde__m128 simde_mm_cvt_si2ss(simde__m128 a, int32_t b)
1794
+{
1795
+#if defined(SIMDE_X86_SSE_NATIVE)
1796
+   return _mm_cvt_si2ss(a, b);
1797
+#else
1798
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
1799
+
1800
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1801
+   r_.neon_f32 =
1802
+       vsetq_lane_f32(HEDLEY_STATIC_CAST(float, b), a_.neon_f32, 0);
1803
+#else
1804
+   r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
1805
+   r_.i32[1] = a_.i32[1];
1806
+   r_.i32[2] = a_.i32[2];
1807
+   r_.i32[3] = a_.i32[3];
1808
+#endif
1809
+
1810
+   return simde__m128_from_private(r_);
1811
+#endif
1812
+}
1813
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1814
+#define _mm_cvt_si2ss(a, b) simde_mm_cvt_si2ss((a), b)
1815
+#endif
1816
+
1817
+SIMDE_FUNCTION_ATTRIBUTES
1818
+int32_t simde_mm_cvt_ss2si(simde__m128 a)
1819
+{
1820
+#if defined(SIMDE_X86_SSE_NATIVE)
1821
+   return _mm_cvt_ss2si(a);
1822
+#elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
1823
+   return vgetq_lane_s32(vcvtnq_s32_f32(simde__m128_to_neon_f32(a)), 0);
1824
+#else
1825
+   simde__m128_private a_ = simde__m128_to_private(
1826
+       simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION));
1827
+   return SIMDE_CONVERT_FTOI(int32_t, a_.f32[0]);
1828
+#endif
1829
+}
1830
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1831
+#define _mm_cvt_ss2si(a) simde_mm_cvt_ss2si((a))
1832
+#endif
1833
+
1834
+SIMDE_FUNCTION_ATTRIBUTES
1835
+simde__m128 simde_mm_cvtpi16_ps(simde__m64 a)
1836
+{
1837
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1838
+   return _mm_cvtpi16_ps(a);
1839
+#else
1840
+   simde__m128_private r_;
1841
+   simde__m64_private a_ = simde__m64_to_private(a);
1842
+
1843
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1844
+   r_.neon_f32 = vcvtq_f32_s32(vmovl_s16(a_.neon_i16));
1845
+#elif defined(SIMDE_CONVERT_VECTOR_)
1846
+   SIMDE_CONVERT_VECTOR_(r_.f32, a_.i16);
1847
+#else
1848
+   SIMDE_VECTORIZE
1849
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
1850
+       simde_float32 v = a_.i16[i];
1851
+       r_.f32[i] = v;
1852
+   }
1853
+#endif
1854
+
1855
+   return simde__m128_from_private(r_);
1856
+#endif
1857
+}
1858
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1859
+#define _mm_cvtpi16_ps(a) simde_mm_cvtpi16_ps(a)
1860
+#endif
1861
+
1862
+SIMDE_FUNCTION_ATTRIBUTES
1863
+simde__m128 simde_mm_cvtpi32_ps(simde__m128 a, simde__m64 b)
1864
+{
1865
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1866
+   return _mm_cvtpi32_ps(a, b);
1867
+#else
1868
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
1869
+   simde__m64_private b_ = simde__m64_to_private(b);
1870
+
1871
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1872
+   r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32),
1873
+                  vget_high_f32(a_.neon_f32));
1874
+#elif defined(SIMDE_CONVERT_VECTOR_)
1875
+   SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32);
1876
+   r_.m64_private[1] = a_.m64_private[1];
1877
+#else
1878
+   r_.f32[0] = (simde_float32)b_.i32[0];
1879
+   r_.f32[1] = (simde_float32)b_.i32[1];
1880
+   r_.i32[2] = a_.i32[2];
1881
+   r_.i32[3] = a_.i32[3];
1882
+#endif
1883
+
1884
+   return simde__m128_from_private(r_);
1885
+#endif
1886
+}
1887
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1888
+#define _mm_cvtpi32_ps(a, b) simde_mm_cvtpi32_ps((a), b)
1889
+#endif
1890
+
1891
+SIMDE_FUNCTION_ATTRIBUTES
1892
+simde__m128 simde_mm_cvtpi32x2_ps(simde__m64 a, simde__m64 b)
1893
+{
1894
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1895
+   return _mm_cvtpi32x2_ps(a, b);
1896
+#else
1897
+   simde__m128_private r_;
1898
+   simde__m64_private a_ = simde__m64_to_private(a),
1899
+              b_ = simde__m64_to_private(b);
1900
+
1901
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1902
+   r_.neon_f32 = vcvtq_f32_s32(vcombine_s32(a_.neon_i32, b_.neon_i32));
1903
+#elif defined(SIMDE_CONVERT_VECTOR_)
1904
+   SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.i32);
1905
+   SIMDE_CONVERT_VECTOR_(r_.m64_private[1].f32, b_.i32);
1906
+#else
1907
+   r_.f32[0] = (simde_float32)a_.i32[0];
1908
+   r_.f32[1] = (simde_float32)a_.i32[1];
1909
+   r_.f32[2] = (simde_float32)b_.i32[0];
1910
+   r_.f32[3] = (simde_float32)b_.i32[1];
1911
+#endif
1912
+
1913
+   return simde__m128_from_private(r_);
1914
+#endif
1915
+}
1916
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1917
+#define _mm_cvtpi32x2_ps(a, b) simde_mm_cvtpi32x2_ps(a, b)
1918
+#endif
1919
+
1920
+SIMDE_FUNCTION_ATTRIBUTES
1921
+simde__m128 simde_mm_cvtpi8_ps(simde__m64 a)
1922
+{
1923
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1924
+   return _mm_cvtpi8_ps(a);
1925
+#else
1926
+   simde__m128_private r_;
1927
+   simde__m64_private a_ = simde__m64_to_private(a);
1928
+
1929
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1930
+   r_.neon_f32 =
1931
+       vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(a_.neon_i8))));
1932
+#else
1933
+   r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[0]);
1934
+   r_.f32[1] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[1]);
1935
+   r_.f32[2] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[2]);
1936
+   r_.f32[3] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[3]);
1937
+#endif
1938
+
1939
+   return simde__m128_from_private(r_);
1940
+#endif
1941
+}
1942
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1943
+#define _mm_cvtpi8_ps(a) simde_mm_cvtpi8_ps(a)
1944
+#endif
1945
+
1946
+SIMDE_FUNCTION_ATTRIBUTES
1947
+simde__m64 simde_mm_cvtps_pi16(simde__m128 a)
1948
+{
1949
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1950
+   return _mm_cvtps_pi16(a);
1951
+#else
1952
+   simde__m64_private r_;
1953
+   simde__m128_private a_ = simde__m128_to_private(a);
1954
+
1955
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
1956
+   r_.neon_i16 = vmovn_s32(vcvtq_s32_f32(vrndiq_f32(a_.neon_f32)));
1957
+#else
1958
+   SIMDE_VECTORIZE
1959
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1960
+       r_.i16[i] = SIMDE_CONVERT_FTOI(int16_t,
1961
+                          simde_math_roundf(a_.f32[i]));
1962
+   }
1963
+#endif
1964
+
1965
+   return simde__m64_from_private(r_);
1966
+#endif
1967
+}
1968
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
1969
+#define _mm_cvtps_pi16(a) simde_mm_cvtps_pi16((a))
1970
+#endif
1971
+
1972
+SIMDE_FUNCTION_ATTRIBUTES
1973
+simde__m64 simde_mm_cvtps_pi32(simde__m128 a)
1974
+{
1975
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
1976
+   return _mm_cvtps_pi32(a);
1977
+#else
1978
+   simde__m64_private r_;
1979
+   simde__m128_private a_ = simde__m128_to_private(a);
1980
+
1981
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
1982
+   defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_BUG_GCC_95399)
1983
+   r_.neon_i32 = vcvt_s32_f32(vget_low_f32(vrndiq_f32(a_.neon_f32)));
1984
+#else
1985
+   SIMDE_VECTORIZE
1986
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1987
+       simde_float32 v = simde_math_roundf(a_.f32[i]);
1988
+#if !defined(SIMDE_FAST_CONVERSION_RANGE)
1989
+       r_.i32[i] =
1990
+           ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) &&
1991
+            (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX)))
1992
+               ? SIMDE_CONVERT_FTOI(int32_t, v)
1993
+               : INT32_MIN;
1994
+#else
1995
+       r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
1996
+#endif
1997
+   }
1998
+#endif
1999
+
2000
+   return simde__m64_from_private(r_);
2001
+#endif
2002
+}
2003
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2004
+#define _mm_cvtps_pi32(a) simde_mm_cvtps_pi32((a))
2005
+#endif
2006
+
2007
+SIMDE_FUNCTION_ATTRIBUTES
2008
+simde__m64 simde_mm_cvtps_pi8(simde__m128 a)
2009
+{
2010
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2011
+   return _mm_cvtps_pi8(a);
2012
+#else
2013
+   simde__m64_private r_;
2014
+   simde__m128_private a_ = simde__m128_to_private(a);
2015
+
2016
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95471)
2017
+   /* Clamp the input to [INT8_MIN, INT8_MAX], round, convert to i32, narrow to
2018
+      * i16, combine with an all-zero vector of i16 (which will become the upper
2019
+      * half), narrow to i8. */
2020
+   float32x4_t max =
2021
+       vdupq_n_f32(HEDLEY_STATIC_CAST(simde_float32, INT8_MAX));
2022
+   float32x4_t min =
2023
+       vdupq_n_f32(HEDLEY_STATIC_CAST(simde_float32, INT8_MIN));
2024
+   float32x4_t values =
2025
+       vrndnq_f32(vmaxq_f32(vminq_f32(max, a_.neon_f32), min));
2026
+   r_.neon_i8 = vmovn_s16(
2027
+       vcombine_s16(vmovn_s32(vcvtq_s32_f32(values)), vdup_n_s16(0)));
2028
+#else
2029
+   SIMDE_VECTORIZE
2030
+   for (size_t i = 0; i < (sizeof(a_.f32) / sizeof(a_.f32[0])); i++) {
2031
+       if (a_.f32[i] > HEDLEY_STATIC_CAST(simde_float32, INT8_MAX))
2032
+           r_.i8[i] = INT8_MAX;
2033
+       else if (a_.f32[i] <
2034
+            HEDLEY_STATIC_CAST(simde_float32, INT8_MIN))
2035
+           r_.i8[i] = INT8_MIN;
2036
+       else
2037
+           r_.i8[i] = SIMDE_CONVERT_FTOI(
2038
+               int8_t, simde_math_roundf(a_.f32[i]));
2039
+   }
2040
+   /* Note: the upper half is undefined */
2041
+#endif
2042
+
2043
+   return simde__m64_from_private(r_);
2044
+#endif
2045
+}
2046
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2047
+#define _mm_cvtps_pi8(a) simde_mm_cvtps_pi8((a))
2048
+#endif
2049
+
2050
+SIMDE_FUNCTION_ATTRIBUTES
2051
+simde__m128 simde_mm_cvtpu16_ps(simde__m64 a)
2052
+{
2053
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2054
+   return _mm_cvtpu16_ps(a);
2055
+#else
2056
+   simde__m128_private r_;
2057
+   simde__m64_private a_ = simde__m64_to_private(a);
2058
+
2059
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2060
+   r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(a_.neon_u16));
2061
+#elif defined(SIMDE_CONVERT_VECTOR_)
2062
+   SIMDE_CONVERT_VECTOR_(r_.f32, a_.u16);
2063
+#else
2064
+   SIMDE_VECTORIZE
2065
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2066
+       r_.f32[i] = (simde_float32)a_.u16[i];
2067
+   }
2068
+#endif
2069
+
2070
+   return simde__m128_from_private(r_);
2071
+#endif
2072
+}
2073
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2074
+#define _mm_cvtpu16_ps(a) simde_mm_cvtpu16_ps(a)
2075
+#endif
2076
+
2077
+SIMDE_FUNCTION_ATTRIBUTES
2078
+simde__m128 simde_mm_cvtpu8_ps(simde__m64 a)
2079
+{
2080
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2081
+   return _mm_cvtpu8_ps(a);
2082
+#else
2083
+   simde__m128_private r_;
2084
+   simde__m64_private a_ = simde__m64_to_private(a);
2085
+
2086
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2087
+   r_.neon_f32 =
2088
+       vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(a_.neon_u8))));
2089
+#else
2090
+   SIMDE_VECTORIZE
2091
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2092
+       r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.u8[i]);
2093
+   }
2094
+#endif
2095
+
2096
+   return simde__m128_from_private(r_);
2097
+#endif
2098
+}
2099
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2100
+#define _mm_cvtpu8_ps(a) simde_mm_cvtpu8_ps(a)
2101
+#endif
2102
+
2103
+SIMDE_FUNCTION_ATTRIBUTES
2104
+simde__m128 simde_mm_cvtsi32_ss(simde__m128 a, int32_t b)
2105
+{
2106
+#if defined(SIMDE_X86_SSE_NATIVE)
2107
+   return _mm_cvtsi32_ss(a, b);
2108
+#else
2109
+   simde__m128_private r_;
2110
+   simde__m128_private a_ = simde__m128_to_private(a);
2111
+
2112
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2113
+   r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b),
2114
+                    a_.neon_f32, 0);
2115
+#else
2116
+   r_ = a_;
2117
+   r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
2118
+#endif
2119
+
2120
+   return simde__m128_from_private(r_);
2121
+#endif
2122
+}
2123
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2124
+#define _mm_cvtsi32_ss(a, b) simde_mm_cvtsi32_ss((a), b)
2125
+#endif
2126
+
2127
+SIMDE_FUNCTION_ATTRIBUTES
2128
+simde__m128 simde_mm_cvtsi64_ss(simde__m128 a, int64_t b)
2129
+{
2130
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
2131
+#if !defined(__PGI)
2132
+   return _mm_cvtsi64_ss(a, b);
2133
+#else
2134
+   return _mm_cvtsi64x_ss(a, b);
2135
+#endif
2136
+#else
2137
+   simde__m128_private r_;
2138
+   simde__m128_private a_ = simde__m128_to_private(a);
2139
+
2140
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2141
+   r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b),
2142
+                    a_.neon_f32, 0);
2143
+#else
2144
+   r_ = a_;
2145
+   r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b);
2146
+#endif
2147
+
2148
+   return simde__m128_from_private(r_);
2149
+#endif
2150
+}
2151
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2152
+#define _mm_cvtsi64_ss(a, b) simde_mm_cvtsi64_ss((a), b)
2153
+#endif
2154
+
2155
+SIMDE_FUNCTION_ATTRIBUTES
2156
+simde_float32 simde_mm_cvtss_f32(simde__m128 a)
2157
+{
2158
+#if defined(SIMDE_X86_SSE_NATIVE)
2159
+   return _mm_cvtss_f32(a);
2160
+#else
2161
+   simde__m128_private a_ = simde__m128_to_private(a);
2162
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2163
+   return vgetq_lane_f32(a_.neon_f32, 0);
2164
+#else
2165
+   return a_.f32[0];
2166
+#endif
2167
+#endif
2168
+}
2169
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2170
+#define _mm_cvtss_f32(a) simde_mm_cvtss_f32((a))
2171
+#endif
2172
+
2173
+SIMDE_FUNCTION_ATTRIBUTES
2174
+int32_t simde_mm_cvtss_si32(simde__m128 a)
2175
+{
2176
+   return simde_mm_cvt_ss2si(a);
2177
+}
2178
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2179
+#define _mm_cvtss_si32(a) simde_mm_cvtss_si32((a))
2180
+#endif
2181
+
2182
+SIMDE_FUNCTION_ATTRIBUTES
2183
+int64_t simde_mm_cvtss_si64(simde__m128 a)
2184
+{
2185
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
2186
+#if !defined(__PGI)
2187
+   return _mm_cvtss_si64(a);
2188
+#else
2189
+   return _mm_cvtss_si64x(a);
2190
+#endif
2191
+#else
2192
+   simde__m128_private a_ = simde__m128_to_private(a);
2193
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2194
+   return SIMDE_CONVERT_FTOI(
2195
+       int64_t, simde_math_roundf(vgetq_lane_f32(a_.neon_f32, 0)));
2196
+#else
2197
+   return SIMDE_CONVERT_FTOI(int64_t, simde_math_roundf(a_.f32[0]));
2198
+#endif
2199
+#endif
2200
+}
2201
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2202
+#define _mm_cvtss_si64(a) simde_mm_cvtss_si64((a))
2203
+#endif
2204
+
2205
+SIMDE_FUNCTION_ATTRIBUTES
2206
+simde__m64 simde_mm_cvtt_ps2pi(simde__m128 a)
2207
+{
2208
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2209
+   return _mm_cvtt_ps2pi(a);
2210
+#else
2211
+   simde__m64_private r_;
2212
+   simde__m128_private a_ = simde__m128_to_private(a);
2213
+
2214
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
2215
+   r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32));
2216
+#else
2217
+   SIMDE_VECTORIZE
2218
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2219
+       simde_float32 v = a_.f32[i];
2220
+#if !defined(SIMDE_FAST_CONVERSION_RANGE)
2221
+       r_.i32[i] =
2222
+           ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) &&
2223
+            (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX)))
2224
+               ? SIMDE_CONVERT_FTOI(int32_t, v)
2225
+               : INT32_MIN;
2226
+#else
2227
+       r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
2228
+#endif
2229
+   }
2230
+#endif
2231
+
2232
+   return simde__m64_from_private(r_);
2233
+#endif
2234
+}
2235
+#define simde_mm_cvttps_pi32(a) simde_mm_cvtt_ps2pi(a)
2236
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2237
+#define _mm_cvtt_ps2pi(a) simde_mm_cvtt_ps2pi((a))
2238
+#define _mm_cvttps_pi32(a) simde_mm_cvttps_pi32((a))
2239
+#endif
2240
+
2241
+SIMDE_FUNCTION_ATTRIBUTES
2242
+int32_t simde_mm_cvtt_ss2si(simde__m128 a)
2243
+{
2244
+#if defined(SIMDE_X86_SSE_NATIVE)
2245
+   return _mm_cvtt_ss2si(a);
2246
+#else
2247
+   simde__m128_private a_ = simde__m128_to_private(a);
2248
+
2249
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
2250
+   return SIMDE_CONVERT_FTOI(int32_t, vgetq_lane_f32(a_.neon_f32, 0));
2251
+#else
2252
+   simde_float32 v = a_.f32[0];
2253
+#if !defined(SIMDE_FAST_CONVERSION_RANGE)
2254
+   return ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) &&
2255
+       (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX)))
2256
+              ? SIMDE_CONVERT_FTOI(int32_t, v)
2257
+              : INT32_MIN;
2258
+#else
2259
+   return SIMDE_CONVERT_FTOI(int32_t, v);
2260
+#endif
2261
+#endif
2262
+#endif
2263
+}
2264
+#define simde_mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a))
2265
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2266
+#define _mm_cvtt_ss2si(a) simde_mm_cvtt_ss2si((a))
2267
+#define _mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a))
2268
+#endif
2269
+
2270
+SIMDE_FUNCTION_ATTRIBUTES
2271
+int64_t simde_mm_cvttss_si64(simde__m128 a)
2272
+{
2273
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
2274
+   !defined(_MSC_VER)
2275
+#if defined(__PGI)
2276
+   return _mm_cvttss_si64x(a);
2277
+#else
2278
+   return _mm_cvttss_si64(a);
2279
+#endif
2280
+#else
2281
+   simde__m128_private a_ = simde__m128_to_private(a);
2282
+
2283
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2284
+   return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0));
2285
+#else
2286
+   return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]);
2287
+#endif
2288
+#endif
2289
+}
2290
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2291
+#define _mm_cvttss_si64(a) simde_mm_cvttss_si64((a))
2292
+#endif
2293
+
2294
+SIMDE_FUNCTION_ATTRIBUTES
2295
+simde__m128 simde_mm_cmpord_ss(simde__m128 a, simde__m128 b)
2296
+{
2297
+#if defined(SIMDE_X86_SSE_NATIVE)
2298
+   return _mm_cmpord_ss(a, b);
2299
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2300
+   return simde_mm_move_ss(a, simde_mm_cmpord_ps(a, b));
2301
+#else
2302
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
2303
+
2304
+#if defined(simde_math_isnanf)
2305
+   r_.u32[0] = (simde_math_isnanf(simde_mm_cvtss_f32(a)) ||
2306
+            simde_math_isnanf(simde_mm_cvtss_f32(b)))
2307
+               ? UINT32_C(0)
2308
+               : ~UINT32_C(0);
2309
+   SIMDE_VECTORIZE
2310
+   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2311
+       r_.u32[i] = a_.u32[i];
2312
+   }
2313
+#else
2314
+   HEDLEY_UNREACHABLE();
2315
+#endif
2316
+
2317
+   return simde__m128_from_private(r_);
2318
+#endif
2319
+}
2320
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2321
+#define _mm_cmpord_ss(a, b) simde_mm_cmpord_ss((a), (b))
2322
+#endif
2323
+
2324
+SIMDE_FUNCTION_ATTRIBUTES
2325
+simde__m128 simde_mm_div_ps(simde__m128 a, simde__m128 b)
2326
+{
2327
+#if defined(SIMDE_X86_SSE_NATIVE)
2328
+   return _mm_div_ps(a, b);
2329
+#else
2330
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
2331
+               b_ = simde__m128_to_private(b);
2332
+
2333
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2334
+   r_.neon_f32 = vdivq_f32(a_.neon_f32, b_.neon_f32);
2335
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2336
+   float32x4_t recip0 = vrecpeq_f32(b_.neon_f32);
2337
+   float32x4_t recip1 =
2338
+       vmulq_f32(recip0, vrecpsq_f32(recip0, b_.neon_f32));
2339
+   r_.neon_f32 = vmulq_f32(a_.neon_f32, recip1);
2340
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2341
+   r_.wasm_v128 = wasm_f32x4_div(a_.wasm_v128, b_.wasm_v128);
2342
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
2343
+   r_.altivec_f32 = vec_div(a_.altivec_f32, b_.altivec_f32);
2344
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2345
+   r_.f32 = a_.f32 / b_.f32;
2346
+#else
2347
+   SIMDE_VECTORIZE
2348
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2349
+       r_.f32[i] = a_.f32[i] / b_.f32[i];
2350
+   }
2351
+#endif
2352
+
2353
+   return simde__m128_from_private(r_);
2354
+#endif
2355
+}
2356
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2357
+#define _mm_div_ps(a, b) simde_mm_div_ps((a), (b))
2358
+#endif
2359
+
2360
+SIMDE_FUNCTION_ATTRIBUTES
2361
+simde__m128 simde_mm_div_ss(simde__m128 a, simde__m128 b)
2362
+{
2363
+#if defined(SIMDE_X86_SSE_NATIVE)
2364
+   return _mm_div_ss(a, b);
2365
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2366
+   return simde_mm_move_ss(a, simde_mm_div_ps(a, b));
2367
+#else
2368
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
2369
+               b_ = simde__m128_to_private(b);
2370
+
2371
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2372
+   float32_t value = vgetq_lane_f32(
2373
+       simde__m128_to_private(simde_mm_div_ps(a, b)).neon_f32, 0);
2374
+   r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
2375
+#else
2376
+   r_.f32[0] = a_.f32[0] / b_.f32[0];
2377
+   SIMDE_VECTORIZE
2378
+   for (size_t i = 1; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2379
+       r_.f32[i] = a_.f32[i];
2380
+   }
2381
+#endif
2382
+
2383
+   return simde__m128_from_private(r_);
2384
+#endif
2385
+}
2386
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2387
+#define _mm_div_ss(a, b) simde_mm_div_ss((a), (b))
2388
+#endif
2389
+
2390
+SIMDE_FUNCTION_ATTRIBUTES
2391
+int16_t simde_mm_extract_pi16(simde__m64 a, const int imm8)
2392
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)
2393
+{
2394
+   simde__m64_private a_ = simde__m64_to_private(a);
2395
+   return a_.i16[imm8];
2396
+}
2397
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
2398
+   !defined(HEDLEY_PGI_VERSION)
2399
+#if defined(SIMDE_BUG_CLANG_44589)
2400
+#define simde_mm_extract_pi16(a, imm8)                                      \
2401
+   (HEDLEY_DIAGNOSTIC_PUSH _Pragma(                                    \
2402
+       "clang diagnostic ignored \"-Wvector-conversion\"")         \
2403
+        HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16((a), (imm8))) \
2404
+            HEDLEY_DIAGNOSTIC_POP)
2405
+#else
2406
+#define simde_mm_extract_pi16(a, imm8) \
2407
+   HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16(a, imm8))
2408
+#endif
2409
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2410
+#define simde_mm_extract_pi16(a, imm8) \
2411
+   vget_lane_s16(simde__m64_to_private(a).neon_i16, imm8)
2412
+#endif
2413
+#define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a, imm8)
2414
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2415
+#define _mm_extract_pi16(a, imm8) simde_mm_extract_pi16((a), (imm8))
2416
+#define _m_pextrw(a, imm8) simde_mm_extract_pi16((a), (imm8))
2417
+#endif
2418
+
2419
+SIMDE_FUNCTION_ATTRIBUTES
2420
+simde__m64 simde_mm_insert_pi16(simde__m64 a, int16_t i, const int imm8)
2421
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)
2422
+{
2423
+   simde__m64_private r_, a_ = simde__m64_to_private(a);
2424
+
2425
+   r_.i64[0] = a_.i64[0];
2426
+   r_.i16[imm8] = i;
2427
+
2428
+   return simde__m64_from_private(r_);
2429
+}
2430
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
2431
+   !defined(__PGI)
2432
+#if defined(SIMDE_BUG_CLANG_44589)
2433
+#define ssimde_mm_insert_pi16(a, i, imm8)                            \
2434
+   (HEDLEY_DIAGNOSTIC_PUSH _Pragma(                             \
2435
+       "clang diagnostic ignored \"-Wvector-conversion\"")( \
2436
+       _mm_insert_pi16((a), (i), (imm8))) HEDLEY_DIAGNOSTIC_POP)
2437
+#else
2438
+#define simde_mm_insert_pi16(a, i, imm8) _mm_insert_pi16(a, i, imm8)
2439
+#endif
2440
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2441
+#define simde_mm_insert_pi16(a, i, imm8) \
2442
+   simde__m64_from_neon_i16(        \
2443
+       vset_lane_s16((i), simde__m64_to_neon_i16(a), (imm8)))
2444
+#endif
2445
+#define simde_m_pinsrw(a, i, imm8) (simde_mm_insert_pi16(a, i, imm8))
2446
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2447
+#define _mm_insert_pi16(a, i, imm8) simde_mm_insert_pi16(a, i, imm8)
2448
+#define _m_pinsrw(a, i, imm8) simde_mm_insert_pi16(a, i, imm8)
2449
+#endif
2450
+
2451
+SIMDE_FUNCTION_ATTRIBUTES
2452
+simde__m128
2453
+simde_mm_load_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
2454
+{
2455
+#if defined(SIMDE_X86_SSE_NATIVE)
2456
+   return _mm_load_ps(mem_addr);
2457
+#else
2458
+   simde__m128_private r_;
2459
+
2460
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2461
+   r_.neon_f32 = vld1q_f32(mem_addr);
2462
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
2463
+   r_.altivec_f32 = vec_vsx_ld(0, mem_addr);
2464
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2465
+   r_.altivec_f32 = vec_ld(0, mem_addr);
2466
+#else
2467
+   simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128),
2468
+            sizeof(r_));
2469
+#endif
2470
+
2471
+   return simde__m128_from_private(r_);
2472
+#endif
2473
+}
2474
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2475
+#define _mm_load_ps(mem_addr) simde_mm_load_ps(mem_addr)
2476
+#endif
2477
+
2478
+SIMDE_FUNCTION_ATTRIBUTES
2479
+simde__m128 simde_mm_load1_ps(simde_float32 const *mem_addr)
2480
+{
2481
+#if defined(SIMDE_X86_SSE_NATIVE)
2482
+   return _mm_load_ps1(mem_addr);
2483
+#else
2484
+   simde__m128_private r_;
2485
+
2486
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2487
+   r_.neon_f32 = vld1q_dup_f32(mem_addr);
2488
+#else
2489
+   r_ = simde__m128_to_private(simde_mm_set1_ps(*mem_addr));
2490
+#endif
2491
+
2492
+   return simde__m128_from_private(r_);
2493
+#endif
2494
+}
2495
+#define simde_mm_load_ps1(mem_addr) simde_mm_load1_ps(mem_addr)
2496
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2497
+#define _mm_load_ps1(mem_addr) simde_mm_load1_ps(mem_addr)
2498
+#define _mm_load1_ps(mem_addr) simde_mm_load1_ps(mem_addr)
2499
+#endif
2500
+
2501
+SIMDE_FUNCTION_ATTRIBUTES
2502
+simde__m128 simde_mm_load_ss(simde_float32 const *mem_addr)
2503
+{
2504
+#if defined(SIMDE_X86_SSE_NATIVE)
2505
+   return _mm_load_ss(mem_addr);
2506
+#else
2507
+   simde__m128_private r_;
2508
+
2509
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2510
+   r_.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0);
2511
+#else
2512
+   r_.f32[0] = *mem_addr;
2513
+   r_.i32[1] = 0;
2514
+   r_.i32[2] = 0;
2515
+   r_.i32[3] = 0;
2516
+#endif
2517
+
2518
+   return simde__m128_from_private(r_);
2519
+#endif
2520
+}
2521
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2522
+#define _mm_load_ss(mem_addr) simde_mm_load_ss(mem_addr)
2523
+#endif
2524
+
2525
+SIMDE_FUNCTION_ATTRIBUTES
2526
+simde__m128 simde_mm_loadh_pi(simde__m128 a, simde__m64 const *mem_addr)
2527
+{
2528
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2529
+   return _mm_loadh_pi(a,
2530
+               HEDLEY_REINTERPRET_CAST(__m64 const *, mem_addr));
2531
+#else
2532
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
2533
+
2534
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2535
+   r_.neon_f32 = vcombine_f32(
2536
+       vget_low_f32(a_.neon_f32),
2537
+       vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr)));
2538
+#else
2539
+   simde__m64_private b_ =
2540
+       *HEDLEY_REINTERPRET_CAST(simde__m64_private const *, mem_addr);
2541
+   r_.f32[0] = a_.f32[0];
2542
+   r_.f32[1] = a_.f32[1];
2543
+   r_.f32[2] = b_.f32[0];
2544
+   r_.f32[3] = b_.f32[1];
2545
+#endif
2546
+
2547
+   return simde__m128_from_private(r_);
2548
+#endif
2549
+}
2550
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2551
+#if HEDLEY_HAS_WARNING("-Wold-style-cast")
2552
+#define _mm_loadh_pi(a, mem_addr)                                          \
2553
+   simde_mm_loadh_pi((a), HEDLEY_REINTERPRET_CAST(simde__m64 const *, \
2554
+                              (mem_addr)))
2555
+#else
2556
+#define _mm_loadh_pi(a, mem_addr) \
2557
+   simde_mm_loadh_pi((a), (simde__m64 const *)(mem_addr))
2558
+#endif
2559
+#endif
2560
+
2561
+/* The SSE documentation says that there are no alignment requirements
2562
+   for mem_addr.  Unfortunately they used the __m64 type for the argument
2563
+   which is supposed to be 8-byte aligned, so some compilers (like clang
2564
+   with -Wcast-align) will generate a warning if you try to cast, say,
2565
+   a simde_float32* to a simde__m64* for this function.
2566
+
2567
+   I think the choice of argument type is unfortunate, but I do think we
2568
+   need to stick to it here.  If there is demand I can always add something
2569
+   like simde_x_mm_loadl_f32(simde__m128, simde_float32 mem_addr[2]) */
2570
+SIMDE_FUNCTION_ATTRIBUTES
2571
+simde__m128 simde_mm_loadl_pi(simde__m128 a, simde__m64 const *mem_addr)
2572
+{
2573
+#if defined(SIMDE_X86_SSE_NATIVE)
2574
+   return _mm_loadl_pi(a,
2575
+               HEDLEY_REINTERPRET_CAST(__m64 const *, mem_addr));
2576
+#else
2577
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
2578
+
2579
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2580
+   r_.neon_f32 = vcombine_f32(
2581
+       vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr)),
2582
+       vget_high_f32(a_.neon_f32));
2583
+#else
2584
+   simde__m64_private b_;
2585
+   simde_memcpy(&b_, mem_addr, sizeof(b_));
2586
+   r_.i32[0] = b_.i32[0];
2587
+   r_.i32[1] = b_.i32[1];
2588
+   r_.i32[2] = a_.i32[2];
2589
+   r_.i32[3] = a_.i32[3];
2590
+#endif
2591
+
2592
+   return simde__m128_from_private(r_);
2593
+#endif
2594
+}
2595
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2596
+#if HEDLEY_HAS_WARNING("-Wold-style-cast")
2597
+#define _mm_loadl_pi(a, mem_addr)                                          \
2598
+   simde_mm_loadl_pi((a), HEDLEY_REINTERPRET_CAST(simde__m64 const *, \
2599
+                              (mem_addr)))
2600
+#else
2601
+#define _mm_loadl_pi(a, mem_addr) \
2602
+   simde_mm_loadl_pi((a), (simde__m64 const *)(mem_addr))
2603
+#endif
2604
+#endif
2605
+
2606
+SIMDE_FUNCTION_ATTRIBUTES
2607
+simde__m128
2608
+simde_mm_loadr_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
2609
+{
2610
+#if defined(SIMDE_X86_SSE_NATIVE)
2611
+   return _mm_loadr_ps(mem_addr);
2612
+#else
2613
+   simde__m128_private r_,
2614
+       v_ = simde__m128_to_private(simde_mm_load_ps(mem_addr));
2615
+
2616
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2617
+   r_.neon_f32 = vrev64q_f32(v_.neon_f32);
2618
+   r_.neon_f32 = vextq_f32(r_.neon_f32, r_.neon_f32, 2);
2619
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__)
2620
+   r_.altivec_f32 = vec_reve(v_.altivec_f32);
2621
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
2622
+   r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, v_.f32, v_.f32, 3, 2, 1, 0);
2623
+#else
2624
+   r_.f32[0] = v_.f32[3];
2625
+   r_.f32[1] = v_.f32[2];
2626
+   r_.f32[2] = v_.f32[1];
2627
+   r_.f32[3] = v_.f32[0];
2628
+#endif
2629
+
2630
+   return simde__m128_from_private(r_);
2631
+#endif
2632
+}
2633
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2634
+#define _mm_loadr_ps(mem_addr) simde_mm_loadr_ps(mem_addr)
2635
+#endif
2636
+
2637
+SIMDE_FUNCTION_ATTRIBUTES
2638
+simde__m128
2639
+simde_mm_loadu_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
2640
+{
2641
+#if defined(SIMDE_X86_SSE_NATIVE)
2642
+   return _mm_loadu_ps(mem_addr);
2643
+#else
2644
+   simde__m128_private r_;
2645
+
2646
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2647
+   r_.neon_f32 =
2648
+       vld1q_f32(HEDLEY_REINTERPRET_CAST(const float32_t *, mem_addr));
2649
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2650
+   r_.wasm_v128 = wasm_v128_load(mem_addr);
2651
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__)
2652
+   r_.altivec_f32 = vec_vsx_ld(0, mem_addr);
2653
+#else
2654
+   simde_memcpy(&r_, mem_addr, sizeof(r_));
2655
+#endif
2656
+
2657
+   return simde__m128_from_private(r_);
2658
+#endif
2659
+}
2660
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2661
+#define _mm_loadu_ps(mem_addr) simde_mm_loadu_ps(mem_addr)
2662
+#endif
2663
+
2664
+SIMDE_FUNCTION_ATTRIBUTES
2665
+void simde_mm_maskmove_si64(simde__m64 a, simde__m64 mask, int8_t *mem_addr)
2666
+{
2667
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2668
+   _mm_maskmove_si64(a, mask, HEDLEY_REINTERPRET_CAST(char *, mem_addr));
2669
+#else
2670
+   simde__m64_private a_ = simde__m64_to_private(a),
2671
+              mask_ = simde__m64_to_private(mask);
2672
+
2673
+   SIMDE_VECTORIZE
2674
+   for (size_t i = 0; i < (sizeof(a_.i8) / sizeof(a_.i8[0])); i++)
2675
+       if (mask_.i8[i] < 0)
2676
+           mem_addr[i] = a_.i8[i];
2677
+#endif
2678
+}
2679
+#define simde_m_maskmovq(a, mask, mem_addr) \
2680
+   simde_mm_maskmove_si64(a, mask, mem_addr)
2681
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2682
+#define _mm_maskmove_si64(a, mask, mem_addr) \
2683
+   simde_mm_maskmove_si64(              \
2684
+       (a), (mask),                 \
2685
+       SIMDE_CHECKED_REINTERPRET_CAST(int8_t *, char *, (mem_addr)))
2686
+#define _m_maskmovq(a, mask, mem_addr) \
2687
+   simde_mm_maskmove_si64(        \
2688
+       (a), (mask),           \
2689
+       SIMDE_CHECKED_REINTERPRET_CAST(int8_t *, char *, (mem_addr)))
2690
+#endif
2691
+
2692
+SIMDE_FUNCTION_ATTRIBUTES
2693
+simde__m64 simde_mm_max_pi16(simde__m64 a, simde__m64 b)
2694
+{
2695
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2696
+   return _mm_max_pi16(a, b);
2697
+#else
2698
+   simde__m64_private r_, a_ = simde__m64_to_private(a),
2699
+                  b_ = simde__m64_to_private(b);
2700
+
2701
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2702
+   r_.neon_i16 = vmax_s16(a_.neon_i16, b_.neon_i16);
2703
+#else
2704
+   SIMDE_VECTORIZE
2705
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
2706
+       r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
2707
+   }
2708
+#endif
2709
+
2710
+   return simde__m64_from_private(r_);
2711
+#endif
2712
+}
2713
+#define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
2714
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2715
+#define _mm_max_pi16(a, b) simde_mm_max_pi16(a, b)
2716
+#define _m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
2717
+#endif
2718
+
2719
+SIMDE_FUNCTION_ATTRIBUTES
2720
+simde__m128 simde_mm_max_ps(simde__m128 a, simde__m128 b)
2721
+{
2722
+#if defined(SIMDE_X86_SSE_NATIVE)
2723
+   return _mm_max_ps(a, b);
2724
+#else
2725
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
2726
+               b_ = simde__m128_to_private(b);
2727
+
2728
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_NANS)
2729
+   r_.neon_f32 = vmaxq_f32(a_.neon_f32, b_.neon_f32);
2730
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2731
+   r_.neon_f32 = vbslq_f32(vcgtq_f32(a_.neon_f32, b_.neon_f32),
2732
+               a_.neon_f32, b_.neon_f32);
2733
+#elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_NANS)
2734
+   r_.wasm_v128 = wasm_f32x4_max(a_.wasm_v128, b_.wasm_v128);
2735
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2736
+   r_.wasm_v128 =
2737
+       wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128,
2738
+                   wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128));
2739
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_FAST_NANS)
2740
+   r_.altivec_f32 = vec_max(a_.altivec_f32, b_.altivec_f32);
2741
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2742
+   r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32,
2743
+                vec_cmpgt(a_.altivec_f32, b_.altivec_f32));
2744
+#else
2745
+   SIMDE_VECTORIZE
2746
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2747
+       r_.f32[i] = (a_.f32[i] > b_.f32[i]) ? a_.f32[i] : b_.f32[i];
2748
+   }
2749
+#endif
2750
+
2751
+   return simde__m128_from_private(r_);
2752
+#endif
2753
+}
2754
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2755
+#define _mm_max_ps(a, b) simde_mm_max_ps((a), (b))
2756
+#endif
2757
+
2758
+SIMDE_FUNCTION_ATTRIBUTES
2759
+simde__m64 simde_mm_max_pu8(simde__m64 a, simde__m64 b)
2760
+{
2761
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2762
+   return _mm_max_pu8(a, b);
2763
+#else
2764
+   simde__m64_private r_, a_ = simde__m64_to_private(a),
2765
+                  b_ = simde__m64_to_private(b);
2766
+
2767
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2768
+   r_.neon_u8 = vmax_u8(a_.neon_u8, b_.neon_u8);
2769
+#else
2770
+   SIMDE_VECTORIZE
2771
+   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
2772
+       r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
2773
+   }
2774
+#endif
2775
+
2776
+   return simde__m64_from_private(r_);
2777
+#endif
2778
+}
2779
+#define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b)
2780
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2781
+#define _mm_max_pu8(a, b) simde_mm_max_pu8(a, b)
2782
+#define _m_pmaxub(a, b) simde_mm_max_pu8(a, b)
2783
+#endif
2784
+
2785
+SIMDE_FUNCTION_ATTRIBUTES
2786
+simde__m128 simde_mm_max_ss(simde__m128 a, simde__m128 b)
2787
+{
2788
+#if defined(SIMDE_X86_SSE_NATIVE)
2789
+   return _mm_max_ss(a, b);
2790
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2791
+   return simde_mm_move_ss(a, simde_mm_max_ps(a, b));
2792
+#else
2793
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
2794
+               b_ = simde__m128_to_private(b);
2795
+
2796
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2797
+   float32_t value = vgetq_lane_f32(maxq_f32(a_.neon_f32, b_.neon_f32), 0);
2798
+   r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
2799
+#else
2800
+   r_.f32[0] = (a_.f32[0] > b_.f32[0]) ? a_.f32[0] : b_.f32[0];
2801
+   r_.f32[1] = a_.f32[1];
2802
+   r_.f32[2] = a_.f32[2];
2803
+   r_.f32[3] = a_.f32[3];
2804
+#endif
2805
+
2806
+   return simde__m128_from_private(r_);
2807
+#endif
2808
+}
2809
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2810
+#define _mm_max_ss(a, b) simde_mm_max_ss((a), (b))
2811
+#endif
2812
+
2813
+SIMDE_FUNCTION_ATTRIBUTES
2814
+simde__m64 simde_mm_min_pi16(simde__m64 a, simde__m64 b)
2815
+{
2816
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2817
+   return _mm_min_pi16(a, b);
2818
+#else
2819
+   simde__m64_private r_, a_ = simde__m64_to_private(a),
2820
+                  b_ = simde__m64_to_private(b);
2821
+
2822
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2823
+   r_.neon_i16 = vmin_s16(a_.neon_i16, b_.neon_i16);
2824
+#else
2825
+   SIMDE_VECTORIZE
2826
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
2827
+       r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
2828
+   }
2829
+#endif
2830
+
2831
+   return simde__m64_from_private(r_);
2832
+#endif
2833
+}
2834
+#define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b)
2835
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2836
+#define _mm_min_pi16(a, b) simde_mm_min_pi16(a, b)
2837
+#define _m_pminsw(a, b) simde_mm_min_pi16(a, b)
2838
+#endif
2839
+
2840
+SIMDE_FUNCTION_ATTRIBUTES
2841
+simde__m128 simde_mm_min_ps(simde__m128 a, simde__m128 b)
2842
+{
2843
+#if defined(SIMDE_X86_SSE_NATIVE)
2844
+   return _mm_min_ps(a, b);
2845
+#elif defined(SIMDE_FAST_NANS) && defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2846
+   return simde__m128_from_neon_f32(vminq_f32(simde__m128_to_neon_f32(a),
2847
+                          simde__m128_to_neon_f32(b)));
2848
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2849
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
2850
+               b_ = simde__m128_to_private(b);
2851
+#if defined(SIMDE_FAST_NANS)
2852
+   r_.wasm_v128 = wasm_f32x4_min(a_.wasm_v128, b_.wasm_v128);
2853
+#else
2854
+   r_.wasm_v128 =
2855
+       wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128,
2856
+                   wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128));
2857
+#endif
2858
+   return simde__m128_from_private(r_);
2859
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2860
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
2861
+               b_ = simde__m128_to_private(b);
2862
+
2863
+#if defined(SIMDE_FAST_NANS)
2864
+   r_.altivec_f32 = vec_min(a_.altivec_f32, b_.altivec_f32);
2865
+#else
2866
+   r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32,
2867
+                vec_cmpgt(b_.altivec_f32, a_.altivec_f32));
2868
+#endif
2869
+
2870
+   return simde__m128_from_private(r_);
2871
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2872
+   simde__m128 mask = simde_mm_cmplt_ps(a, b);
2873
+   return simde_mm_or_ps(simde_mm_and_ps(mask, a),
2874
+                 simde_mm_andnot_ps(mask, b));
2875
+#else
2876
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
2877
+               b_ = simde__m128_to_private(b);
2878
+
2879
+   SIMDE_VECTORIZE
2880
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2881
+       r_.f32[i] = (a_.f32[i] < b_.f32[i]) ? a_.f32[i] : b_.f32[i];
2882
+   }
2883
+
2884
+   return simde__m128_from_private(r_);
2885
+#endif
2886
+}
2887
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2888
+#define _mm_min_ps(a, b) simde_mm_min_ps((a), (b))
2889
+#endif
2890
+
2891
+SIMDE_FUNCTION_ATTRIBUTES
2892
+simde__m64 simde_mm_min_pu8(simde__m64 a, simde__m64 b)
2893
+{
2894
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2895
+   return _mm_min_pu8(a, b);
2896
+#else
2897
+   simde__m64_private r_, a_ = simde__m64_to_private(a),
2898
+                  b_ = simde__m64_to_private(b);
2899
+
2900
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2901
+   r_.neon_u8 = vmin_u8(a_.neon_u8, b_.neon_u8);
2902
+#else
2903
+   SIMDE_VECTORIZE
2904
+   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
2905
+       r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
2906
+   }
2907
+#endif
2908
+
2909
+   return simde__m64_from_private(r_);
2910
+#endif
2911
+}
2912
+#define simde_m_pminub(a, b) simde_mm_min_pu8(a, b)
2913
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2914
+#define _mm_min_pu8(a, b) simde_mm_min_pu8(a, b)
2915
+#define _m_pminub(a, b) simde_mm_min_pu8(a, b)
2916
+#endif
2917
+
2918
+SIMDE_FUNCTION_ATTRIBUTES
2919
+simde__m128 simde_mm_min_ss(simde__m128 a, simde__m128 b)
2920
+{
2921
+#if defined(SIMDE_X86_SSE_NATIVE)
2922
+   return _mm_min_ss(a, b);
2923
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2924
+   return simde_mm_move_ss(a, simde_mm_min_ps(a, b));
2925
+#else
2926
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
2927
+               b_ = simde__m128_to_private(b);
2928
+
2929
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2930
+   float32_t value =
2931
+       vgetq_lane_f32(vminq_f32(a_.neon_f32, b_.neon_f32), 0);
2932
+   r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
2933
+#else
2934
+   r_.f32[0] = (a_.f32[0] < b_.f32[0]) ? a_.f32[0] : b_.f32[0];
2935
+   r_.f32[1] = a_.f32[1];
2936
+   r_.f32[2] = a_.f32[2];
2937
+   r_.f32[3] = a_.f32[3];
2938
+#endif
2939
+
2940
+   return simde__m128_from_private(r_);
2941
+#endif
2942
+}
2943
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2944
+#define _mm_min_ss(a, b) simde_mm_min_ss((a), (b))
2945
+#endif
2946
+
2947
+SIMDE_FUNCTION_ATTRIBUTES
2948
+simde__m128 simde_mm_movehl_ps(simde__m128 a, simde__m128 b)
2949
+{
2950
+#if defined(SIMDE_X86_SSE_NATIVE)
2951
+   return _mm_movehl_ps(a, b);
2952
+#else
2953
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
2954
+               b_ = simde__m128_to_private(b);
2955
+
2956
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2957
+   float32x2_t a32 = vget_high_f32(a_.neon_f32);
2958
+   float32x2_t b32 = vget_high_f32(b_.neon_f32);
2959
+   r_.neon_f32 = vcombine_f32(b32, a32);
2960
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
2961
+   r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
2962
+       SIMDE_POWER_ALTIVEC_VECTOR(float),
2963
+       vec_mergel(b_.altivec_i64, a_.altivec_i64));
2964
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
2965
+   r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 6, 7, 2, 3);
2966
+#else
2967
+   r_.f32[0] = b_.f32[2];
2968
+   r_.f32[1] = b_.f32[3];
2969
+   r_.f32[2] = a_.f32[2];
2970
+   r_.f32[3] = a_.f32[3];
2971
+#endif
2972
+
2973
+   return simde__m128_from_private(r_);
2974
+#endif
2975
+}
2976
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
2977
+#define _mm_movehl_ps(a, b) simde_mm_movehl_ps((a), (b))
2978
+#endif
2979
+
2980
+SIMDE_FUNCTION_ATTRIBUTES
2981
+simde__m128 simde_mm_movelh_ps(simde__m128 a, simde__m128 b)
2982
+{
2983
+#if defined(SIMDE_X86_SSE_NATIVE)
2984
+   return _mm_movelh_ps(a, b);
2985
+#else
2986
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
2987
+               b_ = simde__m128_to_private(b);
2988
+
2989
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2990
+   float32x2_t a10 = vget_low_f32(a_.neon_f32);
2991
+   float32x2_t b10 = vget_low_f32(b_.neon_f32);
2992
+   r_.neon_f32 = vcombine_f32(a10, b10);
2993
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
2994
+   r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 1, 4, 5);
2995
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
2996
+   r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(
2997
+       SIMDE_POWER_ALTIVEC_VECTOR(float),
2998
+       vec_mergeh(a_.altivec_i64, b_.altivec_i64));
2999
+#else
3000
+   r_.f32[0] = a_.f32[0];
3001
+   r_.f32[1] = a_.f32[1];
3002
+   r_.f32[2] = b_.f32[0];
3003
+   r_.f32[3] = b_.f32[1];
3004
+#endif
3005
+
3006
+   return simde__m128_from_private(r_);
3007
+#endif
3008
+}
3009
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3010
+#define _mm_movelh_ps(a, b) simde_mm_movelh_ps((a), (b))
3011
+#endif
3012
+
3013
+SIMDE_FUNCTION_ATTRIBUTES
3014
+int simde_mm_movemask_pi8(simde__m64 a)
3015
+{
3016
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3017
+   return _mm_movemask_pi8(a);
3018
+#else
3019
+   simde__m64_private a_ = simde__m64_to_private(a);
3020
+   int r = 0;
3021
+
3022
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3023
+   uint8x8_t input = a_.neon_u8;
3024
+   const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
3025
+   const uint8x8_t mask_and = vdup_n_u8(0x80);
3026
+   const int8x8_t mask_shift = vld1_s8(xr);
3027
+   const uint8x8_t mask_result =
3028
+       vshl_u8(vand_u8(input, mask_and), mask_shift);
3029
+   uint8x8_t lo = mask_result;
3030
+   r = vaddv_u8(lo);
3031
+#else
3032
+   const size_t nmemb = sizeof(a_.i8) / sizeof(a_.i8[0]);
3033
+   SIMDE_VECTORIZE_REDUCTION(| : r)
3034
+   for (size_t i = 0; i < nmemb; i++) {
3035
+       r |= (a_.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i);
3036
+   }
3037
+#endif
3038
+
3039
+   return r;
3040
+#endif
3041
+}
3042
+#define simde_m_pmovmskb(a) simde_mm_movemask_pi8(a)
3043
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3044
+#define _mm_movemask_pi8(a) simde_mm_movemask_pi8(a)
3045
+#define _m_pmovmskb(a) simde_mm_movemask_pi8(a)
3046
+#endif
3047
+
3048
+SIMDE_FUNCTION_ATTRIBUTES
3049
+int simde_mm_movemask_ps(simde__m128 a)
3050
+{
3051
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3052
+   return _mm_movemask_ps(a);
3053
+#else
3054
+   int r = 0;
3055
+   simde__m128_private a_ = simde__m128_to_private(a);
3056
+
3057
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3058
+   static const int32_t shift_amount[] = {0, 1, 2, 3};
3059
+   const int32x4_t shift = vld1q_s32(shift_amount);
3060
+   uint32x4_t tmp = vshrq_n_u32(a_.neon_u32, 31);
3061
+   return HEDLEY_STATIC_CAST(int, vaddvq_u32(vshlq_u32(tmp, shift)));
3062
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3063
+   // Shift out everything but the sign bits with a 32-bit unsigned shift right.
3064
+   uint64x2_t high_bits =
3065
+       vreinterpretq_u64_u32(vshrq_n_u32(a_.neon_u32, 31));
3066
+   // Merge the two pairs together with a 64-bit unsigned shift right + add.
3067
+   uint8x16_t paired =
3068
+       vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
3069
+   // Extract the result.
3070
+   return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
3071
+#else
3072
+   SIMDE_VECTORIZE_REDUCTION(| : r)
3073
+   for (size_t i = 0; i < sizeof(a_.u32) / sizeof(a_.u32[0]); i++) {
3074
+       r |= (a_.u32[i] >> ((sizeof(a_.u32[i]) * CHAR_BIT) - 1)) << i;
3075
+   }
3076
+#endif
3077
+
3078
+   return r;
3079
+#endif
3080
+}
3081
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3082
+#define _mm_movemask_ps(a) simde_mm_movemask_ps((a))
3083
+#endif
3084
+
3085
+SIMDE_FUNCTION_ATTRIBUTES
3086
+simde__m128 simde_mm_mul_ps(simde__m128 a, simde__m128 b)
3087
+{
3088
+#if defined(SIMDE_X86_SSE_NATIVE)
3089
+   return _mm_mul_ps(a, b);
3090
+#else
3091
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
3092
+               b_ = simde__m128_to_private(b);
3093
+
3094
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3095
+   r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32);
3096
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3097
+   r_.wasm_v128 = wasm_f32x4_mul(a_.wasm_v128, b_.wasm_v128);
3098
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3099
+   r_.f32 = a_.f32 * b_.f32;
3100
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3101
+   r_.altivec_f32 = vec_mul(a_.altivec_f32, b_.altivec_f32);
3102
+#else
3103
+   SIMDE_VECTORIZE
3104
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3105
+       r_.f32[i] = a_.f32[i] * b_.f32[i];
3106
+   }
3107
+#endif
3108
+
3109
+   return simde__m128_from_private(r_);
3110
+#endif
3111
+}
3112
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3113
+#define _mm_mul_ps(a, b) simde_mm_mul_ps((a), (b))
3114
+#endif
3115
+
3116
+SIMDE_FUNCTION_ATTRIBUTES
3117
+simde__m128 simde_mm_mul_ss(simde__m128 a, simde__m128 b)
3118
+{
3119
+#if defined(SIMDE_X86_SSE_NATIVE)
3120
+   return _mm_mul_ss(a, b);
3121
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3122
+   return simde_mm_move_ss(a, simde_mm_mul_ps(a, b));
3123
+#else
3124
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
3125
+               b_ = simde__m128_to_private(b);
3126
+
3127
+   r_.f32[0] = a_.f32[0] * b_.f32[0];
3128
+   r_.f32[1] = a_.f32[1];
3129
+   r_.f32[2] = a_.f32[2];
3130
+   r_.f32[3] = a_.f32[3];
3131
+
3132
+   return simde__m128_from_private(r_);
3133
+#endif
3134
+}
3135
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3136
+#define _mm_mul_ss(a, b) simde_mm_mul_ss((a), (b))
3137
+#endif
3138
+
3139
+SIMDE_FUNCTION_ATTRIBUTES
3140
+simde__m64 simde_mm_mulhi_pu16(simde__m64 a, simde__m64 b)
3141
+{
3142
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3143
+   return _mm_mulhi_pu16(a, b);
3144
+#else
3145
+   simde__m64_private r_, a_ = simde__m64_to_private(a),
3146
+                  b_ = simde__m64_to_private(b);
3147
+
3148
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3149
+   const uint32x4_t t1 = vmull_u16(a_.neon_u16, b_.neon_u16);
3150
+   const uint32x4_t t2 = vshrq_n_u32(t1, 16);
3151
+   const uint16x4_t t3 = vmovn_u32(t2);
3152
+   r_.neon_u16 = t3;
3153
+#else
3154
+   SIMDE_VECTORIZE
3155
+   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
3156
+       r_.u16[i] = HEDLEY_STATIC_CAST(
3157
+           uint16_t, ((HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) *
3158
+                   HEDLEY_STATIC_CAST(uint32_t, b_.u16[i])) >>
3159
+                  UINT32_C(16)));
3160
+   }
3161
+#endif
3162
+
3163
+   return simde__m64_from_private(r_);
3164
+#endif
3165
+}
3166
+#define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
3167
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3168
+#define _mm_mulhi_pu16(a, b) simde_mm_mulhi_pu16(a, b)
3169
+#define _m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
3170
+#endif
3171
+
3172
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(HEDLEY_GCC_VERSION)
3173
+#define SIMDE_MM_HINT_NTA HEDLEY_STATIC_CAST(enum _mm_hint, 0)
3174
+#define SIMDE_MM_HINT_T0 HEDLEY_STATIC_CAST(enum _mm_hint, 1)
3175
+#define SIMDE_MM_HINT_T1 HEDLEY_STATIC_CAST(enum _mm_hint, 2)
3176
+#define SIMDE_MM_HINT_T2 HEDLEY_STATIC_CAST(enum _mm_hint, 3)
3177
+#define SIMDE_MM_HINT_ENTA HEDLEY_STATIC_CAST(enum _mm_hint, 4)
3178
+#define SIMDE_MM_HINT_ET0 HEDLEY_STATIC_CAST(enum _mm_hint, 5)
3179
+#define SIMDE_MM_HINT_ET1 HEDLEY_STATIC_CAST(enum _mm_hint, 6)
3180
+#define SIMDE_MM_HINT_ET2 HEDLEY_STATIC_CAST(enum _mm_hint, 7)
3181
+#else
3182
+#define SIMDE_MM_HINT_NTA 0
3183
+#define SIMDE_MM_HINT_T0 1
3184
+#define SIMDE_MM_HINT_T1 2
3185
+#define SIMDE_MM_HINT_T2 3
3186
+#define SIMDE_MM_HINT_ENTA 4
3187
+#define SIMDE_MM_HINT_ET0 5
3188
+#define SIMDE_MM_HINT_ET1 6
3189
+#define SIMDE_MM_HINT_ET2 7
3190
+#endif
3191
+
3192
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3193
+HEDLEY_DIAGNOSTIC_PUSH
3194
+#if HEDLEY_HAS_WARNING("-Wreserved-id-macro")
3195
+_Pragma("clang diagnostic ignored \"-Wreserved-id-macro\"")
3196
+#endif
3197
+#undef _MM_HINT_NTA
3198
+#define _MM_HINT_NTA SIMDE_MM_HINT_NTA
3199
+#undef _MM_HINT_T0
3200
+#define _MM_HINT_T0 SIMDE_MM_HINT_T0
3201
+#undef _MM_HINT_T1
3202
+#define _MM_HINT_T1 SIMDE_MM_HINT_T1
3203
+#undef _MM_HINT_T2
3204
+#define _MM_HINT_T2 SIMDE_MM_HINT_T2
3205
+#undef _MM_HINT_ETNA
3206
+#define _MM_HINT_ETNA SIMDE_MM_HINT_ETNA
3207
+#undef _MM_HINT_ET0
3208
+#define _MM_HINT_ET0 SIMDE_MM_HINT_ET0
3209
+#undef _MM_HINT_ET1
3210
+#define _MM_HINT_ET1 SIMDE_MM_HINT_ET1
3211
+#undef _MM_HINT_ET1
3212
+#define _MM_HINT_ET2 SIMDE_MM_HINT_ET2
3213
+   HEDLEY_DIAGNOSTIC_POP
3214
+#endif
3215
+
3216
+   SIMDE_FUNCTION_ATTRIBUTES void simde_mm_prefetch(char const *p, int i)
3217
+{
3218
+#if defined(HEDLEY_GCC_VERSION)
3219
+   __builtin_prefetch(p);
3220
+#else
3221
+   (void)p;
3222
+#endif
3223
+
3224
+   (void)i;
3225
+}
3226
+#if defined(SIMDE_X86_SSE_NATIVE)
3227
+#if defined(__clang__) &&                  \
3228
+   !SIMDE_DETECT_CLANG_VERSION_CHECK( \
3229
+       10, 0, 0) /* https://reviews.llvm.org/D71718 */
3230
+#define simde_mm_prefetch(p, i)                     \
3231
+   (__extension__({                            \
3232
+       HEDLEY_DIAGNOSTIC_PUSH              \
3233
+       HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \
3234
+       _mm_prefetch((p), (i));             \
3235
+       HEDLEY_DIAGNOSTIC_POP               \
3236
+   }))
3237
+#else
3238
+#define simde_mm_prefetch(p, i) _mm_prefetch(p, i)
3239
+#endif
3240
+#endif
3241
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3242
+#define _mm_prefetch(p, i) simde_mm_prefetch(p, i)
3243
+#endif
3244
+
3245
+SIMDE_FUNCTION_ATTRIBUTES
3246
+simde__m128 simde_x_mm_negate_ps(simde__m128 a)
3247
+{
3248
+#if defined(SIMDE_X86_SSE_NATIVE)
3249
+   return simde_mm_xor_ps(a, _mm_set1_ps(SIMDE_FLOAT32_C(-0.0)));
3250
+#else
3251
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
3252
+
3253
+#if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \
3254
+   (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8, 1, 0))
3255
+   r_.altivec_f32 = vec_neg(a_.altivec_f32);
3256
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3257
+   r_.neon_f32 = vnegq_f32(a_.neon_f32);
3258
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3259
+   r_.wasm_v128 = wasm_f32x4_neg(a_.wasm_v128);
3260
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
3261
+   r_.altivec_f32 = vec_neg(a_.altivec_f32);
3262
+#elif defined(SIMDE_VECTOR_NEGATE)
3263
+   r_.f32 = -a_.f32;
3264
+#else
3265
+   SIMDE_VECTORIZE
3266
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3267
+       r_.f32[i] = -a_.f32[i];
3268
+   }
3269
+#endif
3270
+
3271
+   return simde__m128_from_private(r_);
3272
+#endif
3273
+}
3274
+
3275
+SIMDE_FUNCTION_ATTRIBUTES
3276
+simde__m128 simde_mm_rcp_ps(simde__m128 a)
3277
+{
3278
+#if defined(SIMDE_X86_SSE_NATIVE)
3279
+   return _mm_rcp_ps(a);
3280
+#else
3281
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
3282
+
3283
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3284
+   float32x4_t recip = vrecpeq_f32(a_.neon_f32);
3285
+
3286
+#if SIMDE_ACCURACY_PREFERENCE > 0
3287
+   for (int i = 0; i < SIMDE_ACCURACY_PREFERENCE; ++i) {
3288
+       recip = vmulq_f32(recip, vrecpsq_f32(recip, a_.neon_f32));
3289
+   }
3290
+#endif
3291
+
3292
+   r_.neon_f32 = recip;
3293
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3294
+   r_.wasm_v128 = wasm_f32x4_div(simde_mm_set1_ps(1.0f), a_.wasm_v128);
3295
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3296
+   r_.altivec_f32 = vec_re(a_.altivec_f32);
3297
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
3298
+   r_.f32 = 1.0f / a_.f32;
3299
+#elif defined(SIMDE_IEEE754_STORAGE)
3300
+   /* https://stackoverflow.com/questions/12227126/division-as-multiply-and-lut-fast-float-division-reciprocal/12228234#12228234 */
3301
+   SIMDE_VECTORIZE
3302
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3303
+       int32_t ix;
3304
+       simde_float32 fx = a_.f32[i];
3305
+       simde_memcpy(&ix, &fx, sizeof(ix));
3306
+       int32_t x = INT32_C(0x7EF311C3) - ix;
3307
+       simde_float32 temp;
3308
+       simde_memcpy(&temp, &x, sizeof(temp));
3309
+       r_.f32[i] = temp * (SIMDE_FLOAT32_C(2.0) - temp * fx);
3310
+   }
3311
+#else
3312
+   SIMDE_VECTORIZE
3313
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3314
+       r_.f32[i] = 1.0f / a_.f32[i];
3315
+   }
3316
+#endif
3317
+
3318
+   return simde__m128_from_private(r_);
3319
+#endif
3320
+}
3321
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3322
+#define _mm_rcp_ps(a) simde_mm_rcp_ps((a))
3323
+#endif
3324
+
3325
+SIMDE_FUNCTION_ATTRIBUTES
3326
+simde__m128 simde_mm_rcp_ss(simde__m128 a)
3327
+{
3328
+#if defined(SIMDE_X86_SSE_NATIVE)
3329
+   return _mm_rcp_ss(a);
3330
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3331
+   return simde_mm_move_ss(a, simde_mm_rcp_ps(a));
3332
+#else
3333
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
3334
+
3335
+   r_.f32[0] = 1.0f / a_.f32[0];
3336
+   r_.f32[1] = a_.f32[1];
3337
+   r_.f32[2] = a_.f32[2];
3338
+   r_.f32[3] = a_.f32[3];
3339
+
3340
+   return simde__m128_from_private(r_);
3341
+#endif
3342
+}
3343
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3344
+#define _mm_rcp_ss(a) simde_mm_rcp_ss((a))
3345
+#endif
3346
+
3347
+SIMDE_FUNCTION_ATTRIBUTES
3348
+simde__m128 simde_mm_rsqrt_ps(simde__m128 a)
3349
+{
3350
+#if defined(SIMDE_X86_SSE_NATIVE)
3351
+   return _mm_rsqrt_ps(a);
3352
+#else
3353
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
3354
+
3355
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3356
+   r_.neon_f32 = vrsqrteq_f32(a_.neon_f32);
3357
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3358
+   r_.altivec_f32 = vec_rsqrte(a_.altivec_f32);
3359
+#elif defined(SIMDE_IEEE754_STORAGE)
3360
+   /* https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
3361
+        Pages 100 - 103 */
3362
+   SIMDE_VECTORIZE
3363
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3364
+#if SIMDE_ACCURACY_PREFERENCE <= 0
3365
+       r_.i32[i] = INT32_C(0x5F37624F) - (a_.i32[i] >> 1);
3366
+#else
3367
+       simde_float32 x = a_.f32[i];
3368
+       simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x;
3369
+       int32_t ix;
3370
+
3371
+       simde_memcpy(&ix, &x, sizeof(ix));
3372
+
3373
+#if SIMDE_ACCURACY_PREFERENCE == 1
3374
+       ix = INT32_C(0x5F375A82) - (ix >> 1);
3375
+#else
3376
+       ix = INT32_C(0x5F37599E) - (ix >> 1);
3377
+#endif
3378
+
3379
+       simde_memcpy(&x, &ix, sizeof(x));
3380
+
3381
+#if SIMDE_ACCURACY_PREFERENCE >= 2
3382
+       x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
3383
+#endif
3384
+       x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
3385
+
3386
+       r_.f32[i] = x;
3387
+#endif
3388
+   }
3389
+#elif defined(simde_math_sqrtf)
3390
+   SIMDE_VECTORIZE
3391
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3392
+       r_.f32[i] = 1.0f / simde_math_sqrtf(a_.f32[i]);
3393
+   }
3394
+#else
3395
+   HEDLEY_UNREACHABLE();
3396
+#endif
3397
+
3398
+   return simde__m128_from_private(r_);
3399
+#endif
3400
+}
3401
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3402
+#define _mm_rsqrt_ps(a) simde_mm_rsqrt_ps((a))
3403
+#endif
3404
+
3405
+SIMDE_FUNCTION_ATTRIBUTES
3406
+simde__m128 simde_mm_rsqrt_ss(simde__m128 a)
3407
+{
3408
+#if defined(SIMDE_X86_SSE_NATIVE)
3409
+   return _mm_rsqrt_ss(a);
3410
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3411
+   return simde_mm_move_ss(a, simde_mm_rsqrt_ps(a));
3412
+#else
3413
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
3414
+
3415
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3416
+   r_.neon_f32 =
3417
+       vsetq_lane_f32(vgetq_lane_f32(simde_mm_rsqrt_ps(a).neon_f32, 0),
3418
+                  a_.neon_f32, 0);
3419
+#elif defined(SIMDE_IEEE754_STORAGE)
3420
+   {
3421
+#if SIMDE_ACCURACY_PREFERENCE <= 0
3422
+       r_.i32[0] = INT32_C(0x5F37624F) - (a_.i32[0] >> 1);
3423
+#else
3424
+       simde_float32 x = a_.f32[0];
3425
+       simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x;
3426
+       int32_t ix;
3427
+
3428
+       simde_memcpy(&ix, &x, sizeof(ix));
3429
+
3430
+#if SIMDE_ACCURACY_PREFERENCE == 1
3431
+       ix = INT32_C(0x5F375A82) - (ix >> 1);
3432
+#else
3433
+       ix = INT32_C(0x5F37599E) - (ix >> 1);
3434
+#endif
3435
+
3436
+       simde_memcpy(&x, &ix, sizeof(x));
3437
+
3438
+#if SIMDE_ACCURACY_PREFERENCE >= 2
3439
+       x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
3440
+#endif
3441
+       x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x);
3442
+
3443
+       r_.f32[0] = x;
3444
+#endif
3445
+   }
3446
+   r_.f32[1] = a_.f32[1];
3447
+   r_.f32[2] = a_.f32[2];
3448
+   r_.f32[3] = a_.f32[3];
3449
+#elif defined(simde_math_sqrtf)
3450
+   r_.f32[0] = 1.0f / simde_math_sqrtf(a_.f32[0]);
3451
+   r_.f32[1] = a_.f32[1];
3452
+   r_.f32[2] = a_.f32[2];
3453
+   r_.f32[3] = a_.f32[3];
3454
+#else
3455
+   HEDLEY_UNREACHABLE();
3456
+#endif
3457
+
3458
+   return simde__m128_from_private(r_);
3459
+#endif
3460
+}
3461
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3462
+#define _mm_rsqrt_ss(a) simde_mm_rsqrt_ss((a))
3463
+#endif
3464
+
3465
+SIMDE_FUNCTION_ATTRIBUTES
3466
+simde__m64 simde_mm_sad_pu8(simde__m64 a, simde__m64 b)
3467
+{
3468
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3469
+   return _mm_sad_pu8(a, b);
3470
+#else
3471
+   simde__m64_private r_, a_ = simde__m64_to_private(a),
3472
+                  b_ = simde__m64_to_private(b);
3473
+
3474
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3475
+   uint16x4_t t = vpaddl_u8(vabd_u8(a_.neon_u8, b_.neon_u8));
3476
+   uint16_t r0 = t[0] + t[1] + t[2] + t[3];
3477
+   r_.neon_u16 = vset_lane_u16(r0, vdup_n_u16(0), 0);
3478
+#else
3479
+   uint16_t sum = 0;
3480
+
3481
+#if defined(SIMDE_HAVE_STDLIB_H)
3482
+   SIMDE_VECTORIZE_REDUCTION(+ : sum)
3483
+   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
3484
+       sum += HEDLEY_STATIC_CAST(uint8_t, abs(a_.u8[i] - b_.u8[i]));
3485
+   }
3486
+
3487
+   r_.i16[0] = HEDLEY_STATIC_CAST(int16_t, sum);
3488
+   r_.i16[1] = 0;
3489
+   r_.i16[2] = 0;
3490
+   r_.i16[3] = 0;
3491
+#else
3492
+   HEDLEY_UNREACHABLE();
3493
+#endif
3494
+#endif
3495
+
3496
+   return simde__m64_from_private(r_);
3497
+#endif
3498
+}
3499
+#define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b)
3500
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3501
+#define _mm_sad_pu8(a, b) simde_mm_sad_pu8(a, b)
3502
+#define _m_psadbw(a, b) simde_mm_sad_pu8(a, b)
3503
+#endif
3504
+
3505
+SIMDE_FUNCTION_ATTRIBUTES
3506
+simde__m128 simde_mm_set_ss(simde_float32 a)
3507
+{
3508
+#if defined(SIMDE_X86_SSE_NATIVE)
3509
+   return _mm_set_ss(a);
3510
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3511
+   return vsetq_lane_f32(a, vdupq_n_f32(SIMDE_FLOAT32_C(0.0)), 0);
3512
+#else
3513
+   return simde_mm_set_ps(SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0),
3514
+                  SIMDE_FLOAT32_C(0.0), a);
3515
+#endif
3516
+}
3517
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3518
+#define _mm_set_ss(a) simde_mm_set_ss(a)
3519
+#endif
3520
+
3521
+SIMDE_FUNCTION_ATTRIBUTES
3522
+simde__m128 simde_mm_setr_ps(simde_float32 e3, simde_float32 e2,
3523
+                simde_float32 e1, simde_float32 e0)
3524
+{
3525
+#if defined(SIMDE_X86_SSE_NATIVE)
3526
+   return _mm_setr_ps(e3, e2, e1, e0);
3527
+#else
3528
+   return simde_mm_set_ps(e0, e1, e2, e3);
3529
+#endif
3530
+}
3531
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3532
+#define _mm_setr_ps(e3, e2, e1, e0) simde_mm_setr_ps(e3, e2, e1, e0)
3533
+#endif
3534
+
3535
+SIMDE_FUNCTION_ATTRIBUTES
3536
+simde__m128 simde_mm_setzero_ps(void)
3537
+{
3538
+#if defined(SIMDE_X86_SSE_NATIVE)
3539
+   return _mm_setzero_ps();
3540
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3541
+   return vdupq_n_f32(SIMDE_FLOAT32_C(0.0));
3542
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3543
+   return vec_splats(SIMDE_FLOAT32_C(0.0));
3544
+#else
3545
+   simde__m128 r;
3546
+   simde_memset(&r, 0, sizeof(r));
3547
+   return r;
3548
+#endif
3549
+}
3550
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3551
+#define _mm_setzero_ps() simde_mm_setzero_ps()
3552
+#endif
3553
+
3554
+#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
3555
+HEDLEY_DIAGNOSTIC_PUSH
3556
+SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
3557
+#endif
3558
+
3559
+SIMDE_FUNCTION_ATTRIBUTES
3560
+simde__m128 simde_mm_undefined_ps(void)
3561
+{
3562
+   simde__m128_private r_;
3563
+
3564
+#if defined(SIMDE_HAVE_UNDEFINED128)
3565
+   r_.n = _mm_undefined_ps();
3566
+#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
3567
+   r_ = simde__m128_to_private(simde_mm_setzero_ps());
3568
+#endif
3569
+
3570
+   return simde__m128_from_private(r_);
3571
+}
3572
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3573
+#define _mm_undefined_ps() simde_mm_undefined_ps()
3574
+#endif
3575
+
3576
+#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
3577
+HEDLEY_DIAGNOSTIC_POP
3578
+#endif
3579
+
3580
+SIMDE_FUNCTION_ATTRIBUTES
3581
+simde__m128 simde_x_mm_setone_ps(void)
3582
+{
3583
+   simde__m128 t = simde_mm_setzero_ps();
3584
+   return simde_mm_cmpeq_ps(t, t);
3585
+}
3586
+
3587
+SIMDE_FUNCTION_ATTRIBUTES
3588
+void simde_mm_sfence(void)
3589
+{
3590
+   /* TODO: Use Hedley. */
3591
+#if defined(SIMDE_X86_SSE_NATIVE)
3592
+   _mm_sfence();
3593
+#elif defined(__GNUC__) && \
3594
+   ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
3595
+   __atomic_thread_fence(__ATOMIC_SEQ_CST);
3596
+#elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \
3597
+   (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
3598
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9)
3599
+   __atomic_thread_fence(__ATOMIC_SEQ_CST);
3600
+#else
3601
+   atomic_thread_fence(memory_order_seq_cst);
3602
+#endif
3603
+#elif defined(_MSC_VER)
3604
+   MemoryBarrier();
3605
+#elif HEDLEY_HAS_EXTENSION(c_atomic)
3606
+   __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
3607
+#elif defined(__GNUC__) && \
3608
+   ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
3609
+   __sync_synchronize();
3610
+#elif defined(_OPENMP)
3611
+#pragma omp critical(simde_mm_sfence_)
3612
+   {
3613
+   }
3614
+#endif
3615
+}
3616
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3617
+#define _mm_sfence() simde_mm_sfence()
3618
+#endif
3619
+
3620
+#define SIMDE_MM_SHUFFLE(z, y, x, w) \
3621
+   (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3622
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3623
+#define _MM_SHUFFLE(z, y, x, w) SIMDE_MM_SHUFFLE(z, y, x, w)
3624
+#endif
3625
+
3626
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
3627
+   !defined(__PGI)
3628
+#define simde_mm_shuffle_pi16(a, imm8) _mm_shuffle_pi16(a, imm8)
3629
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
3630
+#define simde_mm_shuffle_pi16(a, imm8)                                    \
3631
+   (__extension__({                                                  \
3632
+       const simde__m64_private simde__tmp_a_ =                  \
3633
+           simde__m64_to_private(a);                         \
3634
+       simde__m64_from_private((simde__m64_private){             \
3635
+           .i16 = SIMDE_SHUFFLE_VECTOR_(                     \
3636
+               16, 8, (simde__tmp_a_).i16,               \
3637
+               (simde__tmp_a_).i16, (((imm8)) & 3),      \
3638
+               (((imm8) >> 2) & 3), (((imm8) >> 4) & 3), \
3639
+               (((imm8) >> 6) & 3))});                   \
3640
+   }))
3641
+#else
3642
+SIMDE_FUNCTION_ATTRIBUTES
3643
+simde__m64 simde_mm_shuffle_pi16(simde__m64 a, const int imm8)
3644
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
3645
+{
3646
+   simde__m64_private r_;
3647
+   simde__m64_private a_ = simde__m64_to_private(a);
3648
+
3649
+   for (size_t i = 0; i < sizeof(r_.i16) / sizeof(r_.i16[0]); i++) {
3650
+       r_.i16[i] = a_.i16[(imm8 >> (i * 2)) & 3];
3651
+   }
3652
+
3653
+   HEDLEY_DIAGNOSTIC_PUSH
3654
+#if HEDLEY_HAS_WARNING("-Wconditional-uninitialized")
3655
+#pragma clang diagnostic ignored "-Wconditional-uninitialized"
3656
+#endif
3657
+   return simde__m64_from_private(r_);
3658
+   HEDLEY_DIAGNOSTIC_POP
3659
+}
3660
+#endif
3661
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
3662
+   !defined(__PGI)
3663
+#define simde_m_pshufw(a, imm8) _m_pshufw(a, imm8)
3664
+#else
3665
+#define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
3666
+#endif
3667
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3668
+#define _mm_shuffle_pi16(a, imm8) simde_mm_shuffle_pi16(a, imm8)
3669
+#define _m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
3670
+#endif
3671
+
3672
+#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI)
3673
+#define simde_mm_shuffle_ps(a, b, imm8) _mm_shuffle_ps(a, b, imm8)
3674
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3675
+#define simde_mm_shuffle_ps(a, b, imm8)                                      \
3676
+   __extension__({                                                      \
3677
+       float32x4_t ret;                                             \
3678
+       ret = vmovq_n_f32(vgetq_lane_f32(a, (imm8) & (0x3)));        \
3679
+       ret = vsetq_lane_f32(vgetq_lane_f32(a, ((imm8) >> 2) & 0x3), \
3680
+                    ret, 1);                                \
3681
+       ret = vsetq_lane_f32(vgetq_lane_f32(b, ((imm8) >> 4) & 0x3), \
3682
+                    ret, 2);                                \
3683
+       ret = vsetq_lane_f32(vgetq_lane_f32(b, ((imm8) >> 6) & 0x3), \
3684
+                    ret, 3);                                \
3685
+   })
3686
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
3687
+#define simde_mm_shuffle_ps(a, b, imm8)                                        \
3688
+   (__extension__({                                                       \
3689
+       simde__m128_from_private((simde__m128_private){                \
3690
+           .f32 = SIMDE_SHUFFLE_VECTOR_(                          \
3691
+               32, 16, simde__m128_to_private(a).f32,         \
3692
+               simde__m128_to_private(b).f32, (((imm8)) & 3), \
3693
+               (((imm8) >> 2) & 3), (((imm8) >> 4) & 3) + 4,  \
3694
+               (((imm8) >> 6) & 3) + 4)});                    \
3695
+   }))
3696
+#else
3697
+SIMDE_FUNCTION_ATTRIBUTES
3698
+simde__m128 simde_mm_shuffle_ps(simde__m128 a, simde__m128 b, const int imm8)
3699
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
3700
+{
3701
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
3702
+               b_ = simde__m128_to_private(b);
3703
+
3704
+   r_.f32[0] = a_.f32[(imm8 >> 0) & 3];
3705
+   r_.f32[1] = a_.f32[(imm8 >> 2) & 3];
3706
+   r_.f32[2] = b_.f32[(imm8 >> 4) & 3];
3707
+   r_.f32[3] = b_.f32[(imm8 >> 6) & 3];
3708
+
3709
+   return simde__m128_from_private(r_);
3710
+}
3711
+#endif
3712
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3713
+#define _mm_shuffle_ps(a, b, imm8) simde_mm_shuffle_ps((a), (b), imm8)
3714
+#endif
3715
+
3716
+SIMDE_FUNCTION_ATTRIBUTES
3717
+simde__m128 simde_mm_sqrt_ps(simde__m128 a)
3718
+{
3719
+#if defined(SIMDE_X86_SSE_NATIVE)
3720
+   return _mm_sqrt_ps(a);
3721
+#else
3722
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
3723
+
3724
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3725
+   r_.neon_f32 = vsqrtq_f32(a_.neon_f32);
3726
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3727
+   float32x4_t est = vrsqrteq_f32(a_.neon_f32);
3728
+   for (int i = 0; i <= SIMDE_ACCURACY_PREFERENCE; i++) {
3729
+       est = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a_.neon_f32, est), est),
3730
+               est);
3731
+   }
3732
+   r_.neon_f32 = vmulq_f32(a_.neon_f32, est);
3733
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3734
+   r_.wasm_v128 = wasm_f32x4_sqrt(a_.wasm_v128);
3735
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3736
+   r_.altivec_f32 = vec_sqrt(a_.altivec_f32);
3737
+#elif defined(simde_math_sqrt)
3738
+   SIMDE_VECTORIZE
3739
+   for (size_t i = 0; i < sizeof(r_.f32) / sizeof(r_.f32[0]); i++) {
3740
+       r_.f32[i] = simde_math_sqrtf(a_.f32[i]);
3741
+   }
3742
+#else
3743
+   HEDLEY_UNREACHABLE();
3744
+#endif
3745
+
3746
+   return simde__m128_from_private(r_);
3747
+#endif
3748
+}
3749
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3750
+#define _mm_sqrt_ps(a) simde_mm_sqrt_ps((a))
3751
+#endif
3752
+
3753
+SIMDE_FUNCTION_ATTRIBUTES
3754
+simde__m128 simde_mm_sqrt_ss(simde__m128 a)
3755
+{
3756
+#if defined(SIMDE_X86_SSE_NATIVE)
3757
+   return _mm_sqrt_ss(a);
3758
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3759
+   return simde_mm_move_ss(a, simde_mm_sqrt_ps(a));
3760
+#else
3761
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
3762
+
3763
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3764
+   float32_t value = vgetq_lane_f32(
3765
+       simde__m128_to_private(simde_mm_sqrt_ps(a)).neon_f32, 0);
3766
+   r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0);
3767
+#elif defined(simde_math_sqrtf)
3768
+   r_.f32[0] = simde_math_sqrtf(a_.f32[0]);
3769
+   r_.f32[1] = a_.f32[1];
3770
+   r_.f32[2] = a_.f32[2];
3771
+   r_.f32[3] = a_.f32[3];
3772
+#else
3773
+   HEDLEY_UNREACHABLE();
3774
+#endif
3775
+
3776
+   return simde__m128_from_private(r_);
3777
+#endif
3778
+}
3779
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3780
+#define _mm_sqrt_ss(a) simde_mm_sqrt_ss((a))
3781
+#endif
3782
+
3783
+SIMDE_FUNCTION_ATTRIBUTES
3784
+void simde_mm_store_ps(simde_float32 mem_addr[4], simde__m128 a)
3785
+{
3786
+#if defined(SIMDE_X86_SSE_NATIVE)
3787
+   _mm_store_ps(mem_addr, a);
3788
+#else
3789
+   simde__m128_private a_ = simde__m128_to_private(a);
3790
+
3791
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3792
+   vst1q_f32(mem_addr, a_.neon_f32);
3793
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3794
+   vec_st(a_.altivec_f32, 0, mem_addr);
3795
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3796
+   wasm_v128_store(mem_addr, a_.wasm_v128);
3797
+#else
3798
+   simde_memcpy(mem_addr, &a_, sizeof(a));
3799
+#endif
3800
+#endif
3801
+}
3802
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3803
+#define _mm_store_ps(mem_addr, a)                                      \
3804
+   simde_mm_store_ps(SIMDE_CHECKED_REINTERPRET_CAST(              \
3805
+                 float *, simde_float32 *, mem_addr), \
3806
+             (a))
3807
+#endif
3808
+
3809
+SIMDE_FUNCTION_ATTRIBUTES
3810
+void simde_mm_store1_ps(simde_float32 mem_addr[4], simde__m128 a)
3811
+{
3812
+   simde_float32 *mem_addr_ =
3813
+       SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128);
3814
+
3815
+#if defined(SIMDE_X86_SSE_NATIVE)
3816
+   _mm_store_ps1(mem_addr_, a);
3817
+#else
3818
+   simde__m128_private a_ = simde__m128_to_private(a);
3819
+
3820
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3821
+   vst1q_f32(mem_addr_, vdupq_lane_f32(vget_low_f32(a_.neon_f32), 0));
3822
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3823
+   wasm_v128_store(mem_addr_,
3824
+           wasm_v32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 0,
3825
+                      0));
3826
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3827
+   vec_st(vec_splat(a_.altivec_f32, 0), 0, mem_addr_);
3828
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
3829
+   simde__m128_private tmp_;
3830
+   tmp_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0);
3831
+   simde_mm_store_ps(mem_addr_, tmp_.f32);
3832
+#else
3833
+   SIMDE_VECTORIZE_ALIGNED(mem_addr_ : 16)
3834
+   for (size_t i = 0; i < sizeof(a_.f32) / sizeof(a_.f32[0]); i++) {
3835
+       mem_addr_[i] = a_.f32[0];
3836
+   }
3837
+#endif
3838
+#endif
3839
+}
3840
+#define simde_mm_store_ps1(mem_addr, a) simde_mm_store1_ps(mem_addr, a)
3841
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3842
+#define _mm_store_ps1(mem_addr, a)                                      \
3843
+   simde_mm_store1_ps(SIMDE_CHECKED_REINTERPRET_CAST(              \
3844
+                  float *, simde_float32 *, mem_addr), \
3845
+              (a))
3846
+#define _mm_store1_ps(mem_addr, a)                                      \
3847
+   simde_mm_store1_ps(SIMDE_CHECKED_REINTERPRET_CAST(              \
3848
+                  float *, simde_float32 *, mem_addr), \
3849
+              (a))
3850
+#endif
3851
+
3852
+SIMDE_FUNCTION_ATTRIBUTES
3853
+void simde_mm_store_ss(simde_float32 *mem_addr, simde__m128 a)
3854
+{
3855
+#if defined(SIMDE_X86_SSE_NATIVE)
3856
+   _mm_store_ss(mem_addr, a);
3857
+#else
3858
+   simde__m128_private a_ = simde__m128_to_private(a);
3859
+
3860
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3861
+   vst1q_lane_f32(mem_addr, a_.neon_f32, 0);
3862
+#else
3863
+   *mem_addr = a_.f32[0];
3864
+#endif
3865
+#endif
3866
+}
3867
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3868
+#define _mm_store_ss(mem_addr, a)                                      \
3869
+   simde_mm_store_ss(SIMDE_CHECKED_REINTERPRET_CAST(              \
3870
+                 float *, simde_float32 *, mem_addr), \
3871
+             (a))
3872
+#endif
3873
+
3874
+SIMDE_FUNCTION_ATTRIBUTES
3875
+void simde_mm_storeh_pi(simde__m64 *mem_addr, simde__m128 a)
3876
+{
3877
+#if defined(SIMDE_X86_SSE_NATIVE)
3878
+   _mm_storeh_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a);
3879
+#else
3880
+   simde__m128_private a_ = simde__m128_to_private(a);
3881
+
3882
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3883
+   vst1_f32(HEDLEY_REINTERPRET_CAST(float32_t *, mem_addr),
3884
+        vget_high_f32(a_.neon_f32));
3885
+#else
3886
+   simde_memcpy(mem_addr, &(a_.m64[1]), sizeof(a_.m64[1]));
3887
+#endif
3888
+#endif
3889
+}
3890
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3891
+#define _mm_storeh_pi(mem_addr, a) simde_mm_storeh_pi(mem_addr, (a))
3892
+#endif
3893
+
3894
+SIMDE_FUNCTION_ATTRIBUTES
3895
+void simde_mm_storel_pi(simde__m64 *mem_addr, simde__m128 a)
3896
+{
3897
+#if defined(SIMDE_X86_SSE_NATIVE)
3898
+   _mm_storel_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a);
3899
+#else
3900
+   simde__m64_private *dest_ =
3901
+       HEDLEY_REINTERPRET_CAST(simde__m64_private *, mem_addr);
3902
+   simde__m128_private a_ = simde__m128_to_private(a);
3903
+
3904
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3905
+   dest_->neon_f32 = vget_low_f32(a_.neon_f32);
3906
+#else
3907
+   dest_->f32[0] = a_.f32[0];
3908
+   dest_->f32[1] = a_.f32[1];
3909
+#endif
3910
+#endif
3911
+}
3912
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3913
+#define _mm_storel_pi(mem_addr, a) simde_mm_storel_pi(mem_addr, (a))
3914
+#endif
3915
+
3916
+SIMDE_FUNCTION_ATTRIBUTES
3917
+void simde_mm_storer_ps(simde_float32 mem_addr[4], simde__m128 a)
3918
+{
3919
+#if defined(SIMDE_X86_SSE_NATIVE)
3920
+   _mm_storer_ps(mem_addr, a);
3921
+#else
3922
+   simde__m128_private a_ = simde__m128_to_private(a);
3923
+
3924
+#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3925
+   vec_st(vec_reve(a_.altivec_f32), 0, mem_addr);
3926
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3927
+   float32x4_t tmp = vrev64q_f32(a_.neon_f32);
3928
+   vst1q_f32(mem_addr, vextq_f32(tmp, tmp, 2));
3929
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
3930
+   a_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 3, 2, 1, 0);
3931
+   simde_mm_store_ps(mem_addr, simde__m128_from_private(a_));
3932
+#else
3933
+   SIMDE_VECTORIZE_ALIGNED(mem_addr : 16)
3934
+   for (size_t i = 0; i < sizeof(a_.f32) / sizeof(a_.f32[0]); i++) {
3935
+       mem_addr[i] =
3936
+           a_.f32[((sizeof(a_.f32) / sizeof(a_.f32[0])) - 1) - i];
3937
+   }
3938
+#endif
3939
+#endif
3940
+}
3941
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3942
+#define _mm_storer_ps(mem_addr, a)                                      \
3943
+   simde_mm_storer_ps(SIMDE_CHECKED_REINTERPRET_CAST(              \
3944
+                  float *, simde_float32 *, mem_addr), \
3945
+              (a))
3946
+#endif
3947
+
3948
+SIMDE_FUNCTION_ATTRIBUTES
3949
+void simde_mm_storeu_ps(simde_float32 mem_addr[4], simde__m128 a)
3950
+{
3951
+#if defined(SIMDE_X86_SSE_NATIVE)
3952
+   _mm_storeu_ps(mem_addr, a);
3953
+#else
3954
+   simde__m128_private a_ = simde__m128_to_private(a);
3955
+
3956
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3957
+   vst1q_f32(mem_addr, a_.neon_f32);
3958
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3959
+   vec_vsx_st(a_.altivec_f32, 0, mem_addr);
3960
+#else
3961
+   simde_memcpy(mem_addr, &a_, sizeof(a_));
3962
+#endif
3963
+#endif
3964
+}
3965
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
3966
+#define _mm_storeu_ps(mem_addr, a)                                      \
3967
+   simde_mm_storeu_ps(SIMDE_CHECKED_REINTERPRET_CAST(              \
3968
+                  float *, simde_float32 *, mem_addr), \
3969
+              (a))
3970
+#endif
3971
+
3972
+SIMDE_FUNCTION_ATTRIBUTES
3973
+simde__m128 simde_mm_sub_ps(simde__m128 a, simde__m128 b)
3974
+{
3975
+#if defined(SIMDE_X86_SSE_NATIVE)
3976
+   return _mm_sub_ps(a, b);
3977
+#else
3978
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
3979
+               b_ = simde__m128_to_private(b);
3980
+
3981
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3982
+   r_.neon_f32 = vsubq_f32(a_.neon_f32, b_.neon_f32);
3983
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3984
+   r_.wasm_v128 = wasm_f32x4_sub(a_.wasm_v128, b_.wasm_v128);
3985
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3986
+   r_.altivec_f32 = vec_sub(a_.altivec_f32, b_.altivec_f32);
3987
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3988
+   r_.f32 = a_.f32 - b_.f32;
3989
+#else
3990
+   SIMDE_VECTORIZE
3991
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
3992
+       r_.f32[i] = a_.f32[i] - b_.f32[i];
3993
+   }
3994
+#endif
3995
+
3996
+   return simde__m128_from_private(r_);
3997
+#endif
3998
+}
3999
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4000
+#define _mm_sub_ps(a, b) simde_mm_sub_ps((a), (b))
4001
+#endif
4002
+
4003
+SIMDE_FUNCTION_ATTRIBUTES
4004
+simde__m128 simde_mm_sub_ss(simde__m128 a, simde__m128 b)
4005
+{
4006
+#if defined(SIMDE_X86_SSE_NATIVE)
4007
+   return _mm_sub_ss(a, b);
4008
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
4009
+   return simde_mm_move_ss(a, simde_mm_sub_ps(a, b));
4010
+#else
4011
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
4012
+               b_ = simde__m128_to_private(b);
4013
+
4014
+   r_.f32[0] = a_.f32[0] - b_.f32[0];
4015
+   r_.f32[1] = a_.f32[1];
4016
+   r_.f32[2] = a_.f32[2];
4017
+   r_.f32[3] = a_.f32[3];
4018
+
4019
+   return simde__m128_from_private(r_);
4020
+#endif
4021
+}
4022
+
4023
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4024
+#define _mm_sub_ss(a, b) simde_mm_sub_ss((a), (b))
4025
+#endif
4026
+
4027
+SIMDE_FUNCTION_ATTRIBUTES
4028
+int simde_mm_ucomieq_ss(simde__m128 a, simde__m128 b)
4029
+{
4030
+#if defined(SIMDE_X86_SSE_NATIVE)
4031
+   return _mm_ucomieq_ss(a, b);
4032
+#else
4033
+   simde__m128_private a_ = simde__m128_to_private(a),
4034
+               b_ = simde__m128_to_private(b);
4035
+   int r;
4036
+
4037
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4038
+   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
4039
+   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
4040
+   uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
4041
+   uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32);
4042
+   r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0);
4043
+#elif defined(SIMDE_HAVE_FENV_H)
4044
+   fenv_t envp;
4045
+   int x = feholdexcept(&envp);
4046
+   r = a_.f32[0] == b_.f32[0];
4047
+   if (HEDLEY_LIKELY(x == 0))
4048
+       fesetenv(&envp);
4049
+#else
4050
+   r = a_.f32[0] == b_.f32[0];
4051
+#endif
4052
+
4053
+   return r;
4054
+#endif
4055
+}
4056
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4057
+#define _mm_ucomieq_ss(a, b) simde_mm_ucomieq_ss((a), (b))
4058
+#endif
4059
+
4060
+SIMDE_FUNCTION_ATTRIBUTES
4061
+int simde_mm_ucomige_ss(simde__m128 a, simde__m128 b)
4062
+{
4063
+#if defined(SIMDE_X86_SSE_NATIVE)
4064
+   return _mm_ucomige_ss(a, b);
4065
+#else
4066
+   simde__m128_private a_ = simde__m128_to_private(a),
4067
+               b_ = simde__m128_to_private(b);
4068
+   int r;
4069
+
4070
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4071
+   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
4072
+   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
4073
+   uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4074
+   uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32);
4075
+   r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0);
4076
+#elif defined(SIMDE_HAVE_FENV_H)
4077
+   fenv_t envp;
4078
+   int x = feholdexcept(&envp);
4079
+   r = a_.f32[0] >= b_.f32[0];
4080
+   if (HEDLEY_LIKELY(x == 0))
4081
+       fesetenv(&envp);
4082
+#else
4083
+   r = a_.f32[0] >= b_.f32[0];
4084
+#endif
4085
+
4086
+   return r;
4087
+#endif
4088
+}
4089
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4090
+#define _mm_ucomige_ss(a, b) simde_mm_ucomige_ss((a), (b))
4091
+#endif
4092
+
4093
+SIMDE_FUNCTION_ATTRIBUTES
4094
+int simde_mm_ucomigt_ss(simde__m128 a, simde__m128 b)
4095
+{
4096
+#if defined(SIMDE_X86_SSE_NATIVE)
4097
+   return _mm_ucomigt_ss(a, b);
4098
+#else
4099
+   simde__m128_private a_ = simde__m128_to_private(a),
4100
+               b_ = simde__m128_to_private(b);
4101
+   int r;
4102
+
4103
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4104
+   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
4105
+   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
4106
+   uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4107
+   uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32);
4108
+   r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0);
4109
+#elif defined(SIMDE_HAVE_FENV_H)
4110
+   fenv_t envp;
4111
+   int x = feholdexcept(&envp);
4112
+   r = a_.f32[0] > b_.f32[0];
4113
+   if (HEDLEY_LIKELY(x == 0))
4114
+       fesetenv(&envp);
4115
+#else
4116
+   r = a_.f32[0] > b_.f32[0];
4117
+#endif
4118
+
4119
+   return r;
4120
+#endif
4121
+}
4122
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4123
+#define _mm_ucomigt_ss(a, b) simde_mm_ucomigt_ss((a), (b))
4124
+#endif
4125
+
4126
+SIMDE_FUNCTION_ATTRIBUTES
4127
+int simde_mm_ucomile_ss(simde__m128 a, simde__m128 b)
4128
+{
4129
+#if defined(SIMDE_X86_SSE_NATIVE)
4130
+   return _mm_ucomile_ss(a, b);
4131
+#else
4132
+   simde__m128_private a_ = simde__m128_to_private(a),
4133
+               b_ = simde__m128_to_private(b);
4134
+   int r;
4135
+
4136
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4137
+   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
4138
+   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
4139
+   uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
4140
+   uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32);
4141
+   r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0);
4142
+#elif defined(SIMDE_HAVE_FENV_H)
4143
+   fenv_t envp;
4144
+   int x = feholdexcept(&envp);
4145
+   r = a_.f32[0] <= b_.f32[0];
4146
+   if (HEDLEY_LIKELY(x == 0))
4147
+       fesetenv(&envp);
4148
+#else
4149
+   r = a_.f32[0] <= b_.f32[0];
4150
+#endif
4151
+
4152
+   return r;
4153
+#endif
4154
+}
4155
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4156
+#define _mm_ucomile_ss(a, b) simde_mm_ucomile_ss((a), (b))
4157
+#endif
4158
+
4159
+SIMDE_FUNCTION_ATTRIBUTES
4160
+int simde_mm_ucomilt_ss(simde__m128 a, simde__m128 b)
4161
+{
4162
+#if defined(SIMDE_X86_SSE_NATIVE)
4163
+   return _mm_ucomilt_ss(a, b);
4164
+#else
4165
+   simde__m128_private a_ = simde__m128_to_private(a),
4166
+               b_ = simde__m128_to_private(b);
4167
+   int r;
4168
+
4169
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4170
+   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
4171
+   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
4172
+   uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
4173
+   uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32);
4174
+   r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0);
4175
+#elif defined(SIMDE_HAVE_FENV_H)
4176
+   fenv_t envp;
4177
+   int x = feholdexcept(&envp);
4178
+   r = a_.f32[0] < b_.f32[0];
4179
+   if (HEDLEY_LIKELY(x == 0))
4180
+       fesetenv(&envp);
4181
+#else
4182
+   r = a_.f32[0] < b_.f32[0];
4183
+#endif
4184
+
4185
+   return r;
4186
+#endif
4187
+}
4188
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4189
+#define _mm_ucomilt_ss(a, b) simde_mm_ucomilt_ss((a), (b))
4190
+#endif
4191
+
4192
+SIMDE_FUNCTION_ATTRIBUTES
4193
+int simde_mm_ucomineq_ss(simde__m128 a, simde__m128 b)
4194
+{
4195
+#if defined(SIMDE_X86_SSE_NATIVE)
4196
+   return _mm_ucomineq_ss(a, b);
4197
+#else
4198
+   simde__m128_private a_ = simde__m128_to_private(a),
4199
+               b_ = simde__m128_to_private(b);
4200
+   int r;
4201
+
4202
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4203
+   uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32);
4204
+   uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32);
4205
+   uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
4206
+   uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32));
4207
+   r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0);
4208
+#elif defined(SIMDE_HAVE_FENV_H)
4209
+   fenv_t envp;
4210
+   int x = feholdexcept(&envp);
4211
+   r = a_.f32[0] != b_.f32[0];
4212
+   if (HEDLEY_LIKELY(x == 0))
4213
+       fesetenv(&envp);
4214
+#else
4215
+   r = a_.f32[0] != b_.f32[0];
4216
+#endif
4217
+
4218
+   return r;
4219
+#endif
4220
+}
4221
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4222
+#define _mm_ucomineq_ss(a, b) simde_mm_ucomineq_ss((a), (b))
4223
+#endif
4224
+
4225
+#if defined(SIMDE_X86_SSE_NATIVE)
4226
+#if defined(__has_builtin)
4227
+#if __has_builtin(__builtin_ia32_undef128)
4228
+#define SIMDE_HAVE_UNDEFINED128
4229
+#endif
4230
+#elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) && \
4231
+   !defined(_MSC_VER)
4232
+#define SIMDE_HAVE_UNDEFINED128
4233
+#endif
4234
+#endif
4235
+
4236
+#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
4237
+HEDLEY_DIAGNOSTIC_PUSH
4238
+SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
4239
+#endif
4240
+
4241
+SIMDE_FUNCTION_ATTRIBUTES
4242
+simde__m128 simde_mm_unpackhi_ps(simde__m128 a, simde__m128 b)
4243
+{
4244
+#if defined(SIMDE_X86_SSE_NATIVE)
4245
+   return _mm_unpackhi_ps(a, b);
4246
+#else
4247
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
4248
+               b_ = simde__m128_to_private(b);
4249
+
4250
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4251
+   r_.neon_f32 = vzip2q_f32(a_.neon_f32, b_.neon_f32);
4252
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4253
+   float32x2_t a1 = vget_high_f32(a_.neon_f32);
4254
+   float32x2_t b1 = vget_high_f32(b_.neon_f32);
4255
+   float32x2x2_t result = vzip_f32(a1, b1);
4256
+   r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
4257
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
4258
+   r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 2, 6, 3, 7);
4259
+#else
4260
+   r_.f32[0] = a_.f32[2];
4261
+   r_.f32[1] = b_.f32[2];
4262
+   r_.f32[2] = a_.f32[3];
4263
+   r_.f32[3] = b_.f32[3];
4264
+#endif
4265
+
4266
+   return simde__m128_from_private(r_);
4267
+#endif
4268
+}
4269
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4270
+#define _mm_unpackhi_ps(a, b) simde_mm_unpackhi_ps((a), (b))
4271
+#endif
4272
+
4273
+SIMDE_FUNCTION_ATTRIBUTES
4274
+simde__m128 simde_mm_unpacklo_ps(simde__m128 a, simde__m128 b)
4275
+{
4276
+#if defined(SIMDE_X86_SSE_NATIVE)
4277
+   return _mm_unpacklo_ps(a, b);
4278
+#else
4279
+   simde__m128_private r_, a_ = simde__m128_to_private(a),
4280
+               b_ = simde__m128_to_private(b);
4281
+
4282
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4283
+   r_.neon_f32 = vzip1q_f32(a_.neon_f32, b_.neon_f32);
4284
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4285
+   r_.altivec_f32 = vec_mergeh(a_.altivec_f32, b_.altivec_f32);
4286
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
4287
+   r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 4, 1, 5);
4288
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4289
+   float32x2_t a1 = vget_low_f32(a_.neon_f32);
4290
+   float32x2_t b1 = vget_low_f32(b_.neon_f32);
4291
+   float32x2x2_t result = vzip_f32(a1, b1);
4292
+   r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
4293
+#else
4294
+   r_.f32[0] = a_.f32[0];
4295
+   r_.f32[1] = b_.f32[0];
4296
+   r_.f32[2] = a_.f32[1];
4297
+   r_.f32[3] = b_.f32[1];
4298
+#endif
4299
+
4300
+   return simde__m128_from_private(r_);
4301
+#endif
4302
+}
4303
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4304
+#define _mm_unpacklo_ps(a, b) simde_mm_unpacklo_ps((a), (b))
4305
+#endif
4306
+
4307
+SIMDE_FUNCTION_ATTRIBUTES
4308
+void simde_mm_stream_pi(simde__m64 *mem_addr, simde__m64 a)
4309
+{
4310
+#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4311
+   _mm_stream_pi(HEDLEY_REINTERPRET_CAST(__m64 *, mem_addr), a);
4312
+#else
4313
+   simde__m64_private *dest = HEDLEY_REINTERPRET_CAST(simde__m64_private *,
4314
+                              mem_addr),
4315
+              a_ = simde__m64_to_private(a);
4316
+
4317
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4318
+   dest->i64[0] = vget_lane_s64(a_.neon_i64, 0);
4319
+#else
4320
+   dest->i64[0] = a_.i64[0];
4321
+#endif
4322
+#endif
4323
+}
4324
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4325
+#define _mm_stream_pi(mem_addr, a) simde_mm_stream_pi(mem_addr, (a))
4326
+#endif
4327
+
4328
+SIMDE_FUNCTION_ATTRIBUTES
4329
+void simde_mm_stream_ps(simde_float32 mem_addr[4], simde__m128 a)
4330
+{
4331
+#if defined(SIMDE_X86_SSE_NATIVE)
4332
+   _mm_stream_ps(mem_addr, a);
4333
+#elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && \
4334
+   defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4335
+   simde__m128_private a_ = simde__m128_to_private(a);
4336
+   __builtin_nontemporal_store(
4337
+       a_.f32, SIMDE_ALIGN_CAST(__typeof__(a_.f32) *, mem_addr));
4338
+#else
4339
+   simde_mm_store_ps(mem_addr, a);
4340
+#endif
4341
+}
4342
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4343
+#define _mm_stream_ps(mem_addr, a)                                      \
4344
+   simde_mm_stream_ps(SIMDE_CHECKED_REINTERPRET_CAST(              \
4345
+                  float *, simde_float32 *, mem_addr), \
4346
+              (a))
4347
+#endif
4348
+
4349
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4350
+#define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3)            \
4351
+   do {                                                      \
4352
+       float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
4353
+       float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
4354
+       row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
4355
+                   vget_low_f32(ROW23.val[0]));  \
4356
+       row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
4357
+                   vget_low_f32(ROW23.val[1]));  \
4358
+       row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
4359
+                   vget_high_f32(ROW23.val[0])); \
4360
+       row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
4361
+                   vget_high_f32(ROW23.val[1])); \
4362
+   } while (0)
4363
+#else
4364
+#define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3)       \
4365
+   do {                                                 \
4366
+       simde__m128 tmp3, tmp2, tmp1, tmp0;          \
4367
+       tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \
4368
+       tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \
4369
+       tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \
4370
+       tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \
4371
+       row0 = simde_mm_movelh_ps(tmp0, tmp2);       \
4372
+       row1 = simde_mm_movehl_ps(tmp2, tmp0);       \
4373
+       row2 = simde_mm_movelh_ps(tmp1, tmp3);       \
4374
+       row3 = simde_mm_movehl_ps(tmp3, tmp1);       \
4375
+   } while (0)
4376
+#endif
4377
+#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES)
4378
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
4379
+   SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3)
4380
+#endif
4381
+
4382
+#if defined(_MM_EXCEPT_INVALID)
4383
+#define SIMDE_MM_EXCEPT_INVALID _MM_EXCEPT_INVALID
4384
+#else
4385
+#define SIMDE_MM_EXCEPT_INVALID (0x0001)
4386
+#endif
4387
+#if defined(_MM_EXCEPT_DENORM)
4388
+#define SIMDE_MM_EXCEPT_DENORM _MM_EXCEPT_DENORM
4389
+#else
4390
+#define SIMDE_MM_EXCEPT_DENORM (0x0002)
4391
+#endif
4392
+#if defined(_MM_EXCEPT_DIV_ZERO)
4393
+#define SIMDE_MM_EXCEPT_DIV_ZERO _MM_EXCEPT_DIV_ZERO
4394
+#else
4395
+#define SIMDE_MM_EXCEPT_DIV_ZERO (0x0004)
4396
+#endif
4397
+#if defined(_MM_EXCEPT_OVERFLOW)
4398
+#define SIMDE_MM_EXCEPT_OVERFLOW _MM_EXCEPT_OVERFLOW
4399
+#else
4400
+#define SIMDE_MM_EXCEPT_OVERFLOW (0x0008)
4401
+#endif
4402
+#if defined(_MM_EXCEPT_UNDERFLOW)
4403
+#define SIMDE_MM_EXCEPT_UNDERFLOW _MM_EXCEPT_UNDERFLOW
4404
+#else
4405
+#define SIMDE_MM_EXCEPT_UNDERFLOW (0x0010)
4406
+#endif
4407
+#if defined(_MM_EXCEPT_INEXACT)
4408
+#define SIMDE_MM_EXCEPT_INEXACT _MM_EXCEPT_INEXACT
4409
+#else
4410
+#define SIMDE_MM_EXCEPT_INEXACT (0x0020)
4411
+#endif
4412
+#if defined(_MM_EXCEPT_MASK)
4413
+#define SIMDE_MM_EXCEPT_MASK _MM_EXCEPT_MASK
4414
+#else
4415
+#define SIMDE_MM_EXCEPT_MASK                                   \
4416
+   (SIMDE_MM_EXCEPT_INVALID | SIMDE_MM_EXCEPT_DENORM |    \
4417
+    SIMDE_MM_EXCEPT_DIV_ZERO | SIMDE_MM_EXCEPT_OVERFLOW | \
4418
+    SIMDE_MM_EXCEPT_UNDERFLOW | SIMDE_MM_EXCEPT_INEXACT)
4419
+#endif
4420
+
4421
+#if defined(_MM_MASK_INVALID)
4422
+#define SIMDE_MM_MASK_INVALID _MM_MASK_INVALID
4423
+#else
4424
+#define SIMDE_MM_MASK_INVALID (0x0080)
4425
+#endif
4426
+#if defined(_MM_MASK_DENORM)
4427
+#define SIMDE_MM_MASK_DENORM _MM_MASK_DENORM
4428
+#else
4429
+#define SIMDE_MM_MASK_DENORM (0x0100)
4430
+#endif
4431
+#if defined(_MM_MASK_DIV_ZERO)
4432
+#define SIMDE_MM_MASK_DIV_ZERO _MM_MASK_DIV_ZERO
4433
+#else
4434
+#define SIMDE_MM_MASK_DIV_ZERO (0x0200)
4435
+#endif
4436
+#if defined(_MM_MASK_OVERFLOW)
4437
+#define SIMDE_MM_MASK_OVERFLOW _MM_MASK_OVERFLOW
4438
+#else
4439
+#define SIMDE_MM_MASK_OVERFLOW (0x0400)
4440
+#endif
4441
+#if defined(_MM_MASK_UNDERFLOW)
4442
+#define SIMDE_MM_MASK_UNDERFLOW _MM_MASK_UNDERFLOW
4443
+#else
4444
+#define SIMDE_MM_MASK_UNDERFLOW (0x0800)
4445
+#endif
4446
+#if defined(_MM_MASK_INEXACT)
4447
+#define SIMDE_MM_MASK_INEXACT _MM_MASK_INEXACT
4448
+#else
4449
+#define SIMDE_MM_MASK_INEXACT (0x1000)
4450
+#endif
4451
+#if defined(_MM_MASK_MASK)
4452
+#define SIMDE_MM_MASK_MASK _MM_MASK_MASK
4453
+#else
4454
+#define SIMDE_MM_MASK_MASK                                 \
4455
+   (SIMDE_MM_MASK_INVALID | SIMDE_MM_MASK_DENORM |    \
4456
+    SIMDE_MM_MASK_DIV_ZERO | SIMDE_MM_MASK_OVERFLOW | \
4457
+    SIMDE_MM_MASK_UNDERFLOW | SIMDE_MM_MASK_INEXACT)
4458
+#endif
4459
+
4460
+#if defined(_MM_FLUSH_ZERO_MASK)
4461
+#define SIMDE_MM_FLUSH_ZERO_MASK _MM_FLUSH_ZERO_MASK
4462
+#else
4463
+#define SIMDE_MM_FLUSH_ZERO_MASK (0x8000)
4464
+#endif
4465
+#if defined(_MM_FLUSH_ZERO_ON)
4466
+#define SIMDE_MM_FLUSH_ZERO_ON _MM_FLUSH_ZERO_ON
4467
+#else
4468
+#define SIMDE_MM_FLUSH_ZERO_ON (0x8000)
4469
+#endif
4470
+#if defined(_MM_FLUSH_ZERO_OFF)
4471
+#define SIMDE_MM_FLUSH_ZERO_OFF _MM_FLUSH_ZERO_OFF
4472
+#else
4473
+#define SIMDE_MM_FLUSH_ZERO_OFF (0x0000)
4474
+#endif
4475
+
4476
+SIMDE_END_DECLS_
4477
+
4478
+HEDLEY_DIAGNOSTIC_POP
4479
+
4480
+#endif /* !defined(SIMDE_X86_SSE_H) */
4481
obs-studio-26.1.1.tar.xz/libobs/util/simde/x86/sse2.h Added
7551
 
1
@@ -0,0 +1,7549 @@
2
+/* SPDX-License-Identifier: MIT
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person
5
+ * obtaining a copy of this software and associated documentation
6
+ * files (the "Software"), to deal in the Software without
7
+ * restriction, including without limitation the rights to use, copy,
8
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
9
+ * of the Software, and to permit persons to whom the Software is
10
+ * furnished to do so, subject to the following conditions:
11
+ *
12
+ * The above copyright notice and this permission notice shall be
13
+ * included in all copies or substantial portions of the Software.
14
+ *
15
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ * SOFTWARE.
23
+ *
24
+ * Copyright:
25
+ *   2017-2020 Evan Nemerson <evan@nemerson.com>
26
+ *   2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
27
+ *   2015      Brandon Rowlett <browlett@nvidia.com>
28
+ *   2015      Ken Fast <kfast@gdeb.com>
29
+ *   2017      Hasindu Gamaarachchi <hasindu@unsw.edu.au>
30
+ *   2018      Jeff Daily <jeff.daily@amd.com>
31
+ */
32
+
33
+#if !defined(SIMDE_X86_SSE2_H)
34
+#define SIMDE_X86_SSE2_H
35
+
36
+#include "sse.h"
37
+
38
+HEDLEY_DIAGNOSTIC_PUSH
39
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
40
+SIMDE_BEGIN_DECLS_
41
+
42
+typedef union {
43
+#if defined(SIMDE_VECTOR_SUBSCRIPT)
44
+   SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
45
+   SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
46
+   SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
47
+   SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
48
+   SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
49
+   SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
50
+   SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
51
+   SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
52
+#if defined(SIMDE_HAVE_INT128_)
53
+   SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
54
+   SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
55
+#endif
56
+   SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
57
+   SIMDE_ALIGN_TO_16 simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
58
+
59
+   SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
60
+   SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
61
+#else
62
+   SIMDE_ALIGN_TO_16 int8_t i8[16];
63
+   SIMDE_ALIGN_TO_16 int16_t i16[8];
64
+   SIMDE_ALIGN_TO_16 int32_t i32[4];
65
+   SIMDE_ALIGN_TO_16 int64_t i64[2];
66
+   SIMDE_ALIGN_TO_16 uint8_t u8[16];
67
+   SIMDE_ALIGN_TO_16 uint16_t u16[8];
68
+   SIMDE_ALIGN_TO_16 uint32_t u32[4];
69
+   SIMDE_ALIGN_TO_16 uint64_t u64[2];
70
+#if defined(SIMDE_HAVE_INT128_)
71
+   SIMDE_ALIGN_TO_16 simde_int128 i128[1];
72
+   SIMDE_ALIGN_TO_16 simde_uint128 u128[1];
73
+#endif
74
+   SIMDE_ALIGN_TO_16 simde_float32 f32[4];
75
+   SIMDE_ALIGN_TO_16 simde_float64 f64[2];
76
+
77
+   SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)];
78
+   SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
79
+#endif
80
+
81
+   SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
82
+   SIMDE_ALIGN_TO_16 simde__m64 m64[2];
83
+
84
+#if defined(SIMDE_X86_SSE2_NATIVE)
85
+   SIMDE_ALIGN_TO_16 __m128i n;
86
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
87
+   SIMDE_ALIGN_TO_16 int8x16_t neon_i8;
88
+   SIMDE_ALIGN_TO_16 int16x8_t neon_i16;
89
+   SIMDE_ALIGN_TO_16 int32x4_t neon_i32;
90
+   SIMDE_ALIGN_TO_16 int64x2_t neon_i64;
91
+   SIMDE_ALIGN_TO_16 uint8x16_t neon_u8;
92
+   SIMDE_ALIGN_TO_16 uint16x8_t neon_u16;
93
+   SIMDE_ALIGN_TO_16 uint32x4_t neon_u32;
94
+   SIMDE_ALIGN_TO_16 uint64x2_t neon_u64;
95
+   SIMDE_ALIGN_TO_16 float32x4_t neon_f32;
96
+#if defined(SIMDE_ARCH_AARCH64)
97
+   SIMDE_ALIGN_TO_16 float64x2_t neon_f64;
98
+#endif
99
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
100
+   SIMDE_ALIGN_TO_16 v128_t wasm_v128;
101
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
102
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
103
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
104
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
105
+#if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
106
+   SIMDE_ALIGN_TO_16
107
+   SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f;
108
+#else
109
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f;
110
+#endif
111
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
112
+   SIMDE_ALIGN_TO_16
113
+   SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
114
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
115
+#if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
116
+   SIMDE_ALIGN_TO_16
117
+   SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
118
+#else
119
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f;
120
+#endif
121
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
122
+#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
123
+   SIMDE_ALIGN_TO_16
124
+   SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
125
+   SIMDE_ALIGN_TO_16
126
+   SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
127
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
128
+#endif
129
+#endif
130
+} simde__m128i_private;
131
+
132
+typedef union {
133
+#if defined(SIMDE_VECTOR_SUBSCRIPT)
134
+   SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
135
+   SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
136
+   SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
137
+   SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
138
+   SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
139
+   SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
140
+   SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
141
+   SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
142
+   SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
143
+   SIMDE_ALIGN_TO_16 simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
144
+   SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
145
+   SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
146
+#else
147
+   SIMDE_ALIGN_TO_16 int8_t i8[16];
148
+   SIMDE_ALIGN_TO_16 int16_t i16[8];
149
+   SIMDE_ALIGN_TO_16 int32_t i32[4];
150
+   SIMDE_ALIGN_TO_16 int64_t i64[2];
151
+   SIMDE_ALIGN_TO_16 uint8_t u8[16];
152
+   SIMDE_ALIGN_TO_16 uint16_t u16[8];
153
+   SIMDE_ALIGN_TO_16 uint32_t u32[4];
154
+   SIMDE_ALIGN_TO_16 uint64_t u64[2];
155
+   SIMDE_ALIGN_TO_16 simde_float32 f32[4];
156
+   SIMDE_ALIGN_TO_16 simde_float64 f64[2];
157
+   SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)];
158
+   SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)];
159
+#endif
160
+
161
+   SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2];
162
+   SIMDE_ALIGN_TO_16 simde__m64 m64[2];
163
+
164
+#if defined(SIMDE_X86_SSE2_NATIVE)
165
+   SIMDE_ALIGN_TO_16 __m128d n;
166
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
167
+   SIMDE_ALIGN_TO_16 int8x16_t neon_i8;
168
+   SIMDE_ALIGN_TO_16 int16x8_t neon_i16;
169
+   SIMDE_ALIGN_TO_16 int32x4_t neon_i32;
170
+   SIMDE_ALIGN_TO_16 int64x2_t neon_i64;
171
+   SIMDE_ALIGN_TO_16 uint8x16_t neon_u8;
172
+   SIMDE_ALIGN_TO_16 uint16x8_t neon_u16;
173
+   SIMDE_ALIGN_TO_16 uint32x4_t neon_u32;
174
+   SIMDE_ALIGN_TO_16 uint64x2_t neon_u64;
175
+   SIMDE_ALIGN_TO_16 float32x4_t neon_f32;
176
+#if defined(SIMDE_ARCH_AARCH64)
177
+   SIMDE_ALIGN_TO_16 float64x2_t neon_f64;
178
+#endif
179
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
180
+   SIMDE_ALIGN_TO_16 v128_t wasm_v128;
181
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
182
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8;
183
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16;
184
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32;
185
+#if defined(__INT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
186
+   SIMDE_ALIGN_TO_16
187
+   SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f;
188
+#else
189
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f;
190
+#endif
191
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8;
192
+   SIMDE_ALIGN_TO_16
193
+   SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16;
194
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32;
195
+#if defined(__UINT_FAST32_TYPE__) && defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
196
+   SIMDE_ALIGN_TO_16
197
+   SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f;
198
+#else
199
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f;
200
+#endif
201
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32;
202
+#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
203
+   SIMDE_ALIGN_TO_16
204
+   SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64;
205
+   SIMDE_ALIGN_TO_16
206
+   SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64;
207
+   SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64;
208
+#endif
209
+#endif
210
+} simde__m128d_private;
211
+
212
+#if defined(SIMDE_X86_SSE2_NATIVE)
213
+typedef __m128i simde__m128i;
214
+typedef __m128d simde__m128d;
215
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
216
+typedef int64x2_t simde__m128i;
217
+#if defined(SIMDE_ARCH_AARCH64)
218
+typedef float64x2_t simde__m128d;
219
+#elif defined(SIMDE_VECTOR_SUBSCRIPT)
220
+typedef simde_float64 simde__m128d SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
221
+#else
222
+typedef simde__m128d_private simde__m128d;
223
+#endif
224
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
225
+typedef v128_t simde__m128i;
226
+typedef v128_t simde__m128d;
227
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
228
+typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128i;
229
+#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
230
+typedef SIMDE_POWER_ALTIVEC_VECTOR(double) simde__m128d;
231
+#else
232
+typedef simde__m128d_private simde__m128d;
233
+#endif
234
+#elif defined(SIMDE_VECTOR_SUBSCRIPT)
235
+typedef int64_t simde__m128i SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
236
+typedef simde_float64
237
+   simde__m128d SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS;
238
+#else
239
+typedef simde__m128i_private simde__m128i;
240
+typedef simde__m128d_private simde__m128d;
241
+#endif
242
+
243
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
244
+typedef simde__m128i __m128i;
245
+typedef simde__m128d __m128d;
246
+#endif
247
+
248
+HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
249
+HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i_private),
250
+            "simde__m128i_private size incorrect");
251
+HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
252
+HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d_private),
253
+            "simde__m128d_private size incorrect");
254
+#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF)
255
+HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i) == 16,
256
+            "simde__m128i is not 16-byte aligned");
257
+HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i_private) == 16,
258
+            "simde__m128i_private is not 16-byte aligned");
259
+HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d) == 16,
260
+            "simde__m128d is not 16-byte aligned");
261
+HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d_private) == 16,
262
+            "simde__m128d_private is not 16-byte aligned");
263
+#endif
264
+
265
+SIMDE_FUNCTION_ATTRIBUTES
266
+simde__m128i simde__m128i_from_private(simde__m128i_private v)
267
+{
268
+   simde__m128i r;
269
+   simde_memcpy(&r, &v, sizeof(r));
270
+   return r;
271
+}
272
+
273
+SIMDE_FUNCTION_ATTRIBUTES
274
+simde__m128i_private simde__m128i_to_private(simde__m128i v)
275
+{
276
+   simde__m128i_private r;
277
+   simde_memcpy(&r, &v, sizeof(r));
278
+   return r;
279
+}
280
+
281
+SIMDE_FUNCTION_ATTRIBUTES
282
+simde__m128d simde__m128d_from_private(simde__m128d_private v)
283
+{
284
+   simde__m128d r;
285
+   simde_memcpy(&r, &v, sizeof(r));
286
+   return r;
287
+}
288
+
289
+SIMDE_FUNCTION_ATTRIBUTES
290
+simde__m128d_private simde__m128d_to_private(simde__m128d v)
291
+{
292
+   simde__m128d_private r;
293
+   simde_memcpy(&r, &v, sizeof(r));
294
+   return r;
295
+}
296
+
297
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
298
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int8x16_t, neon, i8)
299
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int16x8_t, neon, i16)
300
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int32x4_t, neon, i32)
301
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int64x2_t, neon, i64)
302
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint8x16_t, neon, u8)
303
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint16x8_t, neon, u16)
304
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint32x4_t, neon, u32)
305
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint64x2_t, neon, u64)
306
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float32x4_t, neon, f32)
307
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
308
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float64x2_t, neon, f64)
309
+#endif
310
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
311
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i,
312
+                      SIMDE_POWER_ALTIVEC_VECTOR(signed char),
313
+                      altivec, i8)
314
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i,
315
+                      SIMDE_POWER_ALTIVEC_VECTOR(signed short),
316
+                      altivec, i16)
317
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i,
318
+                      SIMDE_POWER_ALTIVEC_VECTOR(signed int),
319
+                      altivec, i32)
320
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
321
+   m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
322
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
323
+   m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
324
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i,
325
+                      SIMDE_POWER_ALTIVEC_VECTOR(unsigned int),
326
+                      altivec, u32)
327
+#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
328
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
329
+   m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
330
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
331
+   m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
332
+#endif
333
+#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
334
+
335
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
336
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int8x16_t, neon, i8)
337
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int16x8_t, neon, i16)
338
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int32x4_t, neon, i32)
339
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int64x2_t, neon, i64)
340
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint8x16_t, neon, u8)
341
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint16x8_t, neon, u16)
342
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint32x4_t, neon, u32)
343
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint64x2_t, neon, u64)
344
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float32x4_t, neon, f32)
345
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
346
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float64x2_t, neon, f64)
347
+#endif
348
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
349
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d,
350
+                      SIMDE_POWER_ALTIVEC_VECTOR(signed char),
351
+                      altivec, i8)
352
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d,
353
+                      SIMDE_POWER_ALTIVEC_VECTOR(signed short),
354
+                      altivec, i16)
355
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d,
356
+                      SIMDE_POWER_ALTIVEC_VECTOR(signed int),
357
+                      altivec, i32)
358
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
359
+   m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8)
360
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
361
+   m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16)
362
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d,
363
+                      SIMDE_POWER_ALTIVEC_VECTOR(unsigned int),
364
+                      altivec, u32)
365
+#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
366
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
367
+   m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64)
368
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(
369
+   m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64)
370
+#if defined(SIMDE_BUG_GCC_95782)
371
+SIMDE_FUNCTION_ATTRIBUTES
372
+SIMDE_POWER_ALTIVEC_VECTOR(double)
373
+simde__m128d_to_altivec_f64(simde__m128d value)
374
+{
375
+   simde__m128d_private r_ = simde__m128d_to_private(value);
376
+   return r_.altivec_f64;
377
+}
378
+
379
+SIMDE_FUNCTION_ATTRIBUTES
380
+simde__m128d simde__m128d_from_altivec_f64(SIMDE_POWER_ALTIVEC_VECTOR(double)
381
+                          value)
382
+{
383
+   simde__m128d_private r_;
384
+   r_.altivec_f64 = value;
385
+   return simde__m128d_from_private(r_);
386
+}
387
+#else
388
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d,
389
+                      SIMDE_POWER_ALTIVEC_VECTOR(double),
390
+                      altivec, f64)
391
+#endif
392
+#endif
393
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
394
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v128_t, wasm, v128);
395
+SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v128_t, wasm, v128);
396
+#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */
397
+
398
+SIMDE_FUNCTION_ATTRIBUTES
399
+simde__m128d simde_mm_set_pd(simde_float64 e1, simde_float64 e0)
400
+{
401
+#if defined(SIMDE_X86_SSE2_NATIVE)
402
+   return _mm_set_pd(e1, e0);
403
+#else
404
+   simde__m128d_private r_;
405
+
406
+#if defined(SIMDE_WASM_SIMD128_NATIVE)
407
+   r_.wasm_v128 = wasm_f64x2_make(e0, e1);
408
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
409
+   SIMDE_ALIGN_TO_16 simde_float64 data[2] = {e0, e1};
410
+   r_.neon_f64 = vld1q_f64(data);
411
+#else
412
+   r_.f64[0] = e0;
413
+   r_.f64[1] = e1;
414
+#endif
415
+
416
+   return simde__m128d_from_private(r_);
417
+#endif
418
+}
419
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
420
+#define _mm_set_pd(e1, e0) simde_mm_set_pd(e1, e0)
421
+#endif
422
+
423
+SIMDE_FUNCTION_ATTRIBUTES
424
+simde__m128d simde_mm_set1_pd(simde_float64 a)
425
+{
426
+#if defined(SIMDE_X86_SSE2_NATIVE)
427
+   return _mm_set1_pd(a);
428
+#else
429
+   simde__m128d_private r_;
430
+
431
+#if defined(SIMDE_WASM_SIMD128_NATIVE)
432
+   r_.wasm_v128 = wasm_f64x2_splat(a);
433
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
434
+   r_.neon_f64 = vdupq_n_f64(a);
435
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
436
+   r_.altivec_f64 = vec_splats(HEDLEY_STATIC_CAST(double, a));
437
+#else
438
+   SIMDE_VECTORIZE
439
+   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
440
+       r_.f64[i] = a;
441
+   }
442
+#endif
443
+
444
+   return simde__m128d_from_private(r_);
445
+#endif
446
+}
447
+#define simde_mm_set_pd1(a) simde_mm_set1_pd(a)
448
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
449
+#define _mm_set1_pd(a) simde_mm_set1_pd(a)
450
+#define _mm_set_pd1(a) simde_mm_set1_pd(a)
451
+#endif
452
+
453
+SIMDE_FUNCTION_ATTRIBUTES
454
+simde__m128d simde_x_mm_abs_pd(simde__m128d a)
455
+{
456
+#if defined(SIMDE_X86_AVX512F_NATIVE) && \
457
+   (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7, 4, 0))
458
+   return _mm512_castpd512_pd128(_mm512_abs_pd(_mm512_castpd128_pd512(a)));
459
+#else
460
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a);
461
+
462
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
463
+   r_.neon_f32 = vabsq_f32(a_.neon_f32);
464
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
465
+   r_.altivec_f32 = vec_abs(a_.altivec_f32);
466
+#else
467
+   SIMDE_VECTORIZE
468
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
469
+       r_.f64[i] = simde_math_fabs(a_.f64[i]);
470
+   }
471
+#endif
472
+
473
+   return simde__m128d_from_private(r_);
474
+#endif
475
+}
476
+
477
+SIMDE_FUNCTION_ATTRIBUTES
478
+simde__m128d simde_x_mm_not_pd(simde__m128d a)
479
+{
480
+#if defined(SIMDE_X86_AVX512VL_NATIVE)
481
+   __m128i ai = _mm_castpd_si128(a);
482
+   return _mm_castsi128_pd(_mm_ternarylogic_epi64(ai, ai, ai, 0x55));
483
+#else
484
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a);
485
+
486
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
487
+   r_.neon_i32 = vmvnq_s32(a_.neon_i32);
488
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
489
+   r_.altivec_f64 = vec_nor(a_.altivec_f64, a_.altivec_f64);
490
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
491
+   r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
492
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
493
+   r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
494
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
495
+   r_.i32f = ~a_.i32f;
496
+#else
497
+   SIMDE_VECTORIZE
498
+   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
499
+       r_.i32f[i] = ~(a_.i32f[i]);
500
+   }
501
+#endif
502
+
503
+   return simde__m128d_from_private(r_);
504
+#endif
505
+}
506
+
507
+SIMDE_FUNCTION_ATTRIBUTES
508
+simde__m128d simde_x_mm_select_pd(simde__m128d a, simde__m128d b,
509
+                 simde__m128d mask)
510
+{
511
+/* This function is for when you want to blend two elements together
512
+   * according to a mask.  It is similar to _mm_blendv_pd, except that
513
+   * it is undefined whether the blend is based on the highest bit in
514
+   * each lane (like blendv) or just bitwise operations.  This allows
515
+   * us to implement the function efficiently everywhere.
516
+   *
517
+   * Basically, you promise that all the lanes in mask are either 0 or
518
+   * ~0. */
519
+#if defined(SIMDE_X86_SSE4_1_NATIVE)
520
+   return _mm_blendv_pd(a, b, mask);
521
+#else
522
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
523
+                b_ = simde__m128d_to_private(b),
524
+                mask_ = simde__m128d_to_private(mask);
525
+
526
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
527
+   r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64);
528
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
529
+   r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64);
530
+#else
531
+   SIMDE_VECTORIZE
532
+   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
533
+       r_.i64[i] = a_.i64[i] ^
534
+               ((a_.i64[i] ^ b_.i64[i]) & mask_.i64[i]);
535
+   }
536
+#endif
537
+
538
+   return simde__m128d_from_private(r_);
539
+#endif
540
+}
541
+
542
+SIMDE_FUNCTION_ATTRIBUTES
543
+simde__m128i simde_mm_add_epi8(simde__m128i a, simde__m128i b)
544
+{
545
+#if defined(SIMDE_X86_SSE2_NATIVE)
546
+   return _mm_add_epi8(a, b);
547
+#else
548
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
549
+                b_ = simde__m128i_to_private(b);
550
+
551
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
552
+   r_.neon_i8 = vaddq_s8(a_.neon_i8, b_.neon_i8);
553
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
554
+   r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8);
555
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
556
+   r_.wasm_v128 = wasm_i8x16_add(a_.wasm_v128, b_.wasm_v128);
557
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
558
+   r_.i8 = a_.i8 + b_.i8;
559
+#else
560
+   SIMDE_VECTORIZE
561
+   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
562
+       r_.i8[i] = a_.i8[i] + b_.i8[i];
563
+   }
564
+#endif
565
+
566
+   return simde__m128i_from_private(r_);
567
+#endif
568
+}
569
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
570
+#define _mm_add_epi8(a, b) simde_mm_add_epi8(a, b)
571
+#endif
572
+
573
+SIMDE_FUNCTION_ATTRIBUTES
574
+simde__m128i simde_mm_add_epi16(simde__m128i a, simde__m128i b)
575
+{
576
+#if defined(SIMDE_X86_SSE2_NATIVE)
577
+   return _mm_add_epi16(a, b);
578
+#else
579
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
580
+                b_ = simde__m128i_to_private(b);
581
+
582
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
583
+   r_.neon_i16 = vaddq_s16(a_.neon_i16, b_.neon_i16);
584
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
585
+   r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16);
586
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
587
+   r_.wasm_v128 = wasm_i16x8_add(a_.wasm_v128, b_.wasm_v128);
588
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
589
+   r_.i16 = a_.i16 + b_.i16;
590
+#else
591
+   SIMDE_VECTORIZE
592
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
593
+       r_.i16[i] = a_.i16[i] + b_.i16[i];
594
+   }
595
+#endif
596
+
597
+   return simde__m128i_from_private(r_);
598
+#endif
599
+}
600
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
601
+#define _mm_add_epi16(a, b) simde_mm_add_epi16(a, b)
602
+#endif
603
+
604
+SIMDE_FUNCTION_ATTRIBUTES
605
+simde__m128i simde_mm_add_epi32(simde__m128i a, simde__m128i b)
606
+{
607
+#if defined(SIMDE_X86_SSE2_NATIVE)
608
+   return _mm_add_epi32(a, b);
609
+#else
610
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
611
+                b_ = simde__m128i_to_private(b);
612
+
613
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
614
+   r_.neon_i32 = vaddq_s32(a_.neon_i32, b_.neon_i32);
615
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
616
+   r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32);
617
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
618
+   r_.wasm_v128 = wasm_i32x4_add(a_.wasm_v128, b_.wasm_v128);
619
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
620
+   r_.i32 = a_.i32 + b_.i32;
621
+#else
622
+   SIMDE_VECTORIZE
623
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
624
+       r_.i32[i] = a_.i32[i] + b_.i32[i];
625
+   }
626
+#endif
627
+
628
+   return simde__m128i_from_private(r_);
629
+#endif
630
+}
631
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
632
+#define _mm_add_epi32(a, b) simde_mm_add_epi32(a, b)
633
+#endif
634
+
635
+SIMDE_FUNCTION_ATTRIBUTES
636
+simde__m128i simde_mm_add_epi64(simde__m128i a, simde__m128i b)
637
+{
638
+#if defined(SIMDE_X86_SSE2_NATIVE)
639
+   return _mm_add_epi64(a, b);
640
+#else
641
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
642
+                b_ = simde__m128i_to_private(b);
643
+
644
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
645
+   r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64);
646
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
647
+   r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64);
648
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
649
+   r_.wasm_v128 = wasm_i64x2_add(a_.wasm_v128, b_.wasm_v128);
650
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
651
+   r_.i64 = a_.i64 + b_.i64;
652
+#else
653
+   SIMDE_VECTORIZE
654
+   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
655
+       r_.i64[i] = a_.i64[i] + b_.i64[i];
656
+   }
657
+#endif
658
+
659
+   return simde__m128i_from_private(r_);
660
+#endif
661
+}
662
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
663
+#define _mm_add_epi64(a, b) simde_mm_add_epi64(a, b)
664
+#endif
665
+
666
+SIMDE_FUNCTION_ATTRIBUTES
667
+simde__m128d simde_mm_add_pd(simde__m128d a, simde__m128d b)
668
+{
669
+#if defined(SIMDE_X86_SSE2_NATIVE)
670
+   return _mm_add_pd(a, b);
671
+#else
672
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
673
+                b_ = simde__m128d_to_private(b);
674
+
675
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
676
+   r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64);
677
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
678
+   r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
679
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
680
+   r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64);
681
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
682
+   r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128);
683
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
684
+   r_.f64 = a_.f64 + b_.f64;
685
+#else
686
+   SIMDE_VECTORIZE
687
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
688
+       r_.f64[i] = a_.f64[i] + b_.f64[i];
689
+   }
690
+#endif
691
+
692
+   return simde__m128d_from_private(r_);
693
+#endif
694
+}
695
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
696
+#define _mm_add_pd(a, b) simde_mm_add_pd(a, b)
697
+#endif
698
+
699
+SIMDE_FUNCTION_ATTRIBUTES
700
+simde__m128d simde_mm_move_sd(simde__m128d a, simde__m128d b)
701
+{
702
+#if defined(SIMDE_X86_SSE2_NATIVE)
703
+   return _mm_move_sd(a, b);
704
+#else
705
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
706
+                b_ = simde__m128d_to_private(b);
707
+
708
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
709
+   r_.neon_f64 =
710
+       vsetq_lane_f64(vgetq_lane_f64(b_.neon_f64, 0), a_.neon_f64, 0);
711
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
712
+#if defined(HEDLEY_IBM_VERSION)
713
+   r_.altivec_f64 = vec_xxpermdi(a_.altivec_f64, b_.altivec_f64, 1);
714
+#else
715
+   r_.altivec_f64 = vec_xxpermdi(b_.altivec_f64, a_.altivec_f64, 1);
716
+#endif
717
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
718
+   r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 1);
719
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
720
+   r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1);
721
+#else
722
+   r_.f64[0] = b_.f64[0];
723
+   r_.f64[1] = a_.f64[1];
724
+#endif
725
+
726
+   return simde__m128d_from_private(r_);
727
+#endif
728
+}
729
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
730
+#define _mm_move_sd(a, b) simde_mm_move_sd(a, b)
731
+#endif
732
+
733
+SIMDE_FUNCTION_ATTRIBUTES
734
+simde__m128d simde_mm_add_sd(simde__m128d a, simde__m128d b)
735
+{
736
+#if defined(SIMDE_X86_SSE2_NATIVE)
737
+   return _mm_add_sd(a, b);
738
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
739
+   return simde_mm_move_sd(a, simde_mm_add_pd(a, b));
740
+#else
741
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
742
+                b_ = simde__m128d_to_private(b);
743
+
744
+   r_.f64[0] = a_.f64[0] + b_.f64[0];
745
+   r_.f64[1] = a_.f64[1];
746
+
747
+   return simde__m128d_from_private(r_);
748
+#endif
749
+}
750
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
751
+#define _mm_add_sd(a, b) simde_mm_add_sd(a, b)
752
+#endif
753
+
754
+SIMDE_FUNCTION_ATTRIBUTES
755
+simde__m64 simde_mm_add_si64(simde__m64 a, simde__m64 b)
756
+{
757
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
758
+   return _mm_add_si64(a, b);
759
+#else
760
+   simde__m64_private r_, a_ = simde__m64_to_private(a),
761
+                  b_ = simde__m64_to_private(b);
762
+
763
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
764
+   r_.neon_i64 = vadd_s64(a_.neon_i64, b_.neon_i64);
765
+#else
766
+   r_.i64[0] = a_.i64[0] + b_.i64[0];
767
+#endif
768
+
769
+   return simde__m64_from_private(r_);
770
+#endif
771
+}
772
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
773
+#define _mm_add_si64(a, b) simde_mm_add_si64(a, b)
774
+#endif
775
+
776
+SIMDE_FUNCTION_ATTRIBUTES
777
+simde__m128i simde_mm_adds_epi8(simde__m128i a, simde__m128i b)
778
+{
779
+#if defined(SIMDE_X86_SSE2_NATIVE)
780
+   return _mm_adds_epi8(a, b);
781
+#else
782
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
783
+                b_ = simde__m128i_to_private(b);
784
+
785
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
786
+   r_.neon_i8 = vqaddq_s8(a_.neon_i8, b_.neon_i8);
787
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
788
+   r_.wasm_v128 = wasm_i8x16_add_saturate(a_.wasm_v128, b_.wasm_v128);
789
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
790
+   r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8);
791
+#else
792
+   SIMDE_VECTORIZE
793
+   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
794
+       const int_fast16_t tmp =
795
+           HEDLEY_STATIC_CAST(int_fast16_t, a_.i8[i]) +
796
+           HEDLEY_STATIC_CAST(int_fast16_t, b_.i8[i]);
797
+       r_.i8[i] = HEDLEY_STATIC_CAST(
798
+           int8_t,
799
+           ((tmp < INT8_MAX) ? ((tmp > INT8_MIN) ? tmp : INT8_MIN)
800
+                     : INT8_MAX));
801
+   }
802
+#endif
803
+
804
+   return simde__m128i_from_private(r_);
805
+#endif
806
+}
807
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
808
+#define _mm_adds_epi8(a, b) simde_mm_adds_epi8(a, b)
809
+#endif
810
+
811
+SIMDE_FUNCTION_ATTRIBUTES
812
+simde__m128i simde_mm_adds_epi16(simde__m128i a, simde__m128i b)
813
+{
814
+#if defined(SIMDE_X86_SSE2_NATIVE)
815
+   return _mm_adds_epi16(a, b);
816
+#else
817
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
818
+                b_ = simde__m128i_to_private(b);
819
+
820
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
821
+   r_.neon_i16 = vqaddq_s16(a_.neon_i16, b_.neon_i16);
822
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
823
+   r_.wasm_v128 = wasm_i16x8_add_saturate(a_.wasm_v128, b_.wasm_v128);
824
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
825
+   r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16);
826
+#else
827
+   SIMDE_VECTORIZE
828
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
829
+       const int_fast32_t tmp =
830
+           HEDLEY_STATIC_CAST(int_fast32_t, a_.i16[i]) +
831
+           HEDLEY_STATIC_CAST(int_fast32_t, b_.i16[i]);
832
+       r_.i16[i] = HEDLEY_STATIC_CAST(
833
+           int16_t,
834
+           ((tmp < INT16_MAX)
835
+                ? ((tmp > INT16_MIN) ? tmp : INT16_MIN)
836
+                : INT16_MAX));
837
+   }
838
+#endif
839
+
840
+   return simde__m128i_from_private(r_);
841
+#endif
842
+}
843
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
844
+#define _mm_adds_epi16(a, b) simde_mm_adds_epi16(a, b)
845
+#endif
846
+
847
+SIMDE_FUNCTION_ATTRIBUTES
848
+simde__m128i simde_mm_adds_epu8(simde__m128i a, simde__m128i b)
849
+{
850
+#if defined(SIMDE_X86_SSE2_NATIVE)
851
+   return _mm_adds_epu8(a, b);
852
+#else
853
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
854
+                b_ = simde__m128i_to_private(b);
855
+
856
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
857
+   r_.neon_u8 = vqaddq_u8(a_.neon_u8, b_.neon_u8);
858
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
859
+   r_.wasm_v128 = wasm_u8x16_add_saturate(a_.wasm_v128, b_.wasm_v128);
860
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
861
+   r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8);
862
+#else
863
+   SIMDE_VECTORIZE
864
+   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
865
+       r_.u8[i] = ((UINT8_MAX - a_.u8[i]) > b_.u8[i])
866
+                  ? (a_.u8[i] + b_.u8[i])
867
+                  : UINT8_MAX;
868
+   }
869
+#endif
870
+
871
+   return simde__m128i_from_private(r_);
872
+#endif
873
+}
874
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
875
+#define _mm_adds_epu8(a, b) simde_mm_adds_epu8(a, b)
876
+#endif
877
+
878
+SIMDE_FUNCTION_ATTRIBUTES
879
+simde__m128i simde_mm_adds_epu16(simde__m128i a, simde__m128i b)
880
+{
881
+#if defined(SIMDE_X86_SSE2_NATIVE)
882
+   return _mm_adds_epu16(a, b);
883
+#else
884
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
885
+                b_ = simde__m128i_to_private(b);
886
+
887
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
888
+   r_.neon_u16 = vqaddq_u16(a_.neon_u16, b_.neon_u16);
889
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
890
+   r_.wasm_v128 = wasm_u16x8_add_saturate(a_.wasm_v128, b_.wasm_v128);
891
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
892
+   r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16);
893
+#else
894
+   SIMDE_VECTORIZE
895
+   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
896
+       r_.u16[i] = ((UINT16_MAX - a_.u16[i]) > b_.u16[i])
897
+                   ? (a_.u16[i] + b_.u16[i])
898
+                   : UINT16_MAX;
899
+   }
900
+#endif
901
+
902
+   return simde__m128i_from_private(r_);
903
+#endif
904
+}
905
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
906
+#define _mm_adds_epu16(a, b) simde_mm_adds_epu16(a, b)
907
+#endif
908
+
909
+SIMDE_FUNCTION_ATTRIBUTES
910
+simde__m128d simde_mm_and_pd(simde__m128d a, simde__m128d b)
911
+{
912
+#if defined(SIMDE_X86_SSE2_NATIVE)
913
+   return _mm_and_pd(a, b);
914
+#else
915
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
916
+                b_ = simde__m128d_to_private(b);
917
+
918
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
919
+   r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32);
920
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
921
+   r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128);
922
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
923
+   r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64);
924
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
925
+   r_.i32f = a_.i32f & b_.i32f;
926
+#else
927
+   SIMDE_VECTORIZE
928
+   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
929
+       r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
930
+   }
931
+#endif
932
+
933
+   return simde__m128d_from_private(r_);
934
+#endif
935
+}
936
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
937
+#define _mm_and_pd(a, b) simde_mm_and_pd(a, b)
938
+#endif
939
+
940
+SIMDE_FUNCTION_ATTRIBUTES
941
+simde__m128i simde_mm_and_si128(simde__m128i a, simde__m128i b)
942
+{
943
+#if defined(SIMDE_X86_SSE2_NATIVE)
944
+   return _mm_and_si128(a, b);
945
+#else
946
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
947
+                b_ = simde__m128i_to_private(b);
948
+
949
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
950
+   r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32);
951
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
952
+   r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f);
953
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
954
+   r_.i32f = a_.i32f & b_.i32f;
955
+#else
956
+   SIMDE_VECTORIZE
957
+   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
958
+       r_.i32f[i] = a_.i32f[i] & b_.i32f[i];
959
+   }
960
+#endif
961
+
962
+   return simde__m128i_from_private(r_);
963
+#endif
964
+}
965
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
966
+#define _mm_and_si128(a, b) simde_mm_and_si128(a, b)
967
+#endif
968
+
969
+SIMDE_FUNCTION_ATTRIBUTES
970
+simde__m128d simde_mm_andnot_pd(simde__m128d a, simde__m128d b)
971
+{
972
+#if defined(SIMDE_X86_SSE2_NATIVE)
973
+   return _mm_andnot_pd(a, b);
974
+#else
975
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
976
+                b_ = simde__m128d_to_private(b);
977
+
978
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
979
+   r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
980
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
981
+   r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128);
982
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
983
+   r_.altivec_f64 = vec_andc(b_.altivec_f64, a_.altivec_f64);
984
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
985
+   r_.altivec_i32f = vec_andc(b_.altivec_i32f, a_.altivec_i32f);
986
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
987
+   r_.i32f = ~a_.i32f & b_.i32f;
988
+#else
989
+   SIMDE_VECTORIZE
990
+   for (size_t i = 0; i < (sizeof(r_.u64) / sizeof(r_.u64[0])); i++) {
991
+       r_.u64[i] = ~a_.u64[i] & b_.u64[i];
992
+   }
993
+#endif
994
+
995
+   return simde__m128d_from_private(r_);
996
+#endif
997
+}
998
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
999
+#define _mm_andnot_pd(a, b) simde_mm_andnot_pd(a, b)
1000
+#endif
1001
+
1002
+SIMDE_FUNCTION_ATTRIBUTES
1003
+simde__m128i simde_mm_andnot_si128(simde__m128i a, simde__m128i b)
1004
+{
1005
+#if defined(SIMDE_X86_SSE2_NATIVE)
1006
+   return _mm_andnot_si128(a, b);
1007
+#else
1008
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1009
+                b_ = simde__m128i_to_private(b);
1010
+
1011
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1012
+   r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32);
1013
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1014
+   r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32);
1015
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1016
+   r_.i32f = ~a_.i32f & b_.i32f;
1017
+#else
1018
+   SIMDE_VECTORIZE
1019
+   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
1020
+       r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i];
1021
+   }
1022
+#endif
1023
+
1024
+   return simde__m128i_from_private(r_);
1025
+#endif
1026
+}
1027
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1028
+#define _mm_andnot_si128(a, b) simde_mm_andnot_si128(a, b)
1029
+#endif
1030
+
1031
+SIMDE_FUNCTION_ATTRIBUTES
1032
+simde__m128d simde_mm_xor_pd(simde__m128d a, simde__m128d b)
1033
+{
1034
+#if defined(SIMDE_X86_SSE2_NATIVE)
1035
+   return _mm_xor_pd(a, b);
1036
+#else
1037
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1038
+                b_ = simde__m128d_to_private(b);
1039
+
1040
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1041
+   r_.i32f = a_.i32f ^ b_.i32f;
1042
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1043
+   r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128);
1044
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1045
+   r_.neon_i64 = veorq_s64(a_.neon_i64, b_.neon_i64);
1046
+#else
1047
+   SIMDE_VECTORIZE
1048
+   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
1049
+       r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
1050
+   }
1051
+#endif
1052
+
1053
+   return simde__m128d_from_private(r_);
1054
+#endif
1055
+}
1056
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1057
+#define _mm_xor_pd(a, b) simde_mm_xor_pd(a, b)
1058
+#endif
1059
+
1060
+SIMDE_FUNCTION_ATTRIBUTES
1061
+simde__m128i simde_mm_avg_epu8(simde__m128i a, simde__m128i b)
1062
+{
1063
+#if defined(SIMDE_X86_SSE2_NATIVE)
1064
+   return _mm_avg_epu8(a, b);
1065
+#else
1066
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1067
+                b_ = simde__m128i_to_private(b);
1068
+
1069
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1070
+   r_.neon_u8 = vrhaddq_u8(b_.neon_u8, a_.neon_u8);
1071
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1072
+   r_.wasm_v128 = wasm_u8x16_avgr(a_.wasm_v128, b_.wasm_v128);
1073
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1074
+   r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8);
1075
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) &&      \
1076
+   defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
1077
+   defined(SIMDE_CONVERT_VECTOR_)
1078
+   uint16_t wa SIMDE_VECTOR(32);
1079
+   uint16_t wb SIMDE_VECTOR(32);
1080
+   uint16_t wr SIMDE_VECTOR(32);
1081
+   SIMDE_CONVERT_VECTOR_(wa, a_.u8);
1082
+   SIMDE_CONVERT_VECTOR_(wb, b_.u8);
1083
+   wr = (wa + wb + 1) >> 1;
1084
+   SIMDE_CONVERT_VECTOR_(r_.u8, wr);
1085
+#else
1086
+   SIMDE_VECTORIZE
1087
+   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
1088
+       r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1;
1089
+   }
1090
+#endif
1091
+
1092
+   return simde__m128i_from_private(r_);
1093
+#endif
1094
+}
1095
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1096
+#define _mm_avg_epu8(a, b) simde_mm_avg_epu8(a, b)
1097
+#endif
1098
+
1099
+SIMDE_FUNCTION_ATTRIBUTES
1100
+simde__m128i simde_mm_avg_epu16(simde__m128i a, simde__m128i b)
1101
+{
1102
+#if defined(SIMDE_X86_SSE2_NATIVE)
1103
+   return _mm_avg_epu16(a, b);
1104
+#else
1105
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1106
+                b_ = simde__m128i_to_private(b);
1107
+
1108
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1109
+   r_.neon_u16 = vrhaddq_u16(b_.neon_u16, a_.neon_u16);
1110
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1111
+   r_.wasm_v128 = wasm_u16x8_avgr(a_.wasm_v128, b_.wasm_v128);
1112
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1113
+   r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16);
1114
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) &&      \
1115
+   defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && \
1116
+   defined(SIMDE_CONVERT_VECTOR_)
1117
+   uint32_t wa SIMDE_VECTOR(32);
1118
+   uint32_t wb SIMDE_VECTOR(32);
1119
+   uint32_t wr SIMDE_VECTOR(32);
1120
+   SIMDE_CONVERT_VECTOR_(wa, a_.u16);
1121
+   SIMDE_CONVERT_VECTOR_(wb, b_.u16);
1122
+   wr = (wa + wb + 1) >> 1;
1123
+   SIMDE_CONVERT_VECTOR_(r_.u16, wr);
1124
+#else
1125
+   SIMDE_VECTORIZE
1126
+   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
1127
+       r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1;
1128
+   }
1129
+#endif
1130
+
1131
+   return simde__m128i_from_private(r_);
1132
+#endif
1133
+}
1134
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1135
+#define _mm_avg_epu16(a, b) simde_mm_avg_epu16(a, b)
1136
+#endif
1137
+
1138
+SIMDE_FUNCTION_ATTRIBUTES
1139
+simde__m128i simde_mm_setzero_si128(void)
1140
+{
1141
+#if defined(SIMDE_X86_SSE2_NATIVE)
1142
+   return _mm_setzero_si128();
1143
+#else
1144
+   simde__m128i_private r_;
1145
+
1146
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1147
+   r_.neon_i32 = vdupq_n_s32(0);
1148
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1149
+   r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, 0));
1150
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1151
+   r_.wasm_v128 = wasm_i32x4_splat(INT32_C(0));
1152
+#elif defined(SIMDE_VECTOR_SUBSCRIPT)
1153
+   r_.i32 = __extension__(__typeof__(r_.i32)){0, 0, 0, 0};
1154
+#else
1155
+   SIMDE_VECTORIZE
1156
+   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
1157
+       r_.i32f[i] = 0;
1158
+   }
1159
+#endif
1160
+
1161
+   return simde__m128i_from_private(r_);
1162
+#endif
1163
+}
1164
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1165
+#define _mm_setzero_si128() (simde_mm_setzero_si128())
1166
+#endif
1167
+
1168
+SIMDE_FUNCTION_ATTRIBUTES
1169
+simde__m128i simde_mm_bslli_si128(simde__m128i a, const int imm8)
1170
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
1171
+{
1172
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
1173
+
1174
+   if (HEDLEY_UNLIKELY((imm8 & ~15))) {
1175
+       return simde_mm_setzero_si128();
1176
+   }
1177
+
1178
+#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
1179
+   r_.altivec_i8 =
1180
+#if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1181
+       vec_slo
1182
+#else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
1183
+       vec_sro
1184
+#endif
1185
+       (a_.altivec_i8,
1186
+        vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
1187
+#elif defined(SIMDE_HAVE_INT128_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1188
+   r_.u128[0] = a_.u128[0] << (imm8 * 8);
1189
+#else
1190
+   r_ = simde__m128i_to_private(simde_mm_setzero_si128());
1191
+   for (int i = imm8;
1192
+        i < HEDLEY_STATIC_CAST(int, sizeof(r_.i8) / sizeof(r_.i8[0]));
1193
+        i++) {
1194
+       r_.i8[i] = a_.i8[i - imm8];
1195
+   }
1196
+#endif
1197
+
1198
+   return simde__m128i_from_private(r_);
1199
+}
1200
+#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1201
+#define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8)
1202
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
1203
+#define simde_mm_bslli_si128(a, imm8)                                      \
1204
+   simde__m128i_from_neon_i8(                                         \
1205
+       ((imm8) <= 0)                                              \
1206
+           ? simde__m128i_to_neon_i8(a)                       \
1207
+           : (((imm8) > 15)                                   \
1208
+                  ? (vdupq_n_s8(0))                       \
1209
+                  : (vextq_s8(vdupq_n_s8(0),              \
1210
+                          simde__m128i_to_neon_i8(a), \
1211
+                          16 - (imm8)))))
1212
+#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1213
+#define simde_mm_bslli_si128(a, imm8)                                          \
1214
+   (__extension__({                                                       \
1215
+       const simde__m128i_private simde__tmp_a_ =                     \
1216
+           simde__m128i_to_private(a);                            \
1217
+       const simde__m128i_private simde__tmp_z_ =                     \
1218
+           simde__m128i_to_private(simde_mm_setzero_si128());     \
1219
+       simde__m128i_private simde__tmp_r_;                            \
1220
+       if (HEDLEY_UNLIKELY(imm8 > 15)) {                              \
1221
+           simde__tmp_r_ = simde__m128i_to_private(               \
1222
+               simde_mm_setzero_si128());                     \
1223
+       } else {                                                       \
1224
+           simde__tmp_r_.i8 = SIMDE_SHUFFLE_VECTOR_(              \
1225
+               8, 16, simde__tmp_z_.i8, (simde__tmp_a_).i8,   \
1226
+               HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31),  \
1227
+               HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31),  \
1228
+               HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31),  \
1229
+               HEDLEY_STATIC_CAST(int8_t, (19 - imm8) & 31),  \
1230
+               HEDLEY_STATIC_CAST(int8_t, (20 - imm8) & 31),  \
1231
+               HEDLEY_STATIC_CAST(int8_t, (21 - imm8) & 31),  \
1232
+               HEDLEY_STATIC_CAST(int8_t, (22 - imm8) & 31),  \
1233
+               HEDLEY_STATIC_CAST(int8_t, (23 - imm8) & 31),  \
1234
+               HEDLEY_STATIC_CAST(int8_t, (24 - imm8) & 31),  \
1235
+               HEDLEY_STATIC_CAST(int8_t, (25 - imm8) & 31),  \
1236
+               HEDLEY_STATIC_CAST(int8_t, (26 - imm8) & 31),  \
1237
+               HEDLEY_STATIC_CAST(int8_t, (27 - imm8) & 31),  \
1238
+               HEDLEY_STATIC_CAST(int8_t, (28 - imm8) & 31),  \
1239
+               HEDLEY_STATIC_CAST(int8_t, (29 - imm8) & 31),  \
1240
+               HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31),  \
1241
+               HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \
1242
+       }                                                              \
1243
+       simde__m128i_from_private(simde__tmp_r_);                      \
1244
+   }))
1245
+#endif
1246
+#define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1247
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1248
+#define _mm_bslli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1249
+#define _mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
1250
+#endif
1251
+
1252
+SIMDE_FUNCTION_ATTRIBUTES
1253
+simde__m128i simde_mm_bsrli_si128(simde__m128i a, const int imm8)
1254
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
1255
+{
1256
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
1257
+
1258
+   if (HEDLEY_UNLIKELY((imm8 & ~15))) {
1259
+       return simde_mm_setzero_si128();
1260
+   }
1261
+
1262
+#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER)
1263
+   r_.altivec_i8 =
1264
+#if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
1265
+       vec_sro
1266
+#else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */
1267
+       vec_slo
1268
+#endif
1269
+       (a_.altivec_i8,
1270
+        vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8)));
1271
+#else
1272
+   SIMDE_VECTORIZE
1273
+   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
1274
+       const int e = HEDLEY_STATIC_CAST(int, i) + imm8;
1275
+       r_.i8[i] = (e < 16) ? a_.i8[e] : 0;
1276
+   }
1277
+#endif
1278
+
1279
+   return simde__m128i_from_private(r_);
1280
+}
1281
+#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
1282
+#define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8)
1283
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__)
1284
+#define simde_mm_bsrli_si128(a, imm8)                                   \
1285
+   simde__m128i_from_neon_i8(                                      \
1286
+       ((imm8 < 0) || (imm8 > 15))                             \
1287
+           ? vdupq_n_s8(0)                                 \
1288
+           : (vextq_s8(simde__m128i_to_private(a).neon_i8, \
1289
+                   vdupq_n_s8(0),                      \
1290
+                   ((imm8 & 15) != 0) ? imm8 : (imm8 & 15))))
1291
+#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1292
+#define simde_mm_bsrli_si128(a, imm8)                                          \
1293
+   (__extension__({                                                       \
1294
+       const simde__m128i_private simde__tmp_a_ =                     \
1295
+           simde__m128i_to_private(a);                            \
1296
+       const simde__m128i_private simde__tmp_z_ =                     \
1297
+           simde__m128i_to_private(simde_mm_setzero_si128());     \
1298
+       simde__m128i_private simde__tmp_r_ =                           \
1299
+           simde__m128i_to_private(a);                            \
1300
+       if (HEDLEY_UNLIKELY(imm8 > 15)) {                              \
1301
+           simde__tmp_r_ = simde__m128i_to_private(               \
1302
+               simde_mm_setzero_si128());                     \
1303
+       } else {                                                       \
1304
+           simde__tmp_r_.i8 = SIMDE_SHUFFLE_VECTOR_(              \
1305
+               8, 16, simde__tmp_z_.i8, (simde__tmp_a_).i8,   \
1306
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31),  \
1307
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31),  \
1308
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31),  \
1309
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31),  \
1310
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31),  \
1311
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31),  \
1312
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31),  \
1313
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31),  \
1314
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31),  \
1315
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31),  \
1316
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31),  \
1317
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31),  \
1318
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31),  \
1319
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31),  \
1320
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31),  \
1321
+               HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \
1322
+       }                                                              \
1323
+       simde__m128i_from_private(simde__tmp_r_);                      \
1324
+   }))
1325
+#endif
1326
+#define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1327
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1328
+#define _mm_bsrli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1329
+#define _mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8))
1330
+#endif
1331
+
1332
+SIMDE_FUNCTION_ATTRIBUTES
1333
+void simde_mm_clflush(void const *p)
1334
+{
1335
+#if defined(SIMDE_X86_SSE2_NATIVE)
1336
+   _mm_clflush(p);
1337
+#else
1338
+   (void)p;
1339
+#endif
1340
+}
1341
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1342
+#define _mm_clflush(a, b) simde_mm_clflush()
1343
+#endif
1344
+
1345
+SIMDE_FUNCTION_ATTRIBUTES
1346
+int simde_mm_comieq_sd(simde__m128d a, simde__m128d b)
1347
+{
1348
+#if defined(SIMDE_X86_SSE2_NATIVE)
1349
+   return _mm_comieq_sd(a, b);
1350
+#else
1351
+   simde__m128d_private a_ = simde__m128d_to_private(a),
1352
+                b_ = simde__m128d_to_private(b);
1353
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1354
+   return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1355
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1356
+   return wasm_f64x2_extract_lane(a_.wasm_v128, 0) ==
1357
+          wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1358
+#else
1359
+   return a_.f64[0] == b_.f64[0];
1360
+#endif
1361
+#endif
1362
+}
1363
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1364
+#define _mm_comieq_sd(a, b) simde_mm_comieq_sd(a, b)
1365
+#endif
1366
+
1367
+SIMDE_FUNCTION_ATTRIBUTES
1368
+int simde_mm_comige_sd(simde__m128d a, simde__m128d b)
1369
+{
1370
+#if defined(SIMDE_X86_SSE2_NATIVE)
1371
+   return _mm_comige_sd(a, b);
1372
+#else
1373
+   simde__m128d_private a_ = simde__m128d_to_private(a),
1374
+                b_ = simde__m128d_to_private(b);
1375
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1376
+   return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0);
1377
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1378
+   return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >=
1379
+          wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1380
+#else
1381
+   return a_.f64[0] >= b_.f64[0];
1382
+#endif
1383
+#endif
1384
+}
1385
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1386
+#define _mm_comige_sd(a, b) simde_mm_comige_sd(a, b)
1387
+#endif
1388
+
1389
+SIMDE_FUNCTION_ATTRIBUTES
1390
+int simde_mm_comigt_sd(simde__m128d a, simde__m128d b)
1391
+{
1392
+#if defined(SIMDE_X86_SSE2_NATIVE)
1393
+   return _mm_comigt_sd(a, b);
1394
+#else
1395
+   simde__m128d_private a_ = simde__m128d_to_private(a),
1396
+                b_ = simde__m128d_to_private(b);
1397
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1398
+   return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0);
1399
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1400
+   return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >
1401
+          wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1402
+#else
1403
+   return a_.f64[0] > b_.f64[0];
1404
+#endif
1405
+#endif
1406
+}
1407
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1408
+#define _mm_comigt_sd(a, b) simde_mm_comigt_sd(a, b)
1409
+#endif
1410
+
1411
+SIMDE_FUNCTION_ATTRIBUTES
1412
+int simde_mm_comile_sd(simde__m128d a, simde__m128d b)
1413
+{
1414
+#if defined(SIMDE_X86_SSE2_NATIVE)
1415
+   return _mm_comile_sd(a, b);
1416
+#else
1417
+   simde__m128d_private a_ = simde__m128d_to_private(a),
1418
+                b_ = simde__m128d_to_private(b);
1419
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1420
+   return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0);
1421
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1422
+   return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <=
1423
+          wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1424
+#else
1425
+   return a_.f64[0] <= b_.f64[0];
1426
+#endif
1427
+#endif
1428
+}
1429
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1430
+#define _mm_comile_sd(a, b) simde_mm_comile_sd(a, b)
1431
+#endif
1432
+
1433
+SIMDE_FUNCTION_ATTRIBUTES
1434
+int simde_mm_comilt_sd(simde__m128d a, simde__m128d b)
1435
+{
1436
+#if defined(SIMDE_X86_SSE2_NATIVE)
1437
+   return _mm_comilt_sd(a, b);
1438
+#else
1439
+   simde__m128d_private a_ = simde__m128d_to_private(a),
1440
+                b_ = simde__m128d_to_private(b);
1441
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1442
+   return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0);
1443
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1444
+   return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <
1445
+          wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1446
+#else
1447
+   return a_.f64[0] < b_.f64[0];
1448
+#endif
1449
+#endif
1450
+}
1451
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1452
+#define _mm_comilt_sd(a, b) simde_mm_comilt_sd(a, b)
1453
+#endif
1454
+
1455
+SIMDE_FUNCTION_ATTRIBUTES
1456
+int simde_mm_comineq_sd(simde__m128d a, simde__m128d b)
1457
+{
1458
+#if defined(SIMDE_X86_SSE2_NATIVE)
1459
+   return _mm_comineq_sd(a, b);
1460
+#else
1461
+   simde__m128d_private a_ = simde__m128d_to_private(a),
1462
+                b_ = simde__m128d_to_private(b);
1463
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1464
+   return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0);
1465
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1466
+   return wasm_f64x2_extract_lane(a_.wasm_v128, 0) !=
1467
+          wasm_f64x2_extract_lane(b_.wasm_v128, 0);
1468
+#else
1469
+   return a_.f64[0] != b_.f64[0];
1470
+#endif
1471
+#endif
1472
+}
1473
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1474
+#define _mm_comineq_sd(a, b) simde_mm_comineq_sd(a, b)
1475
+#endif
1476
+
1477
+SIMDE_FUNCTION_ATTRIBUTES
1478
+simde__m128d simde_x_mm_copysign_pd(simde__m128d dest, simde__m128d src)
1479
+{
1480
+   simde__m128d_private r_, dest_ = simde__m128d_to_private(dest),
1481
+                src_ = simde__m128d_to_private(src);
1482
+
1483
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1484
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1485
+   uint64x2_t sign_pos =
1486
+       vreinterpretq_u64_f64(vdupq_n_f64(-SIMDE_FLOAT64_C(0.0)));
1487
+#else
1488
+   simde_float64 dbl_nz = -SIMDE_FLOAT64_C(0.0);
1489
+   uint64_t u64_nz;
1490
+   simde_memcpy(&u64_nz, &dbl_nz, sizeof(u64_nz));
1491
+   uint64x2_t sign_pos = vdupq_n_u64(u64_nz);
1492
+#endif
1493
+   r_.neon_u64 = vbslq_u64(sign_pos, src_.neon_u64, dest_.neon_u64);
1494
+#elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
1495
+#if !defined(HEDLEY_IBM_VERSION)
1496
+   r_.altivec_f64 = vec_cpsgn(dest_.altivec_f64, src_.altivec_f64);
1497
+#else
1498
+   r_.altivec_f64 = vec_cpsgn(src_.altivec_f64, dest_.altivec_f64);
1499
+#endif
1500
+#elif defined(simde_math_copysign)
1501
+   SIMDE_VECTORIZE
1502
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
1503
+       r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]);
1504
+   }
1505
+#else
1506
+   simde__m128d sgnbit = simde_mm_set1_pd(-SIMDE_FLOAT64_C(0.0));
1507
+   return simde_mm_xor_pd(simde_mm_and_pd(sgnbit, src),
1508
+                  simde_mm_andnot_pd(sgnbit, dest));
1509
+#endif
1510
+
1511
+   return simde__m128d_from_private(r_);
1512
+}
1513
+
1514
+SIMDE_FUNCTION_ATTRIBUTES
1515
+simde__m128d simde_x_mm_xorsign_pd(simde__m128d dest, simde__m128d src)
1516
+{
1517
+   return simde_mm_xor_pd(simde_mm_and_pd(simde_mm_set1_pd(-0.0), src),
1518
+                  dest);
1519
+}
1520
+
1521
+SIMDE_FUNCTION_ATTRIBUTES
1522
+simde__m128 simde_mm_castpd_ps(simde__m128d a)
1523
+{
1524
+#if defined(SIMDE_X86_SSE2_NATIVE)
1525
+   return _mm_castpd_ps(a);
1526
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1527
+   return vreinterpretq_f32_f64(a);
1528
+#else
1529
+   simde__m128 r;
1530
+   simde_memcpy(&r, &a, sizeof(a));
1531
+   return r;
1532
+#endif
1533
+}
1534
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1535
+#define _mm_castpd_ps(a) simde_mm_castpd_ps(a)
1536
+#endif
1537
+
1538
+SIMDE_FUNCTION_ATTRIBUTES
1539
+simde__m128i simde_mm_castpd_si128(simde__m128d a)
1540
+{
1541
+#if defined(SIMDE_X86_SSE2_NATIVE)
1542
+   return _mm_castpd_si128(a);
1543
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1544
+   return vreinterpretq_s64_f64(a);
1545
+#else
1546
+   simde__m128i r;
1547
+   simde_memcpy(&r, &a, sizeof(a));
1548
+   return r;
1549
+#endif
1550
+}
1551
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1552
+#define _mm_castpd_si128(a) simde_mm_castpd_si128(a)
1553
+#endif
1554
+
1555
+SIMDE_FUNCTION_ATTRIBUTES
1556
+simde__m128d simde_mm_castps_pd(simde__m128 a)
1557
+{
1558
+#if defined(SIMDE_X86_SSE2_NATIVE)
1559
+   return _mm_castps_pd(a);
1560
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1561
+   return vreinterpretq_f64_f32(a);
1562
+#else
1563
+   simde__m128d r;
1564
+   simde_memcpy(&r, &a, sizeof(a));
1565
+   return r;
1566
+#endif
1567
+}
1568
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1569
+#define _mm_castps_pd(a) simde_mm_castps_pd(a)
1570
+#endif
1571
+
1572
+SIMDE_FUNCTION_ATTRIBUTES
1573
+simde__m128i simde_mm_castps_si128(simde__m128 a)
1574
+{
1575
+#if defined(SIMDE_X86_SSE2_NATIVE)
1576
+   return _mm_castps_si128(a);
1577
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1578
+   return simde__m128i_from_neon_i32(simde__m128_to_private(a).neon_i32);
1579
+#else
1580
+   simde__m128i r;
1581
+   simde_memcpy(&r, &a, sizeof(a));
1582
+   return r;
1583
+#endif
1584
+}
1585
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1586
+#define _mm_castps_si128(a) simde_mm_castps_si128(a)
1587
+#endif
1588
+
1589
+SIMDE_FUNCTION_ATTRIBUTES
1590
+simde__m128d simde_mm_castsi128_pd(simde__m128i a)
1591
+{
1592
+#if defined(SIMDE_X86_SSE2_NATIVE)
1593
+   return _mm_castsi128_pd(a);
1594
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1595
+   return vreinterpretq_f64_s64(a);
1596
+#else
1597
+   simde__m128d r;
1598
+   simde_memcpy(&r, &a, sizeof(a));
1599
+   return r;
1600
+#endif
1601
+}
1602
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1603
+#define _mm_castsi128_pd(a) simde_mm_castsi128_pd(a)
1604
+#endif
1605
+
1606
+SIMDE_FUNCTION_ATTRIBUTES
1607
+simde__m128 simde_mm_castsi128_ps(simde__m128i a)
1608
+{
1609
+#if defined(SIMDE_X86_SSE2_NATIVE)
1610
+   return _mm_castsi128_ps(a);
1611
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1612
+   return HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), a);
1613
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1614
+   return simde__m128_from_neon_i32(simde__m128i_to_private(a).neon_i32);
1615
+#else
1616
+   simde__m128 r;
1617
+   simde_memcpy(&r, &a, sizeof(a));
1618
+   return r;
1619
+#endif
1620
+}
1621
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1622
+#define _mm_castsi128_ps(a) simde_mm_castsi128_ps(a)
1623
+#endif
1624
+
1625
+SIMDE_FUNCTION_ATTRIBUTES
1626
+simde__m128i simde_mm_cmpeq_epi8(simde__m128i a, simde__m128i b)
1627
+{
1628
+#if defined(SIMDE_X86_SSE2_NATIVE)
1629
+   return _mm_cmpeq_epi8(a, b);
1630
+#else
1631
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1632
+                b_ = simde__m128i_to_private(b);
1633
+
1634
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1635
+   r_.neon_u8 = vceqq_s8(b_.neon_i8, a_.neon_i8);
1636
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1637
+   r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128);
1638
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1639
+   r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(
1640
+       SIMDE_POWER_ALTIVEC_VECTOR(signed char),
1641
+       vec_cmpeq(a_.altivec_i8, b_.altivec_i8));
1642
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1643
+   r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8));
1644
+#else
1645
+   SIMDE_VECTORIZE
1646
+   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
1647
+       r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1648
+   }
1649
+#endif
1650
+
1651
+   return simde__m128i_from_private(r_);
1652
+#endif
1653
+}
1654
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1655
+#define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b)
1656
+#endif
1657
+
1658
+SIMDE_FUNCTION_ATTRIBUTES
1659
+simde__m128i simde_mm_cmpeq_epi16(simde__m128i a, simde__m128i b)
1660
+{
1661
+#if defined(SIMDE_X86_SSE2_NATIVE)
1662
+   return _mm_cmpeq_epi16(a, b);
1663
+#else
1664
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1665
+                b_ = simde__m128i_to_private(b);
1666
+
1667
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1668
+   r_.neon_u16 = vceqq_s16(b_.neon_i16, a_.neon_i16);
1669
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1670
+   r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128);
1671
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1672
+   r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(
1673
+       SIMDE_POWER_ALTIVEC_VECTOR(signed short),
1674
+       vec_cmpeq(a_.altivec_i16, b_.altivec_i16));
1675
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1676
+   r_.i16 = (a_.i16 == b_.i16);
1677
+#else
1678
+   SIMDE_VECTORIZE
1679
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1680
+       r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1681
+   }
1682
+#endif
1683
+
1684
+   return simde__m128i_from_private(r_);
1685
+#endif
1686
+}
1687
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1688
+#define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b)
1689
+#endif
1690
+
1691
+SIMDE_FUNCTION_ATTRIBUTES
1692
+simde__m128i simde_mm_cmpeq_epi32(simde__m128i a, simde__m128i b)
1693
+{
1694
+#if defined(SIMDE_X86_SSE2_NATIVE)
1695
+   return _mm_cmpeq_epi32(a, b);
1696
+#else
1697
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1698
+                b_ = simde__m128i_to_private(b);
1699
+
1700
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1701
+   r_.neon_u32 = vceqq_s32(b_.neon_i32, a_.neon_i32);
1702
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1703
+   r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128);
1704
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1705
+   r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(
1706
+       SIMDE_POWER_ALTIVEC_VECTOR(signed int),
1707
+       vec_cmpeq(a_.altivec_i32, b_.altivec_i32));
1708
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1709
+   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), a_.i32 == b_.i32);
1710
+#else
1711
+   SIMDE_VECTORIZE
1712
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1713
+       r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1714
+   }
1715
+#endif
1716
+
1717
+   return simde__m128i_from_private(r_);
1718
+#endif
1719
+}
1720
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1721
+#define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b)
1722
+#endif
1723
+
1724
+SIMDE_FUNCTION_ATTRIBUTES
1725
+simde__m128d simde_mm_cmpeq_pd(simde__m128d a, simde__m128d b)
1726
+{
1727
+#if defined(SIMDE_X86_SSE2_NATIVE)
1728
+   return _mm_cmpeq_pd(a, b);
1729
+#else
1730
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1731
+                b_ = simde__m128d_to_private(b);
1732
+
1733
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1734
+   r_.neon_u64 = vceqq_s64(b_.neon_i64, a_.neon_i64);
1735
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1736
+   r_.wasm_v128 = wasm_f64x2_eq(a_.wasm_v128, b_.wasm_v128);
1737
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
1738
+   r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(
1739
+       SIMDE_POWER_ALTIVEC_VECTOR(double),
1740
+       vec_cmpeq(a_.altivec_f64, b_.altivec_f64));
1741
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1742
+   r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64));
1743
+#else
1744
+   SIMDE_VECTORIZE
1745
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
1746
+       r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0)
1747
+                            : UINT64_C(0);
1748
+   }
1749
+#endif
1750
+
1751
+   return simde__m128d_from_private(r_);
1752
+#endif
1753
+}
1754
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1755
+#define _mm_cmpeq_pd(a, b) simde_mm_cmpeq_pd(a, b)
1756
+#endif
1757
+
1758
+SIMDE_FUNCTION_ATTRIBUTES
1759
+simde__m128d simde_mm_cmpeq_sd(simde__m128d a, simde__m128d b)
1760
+{
1761
+#if defined(SIMDE_X86_SSE2_NATIVE)
1762
+   return _mm_cmpeq_sd(a, b);
1763
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1764
+   return simde_mm_move_sd(a, simde_mm_cmpeq_pd(a, b));
1765
+#else
1766
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1767
+                b_ = simde__m128d_to_private(b);
1768
+
1769
+   r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0;
1770
+   r_.u64[1] = a_.u64[1];
1771
+
1772
+   return simde__m128d_from_private(r_);
1773
+#endif
1774
+}
1775
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1776
+#define _mm_cmpeq_sd(a, b) simde_mm_cmpeq_sd(a, b)
1777
+#endif
1778
+
1779
+SIMDE_FUNCTION_ATTRIBUTES
1780
+simde__m128d simde_mm_cmpneq_pd(simde__m128d a, simde__m128d b)
1781
+{
1782
+#if defined(SIMDE_X86_SSE2_NATIVE)
1783
+   return _mm_cmpneq_pd(a, b);
1784
+#else
1785
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1786
+                b_ = simde__m128d_to_private(b);
1787
+
1788
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1789
+   r_.neon_u32 = vmvnq_u32(
1790
+       vreinterpretq_u32_u64(vceqq_f64(b_.neon_f64, a_.neon_f64)));
1791
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1792
+   r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128);
1793
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1794
+   r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64));
1795
+#else
1796
+   SIMDE_VECTORIZE
1797
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
1798
+       r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0)
1799
+                            : UINT64_C(0);
1800
+   }
1801
+#endif
1802
+
1803
+   return simde__m128d_from_private(r_);
1804
+#endif
1805
+}
1806
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1807
+#define _mm_cmpneq_pd(a, b) simde_mm_cmpneq_pd(a, b)
1808
+#endif
1809
+
1810
+SIMDE_FUNCTION_ATTRIBUTES
1811
+simde__m128d simde_mm_cmpneq_sd(simde__m128d a, simde__m128d b)
1812
+{
1813
+#if defined(SIMDE_X86_SSE2_NATIVE)
1814
+   return _mm_cmpneq_sd(a, b);
1815
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1816
+   return simde_mm_move_sd(a, simde_mm_cmpneq_pd(a, b));
1817
+#else
1818
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1819
+                b_ = simde__m128d_to_private(b);
1820
+
1821
+   r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1822
+   r_.u64[1] = a_.u64[1];
1823
+
1824
+   return simde__m128d_from_private(r_);
1825
+#endif
1826
+}
1827
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1828
+#define _mm_cmpneq_sd(a, b) simde_mm_cmpneq_sd(a, b)
1829
+#endif
1830
+
1831
+SIMDE_FUNCTION_ATTRIBUTES
1832
+simde__m128i simde_mm_cmplt_epi8(simde__m128i a, simde__m128i b)
1833
+{
1834
+#if defined(SIMDE_X86_SSE2_NATIVE)
1835
+   return _mm_cmplt_epi8(a, b);
1836
+#else
1837
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1838
+                b_ = simde__m128i_to_private(b);
1839
+
1840
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1841
+   r_.neon_u8 = vcltq_s8(a_.neon_i8, b_.neon_i8);
1842
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1843
+   r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(
1844
+       SIMDE_POWER_ALTIVEC_VECTOR(signed char),
1845
+       vec_cmplt(a_.altivec_i8, b_.altivec_i8));
1846
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1847
+   r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128);
1848
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1849
+   r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8));
1850
+#else
1851
+   SIMDE_VECTORIZE
1852
+   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
1853
+       r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
1854
+   }
1855
+#endif
1856
+
1857
+   return simde__m128i_from_private(r_);
1858
+#endif
1859
+}
1860
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1861
+#define _mm_cmplt_epi8(a, b) simde_mm_cmplt_epi8(a, b)
1862
+#endif
1863
+
1864
+SIMDE_FUNCTION_ATTRIBUTES
1865
+simde__m128i simde_mm_cmplt_epi16(simde__m128i a, simde__m128i b)
1866
+{
1867
+#if defined(SIMDE_X86_SSE2_NATIVE)
1868
+   return _mm_cmplt_epi16(a, b);
1869
+#else
1870
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1871
+                b_ = simde__m128i_to_private(b);
1872
+
1873
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1874
+   r_.neon_u16 = vcltq_s16(a_.neon_i16, b_.neon_i16);
1875
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1876
+   r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(
1877
+       SIMDE_POWER_ALTIVEC_VECTOR(signed short),
1878
+       vec_cmplt(a_.altivec_i16, b_.altivec_i16));
1879
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1880
+   r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128);
1881
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1882
+   r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16));
1883
+#else
1884
+   SIMDE_VECTORIZE
1885
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
1886
+       r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
1887
+   }
1888
+#endif
1889
+
1890
+   return simde__m128i_from_private(r_);
1891
+#endif
1892
+}
1893
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1894
+#define _mm_cmplt_epi16(a, b) simde_mm_cmplt_epi16(a, b)
1895
+#endif
1896
+
1897
+SIMDE_FUNCTION_ATTRIBUTES
1898
+simde__m128i simde_mm_cmplt_epi32(simde__m128i a, simde__m128i b)
1899
+{
1900
+#if defined(SIMDE_X86_SSE2_NATIVE)
1901
+   return _mm_cmplt_epi32(a, b);
1902
+#else
1903
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
1904
+                b_ = simde__m128i_to_private(b);
1905
+
1906
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
1907
+   r_.neon_u32 = vcltq_s32(a_.neon_i32, b_.neon_i32);
1908
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1909
+   r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(
1910
+       SIMDE_POWER_ALTIVEC_VECTOR(signed int),
1911
+       vec_cmplt(a_.altivec_i32, b_.altivec_i32));
1912
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1913
+   r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128);
1914
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1915
+   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32));
1916
+#else
1917
+   SIMDE_VECTORIZE
1918
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
1919
+       r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
1920
+   }
1921
+#endif
1922
+
1923
+   return simde__m128i_from_private(r_);
1924
+#endif
1925
+}
1926
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1927
+#define _mm_cmplt_epi32(a, b) simde_mm_cmplt_epi32(a, b)
1928
+#endif
1929
+
1930
+SIMDE_FUNCTION_ATTRIBUTES
1931
+simde__m128d simde_mm_cmplt_pd(simde__m128d a, simde__m128d b)
1932
+{
1933
+#if defined(SIMDE_X86_SSE2_NATIVE)
1934
+   return _mm_cmplt_pd(a, b);
1935
+#else
1936
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1937
+                b_ = simde__m128d_to_private(b);
1938
+
1939
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1940
+   r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64));
1941
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1942
+   r_.neon_u64 = vcltq_f64(a_.neon_f64, b_.neon_f64);
1943
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1944
+   r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128);
1945
+#else
1946
+   SIMDE_VECTORIZE
1947
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
1948
+       r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0)
1949
+                           : UINT64_C(0);
1950
+   }
1951
+#endif
1952
+
1953
+   return simde__m128d_from_private(r_);
1954
+#endif
1955
+}
1956
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1957
+#define _mm_cmplt_pd(a, b) simde_mm_cmplt_pd(a, b)
1958
+#endif
1959
+
1960
+SIMDE_FUNCTION_ATTRIBUTES
1961
+simde__m128d simde_mm_cmplt_sd(simde__m128d a, simde__m128d b)
1962
+{
1963
+#if defined(SIMDE_X86_SSE2_NATIVE)
1964
+   return _mm_cmplt_sd(a, b);
1965
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
1966
+   return simde_mm_move_sd(a, simde_mm_cmplt_pd(a, b));
1967
+#else
1968
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1969
+                b_ = simde__m128d_to_private(b);
1970
+
1971
+   r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1972
+   r_.u64[1] = a_.u64[1];
1973
+
1974
+   return simde__m128d_from_private(r_);
1975
+#endif
1976
+}
1977
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
1978
+#define _mm_cmplt_sd(a, b) simde_mm_cmplt_sd(a, b)
1979
+#endif
1980
+
1981
+SIMDE_FUNCTION_ATTRIBUTES
1982
+simde__m128d simde_mm_cmple_pd(simde__m128d a, simde__m128d b)
1983
+{
1984
+#if defined(SIMDE_X86_SSE2_NATIVE)
1985
+   return _mm_cmple_pd(a, b);
1986
+#else
1987
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
1988
+                b_ = simde__m128d_to_private(b);
1989
+
1990
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
1991
+   r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64));
1992
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
1993
+   r_.neon_u64 = vcleq_f64(a_.neon_f64, b_.neon_f64);
1994
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
1995
+   r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128);
1996
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
1997
+   r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(
1998
+       SIMDE_POWER_ALTIVEC_VECTOR(double),
1999
+       vec_cmple(a_.altivec_f64, b_.altivec_f64));
2000
+#else
2001
+   SIMDE_VECTORIZE
2002
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
2003
+       r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0)
2004
+                            : UINT64_C(0);
2005
+   }
2006
+#endif
2007
+
2008
+   return simde__m128d_from_private(r_);
2009
+#endif
2010
+}
2011
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2012
+#define _mm_cmple_pd(a, b) simde_mm_cmple_pd(a, b)
2013
+#endif
2014
+
2015
+SIMDE_FUNCTION_ATTRIBUTES
2016
+simde__m128d simde_mm_cmple_sd(simde__m128d a, simde__m128d b)
2017
+{
2018
+#if defined(SIMDE_X86_SSE2_NATIVE)
2019
+   return _mm_cmple_sd(a, b);
2020
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2021
+   return simde_mm_move_sd(a, simde_mm_cmple_pd(a, b));
2022
+#else
2023
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
2024
+                b_ = simde__m128d_to_private(b);
2025
+
2026
+   r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
2027
+   r_.u64[1] = a_.u64[1];
2028
+
2029
+   return simde__m128d_from_private(r_);
2030
+#endif
2031
+}
2032
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2033
+#define _mm_cmple_sd(a, b) simde_mm_cmple_sd(a, b)
2034
+#endif
2035
+
2036
+SIMDE_FUNCTION_ATTRIBUTES
2037
+simde__m128i simde_mm_cmpgt_epi8(simde__m128i a, simde__m128i b)
2038
+{
2039
+#if defined(SIMDE_X86_SSE2_NATIVE)
2040
+   return _mm_cmpgt_epi8(a, b);
2041
+#else
2042
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
2043
+                b_ = simde__m128i_to_private(b);
2044
+
2045
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2046
+   r_.neon_u8 = vcgtq_s8(a_.neon_i8, b_.neon_i8);
2047
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2048
+   r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128);
2049
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2050
+   r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(
2051
+       SIMDE_POWER_ALTIVEC_VECTOR(signed char),
2052
+       vec_cmpgt(a_.altivec_i8, b_.altivec_i8));
2053
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2054
+   r_.i8 = HEDLEY_STATIC_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8));
2055
+#else
2056
+   SIMDE_VECTORIZE
2057
+   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
2058
+       r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0);
2059
+   }
2060
+#endif
2061
+
2062
+   return simde__m128i_from_private(r_);
2063
+#endif
2064
+}
2065
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2066
+#define _mm_cmpgt_epi8(a, b) simde_mm_cmpgt_epi8(a, b)
2067
+#endif
2068
+
2069
+SIMDE_FUNCTION_ATTRIBUTES
2070
+simde__m128i simde_mm_cmpgt_epi16(simde__m128i a, simde__m128i b)
2071
+{
2072
+#if defined(SIMDE_X86_SSE2_NATIVE)
2073
+   return _mm_cmpgt_epi16(a, b);
2074
+#else
2075
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
2076
+                b_ = simde__m128i_to_private(b);
2077
+
2078
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2079
+   r_.neon_u16 = vcgtq_s16(a_.neon_i16, b_.neon_i16);
2080
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2081
+   r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128);
2082
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2083
+   r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(
2084
+       SIMDE_POWER_ALTIVEC_VECTOR(signed short),
2085
+       vec_cmpgt(a_.altivec_i16, b_.altivec_i16));
2086
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2087
+   r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16));
2088
+#else
2089
+   SIMDE_VECTORIZE
2090
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
2091
+       r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0);
2092
+   }
2093
+#endif
2094
+
2095
+   return simde__m128i_from_private(r_);
2096
+#endif
2097
+}
2098
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2099
+#define _mm_cmpgt_epi16(a, b) simde_mm_cmpgt_epi16(a, b)
2100
+#endif
2101
+
2102
+SIMDE_FUNCTION_ATTRIBUTES
2103
+simde__m128i simde_mm_cmpgt_epi32(simde__m128i a, simde__m128i b)
2104
+{
2105
+#if defined(SIMDE_X86_SSE2_NATIVE)
2106
+   return _mm_cmpgt_epi32(a, b);
2107
+#else
2108
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
2109
+                b_ = simde__m128i_to_private(b);
2110
+
2111
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2112
+   r_.neon_u32 = vcgtq_s32(a_.neon_i32, b_.neon_i32);
2113
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2114
+   r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128);
2115
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2116
+   r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(
2117
+       SIMDE_POWER_ALTIVEC_VECTOR(signed int),
2118
+       vec_cmpgt(a_.altivec_i32, b_.altivec_i32));
2119
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2120
+   r_.i32 = HEDLEY_STATIC_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32));
2121
+#else
2122
+   SIMDE_VECTORIZE
2123
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
2124
+       r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0);
2125
+   }
2126
+#endif
2127
+
2128
+   return simde__m128i_from_private(r_);
2129
+#endif
2130
+}
2131
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2132
+#define _mm_cmpgt_epi32(a, b) simde_mm_cmpgt_epi32(a, b)
2133
+#endif
2134
+
2135
+SIMDE_FUNCTION_ATTRIBUTES
2136
+simde__m128d simde_mm_cmpgt_pd(simde__m128d a, simde__m128d b)
2137
+{
2138
+#if defined(SIMDE_X86_SSE2_NATIVE)
2139
+   return _mm_cmpgt_pd(a, b);
2140
+#else
2141
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
2142
+                b_ = simde__m128d_to_private(b);
2143
+
2144
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2145
+   r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64));
2146
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2147
+   r_.neon_u64 = vcgtq_f64(a_.neon_f64, b_.neon_f64);
2148
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2149
+   r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128);
2150
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2151
+   r_.altivec_f64 =
2152
+       HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double),
2153
+                  vec_cmpgt(a_.altivec_f64, b_.altivec_f64));
2154
+#else
2155
+   SIMDE_VECTORIZE
2156
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
2157
+       r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0)
2158
+                           : UINT64_C(0);
2159
+   }
2160
+#endif
2161
+
2162
+   return simde__m128d_from_private(r_);
2163
+#endif
2164
+}
2165
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2166
+#define _mm_cmpgt_pd(a, b) simde_mm_cmpgt_pd(a, b)
2167
+#endif
2168
+
2169
+SIMDE_FUNCTION_ATTRIBUTES
2170
+simde__m128d simde_mm_cmpgt_sd(simde__m128d a, simde__m128d b)
2171
+{
2172
+#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2173
+   return _mm_cmpgt_sd(a, b);
2174
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2175
+   return simde_mm_move_sd(a, simde_mm_cmpgt_pd(a, b));
2176
+#else
2177
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
2178
+                b_ = simde__m128d_to_private(b);
2179
+
2180
+   r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
2181
+   r_.u64[1] = a_.u64[1];
2182
+
2183
+   return simde__m128d_from_private(r_);
2184
+#endif
2185
+}
2186
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2187
+#define _mm_cmpgt_sd(a, b) simde_mm_cmpgt_sd(a, b)
2188
+#endif
2189
+
2190
+SIMDE_FUNCTION_ATTRIBUTES
2191
+simde__m128d simde_mm_cmpge_pd(simde__m128d a, simde__m128d b)
2192
+{
2193
+#if defined(SIMDE_X86_SSE2_NATIVE)
2194
+   return _mm_cmpge_pd(a, b);
2195
+#else
2196
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
2197
+                b_ = simde__m128d_to_private(b);
2198
+
2199
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
2200
+   r_.i64 = HEDLEY_STATIC_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64));
2201
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2202
+   r_.neon_u64 = vcgeq_f64(a_.neon_f64, b_.neon_f64);
2203
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2204
+   r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128);
2205
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2206
+   r_.altivec_f64 =
2207
+       HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double),
2208
+                  vec_cmpge(a_.altivec_f64, b_.altivec_f64));
2209
+#else
2210
+   SIMDE_VECTORIZE
2211
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
2212
+       r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0)
2213
+                            : UINT64_C(0);
2214
+   }
2215
+#endif
2216
+
2217
+   return simde__m128d_from_private(r_);
2218
+#endif
2219
+}
2220
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2221
+#define _mm_cmpge_pd(a, b) simde_mm_cmpge_pd(a, b)
2222
+#endif
2223
+
2224
+SIMDE_FUNCTION_ATTRIBUTES
2225
+simde__m128d simde_mm_cmpge_sd(simde__m128d a, simde__m128d b)
2226
+{
2227
+#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2228
+   return _mm_cmpge_sd(a, b);
2229
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2230
+   return simde_mm_move_sd(a, simde_mm_cmpge_pd(a, b));
2231
+#else
2232
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
2233
+                b_ = simde__m128d_to_private(b);
2234
+
2235
+   r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
2236
+   r_.u64[1] = a_.u64[1];
2237
+
2238
+   return simde__m128d_from_private(r_);
2239
+#endif
2240
+}
2241
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2242
+#define _mm_cmpge_sd(a, b) simde_mm_cmpge_sd(a, b)
2243
+#endif
2244
+
2245
+SIMDE_FUNCTION_ATTRIBUTES
2246
+simde__m128d simde_mm_cmpngt_pd(simde__m128d a, simde__m128d b)
2247
+{
2248
+#if defined(SIMDE_X86_SSE2_NATIVE)
2249
+   return _mm_cmpngt_pd(a, b);
2250
+#else
2251
+   return simde_mm_cmple_pd(a, b);
2252
+#endif
2253
+}
2254
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2255
+#define _mm_cmpngt_pd(a, b) simde_mm_cmpngt_pd(a, b)
2256
+#endif
2257
+
2258
+SIMDE_FUNCTION_ATTRIBUTES
2259
+simde__m128d simde_mm_cmpngt_sd(simde__m128d a, simde__m128d b)
2260
+{
2261
+#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2262
+   return _mm_cmpngt_sd(a, b);
2263
+#else
2264
+   return simde_mm_cmple_sd(a, b);
2265
+#endif
2266
+}
2267
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2268
+#define _mm_cmpngt_sd(a, b) simde_mm_cmpngt_sd(a, b)
2269
+#endif
2270
+
2271
+SIMDE_FUNCTION_ATTRIBUTES
2272
+simde__m128d simde_mm_cmpnge_pd(simde__m128d a, simde__m128d b)
2273
+{
2274
+#if defined(SIMDE_X86_SSE2_NATIVE)
2275
+   return _mm_cmpnge_pd(a, b);
2276
+#else
2277
+   return simde_mm_cmplt_pd(a, b);
2278
+#endif
2279
+}
2280
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2281
+#define _mm_cmpnge_pd(a, b) simde_mm_cmpnge_pd(a, b)
2282
+#endif
2283
+
2284
+SIMDE_FUNCTION_ATTRIBUTES
2285
+simde__m128d simde_mm_cmpnge_sd(simde__m128d a, simde__m128d b)
2286
+{
2287
+#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2288
+   return _mm_cmpnge_sd(a, b);
2289
+#else
2290
+   return simde_mm_cmplt_sd(a, b);
2291
+#endif
2292
+}
2293
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2294
+#define _mm_cmpnge_sd(a, b) simde_mm_cmpnge_sd(a, b)
2295
+#endif
2296
+
2297
+SIMDE_FUNCTION_ATTRIBUTES
2298
+simde__m128d simde_mm_cmpnlt_pd(simde__m128d a, simde__m128d b)
2299
+{
2300
+#if defined(SIMDE_X86_SSE2_NATIVE)
2301
+   return _mm_cmpnlt_pd(a, b);
2302
+#else
2303
+   return simde_mm_cmpge_pd(a, b);
2304
+#endif
2305
+}
2306
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2307
+#define _mm_cmpnlt_pd(a, b) simde_mm_cmpnlt_pd(a, b)
2308
+#endif
2309
+
2310
+SIMDE_FUNCTION_ATTRIBUTES
2311
+simde__m128d simde_mm_cmpnlt_sd(simde__m128d a, simde__m128d b)
2312
+{
2313
+#if defined(SIMDE_X86_SSE2_NATIVE)
2314
+   return _mm_cmpnlt_sd(a, b);
2315
+#else
2316
+   return simde_mm_cmpge_sd(a, b);
2317
+#endif
2318
+}
2319
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2320
+#define _mm_cmpnlt_sd(a, b) simde_mm_cmpnlt_sd(a, b)
2321
+#endif
2322
+
2323
+SIMDE_FUNCTION_ATTRIBUTES
2324
+simde__m128d simde_mm_cmpnle_pd(simde__m128d a, simde__m128d b)
2325
+{
2326
+#if defined(SIMDE_X86_SSE2_NATIVE)
2327
+   return _mm_cmpnle_pd(a, b);
2328
+#else
2329
+   return simde_mm_cmpgt_pd(a, b);
2330
+#endif
2331
+}
2332
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2333
+#define _mm_cmpnle_pd(a, b) simde_mm_cmpnle_pd(a, b)
2334
+#endif
2335
+
2336
+SIMDE_FUNCTION_ATTRIBUTES
2337
+simde__m128d simde_mm_cmpnle_sd(simde__m128d a, simde__m128d b)
2338
+{
2339
+#if defined(SIMDE_X86_SSE2_NATIVE)
2340
+   return _mm_cmpnle_sd(a, b);
2341
+#else
2342
+   return simde_mm_cmpgt_sd(a, b);
2343
+#endif
2344
+}
2345
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2346
+#define _mm_cmpnle_sd(a, b) simde_mm_cmpnle_sd(a, b)
2347
+#endif
2348
+
2349
+SIMDE_FUNCTION_ATTRIBUTES
2350
+simde__m128d simde_mm_cmpord_pd(simde__m128d a, simde__m128d b)
2351
+{
2352
+#if defined(SIMDE_X86_SSE2_NATIVE)
2353
+   return _mm_cmpord_pd(a, b);
2354
+#else
2355
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
2356
+                b_ = simde__m128d_to_private(b);
2357
+
2358
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2359
+   /* Note: NEON does not have ordered compare builtin
2360
+        Need to compare a eq a and b eq b to check for NaN
2361
+        Do AND of results to get final */
2362
+   uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64);
2363
+   uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64);
2364
+   r_.neon_u64 = vandq_u64(ceqaa, ceqbb);
2365
+#elif defined(simde_math_isnan)
2366
+   SIMDE_VECTORIZE
2367
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
2368
+       r_.u64[i] = (!simde_math_isnan(a_.f64[i]) &&
2369
+                !simde_math_isnan(b_.f64[i]))
2370
+                   ? ~UINT64_C(0)
2371
+                   : UINT64_C(0);
2372
+   }
2373
+#else
2374
+   HEDLEY_UNREACHABLE();
2375
+#endif
2376
+
2377
+   return simde__m128d_from_private(r_);
2378
+#endif
2379
+}
2380
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2381
+#define _mm_cmpord_pd(a, b) simde_mm_cmpord_pd(a, b)
2382
+#endif
2383
+
2384
+SIMDE_FUNCTION_ATTRIBUTES
2385
+simde_float64 simde_mm_cvtsd_f64(simde__m128d a)
2386
+{
2387
+#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
2388
+   return _mm_cvtsd_f64(a);
2389
+#else
2390
+   simde__m128d_private a_ = simde__m128d_to_private(a);
2391
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2392
+   return HEDLEY_STATIC_CAST(simde_float64,
2393
+                 vgetq_lane_f64(a_.neon_f64, 0));
2394
+#else
2395
+   return a_.f64[0];
2396
+#endif
2397
+#endif
2398
+}
2399
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2400
+#define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(a)
2401
+#endif
2402
+
2403
+SIMDE_FUNCTION_ATTRIBUTES
2404
+simde__m128d simde_mm_cmpord_sd(simde__m128d a, simde__m128d b)
2405
+{
2406
+#if defined(SIMDE_X86_SSE2_NATIVE)
2407
+   return _mm_cmpord_sd(a, b);
2408
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2409
+   return simde_mm_move_sd(a, simde_mm_cmpord_pd(a, b));
2410
+#else
2411
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
2412
+                b_ = simde__m128d_to_private(b);
2413
+
2414
+#if defined(simde_math_isnan)
2415
+   r_.u64[0] =
2416
+       (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0]))
2417
+           ? ~UINT64_C(0)
2418
+           : UINT64_C(0);
2419
+   r_.u64[1] = a_.u64[1];
2420
+#else
2421
+   HEDLEY_UNREACHABLE();
2422
+#endif
2423
+
2424
+   return simde__m128d_from_private(r_);
2425
+#endif
2426
+}
2427
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2428
+#define _mm_cmpord_sd(a, b) simde_mm_cmpord_sd(a, b)
2429
+#endif
2430
+
2431
+SIMDE_FUNCTION_ATTRIBUTES
2432
+simde__m128d simde_mm_cmpunord_pd(simde__m128d a, simde__m128d b)
2433
+{
2434
+#if defined(SIMDE_X86_SSE2_NATIVE)
2435
+   return _mm_cmpunord_pd(a, b);
2436
+#else
2437
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
2438
+                b_ = simde__m128d_to_private(b);
2439
+
2440
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2441
+   uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64);
2442
+   uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64);
2443
+   r_.neon_u64 = vreinterpretq_u64_u32(
2444
+       vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(ceqaa, ceqbb))));
2445
+#elif defined(simde_math_isnan)
2446
+   SIMDE_VECTORIZE
2447
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
2448
+       r_.u64[i] = (simde_math_isnan(a_.f64[i]) ||
2449
+                simde_math_isnan(b_.f64[i]))
2450
+                   ? ~UINT64_C(0)
2451
+                   : UINT64_C(0);
2452
+   }
2453
+#else
2454
+   HEDLEY_UNREACHABLE();
2455
+#endif
2456
+
2457
+   return simde__m128d_from_private(r_);
2458
+#endif
2459
+}
2460
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2461
+#define _mm_cmpunord_pd(a, b) simde_mm_cmpunord_pd(a, b)
2462
+#endif
2463
+
2464
+SIMDE_FUNCTION_ATTRIBUTES
2465
+simde__m128d simde_mm_cmpunord_sd(simde__m128d a, simde__m128d b)
2466
+{
2467
+#if defined(SIMDE_X86_SSE2_NATIVE)
2468
+   return _mm_cmpunord_sd(a, b);
2469
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
2470
+   return simde_mm_move_sd(a, simde_mm_cmpunord_pd(a, b));
2471
+#else
2472
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
2473
+                b_ = simde__m128d_to_private(b);
2474
+
2475
+#if defined(simde_math_isnan)
2476
+   r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0]))
2477
+               ? ~UINT64_C(0)
2478
+               : UINT64_C(0);
2479
+   r_.u64[1] = a_.u64[1];
2480
+#else
2481
+   HEDLEY_UNREACHABLE();
2482
+#endif
2483
+
2484
+   return simde__m128d_from_private(r_);
2485
+#endif
2486
+}
2487
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2488
+#define _mm_cmpunord_sd(a, b) simde_mm_cmpunord_sd(a, b)
2489
+#endif
2490
+
2491
+SIMDE_FUNCTION_ATTRIBUTES
2492
+simde__m128d simde_mm_cvtepi32_pd(simde__m128i a)
2493
+{
2494
+#if defined(SIMDE_X86_SSE2_NATIVE)
2495
+   return _mm_cvtepi32_pd(a);
2496
+#else
2497
+   simde__m128d_private r_;
2498
+   simde__m128i_private a_ = simde__m128i_to_private(a);
2499
+
2500
+#if defined(SIMDE_CONVERT_VECTOR_)
2501
+   SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32);
2502
+#else
2503
+   SIMDE_VECTORIZE
2504
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
2505
+       r_.f64[i] = (simde_float64)a_.i32[i];
2506
+   }
2507
+#endif
2508
+
2509
+   return simde__m128d_from_private(r_);
2510
+#endif
2511
+}
2512
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2513
+#define _mm_cvtepi32_pd(a) simde_mm_cvtepi32_pd(a)
2514
+#endif
2515
+
2516
+SIMDE_FUNCTION_ATTRIBUTES
2517
+simde__m128 simde_mm_cvtepi32_ps(simde__m128i a)
2518
+{
2519
+#if defined(SIMDE_X86_SSE2_NATIVE)
2520
+   return _mm_cvtepi32_ps(a);
2521
+#else
2522
+   simde__m128_private r_;
2523
+   simde__m128i_private a_ = simde__m128i_to_private(a);
2524
+
2525
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2526
+   r_.neon_f32 = vcvtq_f32_s32(a_.neon_i32);
2527
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2528
+   r_.wasm_v128 = wasm_f32x4_convert_i32x4(a_.wasm_v128);
2529
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2530
+   HEDLEY_DIAGNOSTIC_PUSH
2531
+#if HEDLEY_HAS_WARNING("-Wc11-extensions")
2532
+#pragma clang diagnostic ignored "-Wc11-extensions"
2533
+#endif
2534
+   r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0);
2535
+   HEDLEY_DIAGNOSTIC_POP
2536
+#elif defined(SIMDE_CONVERT_VECTOR_)
2537
+   SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32);
2538
+#else
2539
+   SIMDE_VECTORIZE
2540
+   for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) {
2541
+       r_.f32[i] = (simde_float32)a_.i32[i];
2542
+   }
2543
+#endif
2544
+
2545
+   return simde__m128_from_private(r_);
2546
+#endif
2547
+}
2548
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2549
+#define _mm_cvtepi32_ps(a) simde_mm_cvtepi32_ps(a)
2550
+#endif
2551
+
2552
+SIMDE_FUNCTION_ATTRIBUTES
2553
+simde__m64 simde_mm_cvtpd_pi32(simde__m128d a)
2554
+{
2555
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2556
+   return _mm_cvtpd_pi32(a);
2557
+#else
2558
+   simde__m64_private r_;
2559
+   simde__m128d_private a_ = simde__m128d_to_private(a);
2560
+
2561
+   SIMDE_VECTORIZE
2562
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
2563
+       simde_float64 v = simde_math_round(a_.f64[i]);
2564
+#if defined(SIMDE_FAST_CONVERSION_RANGE)
2565
+       r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
2566
+#else
2567
+       r_.i32[i] =
2568
+           ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) &&
2569
+            (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX)))
2570
+               ? SIMDE_CONVERT_FTOI(int32_t, v)
2571
+               : INT32_MIN;
2572
+#endif
2573
+   }
2574
+
2575
+   return simde__m64_from_private(r_);
2576
+#endif
2577
+}
2578
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2579
+#define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a)
2580
+#endif
2581
+
2582
+SIMDE_FUNCTION_ATTRIBUTES
2583
+simde__m128i simde_mm_cvtpd_epi32(simde__m128d a)
2584
+{
2585
+#if defined(SIMDE_X86_SSE2_NATIVE)
2586
+   return _mm_cvtpd_epi32(a);
2587
+#else
2588
+   simde__m128i_private r_;
2589
+
2590
+   r_.m64[0] = simde_mm_cvtpd_pi32(a);
2591
+   r_.m64[1] = simde_mm_setzero_si64();
2592
+
2593
+   return simde__m128i_from_private(r_);
2594
+#endif
2595
+}
2596
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2597
+#define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a)
2598
+#endif
2599
+
2600
+SIMDE_FUNCTION_ATTRIBUTES
2601
+simde__m128 simde_mm_cvtpd_ps(simde__m128d a)
2602
+{
2603
+#if defined(SIMDE_X86_SSE2_NATIVE)
2604
+   return _mm_cvtpd_ps(a);
2605
+#else
2606
+   simde__m128_private r_;
2607
+   simde__m128d_private a_ = simde__m128d_to_private(a);
2608
+
2609
+#if defined(SIMDE_CONVERT_VECTOR_)
2610
+   SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.f64);
2611
+   r_.m64_private[1] = simde__m64_to_private(simde_mm_setzero_si64());
2612
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2613
+   r_.neon_f32 = vreinterpretq_f32_f64(
2614
+       vcombine_f64(vreinterpret_f64_f32(vcvtx_f32_f64(a_.neon_f64)),
2615
+                vdup_n_f64(0)));
2616
+#else
2617
+   SIMDE_VECTORIZE
2618
+   for (size_t i = 0; i < (sizeof(a_.f64) / sizeof(a_.f64[0])); i++) {
2619
+       r_.f32[i] = (simde_float32)a_.f64[i];
2620
+   }
2621
+   simde_memset(&(r_.m64_private[1]), 0, sizeof(r_.m64_private[1]));
2622
+#endif
2623
+
2624
+   return simde__m128_from_private(r_);
2625
+#endif
2626
+}
2627
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2628
+#define _mm_cvtpd_ps(a) simde_mm_cvtpd_ps(a)
2629
+#endif
2630
+
2631
+SIMDE_FUNCTION_ATTRIBUTES
2632
+simde__m128d simde_mm_cvtpi32_pd(simde__m64 a)
2633
+{
2634
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
2635
+   return _mm_cvtpi32_pd(a);
2636
+#else
2637
+   simde__m128d_private r_;
2638
+   simde__m64_private a_ = simde__m64_to_private(a);
2639
+
2640
+#if defined(SIMDE_CONVERT_VECTOR_)
2641
+   SIMDE_CONVERT_VECTOR_(r_.f64, a_.i32);
2642
+#else
2643
+   SIMDE_VECTORIZE
2644
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
2645
+       r_.f64[i] = (simde_float64)a_.i32[i];
2646
+   }
2647
+#endif
2648
+
2649
+   return simde__m128d_from_private(r_);
2650
+#endif
2651
+}
2652
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2653
+#define _mm_cvtpi32_pd(a) simde_mm_cvtpi32_pd(a)
2654
+#endif
2655
+
2656
+SIMDE_FUNCTION_ATTRIBUTES
2657
+simde__m128i simde_mm_cvtps_epi32(simde__m128 a)
2658
+{
2659
+#if defined(SIMDE_X86_SSE2_NATIVE)
2660
+   return _mm_cvtps_epi32(a);
2661
+#else
2662
+   simde__m128i_private r_;
2663
+   simde__m128_private a_ = simde__m128_to_private(a);
2664
+
2665
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
2666
+   r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2667
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
2668
+   defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES)
2669
+   r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32);
2670
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && \
2671
+   defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES)
2672
+   HEDLEY_DIAGNOSTIC_PUSH
2673
+   SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_
2674
+   SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
2675
+   r_.altivec_i32 = vec_cts(a_.altivec_f32, 1);
2676
+   HEDLEY_DIAGNOSTIC_POP
2677
+#else
2678
+   a_ = simde__m128_to_private(
2679
+       simde_x_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1));
2680
+   SIMDE_VECTORIZE
2681
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
2682
+       simde_float32 v = simde_math_roundf(a_.f32[i]);
2683
+#if defined(SIMDE_FAST_CONVERSION_RANGE)
2684
+       r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
2685
+#else
2686
+       r_.i32[i] =
2687
+           ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) &&
2688
+            (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX)))
2689
+               ? SIMDE_CONVERT_FTOI(int32_t, v)
2690
+               : INT32_MIN;
2691
+#endif
2692
+   }
2693
+#endif
2694
+
2695
+   return simde__m128i_from_private(r_);
2696
+#endif
2697
+}
2698
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2699
+#define _mm_cvtps_epi32(a) simde_mm_cvtps_epi32(a)
2700
+#endif
2701
+
2702
+SIMDE_FUNCTION_ATTRIBUTES
2703
+simde__m128d simde_mm_cvtps_pd(simde__m128 a)
2704
+{
2705
+#if defined(SIMDE_X86_SSE2_NATIVE)
2706
+   return _mm_cvtps_pd(a);
2707
+#else
2708
+   simde__m128d_private r_;
2709
+   simde__m128_private a_ = simde__m128_to_private(a);
2710
+
2711
+#if defined(SIMDE_CONVERT_VECTOR_)
2712
+   SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32);
2713
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2714
+   r_.neon_f64 = vcvt_f64_f32(vget_low_f32(a_.neon_f32));
2715
+#else
2716
+   SIMDE_VECTORIZE
2717
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
2718
+       r_.f64[i] = a_.f32[i];
2719
+   }
2720
+#endif
2721
+
2722
+   return simde__m128d_from_private(r_);
2723
+#endif
2724
+}
2725
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2726
+#define _mm_cvtps_pd(a) simde_mm_cvtps_pd(a)
2727
+#endif
2728
+
2729
+SIMDE_FUNCTION_ATTRIBUTES
2730
+int32_t simde_mm_cvtsd_si32(simde__m128d a)
2731
+{
2732
+#if defined(SIMDE_X86_SSE2_NATIVE)
2733
+   return _mm_cvtsd_si32(a);
2734
+#else
2735
+   simde__m128d_private a_ = simde__m128d_to_private(a);
2736
+
2737
+   simde_float64 v = simde_math_round(a_.f64[0]);
2738
+#if defined(SIMDE_FAST_CONVERSION_RANGE)
2739
+   return SIMDE_CONVERT_FTOI(int32_t, v);
2740
+#else
2741
+   return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) &&
2742
+       (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX)))
2743
+              ? SIMDE_CONVERT_FTOI(int32_t, v)
2744
+              : INT32_MIN;
2745
+#endif
2746
+#endif
2747
+}
2748
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2749
+#define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(a)
2750
+#endif
2751
+
2752
+SIMDE_FUNCTION_ATTRIBUTES
2753
+int64_t simde_mm_cvtsd_si64(simde__m128d a)
2754
+{
2755
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2756
+#if defined(__PGI)
2757
+   return _mm_cvtsd_si64x(a);
2758
+#else
2759
+   return _mm_cvtsd_si64(a);
2760
+#endif
2761
+#else
2762
+   simde__m128d_private a_ = simde__m128d_to_private(a);
2763
+   return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0]));
2764
+#endif
2765
+}
2766
+#define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
2767
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2768
+#define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(a)
2769
+#define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(a)
2770
+#endif
2771
+
2772
+SIMDE_FUNCTION_ATTRIBUTES
2773
+simde__m128 simde_mm_cvtsd_ss(simde__m128 a, simde__m128d b)
2774
+{
2775
+#if defined(SIMDE_X86_SSE2_NATIVE)
2776
+   return _mm_cvtsd_ss(a, b);
2777
+#else
2778
+   simde__m128_private r_, a_ = simde__m128_to_private(a);
2779
+   simde__m128d_private b_ = simde__m128d_to_private(b);
2780
+
2781
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2782
+   r_.neon_f32 = vsetq_lane_f32(
2783
+       vcvtxd_f32_f64(vgetq_lane_f64(b_.neon_f64, 0)), a_.neon_f32, 0);
2784
+#else
2785
+   r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]);
2786
+
2787
+   SIMDE_VECTORIZE
2788
+   for (size_t i = 1; i < (sizeof(r_) / sizeof(r_.i32[0])); i++) {
2789
+       r_.i32[i] = a_.i32[i];
2790
+   }
2791
+#endif
2792
+   return simde__m128_from_private(r_);
2793
+#endif
2794
+}
2795
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2796
+#define _mm_cvtsd_ss(a, b) simde_mm_cvtsd_ss(a, b)
2797
+#endif
2798
+
2799
+SIMDE_FUNCTION_ATTRIBUTES
2800
+int16_t simde_x_mm_cvtsi128_si16(simde__m128i a)
2801
+{
2802
+   simde__m128i_private a_ = simde__m128i_to_private(a);
2803
+
2804
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2805
+   return vgetq_lane_s16(a_.neon_i16, 0);
2806
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2807
+   return HEDLEY_STATIC_CAST(int16_t,
2808
+                 wasm_i16x8_extract_lane(a_.wasm_v128, 0));
2809
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2810
+#if defined(SIMDE_BUG_GCC_95227)
2811
+   (void)a_;
2812
+#endif
2813
+   return vec_extract(a_.altivec_i16, 0);
2814
+#else
2815
+   return a_.i16[0];
2816
+#endif
2817
+}
2818
+
2819
+SIMDE_FUNCTION_ATTRIBUTES
2820
+int32_t simde_mm_cvtsi128_si32(simde__m128i a)
2821
+{
2822
+#if defined(SIMDE_X86_SSE2_NATIVE)
2823
+   return _mm_cvtsi128_si32(a);
2824
+#else
2825
+   simde__m128i_private a_ = simde__m128i_to_private(a);
2826
+
2827
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2828
+   return vgetq_lane_s32(a_.neon_i32, 0);
2829
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2830
+   return HEDLEY_STATIC_CAST(int32_t,
2831
+                 wasm_i32x4_extract_lane(a_.wasm_v128, 0));
2832
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
2833
+#if defined(SIMDE_BUG_GCC_95227)
2834
+   (void)a_;
2835
+#endif
2836
+   return vec_extract(a_.altivec_i32, 0);
2837
+#else
2838
+   return a_.i32[0];
2839
+#endif
2840
+#endif
2841
+}
2842
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2843
+#define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(a)
2844
+#endif
2845
+
2846
+SIMDE_FUNCTION_ATTRIBUTES
2847
+int64_t simde_mm_cvtsi128_si64(simde__m128i a)
2848
+{
2849
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2850
+#if defined(__PGI)
2851
+   return _mm_cvtsi128_si64x(a);
2852
+#else
2853
+   return _mm_cvtsi128_si64(a);
2854
+#endif
2855
+#else
2856
+   simde__m128i_private a_ = simde__m128i_to_private(a);
2857
+#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(HEDLEY_IBM_VERSION)
2858
+   return vec_extract(HEDLEY_REINTERPRET_CAST(
2859
+                  SIMDE_POWER_ALTIVEC_VECTOR(signed long long),
2860
+                  a_.i64),
2861
+              0);
2862
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2863
+   return vgetq_lane_s64(a_.neon_i64, 0);
2864
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2865
+   return HEDLEY_STATIC_CAST(int64_t,
2866
+                 wasm_i64x2_extract_lane(a_.wasm_v128, 0));
2867
+#endif
2868
+   return a_.i64[0];
2869
+#endif
2870
+}
2871
+#define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
2872
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2873
+#define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(a)
2874
+#define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(a)
2875
+#endif
2876
+
2877
+SIMDE_FUNCTION_ATTRIBUTES
2878
+simde__m128d simde_mm_cvtsi32_sd(simde__m128d a, int32_t b)
2879
+{
2880
+#if defined(SIMDE_X86_SSE2_NATIVE)
2881
+   return _mm_cvtsi32_sd(a, b);
2882
+#else
2883
+   simde__m128d_private r_;
2884
+   simde__m128d_private a_ = simde__m128d_to_private(a);
2885
+
2886
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_AMD64)
2887
+   r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b),
2888
+                    a_.neon_f64, 0);
2889
+#else
2890
+   r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2891
+   r_.i64[1] = a_.i64[1];
2892
+#endif
2893
+
2894
+   return simde__m128d_from_private(r_);
2895
+#endif
2896
+}
2897
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2898
+#define _mm_cvtsi32_sd(a, b) simde_mm_cvtsi32_sd(a, b)
2899
+#endif
2900
+
2901
+SIMDE_FUNCTION_ATTRIBUTES
2902
+simde__m128i simde_x_mm_cvtsi16_si128(int16_t a)
2903
+{
2904
+   simde__m128i_private r_;
2905
+
2906
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2907
+   r_.neon_i16 = vsetq_lane_s16(a, vdupq_n_s16(0), 0);
2908
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2909
+   r_.wasm_v128 = wasm_i16x8_make(a, 0, 0, 0, 0, 0, 0, 0);
2910
+#else
2911
+   r_.i16[0] = a;
2912
+   r_.i16[1] = 0;
2913
+   r_.i16[2] = 0;
2914
+   r_.i16[3] = 0;
2915
+   r_.i16[4] = 0;
2916
+   r_.i16[5] = 0;
2917
+   r_.i16[6] = 0;
2918
+   r_.i16[7] = 0;
2919
+#endif
2920
+
2921
+   return simde__m128i_from_private(r_);
2922
+}
2923
+
2924
+SIMDE_FUNCTION_ATTRIBUTES
2925
+simde__m128i simde_mm_cvtsi32_si128(int32_t a)
2926
+{
2927
+#if defined(SIMDE_X86_SSE2_NATIVE)
2928
+   return _mm_cvtsi32_si128(a);
2929
+#else
2930
+   simde__m128i_private r_;
2931
+
2932
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2933
+   r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
2934
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2935
+   r_.wasm_v128 = wasm_i32x4_make(a, 0, 0, 0);
2936
+#else
2937
+   r_.i32[0] = a;
2938
+   r_.i32[1] = 0;
2939
+   r_.i32[2] = 0;
2940
+   r_.i32[3] = 0;
2941
+#endif
2942
+
2943
+   return simde__m128i_from_private(r_);
2944
+#endif
2945
+}
2946
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2947
+#define _mm_cvtsi32_si128(a) simde_mm_cvtsi32_si128(a)
2948
+#endif
2949
+
2950
+SIMDE_FUNCTION_ATTRIBUTES
2951
+simde__m128d simde_mm_cvtsi64_sd(simde__m128d a, int64_t b)
2952
+{
2953
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2954
+#if !defined(__PGI)
2955
+   return _mm_cvtsi64_sd(a, b);
2956
+#else
2957
+   return _mm_cvtsi64x_sd(a, b);
2958
+#endif
2959
+#else
2960
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a);
2961
+
2962
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
2963
+   r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b),
2964
+                    a_.neon_f64, 0);
2965
+#else
2966
+   r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b);
2967
+   r_.f64[1] = a_.f64[1];
2968
+#endif
2969
+
2970
+   return simde__m128d_from_private(r_);
2971
+#endif
2972
+}
2973
+#define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2974
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
2975
+#define _mm_cvtsi64_sd(a, b) simde_mm_cvtsi64_sd(a, b)
2976
+#define _mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64x_sd(a, b)
2977
+#endif
2978
+
2979
+SIMDE_FUNCTION_ATTRIBUTES
2980
+simde__m128i simde_mm_cvtsi64_si128(int64_t a)
2981
+{
2982
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
2983
+#if !defined(__PGI)
2984
+   return _mm_cvtsi64_si128(a);
2985
+#else
2986
+   return _mm_cvtsi64x_si128(a);
2987
+#endif
2988
+#else
2989
+   simde__m128i_private r_;
2990
+
2991
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
2992
+   r_.neon_i64 = vsetq_lane_s64(a, vdupq_n_s64(0), 0);
2993
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
2994
+   r_.wasm_v128 = wasm_i64x2_make(a, 0);
2995
+#else
2996
+   r_.i64[0] = a;
2997
+   r_.i64[1] = 0;
2998
+#endif
2999
+
3000
+   return simde__m128i_from_private(r_);
3001
+#endif
3002
+}
3003
+#define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
3004
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3005
+#define _mm_cvtsi64_si128(a) simde_mm_cvtsi64_si128(a)
3006
+#define _mm_cvtsi64x_si128(a) simde_mm_cvtsi64x_si128(a)
3007
+#endif
3008
+
3009
+SIMDE_FUNCTION_ATTRIBUTES
3010
+simde__m128d simde_mm_cvtss_sd(simde__m128d a, simde__m128 b)
3011
+{
3012
+#if defined(SIMDE_X86_SSE2_NATIVE)
3013
+   return _mm_cvtss_sd(a, b);
3014
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3015
+   float64x2_t temp = vcvt_f64_f32(vset_lane_f32(
3016
+       vgetq_lane_f32(simde__m128_to_private(b).neon_f32, 0),
3017
+       vdup_n_f32(0), 0));
3018
+   return vsetq_lane_f64(
3019
+       vgetq_lane_f64(simde__m128d_to_private(a).neon_f64, 1), temp,
3020
+       1);
3021
+#else
3022
+   simde__m128d_private a_ = simde__m128d_to_private(a);
3023
+   simde__m128_private b_ = simde__m128_to_private(b);
3024
+
3025
+   a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]);
3026
+
3027
+   return simde__m128d_from_private(a_);
3028
+#endif
3029
+}
3030
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3031
+#define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b)
3032
+#endif
3033
+
3034
+SIMDE_FUNCTION_ATTRIBUTES
3035
+simde__m64 simde_mm_cvttpd_pi32(simde__m128d a)
3036
+{
3037
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3038
+   return _mm_cvttpd_pi32(a);
3039
+#else
3040
+   simde__m64_private r_;
3041
+   simde__m128d_private a_ = simde__m128d_to_private(a);
3042
+
3043
+#if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
3044
+   SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64);
3045
+#else
3046
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
3047
+       simde_float64 v = a_.f64[i];
3048
+#if defined(SIMDE_FAST_CONVERSION_RANGE)
3049
+       r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
3050
+#else
3051
+       r_.i32[i] =
3052
+           ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) &&
3053
+            (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX)))
3054
+               ? SIMDE_CONVERT_FTOI(int32_t, v)
3055
+               : INT32_MIN;
3056
+#endif
3057
+   }
3058
+#endif
3059
+
3060
+   return simde__m64_from_private(r_);
3061
+#endif
3062
+}
3063
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3064
+#define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a)
3065
+#endif
3066
+
3067
+SIMDE_FUNCTION_ATTRIBUTES
3068
+simde__m128i simde_mm_cvttpd_epi32(simde__m128d a)
3069
+{
3070
+#if defined(SIMDE_X86_SSE2_NATIVE)
3071
+   return _mm_cvttpd_epi32(a);
3072
+#else
3073
+   simde__m128i_private r_;
3074
+
3075
+   r_.m64[0] = simde_mm_cvttpd_pi32(a);
3076
+   r_.m64[1] = simde_mm_setzero_si64();
3077
+
3078
+   return simde__m128i_from_private(r_);
3079
+#endif
3080
+}
3081
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3082
+#define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a)
3083
+#endif
3084
+
3085
+SIMDE_FUNCTION_ATTRIBUTES
3086
+simde__m128i simde_mm_cvttps_epi32(simde__m128 a)
3087
+{
3088
+#if defined(SIMDE_X86_SSE2_NATIVE)
3089
+   return _mm_cvttps_epi32(a);
3090
+#else
3091
+   simde__m128i_private r_;
3092
+   simde__m128_private a_ = simde__m128_to_private(a);
3093
+
3094
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE)
3095
+   r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32);
3096
+#elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
3097
+   SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32);
3098
+#else
3099
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
3100
+       simde_float32 v = a_.f32[i];
3101
+#if defined(SIMDE_FAST_CONVERSION_RANGE)
3102
+       r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v);
3103
+#else
3104
+       r_.i32[i] =
3105
+           ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) &&
3106
+            (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX)))
3107
+               ? SIMDE_CONVERT_FTOI(int32_t, v)
3108
+               : INT32_MIN;
3109
+#endif
3110
+   }
3111
+#endif
3112
+
3113
+   return simde__m128i_from_private(r_);
3114
+#endif
3115
+}
3116
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3117
+#define _mm_cvttps_epi32(a) simde_mm_cvttps_epi32(a)
3118
+#endif
3119
+
3120
+SIMDE_FUNCTION_ATTRIBUTES
3121
+int32_t simde_mm_cvttsd_si32(simde__m128d a)
3122
+{
3123
+#if defined(SIMDE_X86_SSE2_NATIVE)
3124
+   return _mm_cvttsd_si32(a);
3125
+#else
3126
+   simde__m128d_private a_ = simde__m128d_to_private(a);
3127
+   simde_float64 v = a_.f64[0];
3128
+#if defined(SIMDE_FAST_CONVERSION_RANGE)
3129
+   return SIMDE_CONVERT_FTOI(int32_t, v);
3130
+#else
3131
+   return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) &&
3132
+       (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX)))
3133
+              ? SIMDE_CONVERT_FTOI(int32_t, v)
3134
+              : INT32_MIN;
3135
+#endif
3136
+#endif
3137
+}
3138
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3139
+#define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(a)
3140
+#endif
3141
+
3142
+SIMDE_FUNCTION_ATTRIBUTES
3143
+int64_t simde_mm_cvttsd_si64(simde__m128d a)
3144
+{
3145
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
3146
+#if !defined(__PGI)
3147
+   return _mm_cvttsd_si64(a);
3148
+#else
3149
+   return _mm_cvttsd_si64x(a);
3150
+#endif
3151
+#else
3152
+   simde__m128d_private a_ = simde__m128d_to_private(a);
3153
+   return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]);
3154
+#endif
3155
+}
3156
+#define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
3157
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3158
+#define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(a)
3159
+#define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(a)
3160
+#endif
3161
+
3162
+SIMDE_FUNCTION_ATTRIBUTES
3163
+simde__m128d simde_mm_div_pd(simde__m128d a, simde__m128d b)
3164
+{
3165
+#if defined(SIMDE_X86_SSE2_NATIVE)
3166
+   return _mm_div_pd(a, b);
3167
+#else
3168
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
3169
+                b_ = simde__m128d_to_private(b);
3170
+
3171
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
3172
+   r_.f64 = a_.f64 / b_.f64;
3173
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3174
+   r_.neon_f64 = vdivq_f64(a_.neon_f64, b_.neon_f64);
3175
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3176
+   r_.wasm_v128 = wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128);
3177
+#else
3178
+   SIMDE_VECTORIZE
3179
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
3180
+       r_.f64[i] = a_.f64[i] / b_.f64[i];
3181
+   }
3182
+#endif
3183
+
3184
+   return simde__m128d_from_private(r_);
3185
+#endif
3186
+}
3187
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3188
+#define _mm_div_pd(a, b) simde_mm_div_pd(a, b)
3189
+#endif
3190
+
3191
+SIMDE_FUNCTION_ATTRIBUTES
3192
+simde__m128d simde_mm_div_sd(simde__m128d a, simde__m128d b)
3193
+{
3194
+#if defined(SIMDE_X86_SSE2_NATIVE)
3195
+   return _mm_div_sd(a, b);
3196
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3197
+   return simde_mm_move_sd(a, simde_mm_div_pd(a, b));
3198
+#else
3199
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
3200
+                b_ = simde__m128d_to_private(b);
3201
+
3202
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3203
+   float64x2_t temp = vdivq_f64(a_.neon_f64, b_.neon_f64);
3204
+   r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
3205
+#else
3206
+   r_.f64[0] = a_.f64[0] / b_.f64[0];
3207
+   r_.f64[1] = a_.f64[1];
3208
+#endif
3209
+
3210
+   return simde__m128d_from_private(r_);
3211
+#endif
3212
+}
3213
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3214
+#define _mm_div_sd(a, b) simde_mm_div_sd(a, b)
3215
+#endif
3216
+
3217
+SIMDE_FUNCTION_ATTRIBUTES
3218
+int32_t simde_mm_extract_epi16(simde__m128i a, const int imm8)
3219
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7)
3220
+{
3221
+   uint16_t r;
3222
+   simde__m128i_private a_ = simde__m128i_to_private(a);
3223
+
3224
+#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3225
+#if defined(SIMDE_BUG_GCC_95227)
3226
+   (void)a_;
3227
+   (void)imm8;
3228
+#endif
3229
+   r = HEDLEY_STATIC_CAST(uint16_t, vec_extract(a_.altivec_i16, imm8));
3230
+#else
3231
+   r = a_.u16[imm8 & 7];
3232
+#endif
3233
+
3234
+   return HEDLEY_STATIC_CAST(int32_t, r);
3235
+}
3236
+#if defined(SIMDE_X86_SSE2_NATIVE) && \
3237
+   (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4, 6, 0))
3238
+#define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8)
3239
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3240
+#define simde_mm_extract_epi16(a, imm8)                                       \
3241
+   (HEDLEY_STATIC_CAST(                                                  \
3242
+        int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, \
3243
+                    (imm8))) &                           \
3244
+    (INT32_C(0x0000ffff)))
3245
+#endif
3246
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3247
+#define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8)
3248
+#endif
3249
+
3250
+SIMDE_FUNCTION_ATTRIBUTES
3251
+simde__m128i simde_mm_insert_epi16(simde__m128i a, int16_t i, const int imm8)
3252
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7)
3253
+{
3254
+   simde__m128i_private a_ = simde__m128i_to_private(a);
3255
+   a_.i16[imm8 & 7] = i;
3256
+   return simde__m128i_from_private(a_);
3257
+}
3258
+#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
3259
+#define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8))
3260
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3261
+#define simde_mm_insert_epi16(a, i, imm8) \
3262
+   simde__m128i_from_neon_i16(       \
3263
+       vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8)))
3264
+#endif
3265
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3266
+#define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8)
3267
+#endif
3268
+
3269
+SIMDE_FUNCTION_ATTRIBUTES
3270
+simde__m128d
3271
+simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
3272
+{
3273
+#if defined(SIMDE_X86_SSE2_NATIVE)
3274
+   return _mm_load_pd(mem_addr);
3275
+#else
3276
+   simde__m128d_private r_;
3277
+
3278
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3279
+   r_.neon_f64 = vld1q_f64(mem_addr);
3280
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3281
+   r_.neon_u32 =
3282
+       vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const *, mem_addr));
3283
+#else
3284
+   simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d),
3285
+            sizeof(r_));
3286
+#endif
3287
+
3288
+   return simde__m128d_from_private(r_);
3289
+#endif
3290
+}
3291
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3292
+#define _mm_load_pd(mem_addr) simde_mm_load_pd(mem_addr)
3293
+#endif
3294
+
3295
+SIMDE_FUNCTION_ATTRIBUTES
3296
+simde__m128d simde_mm_load1_pd(simde_float64 const *mem_addr)
3297
+{
3298
+#if defined(SIMDE_X86_SSE2_NATIVE)
3299
+   return _mm_load1_pd(mem_addr);
3300
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3301
+   return simde__m128d_from_neon_f64(vld1q_dup_f64(mem_addr));
3302
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3303
+   return simde__m128d_from_wasm_v128(wasm_v64x2_load_splat(mem_addr));
3304
+#else
3305
+   return simde_mm_set1_pd(*mem_addr);
3306
+#endif
3307
+}
3308
+#define simde_mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr)
3309
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3310
+#define _mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr)
3311
+#define _mm_load1_pd(mem_addr) simde_mm_load1_pd(mem_addr)
3312
+#endif
3313
+
3314
+SIMDE_FUNCTION_ATTRIBUTES
3315
+simde__m128d simde_mm_load_sd(simde_float64 const *mem_addr)
3316
+{
3317
+#if defined(SIMDE_X86_SSE2_NATIVE)
3318
+   return _mm_load_sd(mem_addr);
3319
+#else
3320
+   simde__m128d_private r_;
3321
+
3322
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3323
+   r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0);
3324
+#else
3325
+   r_.f64[0] = *mem_addr;
3326
+   r_.u64[1] = UINT64_C(0);
3327
+#endif
3328
+
3329
+   return simde__m128d_from_private(r_);
3330
+#endif
3331
+}
3332
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3333
+#define _mm_load_sd(mem_addr) simde_mm_load_sd(mem_addr)
3334
+#endif
3335
+
3336
+SIMDE_FUNCTION_ATTRIBUTES
3337
+simde__m128i simde_mm_load_si128(simde__m128i const *mem_addr)
3338
+{
3339
+#if defined(SIMDE_X86_SSE2_NATIVE)
3340
+   return _mm_load_si128(
3341
+       HEDLEY_REINTERPRET_CAST(__m128i const *, mem_addr));
3342
+#else
3343
+   simde__m128i_private r_;
3344
+
3345
+#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3346
+   r_.altivec_i32 = vec_ld(
3347
+       0, HEDLEY_REINTERPRET_CAST(
3348
+              SIMDE_POWER_ALTIVEC_VECTOR(int) const *, mem_addr));
3349
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3350
+   r_.neon_i32 =
3351
+       vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const *, mem_addr));
3352
+#else
3353
+   simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i),
3354
+            sizeof(simde__m128i));
3355
+#endif
3356
+
3357
+   return simde__m128i_from_private(r_);
3358
+#endif
3359
+}
3360
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3361
+#define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr)
3362
+#endif
3363
+
3364
+SIMDE_FUNCTION_ATTRIBUTES
3365
+simde__m128d simde_mm_loadh_pd(simde__m128d a, simde_float64 const *mem_addr)
3366
+{
3367
+#if defined(SIMDE_X86_SSE2_NATIVE)
3368
+   return _mm_loadh_pd(a, mem_addr);
3369
+#else
3370
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a);
3371
+
3372
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3373
+   r_.neon_f64 = vcombine_f64(
3374
+       vget_low_f64(a_.neon_f64),
3375
+       vld1_f64(HEDLEY_REINTERPRET_CAST(const float64_t *, mem_addr)));
3376
+#else
3377
+   simde_float64 t;
3378
+
3379
+   simde_memcpy(&t, mem_addr, sizeof(t));
3380
+   r_.f64[0] = a_.f64[0];
3381
+   r_.f64[1] = t;
3382
+#endif
3383
+
3384
+   return simde__m128d_from_private(r_);
3385
+#endif
3386
+}
3387
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3388
+#define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr)
3389
+#endif
3390
+
3391
+SIMDE_FUNCTION_ATTRIBUTES
3392
+simde__m128i simde_mm_loadl_epi64(simde__m128i const *mem_addr)
3393
+{
3394
+#if defined(SIMDE_X86_SSE2_NATIVE)
3395
+   return _mm_loadl_epi64(mem_addr);
3396
+#else
3397
+   simde__m128i_private r_;
3398
+
3399
+   int64_t value;
3400
+   simde_memcpy(&value, mem_addr, sizeof(value));
3401
+
3402
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3403
+   r_.neon_i64 = vcombine_s64(
3404
+       vld1_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)),
3405
+       vdup_n_s64(0));
3406
+#else
3407
+   r_.i64[0] = value;
3408
+   r_.i64[1] = 0;
3409
+#endif
3410
+
3411
+   return simde__m128i_from_private(r_);
3412
+#endif
3413
+}
3414
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3415
+#define _mm_loadl_epi64(mem_addr) simde_mm_loadl_epi64(mem_addr)
3416
+#endif
3417
+
3418
+SIMDE_FUNCTION_ATTRIBUTES
3419
+simde__m128d simde_mm_loadl_pd(simde__m128d a, simde_float64 const *mem_addr)
3420
+{
3421
+#if defined(SIMDE_X86_SSE2_NATIVE)
3422
+   return _mm_loadl_pd(a, mem_addr);
3423
+#else
3424
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a);
3425
+
3426
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3427
+   r_.neon_f64 = vcombine_f64(
3428
+       vld1_f64(HEDLEY_REINTERPRET_CAST(const float64_t *, mem_addr)),
3429
+       vget_high_f64(a_.neon_f64));
3430
+#else
3431
+   r_.f64[0] = *mem_addr;
3432
+   r_.u64[1] = a_.u64[1];
3433
+#endif
3434
+
3435
+   return simde__m128d_from_private(r_);
3436
+#endif
3437
+}
3438
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3439
+#define _mm_loadl_pd(a, mem_addr) simde_mm_loadl_pd(a, mem_addr)
3440
+#endif
3441
+
3442
+SIMDE_FUNCTION_ATTRIBUTES
3443
+simde__m128d
3444
+simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
3445
+{
3446
+#if defined(SIMDE_X86_SSE2_NATIVE)
3447
+   return _mm_loadr_pd(mem_addr);
3448
+#else
3449
+   simde__m128d_private r_;
3450
+
3451
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3452
+   r_.neon_f64 = vld1q_f64(mem_addr);
3453
+   r_.neon_f64 = vextq_f64(r_.neon_f64, r_.neon_f64, 1);
3454
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3455
+   r_.neon_i64 =
3456
+       vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr));
3457
+   r_.neon_i64 = vextq_s64(r_.neon_i64, r_.neon_i64, 1);
3458
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3459
+   v128_t tmp = wasm_v128_load(mem_addr);
3460
+   r_.wasm_v128 = wasm_v64x2_shuffle(tmp, tmp, 1, 0);
3461
+#else
3462
+   r_.f64[0] = mem_addr[1];
3463
+   r_.f64[1] = mem_addr[0];
3464
+#endif
3465
+
3466
+   return simde__m128d_from_private(r_);
3467
+#endif
3468
+}
3469
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3470
+#define _mm_loadr_pd(mem_addr) simde_mm_loadr_pd(mem_addr)
3471
+#endif
3472
+
3473
+SIMDE_FUNCTION_ATTRIBUTES
3474
+simde__m128d
3475
+simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
3476
+{
3477
+#if defined(SIMDE_X86_SSE2_NATIVE)
3478
+   return _mm_loadu_pd(mem_addr);
3479
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3480
+   return vld1q_f64(mem_addr);
3481
+#else
3482
+   simde__m128d_private r_;
3483
+
3484
+   simde_memcpy(&r_, mem_addr, sizeof(r_));
3485
+
3486
+   return simde__m128d_from_private(r_);
3487
+#endif
3488
+}
3489
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3490
+#define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr)
3491
+#endif
3492
+
3493
+SIMDE_FUNCTION_ATTRIBUTES
3494
+simde__m128i simde_x_mm_loadu_epi8(int8_t const *mem_addr)
3495
+{
3496
+#if defined(SIMDE_X86_SSE2_NATIVE)
3497
+   return _mm_loadu_si128(
3498
+       SIMDE_ALIGN_CAST(simde__m128i const *, mem_addr));
3499
+#else
3500
+   simde__m128i_private r_;
3501
+
3502
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3503
+   r_.neon_i8 =
3504
+       vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const *, mem_addr));
3505
+#else
3506
+   simde_memcpy(&r_, mem_addr, sizeof(r_));
3507
+#endif
3508
+
3509
+   return simde__m128i_from_private(r_);
3510
+#endif
3511
+}
3512
+
3513
+SIMDE_FUNCTION_ATTRIBUTES
3514
+simde__m128i simde_x_mm_loadu_epi16(int16_t const *mem_addr)
3515
+{
3516
+#if defined(SIMDE_X86_SSE2_NATIVE)
3517
+   return _mm_loadu_si128(
3518
+       SIMDE_ALIGN_CAST(simde__m128i const *, mem_addr));
3519
+#else
3520
+   simde__m128i_private r_;
3521
+
3522
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3523
+   r_.neon_i16 =
3524
+       vld1q_s16(HEDLEY_REINTERPRET_CAST(int16_t const *, mem_addr));
3525
+#else
3526
+   simde_memcpy(&r_, mem_addr, sizeof(r_));
3527
+#endif
3528
+
3529
+   return simde__m128i_from_private(r_);
3530
+#endif
3531
+}
3532
+
3533
+SIMDE_FUNCTION_ATTRIBUTES
3534
+simde__m128i simde_x_mm_loadu_epi32(int32_t const *mem_addr)
3535
+{
3536
+#if defined(SIMDE_X86_SSE2_NATIVE)
3537
+   return _mm_loadu_si128(
3538
+       SIMDE_ALIGN_CAST(simde__m128i const *, mem_addr));
3539
+#else
3540
+   simde__m128i_private r_;
3541
+
3542
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3543
+   r_.neon_i32 =
3544
+       vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const *, mem_addr));
3545
+#else
3546
+   simde_memcpy(&r_, mem_addr, sizeof(r_));
3547
+#endif
3548
+
3549
+   return simde__m128i_from_private(r_);
3550
+#endif
3551
+}
3552
+
3553
+SIMDE_FUNCTION_ATTRIBUTES
3554
+simde__m128i simde_x_mm_loadu_epi64(int64_t const *mem_addr)
3555
+{
3556
+#if defined(SIMDE_X86_SSE2_NATIVE)
3557
+   return _mm_loadu_si128(
3558
+       SIMDE_ALIGN_CAST(simde__m128i const *, mem_addr));
3559
+#else
3560
+   simde__m128i_private r_;
3561
+
3562
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3563
+   r_.neon_i64 =
3564
+       vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr));
3565
+#else
3566
+   simde_memcpy(&r_, mem_addr, sizeof(r_));
3567
+#endif
3568
+
3569
+   return simde__m128i_from_private(r_);
3570
+#endif
3571
+}
3572
+
3573
+SIMDE_FUNCTION_ATTRIBUTES
3574
+simde__m128i simde_mm_loadu_si128(void const *mem_addr)
3575
+{
3576
+#if defined(SIMDE_X86_SSE2_NATIVE)
3577
+   return _mm_loadu_si128(HEDLEY_STATIC_CAST(__m128i const *, mem_addr));
3578
+#else
3579
+   simde__m128i_private r_;
3580
+
3581
+#if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias, 3, 3, 0)
3582
+   HEDLEY_DIAGNOSTIC_PUSH
3583
+   SIMDE_DIAGNOSTIC_DISABLE_PACKED_
3584
+   struct simde_mm_loadu_si128_s {
3585
+       __typeof__(r_) v;
3586
+   } __attribute__((__packed__, __may_alias__));
3587
+   r_ = HEDLEY_REINTERPRET_CAST(const struct simde_mm_loadu_si128_s *,
3588
+                    mem_addr)
3589
+            ->v;
3590
+   HEDLEY_DIAGNOSTIC_POP
3591
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3592
+   /* Note that this is a lower priority than the struct above since
3593
+       * clang assumes mem_addr is aligned (since it is a __m128i*). */
3594
+   r_.neon_i32 =
3595
+       vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const *, mem_addr));
3596
+#else
3597
+   simde_memcpy(&r_, mem_addr, sizeof(r_));
3598
+#endif
3599
+
3600
+   return simde__m128i_from_private(r_);
3601
+#endif
3602
+}
3603
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3604
+#define _mm_loadu_si128(mem_addr) simde_mm_loadu_si128(mem_addr)
3605
+#endif
3606
+
3607
+SIMDE_FUNCTION_ATTRIBUTES
3608
+simde__m128i simde_mm_madd_epi16(simde__m128i a, simde__m128i b)
3609
+{
3610
+#if defined(SIMDE_X86_SSE2_NATIVE)
3611
+   return _mm_madd_epi16(a, b);
3612
+#else
3613
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3614
+                b_ = simde__m128i_to_private(b);
3615
+
3616
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3617
+   int32x4_t pl =
3618
+       vmull_s16(vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16));
3619
+   int32x4_t ph = vmull_high_s16(a_.neon_i16, b_.neon_i16);
3620
+   r_.neon_i32 = vpaddq_s32(pl, ph);
3621
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3622
+   int32x4_t pl =
3623
+       vmull_s16(vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16));
3624
+   int32x4_t ph = vmull_s16(vget_high_s16(a_.neon_i16),
3625
+                vget_high_s16(b_.neon_i16));
3626
+   int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
3627
+   int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
3628
+   r_.neon_i32 = vcombine_s32(rl, rh);
3629
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
3630
+   static const SIMDE_POWER_ALTIVEC_VECTOR(int) tz = {0, 0, 0, 0};
3631
+   r_.altivec_i32 = vec_msum(a_.altivec_i16, b_.altivec_i16, tz);
3632
+#else
3633
+   SIMDE_VECTORIZE
3634
+   for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i16[0])); i += 2) {
3635
+       r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) +
3636
+               (a_.i16[i + 1] * b_.i16[i + 1]);
3637
+   }
3638
+#endif
3639
+
3640
+   return simde__m128i_from_private(r_);
3641
+#endif
3642
+}
3643
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3644
+#define _mm_madd_epi16(a, b) simde_mm_madd_epi16(a, b)
3645
+#endif
3646
+
3647
+SIMDE_FUNCTION_ATTRIBUTES
3648
+void simde_mm_maskmoveu_si128(simde__m128i a, simde__m128i mask,
3649
+                 int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)])
3650
+{
3651
+#if defined(SIMDE_X86_SSE2_NATIVE)
3652
+   _mm_maskmoveu_si128(a, mask, HEDLEY_REINTERPRET_CAST(char *, mem_addr));
3653
+#else
3654
+   simde__m128i_private a_ = simde__m128i_to_private(a),
3655
+                mask_ = simde__m128i_to_private(mask);
3656
+
3657
+   for (size_t i = 0; i < (sizeof(a_.i8) / sizeof(a_.i8[0])); i++) {
3658
+       if (mask_.u8[i] & 0x80) {
3659
+           mem_addr[i] = a_.i8[i];
3660
+       }
3661
+   }
3662
+#endif
3663
+}
3664
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3665
+#define _mm_maskmoveu_si128(a, mask, mem_addr) \
3666
+   simde_mm_maskmoveu_si128(              \
3667
+       (a), (mask),                   \
3668
+       SIMDE_CHECKED_REINTERPRET_CAST(int8_t *, char *, (mem_addr)))
3669
+#endif
3670
+
3671
+SIMDE_FUNCTION_ATTRIBUTES
3672
+int32_t simde_mm_movemask_epi8(simde__m128i a)
3673
+{
3674
+#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__INTEL_COMPILER)
3675
+   /* ICC has trouble with _mm_movemask_epi8 at -O2 and above: */
3676
+   return _mm_movemask_epi8(a);
3677
+#else
3678
+   int32_t r = 0;
3679
+   simde__m128i_private a_ = simde__m128i_to_private(a);
3680
+
3681
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3682
+   uint8x16_t input = a_.neon_u8;
3683
+   const int8_t xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0,
3684
+                  -7, -6, -5, -4, -3, -2, -1, 0};
3685
+   const uint8x16_t mask_and = vdupq_n_u8(0x80);
3686
+   const int8x16_t mask_shift = vld1q_s8(xr);
3687
+   const uint8x16_t mask_result =
3688
+       vshlq_u8(vandq_u8(input, mask_and), mask_shift);
3689
+   uint8x8_t lo = vget_low_u8(mask_result);
3690
+   uint8x8_t hi = vget_high_u8(mask_result);
3691
+   r = vaddv_u8(lo) + (vaddv_u8(hi) << 8);
3692
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3693
+   // Use increasingly wide shifts+adds to collect the sign bits
3694
+   // together.
3695
+   // Since the widening shifts would be rather confusing to follow in little endian, everything
3696
+   // will be illustrated in big endian order instead. This has a different result - the bits
3697
+   // would actually be reversed on a big endian machine.
3698
+
3699
+   // Starting input (only half the elements are shown):
3700
+   // 89 ff 1d c0 00 10 99 33
3701
+   uint8x16_t input = a_.neon_u8;
3702
+
3703
+   // Shift out everything but the sign bits with an unsigned shift right.
3704
+   //
3705
+   // Bytes of the vector::
3706
+   // 89 ff 1d c0 00 10 99 33
3707
+   // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
3708
+   //  |  |  |  |  |  |  |  |
3709
+   // 01 01 00 01 00 00 01 00
3710
+   //
3711
+   // Bits of first important lane(s):
3712
+   // 10001001 (89)
3713
+   // \______
3714
+   //        |
3715
+   // 00000001 (01)
3716
+   uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
3717
+
3718
+   // Merge the even lanes together with a 16-bit unsigned shift right + add.
3719
+   // 'xx' represents garbage data which will be ignored in the final result.
3720
+   // In the important bytes, the add functions like a binary OR.
3721
+   //
3722
+   // 01 01 00 01 00 00 01 00
3723
+   //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
3724
+   //    \|    \|    \|    \|
3725
+   // xx 03 xx 01 xx 00 xx 02
3726
+   //
3727
+   // 00000001 00000001 (01 01)
3728
+   //        \_______ |
3729
+   //                \|
3730
+   // xxxxxxxx xxxxxx11 (xx 03)
3731
+   uint32x4_t paired16 =
3732
+       vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
3733
+
3734
+   // Repeat with a wider 32-bit shift + add.
3735
+   // xx 03 xx 01 xx 00 xx 02
3736
+   //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >> 14))
3737
+   //          \|          \|
3738
+   // xx xx xx 0d xx xx xx 02
3739
+   //
3740
+   // 00000011 00000001 (03 01)
3741
+   //        \\_____ ||
3742
+   //         '----.\||
3743
+   // xxxxxxxx xxxx1101 (xx 0d)
3744
+   uint64x2_t paired32 =
3745
+       vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
3746
+
3747
+   // Last, an even wider 64-bit shift + add to get our result in the low 8 bit lanes.
3748
+   // xx xx xx 0d xx xx xx 02
3749
+   //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >> 28))
3750
+   //                      \|
3751
+   // xx xx xx xx xx xx xx d2
3752
+   //
3753
+   // 00001101 00000010 (0d 02)
3754
+   //     \   \___ |  |
3755
+   //      '---.  \|  |
3756
+   // xxxxxxxx 11010010 (xx d2)
3757
+   uint8x16_t paired64 =
3758
+       vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
3759
+
3760
+   // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
3761
+   // xx xx xx xx xx xx xx d2
3762
+   //                      ||  return paired64[0]
3763
+   //                      d2
3764
+   // Note: Little endian would return the correct value 4b (01001011) instead.
3765
+   r = vgetq_lane_u8(paired64, 0) |
3766
+       (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u8(paired64, 8)) << 8);
3767
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \
3768
+   !defined(HEDLEY_IBM_VERSION) &&         \
3769
+   (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
3770
+   static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)
3771
+       perm = {120, 112, 104, 96, 88, 80, 72, 64,
3772
+           56,  48,  40,  32, 24, 16, 8,  0};
3773
+   r = HEDLEY_STATIC_CAST(
3774
+       int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 1));
3775
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \
3776
+   !defined(HEDLEY_IBM_VERSION) &&         \
3777
+   (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG)
3778
+   static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char)
3779
+       perm = {120, 112, 104, 96, 88, 80, 72, 64,
3780
+           56,  48,  40,  32, 24, 16, 8,  0};
3781
+   r = HEDLEY_STATIC_CAST(
3782
+       int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 14));
3783
+#else
3784
+   SIMDE_VECTORIZE_REDUCTION(| : r)
3785
+   for (size_t i = 0; i < (sizeof(a_.u8) / sizeof(a_.u8[0])); i++) {
3786
+       r |= (a_.u8[15 - i] >> 7) << (15 - i);
3787
+   }
3788
+#endif
3789
+
3790
+   return r;
3791
+#endif
3792
+}
3793
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3794
+#define _mm_movemask_epi8(a) simde_mm_movemask_epi8(a)
3795
+#endif
3796
+
3797
+SIMDE_FUNCTION_ATTRIBUTES
3798
+int32_t simde_mm_movemask_pd(simde__m128d a)
3799
+{
3800
+#if defined(SIMDE_X86_SSE2_NATIVE)
3801
+   return _mm_movemask_pd(a);
3802
+#else
3803
+   int32_t r = 0;
3804
+   simde__m128d_private a_ = simde__m128d_to_private(a);
3805
+
3806
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3807
+   static const int64_t shift_amount[] = {0, 1};
3808
+   const int64x2_t shift = vld1q_s64(shift_amount);
3809
+   uint64x2_t tmp = vshrq_n_u64(a_.neon_u64, 63);
3810
+   return HEDLEY_STATIC_CAST(int32_t, vaddvq_u64(vshlq_u64(tmp, shift)));
3811
+#else
3812
+   SIMDE_VECTORIZE_REDUCTION(| : r)
3813
+   for (size_t i = 0; i < (sizeof(a_.u64) / sizeof(a_.u64[0])); i++) {
3814
+       r |= (a_.u64[i] >> 63) << i;
3815
+   }
3816
+#endif
3817
+
3818
+   return r;
3819
+#endif
3820
+}
3821
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3822
+#define _mm_movemask_pd(a) simde_mm_movemask_pd(a)
3823
+#endif
3824
+
3825
+SIMDE_FUNCTION_ATTRIBUTES
3826
+simde__m64 simde_mm_movepi64_pi64(simde__m128i a)
3827
+{
3828
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3829
+   return _mm_movepi64_pi64(a);
3830
+#else
3831
+   simde__m64_private r_;
3832
+   simde__m128i_private a_ = simde__m128i_to_private(a);
3833
+
3834
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3835
+   r_.neon_i64 = vget_low_s64(a_.neon_i64);
3836
+#else
3837
+   r_.i64[0] = a_.i64[0];
3838
+#endif
3839
+
3840
+   return simde__m64_from_private(r_);
3841
+#endif
3842
+}
3843
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3844
+#define _mm_movepi64_pi64(a) simde_mm_movepi64_pi64(a)
3845
+#endif
3846
+
3847
+SIMDE_FUNCTION_ATTRIBUTES
3848
+simde__m128i simde_mm_movpi64_epi64(simde__m64 a)
3849
+{
3850
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
3851
+   return _mm_movpi64_epi64(a);
3852
+#else
3853
+   simde__m128i_private r_;
3854
+   simde__m64_private a_ = simde__m64_to_private(a);
3855
+
3856
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3857
+   r_.neon_i64 = vcombine_s64(a_.neon_i64, vdup_n_s64(0));
3858
+#else
3859
+   r_.i64[0] = a_.i64[0];
3860
+   r_.i64[1] = 0;
3861
+#endif
3862
+
3863
+   return simde__m128i_from_private(r_);
3864
+#endif
3865
+}
3866
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3867
+#define _mm_movpi64_epi64(a) simde_mm_movpi64_epi64(a)
3868
+#endif
3869
+
3870
+SIMDE_FUNCTION_ATTRIBUTES
3871
+simde__m128i simde_mm_min_epi16(simde__m128i a, simde__m128i b)
3872
+{
3873
+#if defined(SIMDE_X86_SSE2_NATIVE)
3874
+   return _mm_min_epi16(a, b);
3875
+#else
3876
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3877
+                b_ = simde__m128i_to_private(b);
3878
+
3879
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3880
+   r_.neon_i16 = vminq_s16(a_.neon_i16, b_.neon_i16);
3881
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3882
+   r_.wasm_v128 = wasm_i16x8_min(a_.wasm_v128, b_.wasm_v128);
3883
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3884
+   r_.altivec_i16 = vec_min(a_.altivec_i16, b_.altivec_i16);
3885
+#else
3886
+   SIMDE_VECTORIZE
3887
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
3888
+       r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i];
3889
+   }
3890
+#endif
3891
+
3892
+   return simde__m128i_from_private(r_);
3893
+#endif
3894
+}
3895
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3896
+#define _mm_min_epi16(a, b) simde_mm_min_epi16(a, b)
3897
+#endif
3898
+
3899
+SIMDE_FUNCTION_ATTRIBUTES
3900
+simde__m128i simde_mm_min_epu8(simde__m128i a, simde__m128i b)
3901
+{
3902
+#if defined(SIMDE_X86_SSE2_NATIVE)
3903
+   return _mm_min_epu8(a, b);
3904
+#else
3905
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3906
+                b_ = simde__m128i_to_private(b);
3907
+
3908
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3909
+   r_.neon_u8 = vminq_u8(a_.neon_u8, b_.neon_u8);
3910
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3911
+   r_.wasm_v128 = wasm_u8x16_min(a_.wasm_v128, b_.wasm_v128);
3912
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3913
+   r_.altivec_u8 = vec_min(a_.altivec_u8, b_.altivec_u8);
3914
+#else
3915
+   SIMDE_VECTORIZE
3916
+   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
3917
+       r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i];
3918
+   }
3919
+#endif
3920
+
3921
+   return simde__m128i_from_private(r_);
3922
+#endif
3923
+}
3924
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3925
+#define _mm_min_epu8(a, b) simde_mm_min_epu8(a, b)
3926
+#endif
3927
+
3928
+SIMDE_FUNCTION_ATTRIBUTES
3929
+simde__m128d simde_mm_min_pd(simde__m128d a, simde__m128d b)
3930
+{
3931
+#if defined(SIMDE_X86_SSE2_NATIVE)
3932
+   return _mm_min_pd(a, b);
3933
+#else
3934
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
3935
+                b_ = simde__m128d_to_private(b);
3936
+
3937
+#if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
3938
+   r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64);
3939
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3940
+   r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64);
3941
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3942
+   r_.wasm_v128 = wasm_f64x2_min(a_.wasm_v128, b_.wasm_v128);
3943
+#else
3944
+   SIMDE_VECTORIZE
3945
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
3946
+       r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i];
3947
+   }
3948
+#endif
3949
+
3950
+   return simde__m128d_from_private(r_);
3951
+#endif
3952
+}
3953
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3954
+#define _mm_min_pd(a, b) simde_mm_min_pd(a, b)
3955
+#endif
3956
+
3957
+SIMDE_FUNCTION_ATTRIBUTES
3958
+simde__m128d simde_mm_min_sd(simde__m128d a, simde__m128d b)
3959
+{
3960
+#if defined(SIMDE_X86_SSE2_NATIVE)
3961
+   return _mm_min_sd(a, b);
3962
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
3963
+   return simde_mm_move_sd(a, simde_mm_min_pd(a, b));
3964
+#else
3965
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
3966
+                b_ = simde__m128d_to_private(b);
3967
+
3968
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
3969
+   float64x2_t temp = vminq_f64(a_.neon_f64, b_.neon_f64);
3970
+   r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
3971
+#else
3972
+   r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0];
3973
+   r_.f64[1] = a_.f64[1];
3974
+#endif
3975
+
3976
+   return simde__m128d_from_private(r_);
3977
+#endif
3978
+}
3979
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
3980
+#define _mm_min_sd(a, b) simde_mm_min_sd(a, b)
3981
+#endif
3982
+
3983
+SIMDE_FUNCTION_ATTRIBUTES
3984
+simde__m128i simde_mm_max_epi16(simde__m128i a, simde__m128i b)
3985
+{
3986
+#if defined(SIMDE_X86_SSE2_NATIVE)
3987
+   return _mm_max_epi16(a, b);
3988
+#else
3989
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
3990
+                b_ = simde__m128i_to_private(b);
3991
+
3992
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
3993
+   r_.neon_i16 = vmaxq_s16(a_.neon_i16, b_.neon_i16);
3994
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
3995
+   r_.wasm_v128 = wasm_i16x8_max(a_.wasm_v128, b_.wasm_v128);
3996
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
3997
+   r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16);
3998
+#else
3999
+   SIMDE_VECTORIZE
4000
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
4001
+       r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i];
4002
+   }
4003
+#endif
4004
+
4005
+   return simde__m128i_from_private(r_);
4006
+#endif
4007
+}
4008
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4009
+#define _mm_max_epi16(a, b) simde_mm_max_epi16(a, b)
4010
+#endif
4011
+
4012
+SIMDE_FUNCTION_ATTRIBUTES
4013
+simde__m128i simde_mm_max_epu8(simde__m128i a, simde__m128i b)
4014
+{
4015
+#if defined(SIMDE_X86_SSE2_NATIVE)
4016
+   return _mm_max_epu8(a, b);
4017
+#else
4018
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4019
+                b_ = simde__m128i_to_private(b);
4020
+
4021
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4022
+   r_.neon_u8 = vmaxq_u8(a_.neon_u8, b_.neon_u8);
4023
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4024
+   r_.wasm_v128 = wasm_u8x16_max(a_.wasm_v128, b_.wasm_v128);
4025
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4026
+   r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8);
4027
+#else
4028
+   SIMDE_VECTORIZE
4029
+   for (size_t i = 0; i < (sizeof(r_.u8) / sizeof(r_.u8[0])); i++) {
4030
+       r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i];
4031
+   }
4032
+#endif
4033
+
4034
+   return simde__m128i_from_private(r_);
4035
+#endif
4036
+}
4037
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4038
+#define _mm_max_epu8(a, b) simde_mm_max_epu8(a, b)
4039
+#endif
4040
+
4041
+SIMDE_FUNCTION_ATTRIBUTES
4042
+simde__m128d simde_mm_max_pd(simde__m128d a, simde__m128d b)
4043
+{
4044
+#if defined(SIMDE_X86_SSE2_NATIVE)
4045
+   return _mm_max_pd(a, b);
4046
+#else
4047
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
4048
+                b_ = simde__m128d_to_private(b);
4049
+
4050
+#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
4051
+   r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64);
4052
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4053
+   r_.wasm_v128 = wasm_f64x2_max(a_.wasm_v128, b_.wasm_v128);
4054
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4055
+   r_.neon_f64 = vmaxq_f64(a_.neon_f64, b_.neon_f64);
4056
+#else
4057
+   SIMDE_VECTORIZE
4058
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
4059
+       r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i];
4060
+   }
4061
+#endif
4062
+
4063
+   return simde__m128d_from_private(r_);
4064
+#endif
4065
+}
4066
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4067
+#define _mm_max_pd(a, b) simde_mm_max_pd(a, b)
4068
+#endif
4069
+
4070
+SIMDE_FUNCTION_ATTRIBUTES
4071
+simde__m128d simde_mm_max_sd(simde__m128d a, simde__m128d b)
4072
+{
4073
+#if defined(SIMDE_X86_SSE2_NATIVE)
4074
+   return _mm_max_sd(a, b);
4075
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
4076
+   return simde_mm_move_sd(a, simde_mm_max_pd(a, b));
4077
+#else
4078
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
4079
+                b_ = simde__m128d_to_private(b);
4080
+
4081
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4082
+   float64x2_t temp = vmaxq_f64(a_.neon_f64, b_.neon_f64);
4083
+   r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
4084
+#else
4085
+   r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0];
4086
+   r_.f64[1] = a_.f64[1];
4087
+#endif
4088
+
4089
+   return simde__m128d_from_private(r_);
4090
+#endif
4091
+}
4092
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4093
+#define _mm_max_sd(a, b) simde_mm_max_sd(a, b)
4094
+#endif
4095
+
4096
+SIMDE_FUNCTION_ATTRIBUTES
4097
+simde__m128i simde_mm_move_epi64(simde__m128i a)
4098
+{
4099
+#if defined(SIMDE_X86_SSE2_NATIVE)
4100
+   return _mm_move_epi64(a);
4101
+#else
4102
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
4103
+
4104
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4105
+   r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1);
4106
+#else
4107
+   r_.i64[0] = a_.i64[0];
4108
+   r_.i64[1] = 0;
4109
+#endif
4110
+
4111
+   return simde__m128i_from_private(r_);
4112
+#endif
4113
+}
4114
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4115
+#define _mm_move_epi64(a) simde_mm_move_epi64(a)
4116
+#endif
4117
+
4118
+SIMDE_FUNCTION_ATTRIBUTES
4119
+simde__m128i simde_mm_mul_epu32(simde__m128i a, simde__m128i b)
4120
+{
4121
+#if defined(SIMDE_X86_SSE2_NATIVE)
4122
+   return _mm_mul_epu32(a, b);
4123
+#else
4124
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4125
+                b_ = simde__m128i_to_private(b);
4126
+
4127
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4128
+   uint32x2_t a_lo = vmovn_u64(a_.neon_u64);
4129
+   uint32x2_t b_lo = vmovn_u64(b_.neon_u64);
4130
+   r_.neon_u64 = vmull_u32(a_lo, b_lo);
4131
+#elif defined(SIMDE_SHUFFLE_VECTOR_) && \
4132
+   (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE)
4133
+   __typeof__(a_.u32) z = {
4134
+       0,
4135
+   };
4136
+   a_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 2, 6);
4137
+   b_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, b_.u32, z, 0, 4, 2, 6);
4138
+   r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u32) *
4139
+        HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), b_.u32);
4140
+#else
4141
+   SIMDE_VECTORIZE
4142
+   for (size_t i = 0; i < (sizeof(r_.u64) / sizeof(r_.u64[0])); i++) {
4143
+       r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) *
4144
+               HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]);
4145
+   }
4146
+#endif
4147
+
4148
+   return simde__m128i_from_private(r_);
4149
+#endif
4150
+}
4151
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4152
+#define _mm_mul_epu32(a, b) simde_mm_mul_epu32(a, b)
4153
+#endif
4154
+
4155
+SIMDE_FUNCTION_ATTRIBUTES
4156
+simde__m128i simde_x_mm_mul_epi64(simde__m128i a, simde__m128i b)
4157
+{
4158
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4159
+                b_ = simde__m128i_to_private(b);
4160
+
4161
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4162
+   r_.i64 = a_.i64 * b_.i64;
4163
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4164
+   r_.neon_f64 = vmulq_s64(a_.neon_f64, b_.neon_f64);
4165
+#else
4166
+   SIMDE_VECTORIZE
4167
+   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
4168
+       r_.i64[i] = a_.i64[i] * b_.i64[i];
4169
+   }
4170
+#endif
4171
+
4172
+   return simde__m128i_from_private(r_);
4173
+}
4174
+
4175
+SIMDE_FUNCTION_ATTRIBUTES
4176
+simde__m128i simde_x_mm_mod_epi64(simde__m128i a, simde__m128i b)
4177
+{
4178
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4179
+                b_ = simde__m128i_to_private(b);
4180
+
4181
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4182
+   r_.i64 = a_.i64 % b_.i64;
4183
+#else
4184
+   SIMDE_VECTORIZE
4185
+   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
4186
+       r_.i64[i] = a_.i64[i] % b_.i64[i];
4187
+   }
4188
+#endif
4189
+
4190
+   return simde__m128i_from_private(r_);
4191
+}
4192
+
4193
+SIMDE_FUNCTION_ATTRIBUTES
4194
+simde__m128d simde_mm_mul_pd(simde__m128d a, simde__m128d b)
4195
+{
4196
+#if defined(SIMDE_X86_SSE2_NATIVE)
4197
+   return _mm_mul_pd(a, b);
4198
+#else
4199
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
4200
+                b_ = simde__m128d_to_private(b);
4201
+
4202
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4203
+   r_.f64 = a_.f64 * b_.f64;
4204
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4205
+   r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64);
4206
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4207
+   r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128);
4208
+#else
4209
+   SIMDE_VECTORIZE
4210
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
4211
+       r_.f64[i] = a_.f64[i] * b_.f64[i];
4212
+   }
4213
+#endif
4214
+
4215
+   return simde__m128d_from_private(r_);
4216
+#endif
4217
+}
4218
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4219
+#define _mm_mul_pd(a, b) simde_mm_mul_pd(a, b)
4220
+#endif
4221
+
4222
+SIMDE_FUNCTION_ATTRIBUTES
4223
+simde__m128d simde_mm_mul_sd(simde__m128d a, simde__m128d b)
4224
+{
4225
+#if defined(SIMDE_X86_SSE2_NATIVE)
4226
+   return _mm_mul_sd(a, b);
4227
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
4228
+   return simde_mm_move_sd(a, simde_mm_mul_pd(a, b));
4229
+#else
4230
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
4231
+                b_ = simde__m128d_to_private(b);
4232
+
4233
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4234
+   float64x2_t temp = vmulq_f64(a_.neon_f64, b_.neon_f64);
4235
+   r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1);
4236
+#else
4237
+   r_.f64[0] = a_.f64[0] * b_.f64[0];
4238
+   r_.f64[1] = a_.f64[1];
4239
+#endif
4240
+
4241
+   return simde__m128d_from_private(r_);
4242
+#endif
4243
+}
4244
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4245
+#define _mm_mul_sd(a, b) simde_mm_mul_sd(a, b)
4246
+#endif
4247
+
4248
+SIMDE_FUNCTION_ATTRIBUTES
4249
+simde__m64 simde_mm_mul_su32(simde__m64 a, simde__m64 b)
4250
+{
4251
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && \
4252
+   !defined(__PGI)
4253
+   return _mm_mul_su32(a, b);
4254
+#else
4255
+   simde__m64_private r_, a_ = simde__m64_to_private(a),
4256
+                  b_ = simde__m64_to_private(b);
4257
+
4258
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4259
+   r_.u64[0] = vget_lane_u64(
4260
+       vget_low_u64(vmull_u32(vreinterpret_u32_s64(a_.neon_i64),
4261
+                      vreinterpret_u32_s64(b_.neon_i64))),
4262
+       0);
4263
+#else
4264
+   r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[0]) *
4265
+           HEDLEY_STATIC_CAST(uint64_t, b_.u32[0]);
4266
+#endif
4267
+
4268
+   return simde__m64_from_private(r_);
4269
+#endif
4270
+}
4271
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4272
+#define _mm_mul_su32(a, b) simde_mm_mul_su32(a, b)
4273
+#endif
4274
+
4275
+SIMDE_FUNCTION_ATTRIBUTES
4276
+simde__m128i simde_mm_mulhi_epi16(simde__m128i a, simde__m128i b)
4277
+{
4278
+#if defined(SIMDE_X86_SSE2_NATIVE)
4279
+   return _mm_mulhi_epi16(a, b);
4280
+#else
4281
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4282
+                b_ = simde__m128i_to_private(b);
4283
+
4284
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4285
+   int16x4_t a3210 = vget_low_s16(a_.neon_i16);
4286
+   int16x4_t b3210 = vget_low_s16(b_.neon_i16);
4287
+   int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4288
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4289
+   int32x4_t ab7654 = vmull_high_s16(a_.neon_i16, b_.neon_i16);
4290
+   r_.neon_i16 = vuzp2q_s16(vreinterpretq_s16_s32(ab3210),
4291
+                vreinterpretq_s16_s32(ab7654));
4292
+#else
4293
+   int16x4_t a7654 = vget_high_s16(a_.neon_i16);
4294
+   int16x4_t b7654 = vget_high_s16(b_.neon_i16);
4295
+   int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4296
+   uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210),
4297
+                   vreinterpretq_u16_s32(ab7654));
4298
+   r_.neon_u16 = rv.val[1];
4299
+#endif
4300
+#else
4301
+   SIMDE_VECTORIZE
4302
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
4303
+       r_.u16[i] = HEDLEY_STATIC_CAST(
4304
+           uint16_t,
4305
+           (HEDLEY_STATIC_CAST(
4306
+                uint32_t,
4307
+                HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) *
4308
+                    HEDLEY_STATIC_CAST(int32_t,
4309
+                               b_.i16[i])) >>
4310
+            16));
4311
+   }
4312
+#endif
4313
+
4314
+   return simde__m128i_from_private(r_);
4315
+#endif
4316
+}
4317
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4318
+#define _mm_mulhi_epi16(a, b) simde_mm_mulhi_epi16(a, b)
4319
+#endif
4320
+
4321
+SIMDE_FUNCTION_ATTRIBUTES
4322
+simde__m128i simde_mm_mulhi_epu16(simde__m128i a, simde__m128i b)
4323
+{
4324
+#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
4325
+   return _mm_mulhi_epu16(a, b);
4326
+#else
4327
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4328
+                b_ = simde__m128i_to_private(b);
4329
+
4330
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4331
+   uint16x4_t a3210 = vget_low_u16(a_.neon_u16);
4332
+   uint16x4_t b3210 = vget_low_u16(b_.neon_u16);
4333
+   uint32x4_t ab3210 = vmull_u16(a3210, b3210); /* 3333222211110000 */
4334
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4335
+   uint32x4_t ab7654 = vmull_high_u16(a_.neon_u16, b_.neon_u16);
4336
+   r_.neon_u16 = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4337
+                vreinterpretq_u16_u32(ab7654));
4338
+#else
4339
+   uint16x4_t a7654 = vget_high_u16(a_.neon_u16);
4340
+   uint16x4_t b7654 = vget_high_u16(b_.neon_u16);
4341
+   uint32x4_t ab7654 = vmull_u16(a7654, b7654); /* 7777666655554444 */
4342
+   uint16x8x2_t neon_r = vuzpq_u16(vreinterpretq_u16_u32(ab3210),
4343
+                   vreinterpretq_u16_u32(ab7654));
4344
+   r_.neon_u16 = neon_r.val[1];
4345
+#endif
4346
+#else
4347
+   SIMDE_VECTORIZE
4348
+   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
4349
+       r_.u16[i] = HEDLEY_STATIC_CAST(
4350
+           uint16_t,
4351
+           HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) *
4352
+                   HEDLEY_STATIC_CAST(uint32_t,
4353
+                              b_.u16[i]) >>
4354
+               16);
4355
+   }
4356
+#endif
4357
+
4358
+   return simde__m128i_from_private(r_);
4359
+#endif
4360
+}
4361
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4362
+#define _mm_mulhi_epu16(a, b) simde_mm_mulhi_epu16(a, b)
4363
+#endif
4364
+
4365
+SIMDE_FUNCTION_ATTRIBUTES
4366
+simde__m128i simde_mm_mullo_epi16(simde__m128i a, simde__m128i b)
4367
+{
4368
+#if defined(SIMDE_X86_SSE2_NATIVE)
4369
+   return _mm_mullo_epi16(a, b);
4370
+#else
4371
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4372
+                b_ = simde__m128i_to_private(b);
4373
+
4374
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4375
+   r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16);
4376
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4377
+   (void)a_;
4378
+   (void)b_;
4379
+   r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16);
4380
+#else
4381
+   SIMDE_VECTORIZE
4382
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
4383
+       r_.u16[i] = HEDLEY_STATIC_CAST(
4384
+           uint16_t,
4385
+           HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) *
4386
+               HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]));
4387
+   }
4388
+#endif
4389
+
4390
+   return simde__m128i_from_private(r_);
4391
+#endif
4392
+}
4393
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4394
+#define _mm_mullo_epi16(a, b) simde_mm_mullo_epi16(a, b)
4395
+#endif
4396
+
4397
+SIMDE_FUNCTION_ATTRIBUTES
4398
+simde__m128d simde_mm_or_pd(simde__m128d a, simde__m128d b)
4399
+{
4400
+#if defined(SIMDE_X86_SSE2_NATIVE)
4401
+   return _mm_or_pd(a, b);
4402
+#else
4403
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
4404
+                b_ = simde__m128d_to_private(b);
4405
+
4406
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4407
+   r_.i32f = a_.i32f | b_.i32f;
4408
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4409
+   r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128);
4410
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4411
+   r_.neon_i64 = vorrq_s64(a_.neon_i64, b_.neon_i64);
4412
+#else
4413
+   SIMDE_VECTORIZE
4414
+   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
4415
+       r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
4416
+   }
4417
+#endif
4418
+
4419
+   return simde__m128d_from_private(r_);
4420
+#endif
4421
+}
4422
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4423
+#define _mm_or_pd(a, b) simde_mm_or_pd(a, b)
4424
+#endif
4425
+
4426
+SIMDE_FUNCTION_ATTRIBUTES
4427
+simde__m128i simde_mm_or_si128(simde__m128i a, simde__m128i b)
4428
+{
4429
+#if defined(SIMDE_X86_SSE2_NATIVE)
4430
+   return _mm_or_si128(a, b);
4431
+#else
4432
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4433
+                b_ = simde__m128i_to_private(b);
4434
+
4435
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4436
+   r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32);
4437
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4438
+   r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32);
4439
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
4440
+   r_.i32f = a_.i32f | b_.i32f;
4441
+#else
4442
+   SIMDE_VECTORIZE
4443
+   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
4444
+       r_.i32f[i] = a_.i32f[i] | b_.i32f[i];
4445
+   }
4446
+#endif
4447
+
4448
+   return simde__m128i_from_private(r_);
4449
+#endif
4450
+}
4451
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4452
+#define _mm_or_si128(a, b) simde_mm_or_si128(a, b)
4453
+#endif
4454
+
4455
+SIMDE_FUNCTION_ATTRIBUTES
4456
+simde__m128i simde_mm_packs_epi16(simde__m128i a, simde__m128i b)
4457
+{
4458
+#if defined(SIMDE_X86_SSE2_NATIVE)
4459
+   return _mm_packs_epi16(a, b);
4460
+#else
4461
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4462
+                b_ = simde__m128i_to_private(b);
4463
+
4464
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4465
+   r_.neon_i8 =
4466
+       vcombine_s8(vqmovn_s16(a_.neon_i16), vqmovn_s16(b_.neon_i16));
4467
+#else
4468
+   SIMDE_VECTORIZE
4469
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
4470
+       r_.i8[i] = (a_.i16[i] > INT8_MAX)
4471
+                  ? INT8_MAX
4472
+                  : ((a_.i16[i] < INT8_MIN)
4473
+                         ? INT8_MIN
4474
+                         : HEDLEY_STATIC_CAST(int8_t,
4475
+                                  a_.i16[i]));
4476
+       r_.i8[i + 8] = (b_.i16[i] > INT8_MAX)
4477
+                      ? INT8_MAX
4478
+                      : ((b_.i16[i] < INT8_MIN)
4479
+                         ? INT8_MIN
4480
+                         : HEDLEY_STATIC_CAST(
4481
+                               int8_t, b_.i16[i]));
4482
+   }
4483
+#endif
4484
+
4485
+   return simde__m128i_from_private(r_);
4486
+#endif
4487
+}
4488
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4489
+#define _mm_packs_epi16(a, b) simde_mm_packs_epi16(a, b)
4490
+#endif
4491
+
4492
+SIMDE_FUNCTION_ATTRIBUTES
4493
+simde__m128i simde_mm_packs_epi32(simde__m128i a, simde__m128i b)
4494
+{
4495
+#if defined(SIMDE_X86_SSE2_NATIVE)
4496
+   return _mm_packs_epi32(a, b);
4497
+#else
4498
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4499
+                b_ = simde__m128i_to_private(b);
4500
+
4501
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4502
+   r_.neon_i16 =
4503
+       vcombine_s16(vqmovn_s32(a_.neon_i32), vqmovn_s32(b_.neon_i32));
4504
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4505
+   r_.altivec_i16 = vec_packs(a_.altivec_i32, b_.altivec_i32);
4506
+#else
4507
+   SIMDE_VECTORIZE
4508
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
4509
+       r_.i16[i] = (a_.i32[i] > INT16_MAX)
4510
+                   ? INT16_MAX
4511
+                   : ((a_.i32[i] < INT16_MIN)
4512
+                          ? INT16_MIN
4513
+                          : HEDLEY_STATIC_CAST(int16_t,
4514
+                                   a_.i32[i]));
4515
+       r_.i16[i + 4] =
4516
+           (b_.i32[i] > INT16_MAX)
4517
+               ? INT16_MAX
4518
+               : ((b_.i32[i] < INT16_MIN)
4519
+                      ? INT16_MIN
4520
+                      : HEDLEY_STATIC_CAST(int16_t,
4521
+                               b_.i32[i]));
4522
+   }
4523
+#endif
4524
+
4525
+   return simde__m128i_from_private(r_);
4526
+#endif
4527
+}
4528
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4529
+#define _mm_packs_epi32(a, b) simde_mm_packs_epi32(a, b)
4530
+#endif
4531
+
4532
+SIMDE_FUNCTION_ATTRIBUTES
4533
+simde__m128i simde_mm_packus_epi16(simde__m128i a, simde__m128i b)
4534
+{
4535
+#if defined(SIMDE_X86_SSE2_NATIVE)
4536
+   return _mm_packus_epi16(a, b);
4537
+#else
4538
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4539
+                b_ = simde__m128i_to_private(b);
4540
+
4541
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4542
+   r_.neon_u8 =
4543
+       vcombine_u8(vqmovun_s16(a_.neon_i16), vqmovun_s16(b_.neon_i16));
4544
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
4545
+   r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16);
4546
+#else
4547
+   SIMDE_VECTORIZE
4548
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
4549
+       r_.u8[i] = (a_.i16[i] > UINT8_MAX)
4550
+                  ? UINT8_MAX
4551
+                  : ((a_.i16[i] < 0)
4552
+                         ? UINT8_C(0)
4553
+                         : HEDLEY_STATIC_CAST(uint8_t,
4554
+                                  a_.i16[i]));
4555
+       r_.u8[i + 8] =
4556
+           (b_.i16[i] > UINT8_MAX)
4557
+               ? UINT8_MAX
4558
+               : ((b_.i16[i] < 0)
4559
+                      ? UINT8_C(0)
4560
+                      : HEDLEY_STATIC_CAST(uint8_t,
4561
+                               b_.i16[i]));
4562
+   }
4563
+#endif
4564
+
4565
+   return simde__m128i_from_private(r_);
4566
+#endif
4567
+}
4568
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4569
+#define _mm_packus_epi16(a, b) simde_mm_packus_epi16(a, b)
4570
+#endif
4571
+
4572
+SIMDE_FUNCTION_ATTRIBUTES
4573
+void simde_mm_pause(void)
4574
+{
4575
+#if defined(SIMDE_X86_SSE2_NATIVE)
4576
+   _mm_pause();
4577
+#endif
4578
+}
4579
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4580
+#define _mm_pause() (simde_mm_pause())
4581
+#endif
4582
+
4583
+SIMDE_FUNCTION_ATTRIBUTES
4584
+simde__m128i simde_mm_sad_epu8(simde__m128i a, simde__m128i b)
4585
+{
4586
+#if defined(SIMDE_X86_SSE2_NATIVE)
4587
+   return _mm_sad_epu8(a, b);
4588
+#else
4589
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
4590
+                b_ = simde__m128i_to_private(b);
4591
+
4592
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4593
+   const uint16x8_t t = vpaddlq_u8(vabdq_u8(a_.neon_u8, b_.neon_u8));
4594
+   r_.neon_u64 = vcombine_u64(vpaddl_u32(vpaddl_u16(vget_low_u16(t))),
4595
+                  vpaddl_u32(vpaddl_u16(vget_high_u16(t))));
4596
+#else
4597
+   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
4598
+       uint16_t tmp = 0;
4599
+       SIMDE_VECTORIZE_REDUCTION(+ : tmp)
4600
+       for (size_t j = 0; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2);
4601
+            j++) {
4602
+           const size_t e = j + (i * 8);
4603
+           tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e])
4604
+                            : (b_.u8[e] - a_.u8[e]);
4605
+       }
4606
+       r_.i64[i] = tmp;
4607
+   }
4608
+#endif
4609
+
4610
+   return simde__m128i_from_private(r_);
4611
+#endif
4612
+}
4613
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4614
+#define _mm_sad_epu8(a, b) simde_mm_sad_epu8(a, b)
4615
+#endif
4616
+
4617
+SIMDE_FUNCTION_ATTRIBUTES
4618
+simde__m128i simde_mm_set_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12,
4619
+                  int8_t e11, int8_t e10, int8_t e9, int8_t e8,
4620
+                  int8_t e7, int8_t e6, int8_t e5, int8_t e4,
4621
+                  int8_t e3, int8_t e2, int8_t e1, int8_t e0)
4622
+{
4623
+
4624
+#if defined(SIMDE_X86_SSE2_NATIVE)
4625
+   return _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5,
4626
+               e4, e3, e2, e1, e0);
4627
+#else
4628
+   simde__m128i_private r_;
4629
+
4630
+#if defined(SIMDE_WASM_SIMD128_NATIVE)
4631
+   r_.wasm_v128 = wasm_i8x16_make(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9,
4632
+                      e10, e11, e12, e13, e14, e15);
4633
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4634
+   SIMDE_ALIGN_LIKE_16(int8x16_t)
4635
+   int8_t data[16] = {e0, e1, e2,  e3,  e4,  e5,  e6,  e7,
4636
+              e8, e9, e10, e11, e12, e13, e14, e15};
4637
+   r_.neon_i8 = vld1q_s8(data);
4638
+#else
4639
+   r_.i8[0] = e0;
4640
+   r_.i8[1] = e1;
4641
+   r_.i8[2] = e2;
4642
+   r_.i8[3] = e3;
4643
+   r_.i8[4] = e4;
4644
+   r_.i8[5] = e5;
4645
+   r_.i8[6] = e6;
4646
+   r_.i8[7] = e7;
4647
+   r_.i8[8] = e8;
4648
+   r_.i8[9] = e9;
4649
+   r_.i8[10] = e10;
4650
+   r_.i8[11] = e11;
4651
+   r_.i8[12] = e12;
4652
+   r_.i8[13] = e13;
4653
+   r_.i8[14] = e14;
4654
+   r_.i8[15] = e15;
4655
+#endif
4656
+
4657
+   return simde__m128i_from_private(r_);
4658
+#endif
4659
+}
4660
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4661
+#define _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, \
4662
+            e2, e1, e0)                                               \
4663
+   simde_mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5,    \
4664
+             e4, e3, e2, e1, e0)
4665
+#endif
4666
+
4667
+SIMDE_FUNCTION_ATTRIBUTES
4668
+simde__m128i simde_mm_set_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4,
4669
+               int16_t e3, int16_t e2, int16_t e1, int16_t e0)
4670
+{
4671
+#if defined(SIMDE_X86_SSE2_NATIVE)
4672
+   return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
4673
+#else
4674
+   simde__m128i_private r_;
4675
+
4676
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4677
+   SIMDE_ALIGN_LIKE_16(int16x8_t)
4678
+   int16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7};
4679
+   r_.neon_i16 = vld1q_s16(data);
4680
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4681
+   r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7);
4682
+#else
4683
+   r_.i16[0] = e0;
4684
+   r_.i16[1] = e1;
4685
+   r_.i16[2] = e2;
4686
+   r_.i16[3] = e3;
4687
+   r_.i16[4] = e4;
4688
+   r_.i16[5] = e5;
4689
+   r_.i16[6] = e6;
4690
+   r_.i16[7] = e7;
4691
+#endif
4692
+
4693
+   return simde__m128i_from_private(r_);
4694
+#endif
4695
+}
4696
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4697
+#define _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) \
4698
+   simde_mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
4699
+#endif
4700
+
4701
+SIMDE_FUNCTION_ATTRIBUTES
4702
+simde__m128i simde_mm_loadu_si16(void const *mem_addr)
4703
+{
4704
+#if defined(SIMDE_X86_SSE2_NATIVE) &&                 \
4705
+   (SIMDE_DETECT_CLANG_VERSION_CHECK(8, 0, 0) || \
4706
+    HEDLEY_GCC_VERSION_CHECK(11, 0, 0) ||        \
4707
+    HEDLEY_INTEL_VERSION_CHECK(20, 21, 1))
4708
+   return _mm_loadu_si16(mem_addr);
4709
+#else
4710
+   int16_t val;
4711
+   simde_memcpy(&val, mem_addr, sizeof(val));
4712
+   return simde_x_mm_cvtsi16_si128(val);
4713
+#endif
4714
+}
4715
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4716
+#define _mm_loadu_si16(mem_addr) simde_mm_loadu_si16(mem_addr)
4717
+#endif
4718
+
4719
+SIMDE_FUNCTION_ATTRIBUTES
4720
+simde__m128i simde_mm_set_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
4721
+{
4722
+#if defined(SIMDE_X86_SSE2_NATIVE)
4723
+   return _mm_set_epi32(e3, e2, e1, e0);
4724
+#else
4725
+   simde__m128i_private r_;
4726
+
4727
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4728
+   SIMDE_ALIGN_LIKE_16(int32x4_t) int32_t data[4] = {e0, e1, e2, e3};
4729
+   r_.neon_i32 = vld1q_s32(data);
4730
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4731
+   r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3);
4732
+#else
4733
+   r_.i32[0] = e0;
4734
+   r_.i32[1] = e1;
4735
+   r_.i32[2] = e2;
4736
+   r_.i32[3] = e3;
4737
+#endif
4738
+
4739
+   return simde__m128i_from_private(r_);
4740
+#endif
4741
+}
4742
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4743
+#define _mm_set_epi32(e3, e2, e1, e0) simde_mm_set_epi32(e3, e2, e1, e0)
4744
+#endif
4745
+
4746
+SIMDE_FUNCTION_ATTRIBUTES
4747
+simde__m128i simde_mm_loadu_si32(void const *mem_addr)
4748
+{
4749
+#if defined(SIMDE_X86_SSE2_NATIVE) &&                 \
4750
+   (SIMDE_DETECT_CLANG_VERSION_CHECK(8, 0, 0) || \
4751
+    HEDLEY_GCC_VERSION_CHECK(11, 0, 0) ||        \
4752
+    HEDLEY_INTEL_VERSION_CHECK(20, 21, 1))
4753
+   return _mm_loadu_si32(mem_addr);
4754
+#else
4755
+   int32_t val;
4756
+   simde_memcpy(&val, mem_addr, sizeof(val));
4757
+   return simde_mm_cvtsi32_si128(val);
4758
+#endif
4759
+}
4760
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4761
+#define _mm_loadu_si32(mem_addr) simde_mm_loadu_si32(mem_addr)
4762
+#endif
4763
+
4764
+SIMDE_FUNCTION_ATTRIBUTES
4765
+simde__m128i simde_mm_set_epi64(simde__m64 e1, simde__m64 e0)
4766
+{
4767
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
4768
+   return _mm_set_epi64(e1, e0);
4769
+#else
4770
+   simde__m128i_private r_;
4771
+
4772
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4773
+   r_.neon_i64 = vcombine_s64(simde__m64_to_neon_i64(e0),
4774
+                  simde__m64_to_neon_i64(e1));
4775
+#else
4776
+   r_.m64[0] = e0;
4777
+   r_.m64[1] = e1;
4778
+#endif
4779
+
4780
+   return simde__m128i_from_private(r_);
4781
+#endif
4782
+}
4783
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4784
+#define _mm_set_epi64(e1, e0) (simde_mm_set_epi64((e1), (e0)))
4785
+#endif
4786
+
4787
+SIMDE_FUNCTION_ATTRIBUTES
4788
+simde__m128i simde_mm_set_epi64x(int64_t e1, int64_t e0)
4789
+{
4790
+#if defined(SIMDE_X86_SSE2_NATIVE) && \
4791
+   (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19, 0, 0))
4792
+   return _mm_set_epi64x(e1, e0);
4793
+#else
4794
+   simde__m128i_private r_;
4795
+
4796
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4797
+   SIMDE_ALIGN_LIKE_16(int64x2_t) int64_t data[2] = {e0, e1};
4798
+   r_.neon_i64 = vld1q_s64(data);
4799
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4800
+   r_.wasm_v128 = wasm_i64x2_make(e0, e1);
4801
+#else
4802
+   r_.i64[0] = e0;
4803
+   r_.i64[1] = e1;
4804
+#endif
4805
+
4806
+   return simde__m128i_from_private(r_);
4807
+#endif
4808
+}
4809
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4810
+#define _mm_set_epi64x(e1, e0) simde_mm_set_epi64x(e1, e0)
4811
+#endif
4812
+
4813
+SIMDE_FUNCTION_ATTRIBUTES
4814
+simde__m128i simde_mm_loadu_si64(void const *mem_addr)
4815
+{
4816
+#if defined(SIMDE_X86_SSE2_NATIVE) &&                 \
4817
+   (SIMDE_DETECT_CLANG_VERSION_CHECK(8, 0, 0) || \
4818
+    HEDLEY_GCC_VERSION_CHECK(11, 0, 0) ||        \
4819
+    HEDLEY_INTEL_VERSION_CHECK(20, 21, 1))
4820
+   return _mm_loadu_si64(mem_addr);
4821
+#else
4822
+   int64_t val;
4823
+   simde_memcpy(&val, mem_addr, sizeof(val));
4824
+   return simde_mm_cvtsi64_si128(val);
4825
+#endif
4826
+}
4827
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4828
+#define _mm_loadu_si64(mem_addr) simde_mm_loadu_si64(mem_addr)
4829
+#endif
4830
+
4831
+SIMDE_FUNCTION_ATTRIBUTES
4832
+simde__m128i simde_x_mm_set_epu8(uint8_t e15, uint8_t e14, uint8_t e13,
4833
+                uint8_t e12, uint8_t e11, uint8_t e10,
4834
+                uint8_t e9, uint8_t e8, uint8_t e7, uint8_t e6,
4835
+                uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2,
4836
+                uint8_t e1, uint8_t e0)
4837
+{
4838
+#if defined(SIMDE_X86_SSE2_NATIVE)
4839
+   return _mm_set_epi8(
4840
+       HEDLEY_STATIC_CAST(char, e15), HEDLEY_STATIC_CAST(char, e14),
4841
+       HEDLEY_STATIC_CAST(char, e13), HEDLEY_STATIC_CAST(char, e12),
4842
+       HEDLEY_STATIC_CAST(char, e11), HEDLEY_STATIC_CAST(char, e10),
4843
+       HEDLEY_STATIC_CAST(char, e9), HEDLEY_STATIC_CAST(char, e8),
4844
+       HEDLEY_STATIC_CAST(char, e7), HEDLEY_STATIC_CAST(char, e6),
4845
+       HEDLEY_STATIC_CAST(char, e5), HEDLEY_STATIC_CAST(char, e4),
4846
+       HEDLEY_STATIC_CAST(char, e3), HEDLEY_STATIC_CAST(char, e2),
4847
+       HEDLEY_STATIC_CAST(char, e1), HEDLEY_STATIC_CAST(char, e0));
4848
+#else
4849
+   simde__m128i_private r_;
4850
+
4851
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4852
+   SIMDE_ALIGN_LIKE_16(uint8x16_t)
4853
+   uint8_t data[16] = {e0, e1, e2,  e3,  e4,  e5,  e6,  e7,
4854
+               e8, e9, e10, e11, e12, e13, e14, e15};
4855
+   r_.neon_u8 = vld1q_u8(data);
4856
+#else
4857
+   r_.u8[0] = e0;
4858
+   r_.u8[1] = e1;
4859
+   r_.u8[2] = e2;
4860
+   r_.u8[3] = e3;
4861
+   r_.u8[4] = e4;
4862
+   r_.u8[5] = e5;
4863
+   r_.u8[6] = e6;
4864
+   r_.u8[7] = e7;
4865
+   r_.u8[8] = e8;
4866
+   r_.u8[9] = e9;
4867
+   r_.u8[10] = e10;
4868
+   r_.u8[11] = e11;
4869
+   r_.u8[12] = e12;
4870
+   r_.u8[13] = e13;
4871
+   r_.u8[14] = e14;
4872
+   r_.u8[15] = e15;
4873
+#endif
4874
+
4875
+   return simde__m128i_from_private(r_);
4876
+#endif
4877
+}
4878
+
4879
+SIMDE_FUNCTION_ATTRIBUTES
4880
+simde__m128i simde_x_mm_set_epu16(uint16_t e7, uint16_t e6, uint16_t e5,
4881
+                 uint16_t e4, uint16_t e3, uint16_t e2,
4882
+                 uint16_t e1, uint16_t e0)
4883
+{
4884
+#if defined(SIMDE_X86_SSE2_NATIVE)
4885
+   return _mm_set_epi16(
4886
+       HEDLEY_STATIC_CAST(short, e7), HEDLEY_STATIC_CAST(short, e6),
4887
+       HEDLEY_STATIC_CAST(short, e5), HEDLEY_STATIC_CAST(short, e4),
4888
+       HEDLEY_STATIC_CAST(short, e3), HEDLEY_STATIC_CAST(short, e2),
4889
+       HEDLEY_STATIC_CAST(short, e1), HEDLEY_STATIC_CAST(short, e0));
4890
+#else
4891
+   simde__m128i_private r_;
4892
+
4893
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4894
+   SIMDE_ALIGN_LIKE_16(uint16x8_t)
4895
+   uint16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7};
4896
+   r_.neon_u16 = vld1q_u16(data);
4897
+#else
4898
+   r_.u16[0] = e0;
4899
+   r_.u16[1] = e1;
4900
+   r_.u16[2] = e2;
4901
+   r_.u16[3] = e3;
4902
+   r_.u16[4] = e4;
4903
+   r_.u16[5] = e5;
4904
+   r_.u16[6] = e6;
4905
+   r_.u16[7] = e7;
4906
+#endif
4907
+
4908
+   return simde__m128i_from_private(r_);
4909
+#endif
4910
+}
4911
+
4912
+SIMDE_FUNCTION_ATTRIBUTES
4913
+simde__m128i simde_x_mm_set_epu32(uint32_t e3, uint32_t e2, uint32_t e1,
4914
+                 uint32_t e0)
4915
+{
4916
+#if defined(SIMDE_X86_SSE2_NATIVE)
4917
+   return _mm_set_epi32(HEDLEY_STATIC_CAST(int, e3),
4918
+                HEDLEY_STATIC_CAST(int, e2),
4919
+                HEDLEY_STATIC_CAST(int, e1),
4920
+                HEDLEY_STATIC_CAST(int, e0));
4921
+#else
4922
+   simde__m128i_private r_;
4923
+
4924
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4925
+   SIMDE_ALIGN_LIKE_16(uint32x4_t) uint32_t data[4] = {e0, e1, e2, e3};
4926
+   r_.neon_u32 = vld1q_u32(data);
4927
+#else
4928
+   r_.u32[0] = e0;
4929
+   r_.u32[1] = e1;
4930
+   r_.u32[2] = e2;
4931
+   r_.u32[3] = e3;
4932
+#endif
4933
+
4934
+   return simde__m128i_from_private(r_);
4935
+#endif
4936
+}
4937
+
4938
+SIMDE_FUNCTION_ATTRIBUTES
4939
+simde__m128i simde_x_mm_set_epu64x(uint64_t e1, uint64_t e0)
4940
+{
4941
+#if defined(SIMDE_X86_SSE2_NATIVE) && \
4942
+   (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19, 0, 0))
4943
+   return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t, e1),
4944
+                 HEDLEY_STATIC_CAST(int64_t, e0));
4945
+#else
4946
+   simde__m128i_private r_;
4947
+
4948
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4949
+   SIMDE_ALIGN_LIKE_16(uint64x2_t) uint64_t data[2] = {e0, e1};
4950
+   r_.neon_u64 = vld1q_u64(data);
4951
+#else
4952
+   r_.u64[0] = e0;
4953
+   r_.u64[1] = e1;
4954
+#endif
4955
+
4956
+   return simde__m128i_from_private(r_);
4957
+#endif
4958
+}
4959
+
4960
+SIMDE_FUNCTION_ATTRIBUTES
4961
+simde__m128d simde_mm_set_sd(simde_float64 a)
4962
+{
4963
+#if defined(SIMDE_X86_SSE2_NATIVE)
4964
+   return _mm_set_sd(a);
4965
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
4966
+   return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0);
4967
+#else
4968
+   return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a);
4969
+#endif
4970
+}
4971
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
4972
+#define _mm_set_sd(a) simde_mm_set_sd(a)
4973
+#endif
4974
+
4975
+SIMDE_FUNCTION_ATTRIBUTES
4976
+simde__m128i simde_mm_set1_epi8(int8_t a)
4977
+{
4978
+#if defined(SIMDE_X86_SSE2_NATIVE)
4979
+   return _mm_set1_epi8(a);
4980
+#else
4981
+   simde__m128i_private r_;
4982
+
4983
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
4984
+   r_.neon_i8 = vdupq_n_s8(a);
4985
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
4986
+   r_.wasm_v128 = wasm_i8x16_splat(a);
4987
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
4988
+   r_.altivec_i8 = vec_splats(HEDLEY_STATIC_CAST(signed char, a));
4989
+#else
4990
+   SIMDE_VECTORIZE
4991
+   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
4992
+       r_.i8[i] = a;
4993
+   }
4994
+#endif
4995
+
4996
+   return simde__m128i_from_private(r_);
4997
+#endif
4998
+}
4999
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5000
+#define _mm_set1_epi8(a) simde_mm_set1_epi8(a)
5001
+#endif
5002
+
5003
+SIMDE_FUNCTION_ATTRIBUTES
5004
+simde__m128i simde_mm_set1_epi16(int16_t a)
5005
+{
5006
+#if defined(SIMDE_X86_SSE2_NATIVE)
5007
+   return _mm_set1_epi16(a);
5008
+#else
5009
+   simde__m128i_private r_;
5010
+
5011
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5012
+   r_.neon_i16 = vdupq_n_s16(a);
5013
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5014
+   r_.wasm_v128 = wasm_i16x8_splat(a);
5015
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5016
+   r_.altivec_i16 = vec_splats(HEDLEY_STATIC_CAST(signed short, a));
5017
+#else
5018
+   SIMDE_VECTORIZE
5019
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
5020
+       r_.i16[i] = a;
5021
+   }
5022
+#endif
5023
+
5024
+   return simde__m128i_from_private(r_);
5025
+#endif
5026
+}
5027
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5028
+#define _mm_set1_epi16(a) simde_mm_set1_epi16(a)
5029
+#endif
5030
+
5031
+SIMDE_FUNCTION_ATTRIBUTES
5032
+simde__m128i simde_mm_set1_epi32(int32_t a)
5033
+{
5034
+#if defined(SIMDE_X86_SSE2_NATIVE)
5035
+   return _mm_set1_epi32(a);
5036
+#else
5037
+   simde__m128i_private r_;
5038
+
5039
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5040
+   r_.neon_i32 = vdupq_n_s32(a);
5041
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5042
+   r_.wasm_v128 = wasm_i32x4_splat(a);
5043
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5044
+   r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, a));
5045
+#else
5046
+   SIMDE_VECTORIZE
5047
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
5048
+       r_.i32[i] = a;
5049
+   }
5050
+#endif
5051
+
5052
+   return simde__m128i_from_private(r_);
5053
+#endif
5054
+}
5055
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5056
+#define _mm_set1_epi32(a) simde_mm_set1_epi32(a)
5057
+#endif
5058
+
5059
+SIMDE_FUNCTION_ATTRIBUTES
5060
+simde__m128i simde_mm_set1_epi64x(int64_t a)
5061
+{
5062
+#if defined(SIMDE_X86_SSE2_NATIVE) && \
5063
+   (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19, 0, 0))
5064
+   return _mm_set1_epi64x(a);
5065
+#else
5066
+   simde__m128i_private r_;
5067
+
5068
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5069
+   r_.neon_i64 = vdupq_n_s64(a);
5070
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5071
+   r_.wasm_v128 = wasm_i64x2_splat(a);
5072
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5073
+   r_.altivec_i64 = vec_splats(HEDLEY_STATIC_CAST(signed long long, a));
5074
+#else
5075
+   SIMDE_VECTORIZE
5076
+   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
5077
+       r_.i64[i] = a;
5078
+   }
5079
+#endif
5080
+
5081
+   return simde__m128i_from_private(r_);
5082
+#endif
5083
+}
5084
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5085
+#define _mm_set1_epi64x(a) simde_mm_set1_epi64x(a)
5086
+#endif
5087
+
5088
+SIMDE_FUNCTION_ATTRIBUTES
5089
+simde__m128i simde_mm_set1_epi64(simde__m64 a)
5090
+{
5091
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
5092
+   return _mm_set1_epi64(a);
5093
+#else
5094
+   simde__m64_private a_ = simde__m64_to_private(a);
5095
+   return simde_mm_set1_epi64x(a_.i64[0]);
5096
+#endif
5097
+}
5098
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5099
+#define _mm_set1_epi64(a) simde_mm_set1_epi64(a)
5100
+#endif
5101
+
5102
+SIMDE_FUNCTION_ATTRIBUTES
5103
+simde__m128i simde_x_mm_set1_epu8(uint8_t value)
5104
+{
5105
+#if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5106
+   return simde__m128i_from_altivec_u8(
5107
+       vec_splats(HEDLEY_STATIC_CAST(unsigned char, value)));
5108
+#else
5109
+   return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value));
5110
+#endif
5111
+}
5112
+
5113
+SIMDE_FUNCTION_ATTRIBUTES
5114
+simde__m128i simde_x_mm_set1_epu16(uint16_t value)
5115
+{
5116
+#if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5117
+   return simde__m128i_from_altivec_u16(
5118
+       vec_splats(HEDLEY_STATIC_CAST(unsigned short, value)));
5119
+#else
5120
+   return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value));
5121
+#endif
5122
+}
5123
+
5124
+SIMDE_FUNCTION_ATTRIBUTES
5125
+simde__m128i simde_x_mm_set1_epu32(uint32_t value)
5126
+{
5127
+#if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5128
+   return simde__m128i_from_altivec_u32(
5129
+       vec_splats(HEDLEY_STATIC_CAST(unsigned int, value)));
5130
+#else
5131
+   return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value));
5132
+#endif
5133
+}
5134
+
5135
+SIMDE_FUNCTION_ATTRIBUTES
5136
+simde__m128i simde_x_mm_set1_epu64(uint64_t value)
5137
+{
5138
+#if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5139
+   return simde__m128i_from_altivec_u64(
5140
+       vec_splats(HEDLEY_STATIC_CAST(unsigned long long, value)));
5141
+#else
5142
+   return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value));
5143
+#endif
5144
+}
5145
+
5146
+SIMDE_FUNCTION_ATTRIBUTES
5147
+simde__m128i simde_mm_setr_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12,
5148
+               int8_t e11, int8_t e10, int8_t e9, int8_t e8,
5149
+               int8_t e7, int8_t e6, int8_t e5, int8_t e4,
5150
+               int8_t e3, int8_t e2, int8_t e1, int8_t e0)
5151
+{
5152
+#if defined(SIMDE_X86_SSE2_NATIVE)
5153
+   return _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5,
5154
+                e4, e3, e2, e1, e0);
5155
+#else
5156
+   return simde_mm_set_epi8(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10,
5157
+                e11, e12, e13, e14, e15);
5158
+#endif
5159
+}
5160
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5161
+#define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4,  \
5162
+             e3, e2, e1, e0)                                        \
5163
+   simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, \
5164
+              e4, e3, e2, e1, e0)
5165
+#endif
5166
+
5167
+SIMDE_FUNCTION_ATTRIBUTES
5168
+simde__m128i simde_mm_setr_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4,
5169
+                int16_t e3, int16_t e2, int16_t e1, int16_t e0)
5170
+{
5171
+#if defined(SIMDE_X86_SSE2_NATIVE)
5172
+   return _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
5173
+#else
5174
+   return simde_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7);
5175
+#endif
5176
+}
5177
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5178
+#define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) \
5179
+   simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0)
5180
+#endif
5181
+
5182
+SIMDE_FUNCTION_ATTRIBUTES
5183
+simde__m128i simde_mm_setr_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
5184
+{
5185
+#if defined(SIMDE_X86_SSE2_NATIVE)
5186
+   return _mm_setr_epi32(e3, e2, e1, e0);
5187
+#else
5188
+   return simde_mm_set_epi32(e0, e1, e2, e3);
5189
+#endif
5190
+}
5191
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5192
+#define _mm_setr_epi32(e3, e2, e1, e0) simde_mm_setr_epi32(e3, e2, e1, e0)
5193
+#endif
5194
+
5195
+SIMDE_FUNCTION_ATTRIBUTES
5196
+simde__m128i simde_mm_setr_epi64(simde__m64 e1, simde__m64 e0)
5197
+{
5198
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
5199
+   return _mm_setr_epi64(e1, e0);
5200
+#else
5201
+   return simde_mm_set_epi64(e0, e1);
5202
+#endif
5203
+}
5204
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5205
+#define _mm_setr_epi64(e1, e0) (simde_mm_setr_epi64((e1), (e0)))
5206
+#endif
5207
+
5208
+SIMDE_FUNCTION_ATTRIBUTES
5209
+simde__m128d simde_mm_setr_pd(simde_float64 e1, simde_float64 e0)
5210
+{
5211
+#if defined(SIMDE_X86_SSE2_NATIVE)
5212
+   return _mm_setr_pd(e1, e0);
5213
+#else
5214
+   return simde_mm_set_pd(e0, e1);
5215
+#endif
5216
+}
5217
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5218
+#define _mm_setr_pd(e1, e0) simde_mm_setr_pd(e1, e0)
5219
+#endif
5220
+
5221
+SIMDE_FUNCTION_ATTRIBUTES
5222
+simde__m128d simde_mm_setzero_pd(void)
5223
+{
5224
+#if defined(SIMDE_X86_SSE2_NATIVE)
5225
+   return _mm_setzero_pd();
5226
+#else
5227
+   return simde_mm_castsi128_pd(simde_mm_setzero_si128());
5228
+#endif
5229
+}
5230
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5231
+#define _mm_setzero_pd() simde_mm_setzero_pd()
5232
+#endif
5233
+
5234
+#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5235
+HEDLEY_DIAGNOSTIC_PUSH
5236
+SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
5237
+#endif
5238
+
5239
+SIMDE_FUNCTION_ATTRIBUTES
5240
+simde__m128d simde_mm_undefined_pd(void)
5241
+{
5242
+   simde__m128d_private r_;
5243
+
5244
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
5245
+   r_.n = _mm_undefined_pd();
5246
+#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5247
+   r_ = simde__m128d_to_private(simde_mm_setzero_pd());
5248
+#endif
5249
+
5250
+   return simde__m128d_from_private(r_);
5251
+}
5252
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5253
+#define _mm_undefined_pd() simde_mm_undefined_pd()
5254
+#endif
5255
+
5256
+SIMDE_FUNCTION_ATTRIBUTES
5257
+simde__m128i simde_mm_undefined_si128(void)
5258
+{
5259
+   simde__m128i_private r_;
5260
+
5261
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
5262
+   r_.n = _mm_undefined_si128();
5263
+#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5264
+   r_ = simde__m128i_to_private(simde_mm_setzero_si128());
5265
+#endif
5266
+
5267
+   return simde__m128i_from_private(r_);
5268
+}
5269
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5270
+#define _mm_undefined_si128() (simde_mm_undefined_si128())
5271
+#endif
5272
+
5273
+#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
5274
+HEDLEY_DIAGNOSTIC_POP
5275
+#endif
5276
+
5277
+SIMDE_FUNCTION_ATTRIBUTES
5278
+simde__m128d simde_x_mm_setone_pd(void)
5279
+{
5280
+   return simde_mm_castps_pd(simde_x_mm_setone_ps());
5281
+}
5282
+
5283
+SIMDE_FUNCTION_ATTRIBUTES
5284
+simde__m128i simde_x_mm_setone_si128(void)
5285
+{
5286
+   return simde_mm_castps_si128(simde_x_mm_setone_ps());
5287
+}
5288
+
5289
+SIMDE_FUNCTION_ATTRIBUTES
5290
+simde__m128i simde_mm_shuffle_epi32(simde__m128i a, const int imm8)
5291
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
5292
+{
5293
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
5294
+
5295
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
5296
+       r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3];
5297
+   }
5298
+
5299
+   return simde__m128i_from_private(r_);
5300
+}
5301
+#if defined(SIMDE_X86_SSE2_NATIVE)
5302
+#define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8))
5303
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5304
+#define simde_mm_shuffle_epi32(a, imm8)                                       \
5305
+   __extension__({                                                       \
5306
+       int32x4_t ret;                                                \
5307
+       ret = vmovq_n_s32(vgetq_lane_s32(vreinterpretq_s32_s64(a),    \
5308
+                        (imm8) & (0x3)));            \
5309
+       ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_s64(a), \
5310
+                           ((imm8) >> 2) & 0x3),     \
5311
+                    ret, 1);                                 \
5312
+       ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_s64(a), \
5313
+                           ((imm8) >> 4) & 0x3),     \
5314
+                    ret, 2);                                 \
5315
+       ret = vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_s64(a), \
5316
+                           ((imm8) >> 6) & 0x3),     \
5317
+                    ret, 3);                                 \
5318
+       vreinterpretq_s64_s32(ret);                                   \
5319
+   })
5320
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
5321
+#define simde_mm_shuffle_epi32(a, imm8)                               \
5322
+   (__extension__({                                              \
5323
+       const simde__m128i_private simde__tmp_a_ =            \
5324
+           simde__m128i_to_private(a);                   \
5325
+       simde__m128i_from_private((simde__m128i_private){     \
5326
+           .i32 = SIMDE_SHUFFLE_VECTOR_(                 \
5327
+               32, 16, (simde__tmp_a_).i32,          \
5328
+               (simde__tmp_a_).i32, ((imm8)) & 3,    \
5329
+               ((imm8) >> 2) & 3, ((imm8) >> 4) & 3, \
5330
+               ((imm8) >> 6) & 3)});                 \
5331
+   }))
5332
+#endif
5333
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5334
+#define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8)
5335
+#endif
5336
+
5337
+SIMDE_FUNCTION_ATTRIBUTES
5338
+simde__m128d simde_mm_shuffle_pd(simde__m128d a, simde__m128d b, const int imm8)
5339
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3)
5340
+{
5341
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
5342
+                b_ = simde__m128d_to_private(b);
5343
+
5344
+   r_.f64[0] = ((imm8 & 1) == 0) ? a_.f64[0] : a_.f64[1];
5345
+   r_.f64[1] = ((imm8 & 2) == 0) ? b_.f64[0] : b_.f64[1];
5346
+
5347
+   return simde__m128d_from_private(r_);
5348
+}
5349
+#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI)
5350
+#define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8))
5351
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
5352
+#define simde_mm_shuffle_pd(a, b, imm8)                                     \
5353
+   (__extension__({                                                    \
5354
+       simde__m128d_from_private((simde__m128d_private){           \
5355
+           .f64 = SIMDE_SHUFFLE_VECTOR_(                       \
5356
+               64, 16, simde__m128d_to_private(a).f64,     \
5357
+               simde__m128d_to_private(b).f64,             \
5358
+               (((imm8)) & 1), (((imm8) >> 1) & 1) + 2)}); \
5359
+   }))
5360
+#endif
5361
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5362
+#define _mm_shuffle_pd(a, b, imm8) simde_mm_shuffle_pd(a, b, imm8)
5363
+#endif
5364
+
5365
+SIMDE_FUNCTION_ATTRIBUTES
5366
+simde__m128i simde_mm_shufflehi_epi16(simde__m128i a, const int imm8)
5367
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
5368
+{
5369
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
5370
+
5371
+   SIMDE_VECTORIZE
5372
+   for (size_t i = 0; i < ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2);
5373
+        i++) {
5374
+       r_.i16[i] = a_.i16[i];
5375
+   }
5376
+   for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2);
5377
+        i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
5378
+       r_.i16[i] = a_.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
5379
+   }
5380
+
5381
+   return simde__m128i_from_private(r_);
5382
+}
5383
+#if defined(SIMDE_X86_SSE2_NATIVE)
5384
+#define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8))
5385
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5386
+#define simde_mm_shufflehi_epi16(a, imm8)                                      \
5387
+   __extension__({                                                        \
5388
+       int16x8_t ret = vreinterpretq_s16_s64(a);                      \
5389
+       int16x4_t highBits = vget_high_s16(ret);                       \
5390
+       ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm8) & (0x3)),  \
5391
+                    ret, 4);                                  \
5392
+       ret = vsetq_lane_s16(                                          \
5393
+           vget_lane_s16(highBits, ((imm8) >> 2) & 0x3), ret, 5); \
5394
+       ret = vsetq_lane_s16(                                          \
5395
+           vget_lane_s16(highBits, ((imm8) >> 4) & 0x3), ret, 6); \
5396
+       ret = vsetq_lane_s16(                                          \
5397
+           vget_lane_s16(highBits, ((imm8) >> 6) & 0x3), ret, 7); \
5398
+       vreinterpretq_s64_s16(ret);                                    \
5399
+   })
5400
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
5401
+#define simde_mm_shufflehi_epi16(a, imm8)                                    \
5402
+   (__extension__({                                                     \
5403
+       const simde__m128i_private simde__tmp_a_ =                   \
5404
+           simde__m128i_to_private(a);                          \
5405
+       simde__m128i_from_private((simde__m128i_private){            \
5406
+           .i16 = SIMDE_SHUFFLE_VECTOR_(                        \
5407
+               16, 16, (simde__tmp_a_).i16,                 \
5408
+               (simde__tmp_a_).i16, 0, 1, 2, 3,             \
5409
+               (((imm8)) & 3) + 4, (((imm8) >> 2) & 3) + 4, \
5410
+               (((imm8) >> 4) & 3) + 4,                     \
5411
+               (((imm8) >> 6) & 3) + 4)});                  \
5412
+   }))
5413
+#endif
5414
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5415
+#define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8)
5416
+#endif
5417
+
5418
+SIMDE_FUNCTION_ATTRIBUTES
5419
+simde__m128i simde_mm_shufflelo_epi16(simde__m128i a, const int imm8)
5420
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
5421
+{
5422
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
5423
+
5424
+   for (size_t i = 0; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2);
5425
+        i++) {
5426
+       r_.i16[i] = a_.i16[((imm8 >> (i * 2)) & 3)];
5427
+   }
5428
+   SIMDE_VECTORIZE
5429
+   for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2);
5430
+        i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
5431
+       r_.i16[i] = a_.i16[i];
5432
+   }
5433
+
5434
+   return simde__m128i_from_private(r_);
5435
+}
5436
+#if defined(SIMDE_X86_SSE2_NATIVE)
5437
+#define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8))
5438
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5439
+#define simde_mm_shufflelo_epi16(a, imm8)                                     \
5440
+   __extension__({                                                       \
5441
+       int16x8_t ret = vreinterpretq_s16_s64(a);                     \
5442
+       int16x4_t lowBits = vget_low_s16(ret);                        \
5443
+       ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm8) & (0x3)),  \
5444
+                    ret, 0);                                 \
5445
+       ret = vsetq_lane_s16(                                         \
5446
+           vget_lane_s16(lowBits, ((imm8) >> 2) & 0x3), ret, 1); \
5447
+       ret = vsetq_lane_s16(                                         \
5448
+           vget_lane_s16(lowBits, ((imm8) >> 4) & 0x3), ret, 2); \
5449
+       ret = vsetq_lane_s16(                                         \
5450
+           vget_lane_s16(lowBits, ((imm8) >> 6) & 0x3), ret, 3); \
5451
+       vreinterpretq_s64_s16(ret);                                   \
5452
+   })
5453
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
5454
+#define simde_mm_shufflelo_epi16(a, imm8)                                 \
5455
+   (__extension__({                                                  \
5456
+       const simde__m128i_private simde__tmp_a_ =                \
5457
+           simde__m128i_to_private(a);                       \
5458
+       simde__m128i_from_private((simde__m128i_private){         \
5459
+           .i16 = SIMDE_SHUFFLE_VECTOR_(                     \
5460
+               16, 16, (simde__tmp_a_).i16,              \
5461
+               (simde__tmp_a_).i16, (((imm8)) & 3),      \
5462
+               (((imm8) >> 2) & 3), (((imm8) >> 4) & 3), \
5463
+               (((imm8) >> 6) & 3), 4, 5, 6, 7)});       \
5464
+   }))
5465
+#endif
5466
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5467
+#define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8)
5468
+#endif
5469
+
5470
+SIMDE_FUNCTION_ATTRIBUTES
5471
+simde__m128i simde_mm_sll_epi16(simde__m128i a, simde__m128i count)
5472
+{
5473
+#if defined(SIMDE_X86_SSE2_NATIVE)
5474
+   return _mm_sll_epi16(a, count);
5475
+#else
5476
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5477
+                count_ = simde__m128i_to_private(count);
5478
+
5479
+   if (count_.u64[0] > 15)
5480
+       return simde_mm_setzero_si128();
5481
+
5482
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5483
+   r_.u16 = (a_.u16 << count_.u64[0]);
5484
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5485
+   r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(
5486
+                            int16_t, count_.u64[0])));
5487
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5488
+   r_.wasm_v128 =
5489
+       ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16)
5490
+            ? wasm_i16x8_shl(a_.wasm_v128,
5491
+                     HEDLEY_STATIC_CAST(
5492
+                         int32_t,
5493
+                         wasm_i64x2_extract_lane(
5494
+                             count_.wasm_v128, 0)))
5495
+            : wasm_i16x8_const(0, 0, 0, 0, 0, 0, 0, 0));
5496
+#else
5497
+   SIMDE_VECTORIZE
5498
+   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
5499
+       r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t,
5500
+                          (a_.u16[i] << count_.u64[0]));
5501
+   }
5502
+#endif
5503
+
5504
+   return simde__m128i_from_private(r_);
5505
+#endif
5506
+}
5507
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5508
+#define _mm_sll_epi16(a, count) simde_mm_sll_epi16((a), (count))
5509
+#endif
5510
+
5511
+SIMDE_FUNCTION_ATTRIBUTES
5512
+simde__m128i simde_mm_sll_epi32(simde__m128i a, simde__m128i count)
5513
+{
5514
+#if defined(SIMDE_X86_SSE2_NATIVE)
5515
+   return _mm_sll_epi32(a, count);
5516
+#else
5517
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5518
+                count_ = simde__m128i_to_private(count);
5519
+
5520
+   if (count_.u64[0] > 31)
5521
+       return simde_mm_setzero_si128();
5522
+
5523
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5524
+   r_.u32 = (a_.u32 << count_.u64[0]);
5525
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5526
+   r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(
5527
+                            int32_t, count_.u64[0])));
5528
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5529
+   r_.wasm_v128 =
5530
+       ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32)
5531
+            ? wasm_i32x4_shl(a_.wasm_v128,
5532
+                     HEDLEY_STATIC_CAST(
5533
+                         int32_t,
5534
+                         wasm_i64x2_extract_lane(
5535
+                             count_.wasm_v128, 0)))
5536
+            : wasm_i32x4_const(0, 0, 0, 0));
5537
+#else
5538
+   SIMDE_VECTORIZE
5539
+   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
5540
+       r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t,
5541
+                          (a_.u32[i] << count_.u64[0]));
5542
+   }
5543
+#endif
5544
+
5545
+   return simde__m128i_from_private(r_);
5546
+#endif
5547
+}
5548
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5549
+#define _mm_sll_epi32(a, count) (simde_mm_sll_epi32(a, (count)))
5550
+#endif
5551
+
5552
+SIMDE_FUNCTION_ATTRIBUTES
5553
+simde__m128i simde_mm_sll_epi64(simde__m128i a, simde__m128i count)
5554
+{
5555
+#if defined(SIMDE_X86_SSE2_NATIVE)
5556
+   return _mm_sll_epi64(a, count);
5557
+#else
5558
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5559
+                count_ = simde__m128i_to_private(count);
5560
+
5561
+   if (count_.u64[0] > 63)
5562
+       return simde_mm_setzero_si128();
5563
+
5564
+   const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]);
5565
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5566
+   r_.neon_u64 = vshlq_u64(a_.neon_u64,
5567
+               vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, s)));
5568
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5569
+   r_.wasm_v128 = (s < 64) ? wasm_i64x2_shl(a_.wasm_v128, s)
5570
+               : wasm_i64x2_const(0, 0);
5571
+#else
5572
+#if !defined(SIMDE_BUG_GCC_94488)
5573
+   SIMDE_VECTORIZE
5574
+#endif
5575
+   for (size_t i = 0; i < (sizeof(r_.u64) / sizeof(r_.u64[0])); i++) {
5576
+       r_.u64[i] = a_.u64[i] << s;
5577
+   }
5578
+#endif
5579
+
5580
+   return simde__m128i_from_private(r_);
5581
+#endif
5582
+}
5583
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5584
+#define _mm_sll_epi64(a, count) (simde_mm_sll_epi64(a, (count)))
5585
+#endif
5586
+
5587
+SIMDE_FUNCTION_ATTRIBUTES
5588
+simde__m128d simde_mm_sqrt_pd(simde__m128d a)
5589
+{
5590
+#if defined(SIMDE_X86_SSE2_NATIVE)
5591
+   return _mm_sqrt_pd(a);
5592
+#else
5593
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a);
5594
+
5595
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
5596
+   r_.neon_f64 = vsqrtq_f64(a_.neon_f64);
5597
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5598
+   r_.wasm_v128 = wasm_f64x2_sqrt(a_.wasm_v128);
5599
+#elif defined(simde_math_sqrt)
5600
+   SIMDE_VECTORIZE
5601
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
5602
+       r_.f64[i] = simde_math_sqrt(a_.f64[i]);
5603
+   }
5604
+#else
5605
+   HEDLEY_UNREACHABLE();
5606
+#endif
5607
+
5608
+   return simde__m128d_from_private(r_);
5609
+#endif
5610
+}
5611
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5612
+#define _mm_sqrt_pd(a) simde_mm_sqrt_pd(a)
5613
+#endif
5614
+
5615
+SIMDE_FUNCTION_ATTRIBUTES
5616
+simde__m128d simde_mm_sqrt_sd(simde__m128d a, simde__m128d b)
5617
+{
5618
+#if defined(SIMDE_X86_SSE2_NATIVE)
5619
+   return _mm_sqrt_sd(a, b);
5620
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
5621
+   return simde_mm_move_sd(a, simde_mm_sqrt_pd(b));
5622
+#else
5623
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
5624
+                b_ = simde__m128d_to_private(b);
5625
+
5626
+#if defined(simde_math_sqrt)
5627
+   r_.f64[0] = simde_math_sqrt(b_.f64[0]);
5628
+   r_.f64[1] = a_.f64[1];
5629
+#else
5630
+   HEDLEY_UNREACHABLE();
5631
+#endif
5632
+
5633
+   return simde__m128d_from_private(r_);
5634
+#endif
5635
+}
5636
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5637
+#define _mm_sqrt_sd(a, b) simde_mm_sqrt_sd(a, b)
5638
+#endif
5639
+
5640
+SIMDE_FUNCTION_ATTRIBUTES
5641
+simde__m128i simde_mm_srl_epi16(simde__m128i a, simde__m128i count)
5642
+{
5643
+#if defined(SIMDE_X86_SSE2_NATIVE)
5644
+   return _mm_srl_epi16(a, count);
5645
+#else
5646
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5647
+                count_ = simde__m128i_to_private(count);
5648
+
5649
+   const int cnt = HEDLEY_STATIC_CAST(
5650
+       int, (count_.i64[0] > 16 ? 16 : count_.i64[0]));
5651
+
5652
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5653
+   r_.neon_u16 = vshlq_u16(a_.neon_u16,
5654
+               vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5655
+#else
5656
+   SIMDE_VECTORIZE
5657
+   for (size_t i = 0; i < (sizeof(r_.u16) / sizeof(r_.u16[0])); i++) {
5658
+       r_.u16[i] = a_.u16[i] >> cnt;
5659
+   }
5660
+#endif
5661
+
5662
+   return simde__m128i_from_private(r_);
5663
+#endif
5664
+}
5665
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5666
+#define _mm_srl_epi16(a, count) (simde_mm_srl_epi16(a, (count)))
5667
+#endif
5668
+
5669
+SIMDE_FUNCTION_ATTRIBUTES
5670
+simde__m128i simde_mm_srl_epi32(simde__m128i a, simde__m128i count)
5671
+{
5672
+#if defined(SIMDE_X86_SSE2_NATIVE)
5673
+   return _mm_srl_epi32(a, count);
5674
+#else
5675
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5676
+                count_ = simde__m128i_to_private(count);
5677
+
5678
+   const int cnt = HEDLEY_STATIC_CAST(
5679
+       int, (count_.i64[0] > 32 ? 32 : count_.i64[0]));
5680
+
5681
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5682
+   r_.neon_u32 = vshlq_u32(a_.neon_u32,
5683
+               vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
5684
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5685
+   r_.wasm_v128 = wasm_u32x4_shr(a_.wasm_v128, cnt);
5686
+#else
5687
+   SIMDE_VECTORIZE
5688
+   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
5689
+       r_.u32[i] = a_.u32[i] >> cnt;
5690
+   }
5691
+#endif
5692
+
5693
+   return simde__m128i_from_private(r_);
5694
+#endif
5695
+}
5696
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5697
+#define _mm_srl_epi32(a, count) (simde_mm_srl_epi32(a, (count)))
5698
+#endif
5699
+
5700
+SIMDE_FUNCTION_ATTRIBUTES
5701
+simde__m128i simde_mm_srl_epi64(simde__m128i a, simde__m128i count)
5702
+{
5703
+#if defined(SIMDE_X86_SSE2_NATIVE)
5704
+   return _mm_srl_epi64(a, count);
5705
+#else
5706
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5707
+                count_ = simde__m128i_to_private(count);
5708
+
5709
+   const int cnt = HEDLEY_STATIC_CAST(
5710
+       int, (count_.i64[0] > 64 ? 64 : count_.i64[0]));
5711
+
5712
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5713
+   r_.neon_u64 = vshlq_u64(a_.neon_u64,
5714
+               vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt)));
5715
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5716
+   r_.wasm_v128 = wasm_u64x2_shr(a_.wasm_v128, cnt);
5717
+#else
5718
+#if !defined(SIMDE_BUG_GCC_94488)
5719
+   SIMDE_VECTORIZE
5720
+#endif
5721
+   for (size_t i = 0; i < (sizeof(r_.u64) / sizeof(r_.u64[0])); i++) {
5722
+       r_.u64[i] = a_.u64[i] >> cnt;
5723
+   }
5724
+#endif
5725
+
5726
+   return simde__m128i_from_private(r_);
5727
+#endif
5728
+}
5729
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5730
+#define _mm_srl_epi64(a, count) (simde_mm_srl_epi64(a, (count)))
5731
+#endif
5732
+
5733
+SIMDE_FUNCTION_ATTRIBUTES
5734
+simde__m128i simde_mm_srai_epi16(simde__m128i a, const int imm8)
5735
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
5736
+{
5737
+   /* MSVC requires a range of (0, 255). */
5738
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
5739
+
5740
+   const int cnt = (imm8 & ~15) ? 15 : imm8;
5741
+
5742
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5743
+   r_.neon_i16 = vshlq_s16(a_.neon_i16,
5744
+               vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5745
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5746
+   r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, cnt);
5747
+#else
5748
+   SIMDE_VECTORIZE
5749
+   for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i16[0])); i++) {
5750
+       r_.i16[i] = a_.i16[i] >> cnt;
5751
+   }
5752
+#endif
5753
+
5754
+   return simde__m128i_from_private(r_);
5755
+}
5756
+#if defined(SIMDE_X86_SSE2_NATIVE)
5757
+#define simde_mm_srai_epi16(a, imm8) _mm_srai_epi16((a), (imm8))
5758
+#endif
5759
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5760
+#define _mm_srai_epi16(a, imm8) simde_mm_srai_epi16(a, imm8)
5761
+#endif
5762
+
5763
+SIMDE_FUNCTION_ATTRIBUTES
5764
+simde__m128i simde_mm_srai_epi32(simde__m128i a, const int imm8)
5765
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
5766
+{
5767
+   /* MSVC requires a range of (0, 255). */
5768
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
5769
+
5770
+   const int cnt = (imm8 & ~31) ? 31 : imm8;
5771
+
5772
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5773
+   r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt));
5774
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5775
+   r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, cnt);
5776
+#else
5777
+   SIMDE_VECTORIZE
5778
+   for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i32[0])); i++) {
5779
+       r_.i32[i] = a_.i32[i] >> cnt;
5780
+   }
5781
+#endif
5782
+
5783
+   return simde__m128i_from_private(r_);
5784
+}
5785
+#if defined(SIMDE_X86_SSE2_NATIVE)
5786
+#define simde_mm_srai_epi32(a, imm8) _mm_srai_epi32((a), (imm8))
5787
+#endif
5788
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5789
+#define _mm_srai_epi32(a, imm8) simde_mm_srai_epi32(a, imm8)
5790
+#endif
5791
+
5792
+SIMDE_FUNCTION_ATTRIBUTES
5793
+simde__m128i simde_mm_sra_epi16(simde__m128i a, simde__m128i count)
5794
+{
5795
+#if defined(SIMDE_X86_SSE2_NATIVE)
5796
+   return _mm_sra_epi16(a, count);
5797
+#else
5798
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5799
+                count_ = simde__m128i_to_private(count);
5800
+
5801
+   const int cnt = HEDLEY_STATIC_CAST(
5802
+       int, (count_.i64[0] > 15 ? 15 : count_.i64[0]));
5803
+
5804
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5805
+   r_.neon_i16 = vshlq_s16(a_.neon_i16,
5806
+               vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt)));
5807
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5808
+   r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, cnt);
5809
+#else
5810
+   SIMDE_VECTORIZE
5811
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
5812
+       r_.i16[i] = a_.i16[i] >> cnt;
5813
+   }
5814
+#endif
5815
+
5816
+   return simde__m128i_from_private(r_);
5817
+#endif
5818
+}
5819
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5820
+#define _mm_sra_epi16(a, count) (simde_mm_sra_epi16(a, count))
5821
+#endif
5822
+
5823
+SIMDE_FUNCTION_ATTRIBUTES
5824
+simde__m128i simde_mm_sra_epi32(simde__m128i a, simde__m128i count)
5825
+{
5826
+#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
5827
+   return _mm_sra_epi32(a, count);
5828
+#else
5829
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
5830
+                count_ = simde__m128i_to_private(count);
5831
+
5832
+   const int cnt = count_.u64[0] > 31
5833
+               ? 31
5834
+               : HEDLEY_STATIC_CAST(int, count_.u64[0]);
5835
+
5836
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5837
+   r_.neon_i32 = vshlq_s32(a_.neon_i32,
5838
+               vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt)));
5839
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5840
+   r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, cnt);
5841
+#else
5842
+   SIMDE_VECTORIZE
5843
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
5844
+       r_.i32[i] = a_.i32[i] >> cnt;
5845
+   }
5846
+#endif
5847
+
5848
+   return simde__m128i_from_private(r_);
5849
+#endif
5850
+}
5851
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5852
+#define _mm_sra_epi32(a, count) (simde_mm_sra_epi32(a, (count)))
5853
+#endif
5854
+
5855
+SIMDE_FUNCTION_ATTRIBUTES
5856
+simde__m128i simde_mm_slli_epi16(simde__m128i a, const int imm8)
5857
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
5858
+{
5859
+   if (HEDLEY_UNLIKELY((imm8 > 15))) {
5860
+       return simde_mm_setzero_si128();
5861
+   }
5862
+
5863
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
5864
+
5865
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5866
+   r_.i16 = a_.i16 << (imm8 & 0xff);
5867
+#else
5868
+   const int s =
5869
+       (imm8 >
5870
+        HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1)
5871
+           ? 0
5872
+           : imm8;
5873
+   SIMDE_VECTORIZE
5874
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
5875
+       r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << s);
5876
+   }
5877
+#endif
5878
+
5879
+   return simde__m128i_from_private(r_);
5880
+}
5881
+#if defined(SIMDE_X86_SSE2_NATIVE)
5882
+#define simde_mm_slli_epi16(a, imm8) _mm_slli_epi16(a, imm8)
5883
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5884
+#define simde_mm_slli_epi16(a, imm8)                                        \
5885
+   (__extension__({                                                    \
5886
+       simde__m128i ret;                                           \
5887
+       if ((imm8) <= 0) {                                          \
5888
+           ret = a;                                            \
5889
+       } else if ((imm8) > 15) {                                   \
5890
+           ret = simde_mm_setzero_si128();                     \
5891
+       } else {                                                    \
5892
+           ret = simde__m128i_from_neon_i16(vshlq_n_s16(       \
5893
+               simde__m128i_to_neon_i16(a), ((imm8)&15))); \
5894
+       }                                                           \
5895
+       ret;                                                        \
5896
+   }))
5897
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5898
+#define simde_mm_slli_epi16(a, imm8)                                          \
5899
+   ((imm8 < 16)                                                          \
5900
+        ? wasm_i16x8_shl(simde__m128i_to_private(a).wasm_v128, imm8) \
5901
+        : wasm_i16x8_const(0, 0, 0, 0, 0, 0, 0, 0))
5902
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5903
+#define simde_mm_slli_epi16(a, imm8)                                     \
5904
+   ((imm8 & ~15) ? simde_mm_setzero_si128()                         \
5905
+             : simde__m128i_from_altivec_i16(                   \
5906
+               vec_sl(simde__m128i_to_altivec_i16(a),   \
5907
+                      vec_splat_u16(HEDLEY_STATIC_CAST( \
5908
+                          unsigned short, imm8)))))
5909
+#endif
5910
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5911
+#define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8)
5912
+#endif
5913
+
5914
+SIMDE_FUNCTION_ATTRIBUTES
5915
+simde__m128i simde_mm_slli_epi32(simde__m128i a, const int imm8)
5916
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
5917
+{
5918
+   if (HEDLEY_UNLIKELY((imm8 > 31))) {
5919
+       return simde_mm_setzero_si128();
5920
+   }
5921
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
5922
+
5923
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5924
+   r_.i32 = a_.i32 << imm8;
5925
+#else
5926
+   SIMDE_VECTORIZE
5927
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
5928
+       r_.i32[i] = a_.i32[i] << (imm8 & 0xff);
5929
+   }
5930
+#endif
5931
+
5932
+   return simde__m128i_from_private(r_);
5933
+}
5934
+#if defined(SIMDE_X86_SSE2_NATIVE)
5935
+#define simde_mm_slli_epi32(a, imm8) _mm_slli_epi32(a, imm8)
5936
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5937
+#define simde_mm_slli_epi32(a, imm8)                                        \
5938
+   (__extension__({                                                    \
5939
+       simde__m128i ret;                                           \
5940
+       if ((imm8) <= 0) {                                          \
5941
+           ret = a;                                            \
5942
+       } else if ((imm8) > 31) {                                   \
5943
+           ret = simde_mm_setzero_si128();                     \
5944
+       } else {                                                    \
5945
+           ret = simde__m128i_from_neon_i32(vshlq_n_s32(       \
5946
+               simde__m128i_to_neon_i32(a), ((imm8)&31))); \
5947
+       }                                                           \
5948
+       ret;                                                        \
5949
+   }))
5950
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
5951
+#define simde_mm_slli_epi32(a, imm8)                                          \
5952
+   ((imm8 < 32)                                                          \
5953
+        ? wasm_i32x4_shl(simde__m128i_to_private(a).wasm_v128, imm8) \
5954
+        : wasm_i32x4_const(0, 0, 0, 0))
5955
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
5956
+#define simde_mm_slli_epi32(a, imm8)                                        \
5957
+   (__extension__({                                                    \
5958
+       simde__m128i ret;                                           \
5959
+       if ((imm8) <= 0) {                                          \
5960
+           ret = a;                                            \
5961
+       } else if ((imm8) > 31) {                                   \
5962
+           ret = simde_mm_setzero_si128();                     \
5963
+       } else {                                                    \
5964
+           ret = simde__m128i_from_altivec_i32(                \
5965
+               vec_sl(simde__m128i_to_altivec_i32(a),      \
5966
+                      vec_splats(HEDLEY_STATIC_CAST(       \
5967
+                          unsigned int, (imm8)&31)))); \
5968
+       }                                                           \
5969
+       ret;                                                        \
5970
+   }))
5971
+#endif
5972
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
5973
+#define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8)
5974
+#endif
5975
+
5976
+SIMDE_FUNCTION_ATTRIBUTES
5977
+simde__m128i simde_mm_slli_epi64(simde__m128i a, const int imm8)
5978
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
5979
+{
5980
+   if (HEDLEY_UNLIKELY((imm8 > 63))) {
5981
+       return simde_mm_setzero_si128();
5982
+   }
5983
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
5984
+
5985
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
5986
+   r_.i64 = a_.i64 << imm8;
5987
+#else
5988
+   SIMDE_VECTORIZE
5989
+   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
5990
+       r_.i64[i] = a_.i64[i] << (imm8 & 0xff);
5991
+   }
5992
+#endif
5993
+
5994
+   return simde__m128i_from_private(r_);
5995
+}
5996
+#if defined(SIMDE_X86_SSE2_NATIVE)
5997
+#define simde_mm_slli_epi64(a, imm8) _mm_slli_epi64(a, imm8)
5998
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
5999
+#define simde_mm_slli_epi64(a, imm8)                                        \
6000
+   (__extension__({                                                    \
6001
+       simde__m128i ret;                                           \
6002
+       if ((imm8) <= 0) {                                          \
6003
+           ret = a;                                            \
6004
+       } else if ((imm8) > 63) {                                   \
6005
+           ret = simde_mm_setzero_si128();                     \
6006
+       } else {                                                    \
6007
+           ret = simde__m128i_from_neon_i64(vshlq_n_s64(       \
6008
+               simde__m128i_to_neon_i64(a), ((imm8)&63))); \
6009
+       }                                                           \
6010
+       ret;                                                        \
6011
+   }))
6012
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
6013
+#define simde_mm_slli_epi64(a, imm8)                                          \
6014
+   ((imm8 < 64)                                                          \
6015
+        ? wasm_i64x2_shl(simde__m128i_to_private(a).wasm_v128, imm8) \
6016
+        : wasm_i64x2_const(0, 0))
6017
+#endif
6018
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6019
+#define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8)
6020
+#endif
6021
+
6022
+SIMDE_FUNCTION_ATTRIBUTES
6023
+simde__m128i simde_mm_srli_epi16(simde__m128i a, const int imm8)
6024
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
6025
+{
6026
+   if (HEDLEY_UNLIKELY((imm8 > 15))) {
6027
+       return simde_mm_setzero_si128();
6028
+   }
6029
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
6030
+
6031
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
6032
+   r_.u16 = a_.u16 >> imm8;
6033
+#else
6034
+   SIMDE_VECTORIZE
6035
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
6036
+       r_.u16[i] = a_.u16[i] >> (imm8 & 0xff);
6037
+   }
6038
+#endif
6039
+
6040
+   return simde__m128i_from_private(r_);
6041
+}
6042
+#if defined(SIMDE_X86_SSE2_NATIVE)
6043
+#define simde_mm_srli_epi16(a, imm8) _mm_srli_epi16(a, imm8)
6044
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6045
+#define simde_mm_srli_epi16(a, imm8)                                  \
6046
+   (__extension__({                                              \
6047
+       simde__m128i ret;                                     \
6048
+       if ((imm8) <= 0) {                                    \
6049
+           ret = a;                                      \
6050
+       } else if ((imm8) > 15) {                             \
6051
+           ret = simde_mm_setzero_si128();               \
6052
+       } else {                                              \
6053
+           ret = simde__m128i_from_neon_u16(vshrq_n_u16( \
6054
+               simde__m128i_to_neon_u16(a),          \
6055
+               (((imm8)&15) | (((imm8)&15) == 0)))); \
6056
+       }                                                     \
6057
+       ret;                                                  \
6058
+   }))
6059
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
6060
+#define simde_mm_srli_epi16(a, imm8)                                          \
6061
+   ((imm8 < 16)                                                          \
6062
+        ? wasm_u16x8_shr(simde__m128i_to_private(a).wasm_v128, imm8) \
6063
+        : wasm_i16x8_const(0, 0, 0, 0, 0, 0, 0, 0))
6064
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
6065
+#define simde_mm_srli_epi16(a, imm8)                                     \
6066
+   ((imm8 & ~15) ? simde_mm_setzero_si128()                         \
6067
+             : simde__m128i_from_altivec_i16(                   \
6068
+               vec_sr(simde__m128i_to_altivec_i16(a),   \
6069
+                      vec_splat_u16(HEDLEY_STATIC_CAST( \
6070
+                          unsigned short, imm8)))))
6071
+#endif
6072
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6073
+#define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8)
6074
+#endif
6075
+
6076
+SIMDE_FUNCTION_ATTRIBUTES
6077
+simde__m128i simde_mm_srli_epi32(simde__m128i a, const int imm8)
6078
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
6079
+{
6080
+   if (HEDLEY_UNLIKELY((imm8 > 31))) {
6081
+       return simde_mm_setzero_si128();
6082
+   }
6083
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
6084
+
6085
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR)
6086
+   r_.u32 = a_.u32 >> (imm8 & 0xff);
6087
+#else
6088
+   SIMDE_VECTORIZE
6089
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
6090
+       r_.u32[i] = a_.u32[i] >> (imm8 & 0xff);
6091
+   }
6092
+#endif
6093
+
6094
+   return simde__m128i_from_private(r_);
6095
+}
6096
+#if defined(SIMDE_X86_SSE2_NATIVE)
6097
+#define simde_mm_srli_epi32(a, imm8) _mm_srli_epi32(a, imm8)
6098
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6099
+#define simde_mm_srli_epi32(a, imm8)                                  \
6100
+   (__extension__({                                              \
6101
+       simde__m128i ret;                                     \
6102
+       if ((imm8) <= 0) {                                    \
6103
+           ret = a;                                      \
6104
+       } else if ((imm8) > 31) {                             \
6105
+           ret = simde_mm_setzero_si128();               \
6106
+       } else {                                              \
6107
+           ret = simde__m128i_from_neon_u32(vshrq_n_u32( \
6108
+               simde__m128i_to_neon_u32(a),          \
6109
+               (((imm8)&31) | (((imm8)&31) == 0)))); \
6110
+       }                                                     \
6111
+       ret;                                                  \
6112
+   }))
6113
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
6114
+#define simde_mm_srli_epi32(a, imm8)                                          \
6115
+   ((imm8 < 32)                                                          \
6116
+        ? wasm_u32x4_shr(simde__m128i_to_private(a).wasm_v128, imm8) \
6117
+        : wasm_i32x4_const(0, 0, 0, 0))
6118
+#elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE)
6119
+#define simde_mm_srli_epi32(a, imm8)                                        \
6120
+   (__extension__({                                                    \
6121
+       simde__m128i ret;                                           \
6122
+       if ((imm8) <= 0) {                                          \
6123
+           ret = a;                                            \
6124
+       } else if ((imm8) > 31) {                                   \
6125
+           ret = simde_mm_setzero_si128();                     \
6126
+       } else {                                                    \
6127
+           ret = simde__m128i_from_altivec_i32(                \
6128
+               vec_sr(simde__m128i_to_altivec_i32(a),      \
6129
+                      vec_splats(HEDLEY_STATIC_CAST(       \
6130
+                          unsigned int, (imm8)&31)))); \
6131
+       }                                                           \
6132
+       ret;                                                        \
6133
+   }))
6134
+#endif
6135
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6136
+#define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8)
6137
+#endif
6138
+
6139
+SIMDE_FUNCTION_ATTRIBUTES
6140
+simde__m128i simde_mm_srli_epi64(simde__m128i a, const int imm8)
6141
+   SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255)
6142
+{
6143
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
6144
+
6145
+   if (HEDLEY_UNLIKELY((imm8 & 63) != imm8))
6146
+       return simde_mm_setzero_si128();
6147
+
6148
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6149
+   r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8));
6150
+#else
6151
+#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488)
6152
+   r_.u64 = a_.u64 >> imm8;
6153
+#else
6154
+   SIMDE_VECTORIZE
6155
+   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
6156
+       r_.u64[i] = a_.u64[i] >> imm8;
6157
+   }
6158
+#endif
6159
+#endif
6160
+
6161
+   return simde__m128i_from_private(r_);
6162
+}
6163
+#if defined(SIMDE_X86_SSE2_NATIVE)
6164
+#define simde_mm_srli_epi64(a, imm8) _mm_srli_epi64(a, imm8)
6165
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6166
+#define simde_mm_srli_epi64(a, imm8)                                  \
6167
+   (__extension__({                                              \
6168
+       simde__m128i ret;                                     \
6169
+       if ((imm8) <= 0) {                                    \
6170
+           ret = a;                                      \
6171
+       } else if ((imm8) > 63) {                             \
6172
+           ret = simde_mm_setzero_si128();               \
6173
+       } else {                                              \
6174
+           ret = simde__m128i_from_neon_u64(vshrq_n_u64( \
6175
+               simde__m128i_to_neon_u64(a),          \
6176
+               (((imm8)&63) | (((imm8)&63) == 0)))); \
6177
+       }                                                     \
6178
+       ret;                                                  \
6179
+   }))
6180
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
6181
+#define simde_mm_srli_epi64(a, imm8)                                          \
6182
+   ((imm8 < 64)                                                          \
6183
+        ? wasm_u64x2_shr(simde__m128i_to_private(a).wasm_v128, imm8) \
6184
+        : wasm_i64x2_const(0, 0))
6185
+#endif
6186
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6187
+#define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8)
6188
+#endif
6189
+
6190
+SIMDE_FUNCTION_ATTRIBUTES
6191
+void simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
6192
+              simde__m128d a)
6193
+{
6194
+#if defined(SIMDE_X86_SSE2_NATIVE)
6195
+   _mm_store_pd(mem_addr, a);
6196
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6197
+   vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
6198
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6199
+   vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr),
6200
+         simde__m128d_to_private(a).neon_i64);
6201
+#else
6202
+   simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), &a,
6203
+            sizeof(a));
6204
+#endif
6205
+}
6206
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6207
+#define _mm_store_pd(mem_addr, a) \
6208
+   simde_mm_store_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
6209
+#endif
6210
+
6211
+SIMDE_FUNCTION_ATTRIBUTES
6212
+void simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
6213
+           simde__m128d a)
6214
+{
6215
+#if defined(SIMDE_X86_SSE2_NATIVE)
6216
+   _mm_store1_pd(mem_addr, a);
6217
+#else
6218
+   simde__m128d_private a_ = simde__m128d_to_private(a);
6219
+
6220
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6221
+   vst1q_f64(mem_addr, vdupq_laneq_f64(a_.neon_f64, 0));
6222
+#else
6223
+   mem_addr[0] = a_.f64[0];
6224
+   mem_addr[1] = a_.f64[0];
6225
+#endif
6226
+#endif
6227
+}
6228
+#define simde_mm_store_pd1(mem_addr, a) \
6229
+   simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
6230
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6231
+#define _mm_store1_pd(mem_addr, a) \
6232
+   simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
6233
+#define _mm_store_pd1(mem_addr, a) \
6234
+   simde_mm_store_pd1(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
6235
+#endif
6236
+
6237
+SIMDE_FUNCTION_ATTRIBUTES
6238
+void simde_mm_store_sd(simde_float64 *mem_addr, simde__m128d a)
6239
+{
6240
+#if defined(SIMDE_X86_SSE2_NATIVE)
6241
+   _mm_store_sd(mem_addr, a);
6242
+#else
6243
+   simde__m128d_private a_ = simde__m128d_to_private(a);
6244
+
6245
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6246
+   const simde_float64 v = vgetq_lane_f64(a_.neon_f64, 0);
6247
+   simde_memcpy(mem_addr, &v, sizeof(v));
6248
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6249
+   const int64_t v = vgetq_lane_s64(a_.neon_i64, 0);
6250
+   simde_memcpy(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), &v,
6251
+            sizeof(v));
6252
+#else
6253
+   simde_float64 v = a_.f64[0];
6254
+   simde_memcpy(mem_addr, &v, sizeof(simde_float64));
6255
+#endif
6256
+#endif
6257
+}
6258
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6259
+#define _mm_store_sd(mem_addr, a) \
6260
+   simde_mm_store_sd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
6261
+#endif
6262
+
6263
+SIMDE_FUNCTION_ATTRIBUTES
6264
+void simde_mm_store_si128(simde__m128i *mem_addr, simde__m128i a)
6265
+{
6266
+#if defined(SIMDE_X86_SSE2_NATIVE)
6267
+   _mm_store_si128(HEDLEY_STATIC_CAST(__m128i *, mem_addr), a);
6268
+#else
6269
+   simde__m128i_private a_ = simde__m128i_to_private(a);
6270
+
6271
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6272
+   vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), a_.neon_i32);
6273
+#else
6274
+   simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), &a_,
6275
+            sizeof(a_));
6276
+#endif
6277
+#endif
6278
+}
6279
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6280
+#define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, a)
6281
+#endif
6282
+
6283
+SIMDE_FUNCTION_ATTRIBUTES
6284
+void simde_mm_storeh_pd(simde_float64 *mem_addr, simde__m128d a)
6285
+{
6286
+#if defined(SIMDE_X86_SSE2_NATIVE)
6287
+   _mm_storeh_pd(mem_addr, a);
6288
+#else
6289
+   simde__m128d_private a_ = simde__m128d_to_private(a);
6290
+
6291
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6292
+   *mem_addr = vgetq_lane_f64(a_.neon_f64, 1);
6293
+#else
6294
+   *mem_addr = a_.f64[1];
6295
+#endif
6296
+#endif
6297
+}
6298
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6299
+#define _mm_storeh_pd(mem_addr, a) \
6300
+   simde_mm_storeh_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
6301
+#endif
6302
+
6303
+SIMDE_FUNCTION_ATTRIBUTES
6304
+void simde_mm_storel_epi64(simde__m128i *mem_addr, simde__m128i a)
6305
+{
6306
+#if defined(SIMDE_X86_SSE2_NATIVE)
6307
+   _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i *, mem_addr), a);
6308
+#else
6309
+   simde__m128i_private a_ = simde__m128i_to_private(a);
6310
+   int64_t tmp;
6311
+
6312
+   /* memcpy to prevent aliasing, tmp because we can't take the
6313
+     * address of a vector element. */
6314
+
6315
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6316
+   tmp = vgetq_lane_s64(a_.neon_i64, 0);
6317
+#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE)
6318
+#if defined(SIMDE_BUG_GCC_95227)
6319
+   (void)a_;
6320
+#endif
6321
+   tmp = vec_extract(a_.altivec_i64, 0);
6322
+#else
6323
+   tmp = a_.i64[0];
6324
+#endif
6325
+
6326
+   simde_memcpy(mem_addr, &tmp, sizeof(tmp));
6327
+#endif
6328
+}
6329
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6330
+#define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, a)
6331
+#endif
6332
+
6333
+SIMDE_FUNCTION_ATTRIBUTES
6334
+void simde_mm_storel_pd(simde_float64 *mem_addr, simde__m128d a)
6335
+{
6336
+#if defined(SIMDE_X86_SSE2_NATIVE)
6337
+   _mm_storel_pd(mem_addr, a);
6338
+#else
6339
+   simde__m128d_private a_ = simde__m128d_to_private(a);
6340
+
6341
+   simde_float64 tmp;
6342
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6343
+   tmp = vgetq_lane_f64(a_.neon_f64, 0);
6344
+#else
6345
+   tmp = a_.f64[0];
6346
+#endif
6347
+   simde_memcpy(mem_addr, &tmp, sizeof(tmp));
6348
+#endif
6349
+}
6350
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6351
+#define _mm_storel_pd(mem_addr, a) \
6352
+   simde_mm_storel_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
6353
+#endif
6354
+
6355
+SIMDE_FUNCTION_ATTRIBUTES
6356
+void simde_mm_storer_pd(simde_float64 mem_addr[2], simde__m128d a)
6357
+{
6358
+#if defined(SIMDE_X86_SSE2_NATIVE)
6359
+   _mm_storer_pd(mem_addr, a);
6360
+#else
6361
+   simde__m128d_private a_ = simde__m128d_to_private(a);
6362
+
6363
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6364
+   vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr),
6365
+         vextq_s64(a_.neon_i64, a_.neon_i64, 1));
6366
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
6367
+   a_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 1, 0);
6368
+   simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_));
6369
+#else
6370
+   mem_addr[0] = a_.f64[1];
6371
+   mem_addr[1] = a_.f64[0];
6372
+#endif
6373
+#endif
6374
+}
6375
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6376
+#define _mm_storer_pd(mem_addr, a) \
6377
+   simde_mm_storer_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
6378
+#endif
6379
+
6380
+SIMDE_FUNCTION_ATTRIBUTES
6381
+void simde_mm_storeu_pd(simde_float64 *mem_addr, simde__m128d a)
6382
+{
6383
+#if defined(SIMDE_X86_SSE2_NATIVE)
6384
+   _mm_storeu_pd(mem_addr, a);
6385
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6386
+   vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64);
6387
+#else
6388
+   simde_memcpy(mem_addr, &a, sizeof(a));
6389
+#endif
6390
+}
6391
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6392
+#define _mm_storeu_pd(mem_addr, a) \
6393
+   simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
6394
+#endif
6395
+
6396
+SIMDE_FUNCTION_ATTRIBUTES
6397
+void simde_mm_storeu_si128(simde__m128i *mem_addr, simde__m128i a)
6398
+{
6399
+#if defined(SIMDE_X86_SSE2_NATIVE)
6400
+   _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i *, mem_addr), a);
6401
+#else
6402
+   simde_memcpy(mem_addr, &a, sizeof(a));
6403
+#endif
6404
+}
6405
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6406
+#define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, a)
6407
+#endif
6408
+
6409
+SIMDE_FUNCTION_ATTRIBUTES
6410
+void simde_mm_storeu_si16(void *mem_addr, simde__m128i a)
6411
+{
6412
+#if defined(SIMDE_X86_SSE2_NATIVE) &&                 \
6413
+   (SIMDE_DETECT_CLANG_VERSION_CHECK(8, 0, 0) || \
6414
+    HEDLEY_GCC_VERSION_CHECK(11, 0, 0) ||        \
6415
+    HEDLEY_INTEL_VERSION_CHECK(20, 21, 1))
6416
+   _mm_storeu_si16(mem_addr, a);
6417
+#else
6418
+   int16_t val = simde_x_mm_cvtsi128_si16(a);
6419
+   simde_memcpy(mem_addr, &val, sizeof(val));
6420
+#endif
6421
+}
6422
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6423
+#define _mm_storeu_si16(mem_addr, a) simde_mm_storeu_si16(mem_addr, a)
6424
+#endif
6425
+
6426
+SIMDE_FUNCTION_ATTRIBUTES
6427
+void simde_mm_storeu_si32(void *mem_addr, simde__m128i a)
6428
+{
6429
+#if defined(SIMDE_X86_SSE2_NATIVE) &&                 \
6430
+   (SIMDE_DETECT_CLANG_VERSION_CHECK(8, 0, 0) || \
6431
+    HEDLEY_GCC_VERSION_CHECK(11, 0, 0) ||        \
6432
+    HEDLEY_INTEL_VERSION_CHECK(20, 21, 1))
6433
+   _mm_storeu_si32(mem_addr, a);
6434
+#else
6435
+   int32_t val = simde_mm_cvtsi128_si32(a);
6436
+   simde_memcpy(mem_addr, &val, sizeof(val));
6437
+#endif
6438
+}
6439
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6440
+#define _mm_storeu_si32(mem_addr, a) simde_mm_storeu_si32(mem_addr, a)
6441
+#endif
6442
+
6443
+SIMDE_FUNCTION_ATTRIBUTES
6444
+void simde_mm_storeu_si64(void *mem_addr, simde__m128i a)
6445
+{
6446
+#if defined(SIMDE_X86_SSE2_NATIVE) &&                 \
6447
+   (SIMDE_DETECT_CLANG_VERSION_CHECK(8, 0, 0) || \
6448
+    HEDLEY_GCC_VERSION_CHECK(11, 0, 0) ||        \
6449
+    HEDLEY_INTEL_VERSION_CHECK(20, 21, 1))
6450
+   _mm_storeu_si64(mem_addr, a);
6451
+#else
6452
+   int64_t val = simde_mm_cvtsi128_si64(a);
6453
+   simde_memcpy(mem_addr, &val, sizeof(val));
6454
+#endif
6455
+}
6456
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6457
+#define _mm_storeu_si64(mem_addr, a) simde_mm_storeu_si64(mem_addr, a)
6458
+#endif
6459
+
6460
+SIMDE_FUNCTION_ATTRIBUTES
6461
+void simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)],
6462
+           simde__m128d a)
6463
+{
6464
+#if defined(SIMDE_X86_SSE2_NATIVE)
6465
+   _mm_stream_pd(mem_addr, a);
6466
+#else
6467
+   simde_memcpy(mem_addr, &a, sizeof(a));
6468
+#endif
6469
+}
6470
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6471
+#define _mm_stream_pd(mem_addr, a) \
6472
+   simde_mm_stream_pd(HEDLEY_REINTERPRET_CAST(double *, mem_addr), a)
6473
+#endif
6474
+
6475
+SIMDE_FUNCTION_ATTRIBUTES
6476
+void simde_mm_stream_si128(simde__m128i *mem_addr, simde__m128i a)
6477
+{
6478
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
6479
+   _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i *, mem_addr), a);
6480
+#else
6481
+   simde_memcpy(mem_addr, &a, sizeof(a));
6482
+#endif
6483
+}
6484
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6485
+#define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, a)
6486
+#endif
6487
+
6488
+SIMDE_FUNCTION_ATTRIBUTES
6489
+void simde_mm_stream_si32(int32_t *mem_addr, int32_t a)
6490
+{
6491
+#if defined(SIMDE_X86_SSE2_NATIVE)
6492
+   _mm_stream_si32(mem_addr, a);
6493
+#else
6494
+   *mem_addr = a;
6495
+#endif
6496
+}
6497
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6498
+#define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a)
6499
+#endif
6500
+
6501
+SIMDE_FUNCTION_ATTRIBUTES
6502
+void simde_mm_stream_si64(int64_t *mem_addr, int64_t a)
6503
+{
6504
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) && \
6505
+   !defined(HEDLEY_MSVC_VERSION)
6506
+   _mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(long long int *,
6507
+                              int64_t *, mem_addr),
6508
+           a);
6509
+#else
6510
+   *mem_addr = a;
6511
+#endif
6512
+}
6513
+#define simde_mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(mem_addr, a)
6514
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6515
+#define _mm_stream_si64(mem_addr, a)                                  \
6516
+   simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(          \
6517
+                    int64_t *, __int64 *, mem_addr), \
6518
+                a)
6519
+#define _mm_stream_si64x(mem_addr, a)                                 \
6520
+   simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(          \
6521
+                    int64_t *, __int64 *, mem_addr), \
6522
+                a)
6523
+#endif
6524
+
6525
+SIMDE_FUNCTION_ATTRIBUTES
6526
+simde__m128i simde_mm_sub_epi8(simde__m128i a, simde__m128i b)
6527
+{
6528
+#if defined(SIMDE_X86_SSE2_NATIVE)
6529
+   return _mm_sub_epi8(a, b);
6530
+#else
6531
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
6532
+                b_ = simde__m128i_to_private(b);
6533
+
6534
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6535
+   r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8);
6536
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6537
+   r_.i8 = a_.i8 - b_.i8;
6538
+#else
6539
+   SIMDE_VECTORIZE
6540
+   for (size_t i = 0; i < (sizeof(r_.i8) / sizeof(r_.i8[0])); i++) {
6541
+       r_.i8[i] = a_.i8[i] - b_.i8[i];
6542
+   }
6543
+#endif
6544
+
6545
+   return simde__m128i_from_private(r_);
6546
+#endif
6547
+}
6548
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6549
+#define _mm_sub_epi8(a, b) simde_mm_sub_epi8(a, b)
6550
+#endif
6551
+
6552
+SIMDE_FUNCTION_ATTRIBUTES
6553
+simde__m128i simde_mm_sub_epi16(simde__m128i a, simde__m128i b)
6554
+{
6555
+#if defined(SIMDE_X86_SSE2_NATIVE)
6556
+   return _mm_sub_epi16(a, b);
6557
+#else
6558
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
6559
+                b_ = simde__m128i_to_private(b);
6560
+
6561
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6562
+   r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16);
6563
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6564
+   r_.i16 = a_.i16 - b_.i16;
6565
+#else
6566
+   SIMDE_VECTORIZE
6567
+   for (size_t i = 0; i < (sizeof(r_.i16) / sizeof(r_.i16[0])); i++) {
6568
+       r_.i16[i] = a_.i16[i] - b_.i16[i];
6569
+   }
6570
+#endif
6571
+
6572
+   return simde__m128i_from_private(r_);
6573
+#endif
6574
+}
6575
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6576
+#define _mm_sub_epi16(a, b) simde_mm_sub_epi16(a, b)
6577
+#endif
6578
+
6579
+SIMDE_FUNCTION_ATTRIBUTES
6580
+simde__m128i simde_mm_sub_epi32(simde__m128i a, simde__m128i b)
6581
+{
6582
+#if defined(SIMDE_X86_SSE2_NATIVE)
6583
+   return _mm_sub_epi32(a, b);
6584
+#else
6585
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
6586
+                b_ = simde__m128i_to_private(b);
6587
+
6588
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6589
+   r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32);
6590
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6591
+   r_.i32 = a_.i32 - b_.i32;
6592
+#else
6593
+   SIMDE_VECTORIZE
6594
+   for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) {
6595
+       r_.i32[i] = a_.i32[i] - b_.i32[i];
6596
+   }
6597
+#endif
6598
+
6599
+   return simde__m128i_from_private(r_);
6600
+#endif
6601
+}
6602
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6603
+#define _mm_sub_epi32(a, b) simde_mm_sub_epi32(a, b)
6604
+#endif
6605
+
6606
+SIMDE_FUNCTION_ATTRIBUTES
6607
+simde__m128i simde_mm_sub_epi64(simde__m128i a, simde__m128i b)
6608
+{
6609
+#if defined(SIMDE_X86_SSE2_NATIVE)
6610
+   return _mm_sub_epi64(a, b);
6611
+#else
6612
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
6613
+                b_ = simde__m128i_to_private(b);
6614
+
6615
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6616
+   r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64);
6617
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6618
+   r_.i64 = a_.i64 - b_.i64;
6619
+#else
6620
+   SIMDE_VECTORIZE
6621
+   for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) {
6622
+       r_.i64[i] = a_.i64[i] - b_.i64[i];
6623
+   }
6624
+#endif
6625
+
6626
+   return simde__m128i_from_private(r_);
6627
+#endif
6628
+}
6629
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6630
+#define _mm_sub_epi64(a, b) simde_mm_sub_epi64(a, b)
6631
+#endif
6632
+
6633
+SIMDE_FUNCTION_ATTRIBUTES
6634
+simde__m128i simde_x_mm_sub_epu32(simde__m128i a, simde__m128i b)
6635
+{
6636
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
6637
+                b_ = simde__m128i_to_private(b);
6638
+
6639
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6640
+   r_.u32 = a_.u32 - b_.u32;
6641
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6642
+   r_.neon_u32 = vsubq_u32(a_.neon_u32, b_.neon_u32);
6643
+#else
6644
+   SIMDE_VECTORIZE
6645
+   for (size_t i = 0; i < (sizeof(r_.u32) / sizeof(r_.u32[0])); i++) {
6646
+       r_.u32[i] = a_.u32[i] - b_.u32[i];
6647
+   }
6648
+#endif
6649
+
6650
+   return simde__m128i_from_private(r_);
6651
+}
6652
+
6653
+SIMDE_FUNCTION_ATTRIBUTES
6654
+simde__m128d simde_mm_sub_pd(simde__m128d a, simde__m128d b)
6655
+{
6656
+#if defined(SIMDE_X86_SSE2_NATIVE)
6657
+   return _mm_sub_pd(a, b);
6658
+#else
6659
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
6660
+                b_ = simde__m128d_to_private(b);
6661
+
6662
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6663
+   r_.f64 = a_.f64 - b_.f64;
6664
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6665
+   r_.neon_f64 = vsubq_f64(a_.neon_f64, b_.neon_f64);
6666
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
6667
+   r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128);
6668
+#else
6669
+   SIMDE_VECTORIZE
6670
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
6671
+       r_.f64[i] = a_.f64[i] - b_.f64[i];
6672
+   }
6673
+#endif
6674
+
6675
+   return simde__m128d_from_private(r_);
6676
+#endif
6677
+}
6678
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6679
+#define _mm_sub_pd(a, b) simde_mm_sub_pd(a, b)
6680
+#endif
6681
+
6682
+SIMDE_FUNCTION_ATTRIBUTES
6683
+simde__m128d simde_mm_sub_sd(simde__m128d a, simde__m128d b)
6684
+{
6685
+#if defined(SIMDE_X86_SSE2_NATIVE)
6686
+   return _mm_sub_sd(a, b);
6687
+#elif (SIMDE_NATURAL_VECTOR_SIZE > 0)
6688
+   return simde_mm_move_sd(a, simde_mm_sub_pd(a, b));
6689
+#else
6690
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
6691
+                b_ = simde__m128d_to_private(b);
6692
+
6693
+   r_.f64[0] = a_.f64[0] - b_.f64[0];
6694
+   r_.f64[1] = a_.f64[1];
6695
+
6696
+   return simde__m128d_from_private(r_);
6697
+#endif
6698
+}
6699
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6700
+#define _mm_sub_sd(a, b) simde_mm_sub_sd(a, b)
6701
+#endif
6702
+
6703
+SIMDE_FUNCTION_ATTRIBUTES
6704
+simde__m64 simde_mm_sub_si64(simde__m64 a, simde__m64 b)
6705
+{
6706
+#if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE)
6707
+   return _mm_sub_si64(a, b);
6708
+#else
6709
+   simde__m64_private r_, a_ = simde__m64_to_private(a),
6710
+                  b_ = simde__m64_to_private(b);
6711
+
6712
+#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
6713
+   r_.i64 = a_.i64 - b_.i64;
6714
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6715
+   r_.neon_i64 = vsub_s64(a_.neon_i64, b_.neon_i64);
6716
+#else
6717
+   r_.i64[0] = a_.i64[0] - b_.i64[0];
6718
+#endif
6719
+
6720
+   return simde__m64_from_private(r_);
6721
+#endif
6722
+}
6723
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6724
+#define _mm_sub_si64(a, b) simde_mm_sub_si64(a, b)
6725
+#endif
6726
+
6727
+SIMDE_FUNCTION_ATTRIBUTES
6728
+simde__m128i simde_mm_subs_epi8(simde__m128i a, simde__m128i b)
6729
+{
6730
+#if defined(SIMDE_X86_SSE2_NATIVE)
6731
+   return _mm_subs_epi8(a, b);
6732
+#else
6733
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
6734
+                b_ = simde__m128i_to_private(b);
6735
+
6736
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6737
+   r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8);
6738
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
6739
+   r_.wasm_v128 = wasm_i8x16_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6740
+#else
6741
+   SIMDE_VECTORIZE
6742
+   for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i8[0])); i++) {
6743
+       if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) {
6744
+           r_.i8[i] = INT8_MIN;
6745
+       } else if ((b_.i8[i]) < 0 &&
6746
+              (a_.i8[i]) > INT8_MAX + (b_.i8[i])) {
6747
+           r_.i8[i] = INT8_MAX;
6748
+       } else {
6749
+           r_.i8[i] = (a_.i8[i]) - (b_.i8[i]);
6750
+       }
6751
+   }
6752
+#endif
6753
+
6754
+   return simde__m128i_from_private(r_);
6755
+#endif
6756
+}
6757
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6758
+#define _mm_subs_epi8(a, b) simde_mm_subs_epi8(a, b)
6759
+#endif
6760
+
6761
+SIMDE_FUNCTION_ATTRIBUTES
6762
+simde__m128i simde_mm_subs_epi16(simde__m128i a, simde__m128i b)
6763
+{
6764
+#if defined(SIMDE_X86_SSE2_NATIVE)
6765
+   return _mm_subs_epi16(a, b);
6766
+#else
6767
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
6768
+                b_ = simde__m128i_to_private(b);
6769
+
6770
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6771
+   r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16);
6772
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
6773
+   r_.wasm_v128 = wasm_i16x8_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6774
+#else
6775
+   SIMDE_VECTORIZE
6776
+   for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i16[0])); i++) {
6777
+       if (((b_.i16[i]) > 0 &&
6778
+            (a_.i16[i]) < INT16_MIN + (b_.i16[i]))) {
6779
+           r_.i16[i] = INT16_MIN;
6780
+       } else if ((b_.i16[i]) < 0 &&
6781
+              (a_.i16[i]) > INT16_MAX + (b_.i16[i])) {
6782
+           r_.i16[i] = INT16_MAX;
6783
+       } else {
6784
+           r_.i16[i] = (a_.i16[i]) - (b_.i16[i]);
6785
+       }
6786
+   }
6787
+#endif
6788
+
6789
+   return simde__m128i_from_private(r_);
6790
+#endif
6791
+}
6792
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6793
+#define _mm_subs_epi16(a, b) simde_mm_subs_epi16(a, b)
6794
+#endif
6795
+
6796
+SIMDE_FUNCTION_ATTRIBUTES
6797
+simde__m128i simde_mm_subs_epu8(simde__m128i a, simde__m128i b)
6798
+{
6799
+#if defined(SIMDE_X86_SSE2_NATIVE)
6800
+   return _mm_subs_epu8(a, b);
6801
+#else
6802
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
6803
+                b_ = simde__m128i_to_private(b);
6804
+
6805
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6806
+   r_.neon_u8 = vqsubq_u8(a_.neon_u8, b_.neon_u8);
6807
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
6808
+   r_.wasm_v128 = wasm_u8x16_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6809
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
6810
+   r_.altivec_u8 = vec_subs(a_.altivec_u8, b_.altivec_u8);
6811
+#else
6812
+   SIMDE_VECTORIZE
6813
+   for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i8[0])); i++) {
6814
+       const int32_t x = a_.u8[i] - b_.u8[i];
6815
+       if (x < 0) {
6816
+           r_.u8[i] = 0;
6817
+       } else if (x > UINT8_MAX) {
6818
+           r_.u8[i] = UINT8_MAX;
6819
+       } else {
6820
+           r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x);
6821
+       }
6822
+   }
6823
+#endif
6824
+
6825
+   return simde__m128i_from_private(r_);
6826
+#endif
6827
+}
6828
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6829
+#define _mm_subs_epu8(a, b) simde_mm_subs_epu8(a, b)
6830
+#endif
6831
+
6832
+SIMDE_FUNCTION_ATTRIBUTES
6833
+simde__m128i simde_mm_subs_epu16(simde__m128i a, simde__m128i b)
6834
+{
6835
+#if defined(SIMDE_X86_SSE2_NATIVE)
6836
+   return _mm_subs_epu16(a, b);
6837
+#else
6838
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
6839
+                b_ = simde__m128i_to_private(b);
6840
+
6841
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
6842
+   r_.neon_u16 = vqsubq_u16(a_.neon_u16, b_.neon_u16);
6843
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
6844
+   r_.wasm_v128 = wasm_u16x8_sub_saturate(a_.wasm_v128, b_.wasm_v128);
6845
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
6846
+   r_.altivec_u16 = vec_subs(a_.altivec_u16, b_.altivec_u16);
6847
+#else
6848
+   SIMDE_VECTORIZE
6849
+   for (size_t i = 0; i < (sizeof(r_) / sizeof(r_.i16[0])); i++) {
6850
+       const int32_t x = a_.u16[i] - b_.u16[i];
6851
+       if (x < 0) {
6852
+           r_.u16[i] = 0;
6853
+       } else if (x > UINT16_MAX) {
6854
+           r_.u16[i] = UINT16_MAX;
6855
+       } else {
6856
+           r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x);
6857
+       }
6858
+   }
6859
+#endif
6860
+
6861
+   return simde__m128i_from_private(r_);
6862
+#endif
6863
+}
6864
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6865
+#define _mm_subs_epu16(a, b) simde_mm_subs_epu16(a, b)
6866
+#endif
6867
+
6868
+SIMDE_FUNCTION_ATTRIBUTES
6869
+int simde_mm_ucomieq_sd(simde__m128d a, simde__m128d b)
6870
+{
6871
+#if defined(SIMDE_X86_SSE2_NATIVE)
6872
+   return _mm_ucomieq_sd(a, b);
6873
+#else
6874
+   simde__m128d_private a_ = simde__m128d_to_private(a),
6875
+                b_ = simde__m128d_to_private(b);
6876
+   int r;
6877
+
6878
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6879
+   uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6880
+   uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6881
+   uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(
6882
+       vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
6883
+   uint64x2_t a_eq_b = vceqq_f64(a_.neon_f64, b_.neon_f64);
6884
+   r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_eq_b), 0) != 0);
6885
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
6886
+   return wasm_f64x2_extract_lane(a_.wasm_v128, 0) ==
6887
+          wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6888
+#elif defined(SIMDE_HAVE_FENV_H)
6889
+   fenv_t envp;
6890
+   int x = feholdexcept(&envp);
6891
+   r = a_.f64[0] == b_.f64[0];
6892
+   if (HEDLEY_LIKELY(x == 0))
6893
+       fesetenv(&envp);
6894
+#else
6895
+   r = a_.f64[0] == b_.f64[0];
6896
+#endif
6897
+
6898
+   return r;
6899
+#endif
6900
+}
6901
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6902
+#define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(a, b)
6903
+#endif
6904
+
6905
+SIMDE_FUNCTION_ATTRIBUTES
6906
+int simde_mm_ucomige_sd(simde__m128d a, simde__m128d b)
6907
+{
6908
+#if defined(SIMDE_X86_SSE2_NATIVE)
6909
+   return _mm_ucomige_sd(a, b);
6910
+#else
6911
+   simde__m128d_private a_ = simde__m128d_to_private(a),
6912
+                b_ = simde__m128d_to_private(b);
6913
+   int r;
6914
+
6915
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6916
+   uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6917
+   uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6918
+   uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
6919
+   uint64x2_t a_ge_b = vcgeq_f64(a_.neon_f64, b_.neon_f64);
6920
+   r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_ge_b), 0) != 0);
6921
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
6922
+   return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >=
6923
+          wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6924
+#elif defined(SIMDE_HAVE_FENV_H)
6925
+   fenv_t envp;
6926
+   int x = feholdexcept(&envp);
6927
+   r = a_.f64[0] >= b_.f64[0];
6928
+   if (HEDLEY_LIKELY(x == 0))
6929
+       fesetenv(&envp);
6930
+#else
6931
+   r = a_.f64[0] >= b_.f64[0];
6932
+#endif
6933
+
6934
+   return r;
6935
+#endif
6936
+}
6937
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6938
+#define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(a, b)
6939
+#endif
6940
+
6941
+SIMDE_FUNCTION_ATTRIBUTES
6942
+int simde_mm_ucomigt_sd(simde__m128d a, simde__m128d b)
6943
+{
6944
+#if defined(SIMDE_X86_SSE2_NATIVE)
6945
+   return _mm_ucomigt_sd(a, b);
6946
+#else
6947
+   simde__m128d_private a_ = simde__m128d_to_private(a),
6948
+                b_ = simde__m128d_to_private(b);
6949
+   int r;
6950
+
6951
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6952
+   uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6953
+   uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6954
+   uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
6955
+   uint64x2_t a_gt_b = vcgtq_f64(a_.neon_f64, b_.neon_f64);
6956
+   r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_gt_b), 0) != 0);
6957
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
6958
+   return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >
6959
+          wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6960
+#elif defined(SIMDE_HAVE_FENV_H)
6961
+   fenv_t envp;
6962
+   int x = feholdexcept(&envp);
6963
+   r = a_.f64[0] > b_.f64[0];
6964
+   if (HEDLEY_LIKELY(x == 0))
6965
+       fesetenv(&envp);
6966
+#else
6967
+   r = a_.f64[0] > b_.f64[0];
6968
+#endif
6969
+
6970
+   return r;
6971
+#endif
6972
+}
6973
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
6974
+#define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(a, b)
6975
+#endif
6976
+
6977
+SIMDE_FUNCTION_ATTRIBUTES
6978
+int simde_mm_ucomile_sd(simde__m128d a, simde__m128d b)
6979
+{
6980
+#if defined(SIMDE_X86_SSE2_NATIVE)
6981
+   return _mm_ucomile_sd(a, b);
6982
+#else
6983
+   simde__m128d_private a_ = simde__m128d_to_private(a),
6984
+                b_ = simde__m128d_to_private(b);
6985
+   int r;
6986
+
6987
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
6988
+   uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
6989
+   uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
6990
+   uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(
6991
+       vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
6992
+   uint64x2_t a_le_b = vcleq_f64(a_.neon_f64, b_.neon_f64);
6993
+   r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_le_b), 0) != 0);
6994
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
6995
+   return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <=
6996
+          wasm_f64x2_extract_lane(b_.wasm_v128, 0);
6997
+#elif defined(SIMDE_HAVE_FENV_H)
6998
+   fenv_t envp;
6999
+   int x = feholdexcept(&envp);
7000
+   r = a_.f64[0] <= b_.f64[0];
7001
+   if (HEDLEY_LIKELY(x == 0))
7002
+       fesetenv(&envp);
7003
+#else
7004
+   r = a_.f64[0] <= b_.f64[0];
7005
+#endif
7006
+
7007
+   return r;
7008
+#endif
7009
+}
7010
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7011
+#define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(a, b)
7012
+#endif
7013
+
7014
+SIMDE_FUNCTION_ATTRIBUTES
7015
+int simde_mm_ucomilt_sd(simde__m128d a, simde__m128d b)
7016
+{
7017
+#if defined(SIMDE_X86_SSE2_NATIVE)
7018
+   return _mm_ucomilt_sd(a, b);
7019
+#else
7020
+   simde__m128d_private a_ = simde__m128d_to_private(a),
7021
+                b_ = simde__m128d_to_private(b);
7022
+   int r;
7023
+
7024
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7025
+   uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
7026
+   uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
7027
+   uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(
7028
+       vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan))));
7029
+   uint64x2_t a_lt_b = vcltq_f64(a_.neon_f64, b_.neon_f64);
7030
+   r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_lt_b), 0) != 0);
7031
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
7032
+   return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <
7033
+          wasm_f64x2_extract_lane(b_.wasm_v128, 0);
7034
+#elif defined(SIMDE_HAVE_FENV_H)
7035
+   fenv_t envp;
7036
+   int x = feholdexcept(&envp);
7037
+   r = a_.f64[0] < b_.f64[0];
7038
+   if (HEDLEY_LIKELY(x == 0))
7039
+       fesetenv(&envp);
7040
+#else
7041
+   r = a_.f64[0] < b_.f64[0];
7042
+#endif
7043
+
7044
+   return r;
7045
+#endif
7046
+}
7047
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7048
+#define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(a, b)
7049
+#endif
7050
+
7051
+SIMDE_FUNCTION_ATTRIBUTES
7052
+int simde_mm_ucomineq_sd(simde__m128d a, simde__m128d b)
7053
+{
7054
+#if defined(SIMDE_X86_SSE2_NATIVE)
7055
+   return _mm_ucomineq_sd(a, b);
7056
+#else
7057
+   simde__m128d_private a_ = simde__m128d_to_private(a),
7058
+                b_ = simde__m128d_to_private(b);
7059
+   int r;
7060
+
7061
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7062
+   uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64);
7063
+   uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64);
7064
+   uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan);
7065
+   uint64x2_t a_neq_b = vreinterpretq_u64_u32(vmvnq_u32(
7066
+       vreinterpretq_u32_u64(vceqq_f64(a_.neon_f64, b_.neon_f64))));
7067
+   r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_neq_b), 0) != 0);
7068
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
7069
+   return wasm_f64x2_extract_lane(a_.wasm_v128, 0) !=
7070
+          wasm_f64x2_extract_lane(b_.wasm_v128, 0);
7071
+#elif defined(SIMDE_HAVE_FENV_H)
7072
+   fenv_t envp;
7073
+   int x = feholdexcept(&envp);
7074
+   r = a_.f64[0] != b_.f64[0];
7075
+   if (HEDLEY_LIKELY(x == 0))
7076
+       fesetenv(&envp);
7077
+#else
7078
+   r = a_.f64[0] != b_.f64[0];
7079
+#endif
7080
+
7081
+   return r;
7082
+#endif
7083
+}
7084
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7085
+#define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b)
7086
+#endif
7087
+
7088
+#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
7089
+HEDLEY_DIAGNOSTIC_PUSH
7090
+SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
7091
+#endif
7092
+
7093
+#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_)
7094
+HEDLEY_DIAGNOSTIC_POP
7095
+#endif
7096
+
7097
+SIMDE_FUNCTION_ATTRIBUTES
7098
+void simde_mm_lfence(void)
7099
+{
7100
+#if defined(SIMDE_X86_SSE2_NATIVE)
7101
+   _mm_lfence();
7102
+#else
7103
+   simde_mm_sfence();
7104
+#endif
7105
+}
7106
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7107
+#define _mm_lfence() simde_mm_lfence()
7108
+#endif
7109
+
7110
+SIMDE_FUNCTION_ATTRIBUTES
7111
+void simde_mm_mfence(void)
7112
+{
7113
+#if defined(SIMDE_X86_SSE2_NATIVE)
7114
+   _mm_mfence();
7115
+#else
7116
+   simde_mm_sfence();
7117
+#endif
7118
+}
7119
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7120
+#define _mm_mfence() simde_mm_mfence()
7121
+#endif
7122
+
7123
+SIMDE_FUNCTION_ATTRIBUTES
7124
+simde__m128i simde_mm_unpackhi_epi8(simde__m128i a, simde__m128i b)
7125
+{
7126
+#if defined(SIMDE_X86_SSE2_NATIVE)
7127
+   return _mm_unpackhi_epi8(a, b);
7128
+#else
7129
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
7130
+                b_ = simde__m128i_to_private(b);
7131
+
7132
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7133
+   r_.neon_i8 = vzip2q_s8(a_.neon_i8, b_.neon_i8);
7134
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7135
+   int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a_.neon_i16));
7136
+   int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16));
7137
+   int8x8x2_t result = vzip_s8(a1, b1);
7138
+   r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
7139
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
7140
+   r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26,
7141
+                     11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
7142
+#else
7143
+   SIMDE_VECTORIZE
7144
+   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2); i++) {
7145
+       r_.i8[(i * 2)] =
7146
+           a_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
7147
+       r_.i8[(i * 2) + 1] =
7148
+           b_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)];
7149
+   }
7150
+#endif
7151
+
7152
+   return simde__m128i_from_private(r_);
7153
+#endif
7154
+}
7155
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7156
+#define _mm_unpackhi_epi8(a, b) simde_mm_unpackhi_epi8(a, b)
7157
+#endif
7158
+
7159
+SIMDE_FUNCTION_ATTRIBUTES
7160
+simde__m128i simde_mm_unpackhi_epi16(simde__m128i a, simde__m128i b)
7161
+{
7162
+#if defined(SIMDE_X86_SSE2_NATIVE)
7163
+   return _mm_unpackhi_epi16(a, b);
7164
+#else
7165
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
7166
+                b_ = simde__m128i_to_private(b);
7167
+
7168
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7169
+   r_.neon_i16 = vzip2q_s16(a_.neon_i16, b_.neon_i16);
7170
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7171
+   int16x4_t a1 = vget_high_s16(a_.neon_i16);
7172
+   int16x4_t b1 = vget_high_s16(b_.neon_i16);
7173
+   int16x4x2_t result = vzip_s16(a1, b1);
7174
+   r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
7175
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
7176
+   r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6,
7177
+                      14, 7, 15);
7178
+#else
7179
+   SIMDE_VECTORIZE
7180
+   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2); i++) {
7181
+       r_.i16[(i * 2)] =
7182
+           a_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
7183
+       r_.i16[(i * 2) + 1] =
7184
+           b_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)];
7185
+   }
7186
+#endif
7187
+
7188
+   return simde__m128i_from_private(r_);
7189
+#endif
7190
+}
7191
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7192
+#define _mm_unpackhi_epi16(a, b) simde_mm_unpackhi_epi16(a, b)
7193
+#endif
7194
+
7195
+SIMDE_FUNCTION_ATTRIBUTES
7196
+simde__m128i simde_mm_unpackhi_epi32(simde__m128i a, simde__m128i b)
7197
+{
7198
+#if defined(SIMDE_X86_SSE2_NATIVE)
7199
+   return _mm_unpackhi_epi32(a, b);
7200
+#else
7201
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
7202
+                b_ = simde__m128i_to_private(b);
7203
+
7204
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7205
+   r_.neon_i32 = vzip2q_s32(a_.neon_i32, b_.neon_i32);
7206
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7207
+   int32x2_t a1 = vget_high_s32(a_.neon_i32);
7208
+   int32x2_t b1 = vget_high_s32(b_.neon_i32);
7209
+   int32x2x2_t result = vzip_s32(a1, b1);
7210
+   r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
7211
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
7212
+   r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7);
7213
+#else
7214
+   SIMDE_VECTORIZE
7215
+   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2); i++) {
7216
+       r_.i32[(i * 2)] =
7217
+           a_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
7218
+       r_.i32[(i * 2) + 1] =
7219
+           b_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)];
7220
+   }
7221
+#endif
7222
+
7223
+   return simde__m128i_from_private(r_);
7224
+#endif
7225
+}
7226
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7227
+#define _mm_unpackhi_epi32(a, b) simde_mm_unpackhi_epi32(a, b)
7228
+#endif
7229
+
7230
+SIMDE_FUNCTION_ATTRIBUTES
7231
+simde__m128i simde_mm_unpackhi_epi64(simde__m128i a, simde__m128i b)
7232
+{
7233
+#if defined(SIMDE_X86_SSE2_NATIVE)
7234
+   return _mm_unpackhi_epi64(a, b);
7235
+#else
7236
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
7237
+                b_ = simde__m128i_to_private(b);
7238
+
7239
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7240
+   int64x1_t a_h = vget_high_s64(a_.neon_i64);
7241
+   int64x1_t b_h = vget_high_s64(b_.neon_i64);
7242
+   r_.neon_i64 = vcombine_s64(a_h, b_h);
7243
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
7244
+   r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3);
7245
+#else
7246
+   SIMDE_VECTORIZE
7247
+   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2); i++) {
7248
+       r_.i64[(i * 2)] =
7249
+           a_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
7250
+       r_.i64[(i * 2) + 1] =
7251
+           b_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)];
7252
+   }
7253
+#endif
7254
+
7255
+   return simde__m128i_from_private(r_);
7256
+#endif
7257
+}
7258
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7259
+#define _mm_unpackhi_epi64(a, b) simde_mm_unpackhi_epi64(a, b)
7260
+#endif
7261
+
7262
+SIMDE_FUNCTION_ATTRIBUTES
7263
+simde__m128d simde_mm_unpackhi_pd(simde__m128d a, simde__m128d b)
7264
+{
7265
+#if defined(SIMDE_X86_SSE2_NATIVE)
7266
+   return _mm_unpackhi_pd(a, b);
7267
+#else
7268
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
7269
+                b_ = simde__m128d_to_private(b);
7270
+
7271
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7272
+   float64x1_t a_l = vget_high_f64(a_.f64);
7273
+   float64x1_t b_l = vget_high_f64(b_.f64);
7274
+   r_.neon_f64 = vcombine_f64(a_l, b_l);
7275
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
7276
+   r_.wasm_v128 = wasm_v64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3);
7277
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
7278
+   r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3);
7279
+#else
7280
+   SIMDE_VECTORIZE
7281
+   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2); i++) {
7282
+       r_.f64[(i * 2)] =
7283
+           a_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
7284
+       r_.f64[(i * 2) + 1] =
7285
+           b_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)];
7286
+   }
7287
+#endif
7288
+
7289
+   return simde__m128d_from_private(r_);
7290
+#endif
7291
+}
7292
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7293
+#define _mm_unpackhi_pd(a, b) simde_mm_unpackhi_pd(a, b)
7294
+#endif
7295
+
7296
+SIMDE_FUNCTION_ATTRIBUTES
7297
+simde__m128i simde_mm_unpacklo_epi8(simde__m128i a, simde__m128i b)
7298
+{
7299
+#if defined(SIMDE_X86_SSE2_NATIVE)
7300
+   return _mm_unpacklo_epi8(a, b);
7301
+#else
7302
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
7303
+                b_ = simde__m128i_to_private(b);
7304
+
7305
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7306
+   r_.neon_i8 = vzip1q_s8(a_.neon_i8, b_.neon_i8);
7307
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7308
+   int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a_.neon_i16));
7309
+   int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16));
7310
+   int8x8x2_t result = vzip_s8(a1, b1);
7311
+   r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]);
7312
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
7313
+   r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18,
7314
+                     3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
7315
+#else
7316
+   SIMDE_VECTORIZE
7317
+   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2); i++) {
7318
+       r_.i8[(i * 2)] = a_.i8[i];
7319
+       r_.i8[(i * 2) + 1] = b_.i8[i];
7320
+   }
7321
+#endif
7322
+
7323
+   return simde__m128i_from_private(r_);
7324
+#endif
7325
+}
7326
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7327
+#define _mm_unpacklo_epi8(a, b) simde_mm_unpacklo_epi8(a, b)
7328
+#endif
7329
+
7330
+SIMDE_FUNCTION_ATTRIBUTES
7331
+simde__m128i simde_mm_unpacklo_epi16(simde__m128i a, simde__m128i b)
7332
+{
7333
+#if defined(SIMDE_X86_SSE2_NATIVE)
7334
+   return _mm_unpacklo_epi16(a, b);
7335
+#else
7336
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
7337
+                b_ = simde__m128i_to_private(b);
7338
+
7339
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7340
+   r_.neon_i16 = vzip1q_s16(a_.neon_i16, b_.neon_i16);
7341
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7342
+   int16x4_t a1 = vget_low_s16(a_.neon_i16);
7343
+   int16x4_t b1 = vget_low_s16(b_.neon_i16);
7344
+   int16x4x2_t result = vzip_s16(a1, b1);
7345
+   r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]);
7346
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
7347
+   r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2,
7348
+                      10, 3, 11);
7349
+#else
7350
+   SIMDE_VECTORIZE
7351
+   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2); i++) {
7352
+       r_.i16[(i * 2)] = a_.i16[i];
7353
+       r_.i16[(i * 2) + 1] = b_.i16[i];
7354
+   }
7355
+#endif
7356
+
7357
+   return simde__m128i_from_private(r_);
7358
+#endif
7359
+}
7360
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7361
+#define _mm_unpacklo_epi16(a, b) simde_mm_unpacklo_epi16(a, b)
7362
+#endif
7363
+
7364
+SIMDE_FUNCTION_ATTRIBUTES
7365
+simde__m128i simde_mm_unpacklo_epi32(simde__m128i a, simde__m128i b)
7366
+{
7367
+#if defined(SIMDE_X86_SSE2_NATIVE)
7368
+   return _mm_unpacklo_epi32(a, b);
7369
+#else
7370
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
7371
+                b_ = simde__m128i_to_private(b);
7372
+
7373
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7374
+   r_.neon_i32 = vzip1q_s32(a_.neon_i32, b_.neon_i32);
7375
+#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7376
+   int32x2_t a1 = vget_low_s32(a_.neon_i32);
7377
+   int32x2_t b1 = vget_low_s32(b_.neon_i32);
7378
+   int32x2x2_t result = vzip_s32(a1, b1);
7379
+   r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]);
7380
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
7381
+   r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5);
7382
+#else
7383
+   SIMDE_VECTORIZE
7384
+   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2); i++) {
7385
+       r_.i32[(i * 2)] = a_.i32[i];
7386
+       r_.i32[(i * 2) + 1] = b_.i32[i];
7387
+   }
7388
+#endif
7389
+
7390
+   return simde__m128i_from_private(r_);
7391
+#endif
7392
+}
7393
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7394
+#define _mm_unpacklo_epi32(a, b) simde_mm_unpacklo_epi32(a, b)
7395
+#endif
7396
+
7397
+SIMDE_FUNCTION_ATTRIBUTES
7398
+simde__m128i simde_mm_unpacklo_epi64(simde__m128i a, simde__m128i b)
7399
+{
7400
+#if defined(SIMDE_X86_SSE2_NATIVE)
7401
+   return _mm_unpacklo_epi64(a, b);
7402
+#else
7403
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
7404
+                b_ = simde__m128i_to_private(b);
7405
+
7406
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7407
+   int64x1_t a_l = vget_low_s64(a_.i64);
7408
+   int64x1_t b_l = vget_low_s64(b_.i64);
7409
+   r_.neon_i64 = vcombine_s64(a_l, b_l);
7410
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
7411
+   r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2);
7412
+#else
7413
+   SIMDE_VECTORIZE
7414
+   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2); i++) {
7415
+       r_.i64[(i * 2)] = a_.i64[i];
7416
+       r_.i64[(i * 2) + 1] = b_.i64[i];
7417
+   }
7418
+#endif
7419
+
7420
+   return simde__m128i_from_private(r_);
7421
+#endif
7422
+}
7423
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7424
+#define _mm_unpacklo_epi64(a, b) simde_mm_unpacklo_epi64(a, b)
7425
+#endif
7426
+
7427
+SIMDE_FUNCTION_ATTRIBUTES
7428
+simde__m128d simde_mm_unpacklo_pd(simde__m128d a, simde__m128d b)
7429
+{
7430
+#if defined(SIMDE_X86_SSE2_NATIVE)
7431
+   return _mm_unpacklo_pd(a, b);
7432
+#else
7433
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a),
7434
+                b_ = simde__m128d_to_private(b);
7435
+
7436
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7437
+   float64x1_t a_l = vget_low_f64(a_.f64);
7438
+   float64x1_t b_l = vget_low_f64(b_.f64);
7439
+   r_.neon_f64 = vcombine_f64(a_l, b_l);
7440
+#elif defined(SIMDE_SHUFFLE_VECTOR_)
7441
+   r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2);
7442
+#else
7443
+   SIMDE_VECTORIZE
7444
+   for (size_t i = 0; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2); i++) {
7445
+       r_.f64[(i * 2)] = a_.f64[i];
7446
+       r_.f64[(i * 2) + 1] = b_.f64[i];
7447
+   }
7448
+#endif
7449
+
7450
+   return simde__m128d_from_private(r_);
7451
+#endif
7452
+}
7453
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7454
+#define _mm_unpacklo_pd(a, b) simde_mm_unpacklo_pd(a, b)
7455
+#endif
7456
+
7457
+SIMDE_FUNCTION_ATTRIBUTES
7458
+simde__m128d simde_x_mm_negate_pd(simde__m128d a)
7459
+{
7460
+#if defined(SIMDE_X86_SSE_NATIVE)
7461
+   return simde_mm_xor_pd(a, _mm_set1_pd(SIMDE_FLOAT64_C(-0.0)));
7462
+#else
7463
+   simde__m128d_private r_, a_ = simde__m128d_to_private(a);
7464
+
7465
+#if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \
7466
+   (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8, 1, 0))
7467
+   r_.altivec_f64 = vec_neg(a_.altivec_f64);
7468
+#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE)
7469
+   r_.neon_f64 = vnegq_f64(a_.neon_f64);
7470
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
7471
+   r_.wasm_v128 = wasm_f64x2_neg(a_.wasm_v128);
7472
+#elif defined(SIMDE_VECTOR_NEGATE)
7473
+   r_.f64 = -a_.f64;
7474
+#else
7475
+   SIMDE_VECTORIZE
7476
+   for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) {
7477
+       r_.f64[i] = -a_.f64[i];
7478
+   }
7479
+#endif
7480
+
7481
+   return simde__m128d_from_private(r_);
7482
+#endif
7483
+}
7484
+
7485
+SIMDE_FUNCTION_ATTRIBUTES
7486
+simde__m128i simde_mm_xor_si128(simde__m128i a, simde__m128i b)
7487
+{
7488
+#if defined(SIMDE_X86_SSE2_NATIVE)
7489
+   return _mm_xor_si128(a, b);
7490
+#else
7491
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a),
7492
+                b_ = simde__m128i_to_private(b);
7493
+
7494
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7495
+   r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32);
7496
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
7497
+   r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32);
7498
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
7499
+   r_.i32f = a_.i32f ^ b_.i32f;
7500
+#else
7501
+   SIMDE_VECTORIZE
7502
+   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
7503
+       r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i];
7504
+   }
7505
+#endif
7506
+
7507
+   return simde__m128i_from_private(r_);
7508
+#endif
7509
+}
7510
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7511
+#define _mm_xor_si128(a, b) simde_mm_xor_si128(a, b)
7512
+#endif
7513
+
7514
+SIMDE_FUNCTION_ATTRIBUTES
7515
+simde__m128i simde_x_mm_not_si128(simde__m128i a)
7516
+{
7517
+#if defined(SIMDE_X86_AVX512VL_NATIVE)
7518
+   return _mm_ternarylogic_epi32(a, a, a, 0x55);
7519
+#else
7520
+   simde__m128i_private r_, a_ = simde__m128i_to_private(a);
7521
+
7522
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
7523
+   r_.neon_i32 = vmvnq_s32(a_.neon_i32);
7524
+#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
7525
+   r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32);
7526
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
7527
+   r_.wasm_v128 = wasm_v128_not(a_.wasm_v128);
7528
+#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
7529
+   r_.i32f = ~a_.i32f;
7530
+#else
7531
+   SIMDE_VECTORIZE
7532
+   for (size_t i = 0; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])); i++) {
7533
+       r_.i32f[i] = ~(a_.i32f[i]);
7534
+   }
7535
+#endif
7536
+
7537
+   return simde__m128i_from_private(r_);
7538
+#endif
7539
+}
7540
+
7541
+#define SIMDE_MM_SHUFFLE2(x, y) (((x) << 1) | (y))
7542
+#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES)
7543
+#define _MM_SHUFFLE2(x, y) SIMDE_MM_SHUFFLE2(x, y)
7544
+#endif
7545
+
7546
+SIMDE_END_DECLS_
7547
+
7548
+HEDLEY_DIAGNOSTIC_POP
7549
+
7550
+#endif /* !defined(SIMDE_X86_SSE2_H) */
7551
obs-studio-26.1.0.tar.xz/libobs/util/sse-intrin.h -> obs-studio-26.1.1.tar.xz/libobs/util/sse-intrin.h Changed
61
 
1
@@ -17,55 +17,9 @@
2
 
3
 #pragma once
4
 
5
-#if NEEDS_SIMDE
6
-
7
-#include "simde/sse2.h"
8
-
9
-#define __m128 simde__m128
10
-#define _mm_setzero_ps simde_mm_setzero_ps
11
-#define _mm_set_ps simde_mm_set_ps
12
-#define _mm_add_ps simde_mm_add_ps
13
-#define _mm_sub_ps simde_mm_sub_ps
14
-#define _mm_mul_ps simde_mm_mul_ps
15
-#define _mm_div_ps simde_mm_div_ps
16
-#define _mm_set1_ps simde_mm_set1_ps
17
-#define _mm_movehl_ps simde_mm_movehl_ps
18
-#define _mm_shuffle_ps simde_mm_shuffle_ps
19
-#define _mm_min_ps simde_mm_min_ps
20
-#define _mm_max_ps simde_mm_max_ps
21
-#define _mm_movelh_ps simde_mm_movelh_ps
22
-#define _mm_unpacklo_ps simde_mm_unpacklo_ps
23
-#define _mm_unpackhi_ps simde_mm_unpackhi_ps
24
-#define _mm_load_ps simde_mm_load_ps
25
-#define _mm_andnot_ps simde_mm_andnot_ps
26
-#define _mm_storeu_ps simde_mm_storeu_ps
27
-#define _mm_loadu_ps simde_mm_loadu_ps
28
-
29
-#define __m128i simde__m128i
30
-#define _mm_set1_epi32 simde_mm_set1_epi32
31
-#define _mm_set1_epi16 simde_mm_set1_epi16
32
-#define _mm_load_si128 simde_mm_load_si128
33
-#define _mm_packs_epi32 simde_mm_packs_epi32
34
-#define _mm_srli_si128 simde_mm_srli_si128
35
-#define _mm_and_si128 simde_mm_and_si128
36
-#define _mm_packus_epi16 simde_mm_packus_epi16
37
-#define _mm_add_epi64 simde_mm_add_epi64
38
-#define _mm_shuffle_epi32 simde_mm_shuffle_epi32
39
-#define _mm_srai_epi16 simde_mm_srai_epi16
40
-#define _mm_shufflelo_epi16 simde_mm_shufflelo_epi16
41
-#define _mm_storeu_si128 simde_mm_storeu_si128
42
-
43
-#define _MM_SHUFFLE SIMDE_MM_SHUFFLE
44
-#define _MM_TRANSPOSE4_PS SIMDE_MM_TRANSPOSE4_PS
45
-
46
-#else
47
-
48
-#if defined(__aarch64__) || defined(__arm__)
49
-#include <arm_neon.h>
50
-#include "sse2neon.h"
51
-#else
52
-#include <xmmintrin.h>
53
+#if defined(_MSC_VER)
54
 #include <emmintrin.h>
55
-#endif
56
-
57
+#else
58
+#define SIMDE_ENABLE_NATIVE_ALIASES
59
+#include "simde/x86/sse2.h"
60
 #endif
61
obs-studio-26.1.0.tar.xz/plugins/coreaudio-encoder/CMakeLists.txt -> obs-studio-26.1.1.tar.xz/plugins/coreaudio-encoder/CMakeLists.txt Changed
25
 
1
@@ -4,17 +4,18 @@
2
    encoder.cpp)
3
 
4
 if (WIN32)
5
+   # Set compiler flag before adding resource file
6
+   if (MINGW)
7
+       set_source_files_properties(${coreaudio-encoder_SOURCES}
8
+           PROPERTIES COMPILE_FLAGS "-Wno-multichar")
9
+   endif()
10
+
11
    set(MODULE_DESCRIPTION "OBS Core Audio encoder")
12
    configure_file(${CMAKE_SOURCE_DIR}/cmake/winrc/obs-module.rc.in coreaudio-encoder.rc)
13
    list(APPEND coreaudio-encoder_SOURCES
14
        coreaudio-encoder.rc)
15
    set(coreaudio-encoder_HEADERS windows-imports.h)
16
    set(coreaudio-encoder_LIBS )
17
-
18
-   if (MINGW)
19
-       set_source_files_properties(${coreaudio-encoder_SOURCES}
20
-           PROPERTIES COMPILE_FLAGS "-Wno-multichar")
21
-   endif()
22
 else()
23
    find_library(COREFOUNDATION CoreFoundation)
24
    find_library(COREAUDIO CoreAudio)
25
obs-studio-26.1.0.tar.xz/plugins/decklink/DecklinkInput.cpp -> obs-studio-26.1.1.tar.xz/plugins/decklink/DecklinkInput.cpp Changed
10
 
1
@@ -100,7 +100,7 @@
2
        return false;
3
    }
4
 
5
-   if (!instance->StartCapture(mode, bmdVideoConnection,
6
+   if (!instance->StartCapture(mode, allow10Bit, bmdVideoConnection,
7
                    bmdAudioConnection)) {
8
        instance = nullptr;
9
        return false;
10
obs-studio-26.1.0.tar.xz/plugins/decklink/DecklinkInput.hpp -> obs-studio-26.1.1.tar.xz/plugins/decklink/DecklinkInput.hpp Changed
9
 
1
@@ -50,6 +50,7 @@
2
    std::string hash;
3
    long long id;
4
    bool swap = false;
5
+   bool allow10Bit = false;
6
    BMDVideoConnection videoConnection;
7
    BMDAudioConnection audioConnection;
8
 };
9
obs-studio-26.1.0.tar.xz/plugins/decklink/OBSVideoFrame.cpp -> obs-studio-26.1.1.tar.xz/plugins/decklink/OBSVideoFrame.cpp Changed
19
 
1
@@ -1,11 +1,14 @@
2
 #include "OBSVideoFrame.h"
3
 
4
-OBSVideoFrame::OBSVideoFrame(long width, long height)
5
+OBSVideoFrame::OBSVideoFrame(long width, long height,
6
+                BMDPixelFormat pixelFormat)
7
 {
8
+   int bpp = 2;
9
    this->width = width;
10
    this->height = height;
11
-   this->rowBytes = width * 2;
12
-   this->data = new unsigned char[width * height * 2 + 1];
13
+   this->rowBytes = width * bpp;
14
+   this->data = new unsigned char[width * height * bpp + 1];
15
+   this->pixelFormat = pixelFormat;
16
 }
17
 
18
 HRESULT OBSVideoFrame::SetFlags(BMDFrameFlags newFlags)
19
obs-studio-26.1.0.tar.xz/plugins/decklink/OBSVideoFrame.h -> obs-studio-26.1.1.tar.xz/plugins/decklink/OBSVideoFrame.h Changed
10
 
1
@@ -15,7 +15,7 @@
2
    unsigned char *data;
3
 
4
 public:
5
-   OBSVideoFrame(long width, long height);
6
+   OBSVideoFrame(long width, long height, BMDPixelFormat pixelFormat);
7
 
8
    HRESULT STDMETHODCALLTYPE SetFlags(BMDFrameFlags newFlags) override;
9
 
10
obs-studio-26.1.0.tar.xz/plugins/decklink/const.h -> obs-studio-26.1.1.tar.xz/plugins/decklink/const.h Changed
14
 
1
@@ -13,6 +13,7 @@
2
 #define AUTO_START "auto_start"
3
 #define KEYER "keyer"
4
 #define SWAP "swap"
5
+#define ALLOW_10_BIT "allow_10_bit"
6
 
7
 #define TEXT_DEVICE obs_module_text("Device")
8
 #define TEXT_VIDEO_CONNECTION obs_module_text("VideoConnection")
9
@@ -39,3 +40,4 @@
10
 #define TEXT_ENABLE_KEYER obs_module_text("Keyer")
11
 #define TEXT_SWAP obs_module_text("SwapFC-LFE")
12
 #define TEXT_SWAP_TOOLTIP obs_module_text("SwapFC-LFE.Tooltip")
13
+#define TEXT_ALLOW_10_BIT obs_module_text("Allow10Bit")
14
obs-studio-26.1.0.tar.xz/plugins/decklink/data/locale/en-US.ini -> obs-studio-26.1.1.tar.xz/plugins/decklink/data/locale/en-US.ini Changed
7
 
1
@@ -23,3 +23,4 @@
2
 SwapFC-LFE.Tooltip="Swap Front Center Channel and LFE Channel"
3
 VideoConnection="Video Connection"
4
 AudioConnection="Audio Connection"
5
+Allow10Bit="Allow 10 Bit (Required for SDI captions, may cause performance overhead)"
6
\ No newline at end of file
7
obs-studio-26.1.0.tar.xz/plugins/decklink/decklink-device-instance.cpp -> obs-studio-26.1.1.tar.xz/plugins/decklink/decklink-device-instance.cpp Changed
138
 
1
@@ -24,10 +24,10 @@
2
        return VIDEO_FORMAT_BGRX;
3
 
4
    default:
5
-   case bmdFormat8BitYUV:;
6
+   case bmdFormat8BitYUV:
7
+   case bmdFormat10BitYUV:;
8
+       return VIDEO_FORMAT_UYVY;
9
    }
10
-
11
-   return VIDEO_FORMAT_UYVY;
12
 }
13
 
14
 static inline int ConvertChannelFormat(speaker_layout format)
15
@@ -168,21 +168,28 @@
16
        packets->Release();
17
    }
18
 
19
-   IDeckLinkVideoConversion *frameConverter =
20
-       CreateVideoConversionInstance();
21
+   IDeckLinkVideoFrame *frame;
22
+   if (videoFrame->GetPixelFormat() != convertFrame->GetPixelFormat()) {
23
+       IDeckLinkVideoConversion *frameConverter =
24
+           CreateVideoConversionInstance();
25
+
26
+       frameConverter->ConvertFrame(videoFrame, convertFrame);
27
 
28
-   frameConverter->ConvertFrame(videoFrame, convertFrame);
29
+       frame = convertFrame;
30
+   } else {
31
+       frame = videoFrame;
32
+   }
33
 
34
    void *bytes;
35
-   if (convertFrame->GetBytes(&bytes) != S_OK) {
36
+   if (frame->GetBytes(&bytes) != S_OK) {
37
        LOG(LOG_WARNING, "Failed to get video frame data");
38
        return;
39
    }
40
 
41
    currentFrame.data[0] = (uint8_t *)bytes;
42
-   currentFrame.linesize[0] = (uint32_t)convertFrame->GetRowBytes();
43
-   currentFrame.width = (uint32_t)convertFrame->GetWidth();
44
-   currentFrame.height = (uint32_t)convertFrame->GetHeight();
45
+   currentFrame.linesize[0] = (uint32_t)frame->GetRowBytes();
46
+   currentFrame.width = (uint32_t)frame->GetWidth();
47
+   currentFrame.height = (uint32_t)frame->GetHeight();
48
    currentFrame.timestamp = timestamp;
49
 
50
    obs_source_output_video2(
51
@@ -326,10 +333,22 @@
52
                    currentFrame.color_range_min,
53
                    currentFrame.color_range_max);
54
 
55
-   if (convertFrame) {
56
-       delete convertFrame;
57
+   delete convertFrame;
58
+
59
+   BMDPixelFormat convertFormat;
60
+   switch (pixelFormat) {
61
+   case bmdFormat8BitBGRA:
62
+       convertFormat = bmdFormat8BitBGRA;
63
+       break;
64
+   default:
65
+   case bmdFormat10BitYUV:
66
+   case bmdFormat8BitYUV:;
67
+       convertFormat = bmdFormat8BitYUV;
68
+       break;
69
    }
70
-   convertFrame = new OBSVideoFrame(mode_->GetWidth(), mode_->GetHeight());
71
+
72
+   convertFrame = new OBSVideoFrame(mode_->GetWidth(), mode_->GetHeight(),
73
+                    convertFormat);
74
 
75
 #ifdef LOG_SETUP_VIDEO_FORMAT
76
    LOG(LOG_INFO, "Setup video format: %s, %s, %s",
77
@@ -340,6 +359,7 @@
78
 }
79
 
80
 bool DeckLinkDeviceInstance::StartCapture(DeckLinkDeviceMode *mode_,
81
+                     bool allow10Bit_,
82
                      BMDVideoConnection bmdVideoConnection,
83
                      BMDAudioConnection bmdAudioConnection)
84
 {
85
@@ -392,7 +412,11 @@
86
    bool isauto = mode_->GetName() == "Auto";
87
    if (isauto) {
88
        displayMode = bmdModeNTSC;
89
-       pixelFormat = bmdFormat10BitYUV;
90
+       if (allow10Bit) {
91
+           pixelFormat = bmdFormat10BitYUV;
92
+       } else {
93
+           pixelFormat = bmdFormat8BitYUV;
94
+       }
95
        flags = bmdVideoInputEnableFormatDetection;
96
    } else {
97
        displayMode = mode_->GetDisplayMode();
98
@@ -401,6 +425,8 @@
99
        flags = bmdVideoInputFlagDefault;
100
    }
101
 
102
+   allow10Bit = allow10Bit_;
103
+
104
    const HRESULT videoResult =
105
        input->EnableVideoInput(displayMode, pixelFormat, flags);
106
    if (videoResult != S_OK) {
107
@@ -631,15 +657,22 @@
108
 {
109
 
110
    if (events & bmdVideoInputColorspaceChanged) {
111
-       switch (detectedSignalFlags) {
112
-       case bmdDetectedVideoInputRGB444:
113
+       if (detectedSignalFlags & bmdDetectedVideoInputRGB444) {
114
            pixelFormat = bmdFormat8BitBGRA;
115
-           break;
116
-
117
-       default:
118
-       case bmdDetectedVideoInputYCbCr422:
119
-           pixelFormat = bmdFormat10BitYUV;
120
-           break;
121
+       }
122
+       if (detectedSignalFlags & bmdDetectedVideoInputYCbCr422) {
123
+           if (detectedSignalFlags &
124
+               bmdDetectedVideoInput10BitDepth) {
125
+               if (allow10Bit) {
126
+                   pixelFormat = bmdFormat10BitYUV;
127
+               } else {
128
+                   pixelFormat = bmdFormat8BitYUV;
129
+               }
130
+           }
131
+           if (detectedSignalFlags &
132
+               bmdDetectedVideoInput8BitDepth) {
133
+               pixelFormat = bmdFormat8BitYUV;
134
+           }
135
        }
136
    }
137
 
138
obs-studio-26.1.0.tar.xz/plugins/decklink/decklink-device-instance.hpp -> obs-studio-26.1.1.tar.xz/plugins/decklink/decklink-device-instance.hpp Changed
18
 
1
@@ -35,6 +35,7 @@
2
    AudioRepacker *audioRepacker = nullptr;
3
    speaker_layout channelFormat = SPEAKERS_STEREO;
4
    bool swap;
5
+   bool allow10Bit;
6
 
7
    OBSVideoFrame *convertFrame = nullptr;
8
    IDeckLinkMutableVideoFrame *decklinkOutputFrame = nullptr;
9
@@ -85,7 +86,7 @@
10
 
11
    inline DeckLinkDeviceMode *GetMode() const { return mode; }
12
 
13
-   bool StartCapture(DeckLinkDeviceMode *mode,
14
+   bool StartCapture(DeckLinkDeviceMode *mode, bool allow10Bit,
15
              BMDVideoConnection bmdVideoConnection,
16
              BMDAudioConnection bmdAudioConnection);
17
    bool StopCapture(void);
18
obs-studio-26.1.0.tar.xz/plugins/decklink/decklink-source.cpp -> obs-studio-26.1.1.tar.xz/plugins/decklink/decklink-source.cpp Changed
36
 
1
@@ -80,6 +80,7 @@
2
    decklink->SetChannelFormat(channelFormat);
3
    decklink->hash = std::string(hash);
4
    decklink->swap = obs_data_get_bool(settings, SWAP);
5
+   decklink->allow10Bit = obs_data_get_bool(settings, ALLOW_10_BIT);
6
    decklink->Activate(device, id, videoConnection, audioConnection);
7
 }
8
 
9
@@ -247,6 +248,9 @@
10
    list = obs_properties_get(props, PIXEL_FORMAT);
11
    obs_property_set_visible(list, id != MODE_ID_AUTO);
12
 
13
+   auto allow10BitProp = obs_properties_get(props, ALLOW_10_BIT);
14
+   obs_property_set_visible(allow10BitProp, id == MODE_ID_AUTO);
15
+
16
    return true;
17
 }
18
 
19
@@ -277,6 +281,7 @@
20
                       OBS_COMBO_FORMAT_INT);
21
 
22
    obs_property_list_add_int(list, "8-bit YUV", bmdFormat8BitYUV);
23
+   obs_property_list_add_int(list, "10-bit YUV", bmdFormat10BitYUV);
24
    obs_property_list_add_int(list, "8-bit BGRA", bmdFormat8BitBGRA);
25
 
26
    list = obs_properties_add_list(props, COLOR_SPACE, TEXT_COLOR_SPACE,
27
@@ -322,6 +327,8 @@
28
 
29
    obs_properties_add_bool(props, DEACTIVATE_WNS, TEXT_DWNS);
30
 
31
+   obs_properties_add_bool(props, ALLOW_10_BIT, TEXT_ALLOW_10_BIT);
32
+
33
    UNUSED_PARAMETER(data);
34
    return props;
35
 }
36
obs-studio-26.1.0.tar.xz/plugins/image-source/image-source.c -> obs-studio-26.1.1.tar.xz/plugins/image-source/image-source.c Changed
17
 
1
@@ -211,13 +211,14 @@
2
 }
3
 
4
 static const char *image_filter =
5
-   "All formats (*.bmp *.tga *.png *.jpeg *.jpg *.gif *.psd);;"
6
+   "All formats (*.bmp *.tga *.png *.jpeg *.jpg *.gif *.psd *.webp);;"
7
    "BMP Files (*.bmp);;"
8
    "Targa Files (*.tga);;"
9
    "PNG Files (*.png);;"
10
    "JPEG Files (*.jpeg *.jpg);;"
11
    "GIF Files (*.gif);;"
12
    "PSD Files (*.psd);;"
13
+   "WebP Files (*.webp);;"
14
    "All Files (*.*)";
15
 
16
 static obs_properties_t *image_source_properties(void *data)
17
obs-studio-26.1.0.tar.xz/plugins/image-source/obs-slideshow.c -> obs-studio-26.1.1.tar.xz/plugins/image-source/obs-slideshow.c Changed
22
 
1
@@ -699,9 +699,9 @@
2
    if (!ss->transition || !ss->slide_time)
3
        return;
4
 
5
-   if (ss->restart_on_activate && !ss->randomize && ss->use_cut) {
6
+   if (ss->restart_on_activate && ss->use_cut) {
7
        ss->elapsed = 0.0f;
8
-       ss->cur_item = 0;
9
+       ss->cur_item = ss->randomize ? random_file(ss) : 0;
10
        do_transition(ss, false);
11
        ss->restart_on_activate = false;
12
        ss->use_cut = false;
13
@@ -848,7 +848,7 @@
14
 }
15
 
16
 static const char *file_filter =
17
-   "Image files (*.bmp *.tga *.png *.jpeg *.jpg *.gif)";
18
+   "Image files (*.bmp *.tga *.png *.jpeg *.jpg *.gif *.webp)";
19
 
20
 static const char *aspects[] = {"16:9", "16:10", "4:3", "1:1"};
21
 
22
obs-studio-26.1.0.tar.xz/plugins/linux-jack/jack-wrapper.c -> obs-studio-26.1.1.tar.xz/plugins/linux-jack/jack-wrapper.c Changed
70
 
1
@@ -61,11 +61,15 @@
2
 int jack_process_callback(jack_nframes_t nframes, void *arg)
3
 {
4
    struct jack_data *data = (struct jack_data *)arg;
5
+   jack_nframes_t current_frames;
6
+   jack_time_t current_usecs, next_usecs;
7
+   float period_usecs;
8
+
9
+   uint64_t now = os_gettime_ns();
10
+
11
    if (data == 0)
12
        return 0;
13
 
14
-   pthread_mutex_lock(&data->jack_mutex);
15
-
16
    struct obs_source_audio out;
17
    out.speakers = jack_channels_to_obs_speakers(data->channels);
18
    out.samples_per_sec = jack_get_sample_rate(data->jack_client);
19
@@ -80,11 +84,19 @@
20
    }
21
 
22
    out.frames = nframes;
23
-   out.timestamp = os_gettime_ns() -
24
-           jack_frames_to_time(data->jack_client, nframes);
25
+   if (!jack_get_cycle_times(data->jack_client, &current_frames,
26
+                 &current_usecs, &next_usecs, &period_usecs)) {
27
+       out.timestamp = now - (int64_t)(period_usecs * 1000);
28
+   } else {
29
+       out.timestamp = now - util_mul_div64(nframes, 1000000000ULL,
30
+                            data->samples_per_sec);
31
+       blog(LOG_WARNING,
32
+            "jack_get_cycle_times error: guessing timestamp");
33
+   }
34
 
35
+   /* FIXME: this function is not realtime-safe, we should do something
36
+    * about this */
37
    obs_source_output_audio(data->source, &out);
38
-   pthread_mutex_unlock(&data->jack_mutex);
39
    return 0;
40
 }
41
 
42
@@ -115,7 +127,7 @@
43
 
44
        data->jack_ports[i] = jack_port_register(
45
            data->jack_client, port_name, JACK_DEFAULT_AUDIO_TYPE,
46
-           JackPortIsInput, 0);
47
+           JackPortIsInput | JackPortIsTerminal, 0);
48
        if (data->jack_ports[i] == NULL) {
49
            blog(LOG_ERROR,
50
                 "jack_port_register Error:"
51
@@ -151,17 +163,11 @@
52
    pthread_mutex_lock(&data->jack_mutex);
53
 
54
    if (data->jack_client) {
55
+       jack_client_close(data->jack_client);
56
        if (data->jack_ports != NULL) {
57
-           for (int i = 0; i < data->channels; ++i) {
58
-               if (data->jack_ports[i] != NULL)
59
-                   jack_port_unregister(
60
-                       data->jack_client,
61
-                       data->jack_ports[i]);
62
-           }
63
            bfree(data->jack_ports);
64
            data->jack_ports = NULL;
65
        }
66
-       jack_client_close(data->jack_client);
67
        data->jack_client = NULL;
68
    }
69
    pthread_mutex_unlock(&data->jack_mutex);
70
obs-studio-26.1.0.tar.xz/plugins/mac-virtualcam/src/dal-plugin/CMSampleBufferUtils.mm -> obs-studio-26.1.1.tar.xz/plugins/mac-virtualcam/src/dal-plugin/CMSampleBufferUtils.mm Changed
11
 
1
@@ -83,6 +83,9 @@
2
 
3
 static void releaseNSData(void *o, void *block, size_t size)
4
 {
5
+   UNUSED_PARAMETER(block);
6
+   UNUSED_PARAMETER(size);
7
+
8
    NSData *data = (__bridge_transfer NSData *)o;
9
    data = nil; // Assuming ARC is enabled
10
 }
11
obs-studio-26.1.0.tar.xz/plugins/mac-virtualcam/src/dal-plugin/Logging.h -> obs-studio-26.1.1.tar.xz/plugins/mac-virtualcam/src/dal-plugin/Logging.h Changed
8
 
1
@@ -29,4 +29,6 @@
2
 #define VLogFunc(fmt, ...)
3
 #define ELog(fmt, ...) DLog(fmt, ##__VA_ARGS__)
4
 
5
+#define UNUSED_PARAMETER(param) (void)param
6
+
7
 #endif /* Logging_h */
8
obs-studio-26.1.0.tar.xz/plugins/mac-virtualcam/src/dal-plugin/OBSDALDevice.mm -> obs-studio-26.1.1.tar.xz/plugins/mac-virtualcam/src/dal-plugin/OBSDALDevice.mm Changed
53
 
1
@@ -86,9 +86,7 @@
2
    case kCMIODevicePropertyDeviceMaster:
3
        return sizeof(pid_t);
4
    default:
5
-       DLog(@"Device unhandled getPropertyDataSizeWithAddress for %@",
6
-            [OBSDALObjectStore
7
-                StringFromPropertySelector:address.mSelector]);
8
+       break;
9
    };
10
 
11
    return 0;
12
@@ -191,10 +189,6 @@
13
        *dataUsed = sizeof(pid_t);
14
        break;
15
    default:
16
-       DLog(@"Device unhandled getPropertyDataWithAddress for %@",
17
-            [OBSDALObjectStore
18
-                StringFromPropertySelector:address.mSelector]);
19
-       *dataUsed = 0;
20
        break;
21
    };
22
 }
23
@@ -227,9 +221,6 @@
24
    case kCMIODevicePropertyLinkedCoreAudioDeviceUID:
25
        return false;
26
    default:
27
-       DLog(@"Device unhandled hasPropertyWithAddress for %@",
28
-            [OBSDALObjectStore
29
-                StringFromPropertySelector:address.mSelector]);
30
        return false;
31
    };
32
 }
33
@@ -262,9 +253,6 @@
34
    case kCMIODevicePropertyDeviceMaster:
35
        return true;
36
    default:
37
-       DLog(@"Device unhandled isPropertySettableWithAddress for %@",
38
-            [OBSDALObjectStore
39
-                StringFromPropertySelector:address.mSelector]);
40
        return false;
41
    };
42
 }
43
@@ -285,9 +273,6 @@
44
        self.masterPid = *static_cast<const pid_t *>(data);
45
        break;
46
    default:
47
-       DLog(@"Device unhandled setPropertyDataWithAddress for %@",
48
-            [OBSDALObjectStore
49
-                StringFromPropertySelector:address.mSelector]);
50
        break;
51
    };
52
 }
53
obs-studio-26.1.0.tar.xz/plugins/mac-virtualcam/src/dal-plugin/OBSDALPlugInInterface.mm -> obs-studio-26.1.1.tar.xz/plugins/mac-virtualcam/src/dal-plugin/OBSDALPlugInInterface.mm Changed
136
 
1
@@ -34,6 +34,8 @@
2
 
3
 ULONG HardwarePlugIn_AddRef(CMIOHardwarePlugInRef self)
4
 {
5
+   UNUSED_PARAMETER(self);
6
+
7
    sRefCount += 1;
8
    DLogFunc(@"sRefCount now = %d", sRefCount);
9
    return sRefCount;
10
@@ -41,6 +43,8 @@
11
 
12
 ULONG HardwarePlugIn_Release(CMIOHardwarePlugInRef self)
13
 {
14
+   UNUSED_PARAMETER(self);
15
+
16
    sRefCount -= 1;
17
    DLogFunc(@"sRefCount now = %d", sRefCount);
18
    return sRefCount;
19
@@ -49,6 +53,7 @@
20
 HRESULT HardwarePlugIn_QueryInterface(CMIOHardwarePlugInRef self, REFIID uuid,
21
                      LPVOID *interface)
22
 {
23
+   UNUSED_PARAMETER(self);
24
    DLogFunc(@"");
25
 
26
    if (!interface) {
27
@@ -161,6 +166,7 @@
28
 void HardwarePlugIn_ObjectShow(CMIOHardwarePlugInRef self,
29
                   CMIOObjectID objectID)
30
 {
31
+   UNUSED_PARAMETER(objectID);
32
    DLogFunc(@"self=%p", self);
33
 }
34
 
35
@@ -169,6 +175,7 @@
36
                 CMIOObjectID objectID,
37
                 const CMIOObjectPropertyAddress *address)
38
 {
39
+   UNUSED_PARAMETER(self);
40
 
41
    NSObject<CMIOObject> *object =
42
        [OBSDALObjectStore GetObjectWithId:objectID];
43
@@ -215,6 +222,7 @@
44
    const CMIOObjectPropertyAddress *address, UInt32 qualifierDataSize,
45
    const void *qualifierData, UInt32 *dataSize)
46
 {
47
+   UNUSED_PARAMETER(self);
48
 
49
    NSObject<CMIOObject> *object =
50
        [OBSDALObjectStore GetObjectWithId:objectID];
51
@@ -240,6 +248,7 @@
52
    const void *qualifierData, UInt32 dataSize, UInt32 *dataUsed,
53
    void *data)
54
 {
55
+   UNUSED_PARAMETER(self);
56
 
57
    NSObject<CMIOObject> *object =
58
        [OBSDALObjectStore GetObjectWithId:objectID];
59
@@ -345,6 +354,8 @@
60
 OSStatus HardwarePlugIn_DeviceSuspend(CMIOHardwarePlugInRef self,
61
                      CMIODeviceID deviceID)
62
 {
63
+   UNUSED_PARAMETER(deviceID);
64
+
65
    DLogFunc(@"self=%p", self);
66
    return kCMIOHardwareNoError;
67
 }
68
@@ -352,6 +363,8 @@
69
 OSStatus HardwarePlugIn_DeviceResume(CMIOHardwarePlugInRef self,
70
                     CMIODeviceID deviceID)
71
 {
72
+   UNUSED_PARAMETER(deviceID);
73
+
74
    DLogFunc(@"self=%p", self);
75
    return kCMIOHardwareNoError;
76
 }
77
@@ -380,6 +393,9 @@
78
                       CMIODeviceID deviceID,
79
                       CMIODeviceAVCCommand *ioAVCCommand)
80
 {
81
+   UNUSED_PARAMETER(deviceID);
82
+   UNUSED_PARAMETER(ioAVCCommand);
83
+
84
    DLogFunc(@"self=%p", self);
85
    return kCMIOHardwareNoError;
86
 }
87
@@ -389,6 +405,9 @@
88
                     CMIODeviceID deviceID,
89
                     CMIODeviceRS422Command *ioRS422Command)
90
 {
91
+   UNUSED_PARAMETER(deviceID);
92
+   UNUSED_PARAMETER(ioRS422Command);
93
+
94
    DLogFunc(@"self=%p", self);
95
    return kCMIOHardwareNoError;
96
 }
97
@@ -396,6 +415,8 @@
98
 OSStatus HardwarePlugIn_StreamDeckPlay(CMIOHardwarePlugInRef self,
99
                       CMIOStreamID streamID)
100
 {
101
+   UNUSED_PARAMETER(streamID);
102
+
103
    DLogFunc(@"self=%p", self);
104
    return kCMIOHardwareIllegalOperationError;
105
 }
106
@@ -403,6 +424,8 @@
107
 OSStatus HardwarePlugIn_StreamDeckStop(CMIOHardwarePlugInRef self,
108
                       CMIOStreamID streamID)
109
 {
110
+   UNUSED_PARAMETER(streamID);
111
+
112
    DLogFunc(@"self=%p", self);
113
    return kCMIOHardwareIllegalOperationError;
114
 }
115
@@ -410,6 +433,9 @@
116
 OSStatus HardwarePlugIn_StreamDeckJog(CMIOHardwarePlugInRef self,
117
                      CMIOStreamID streamID, SInt32 speed)
118
 {
119
+   UNUSED_PARAMETER(streamID);
120
+   UNUSED_PARAMETER(speed);
121
+
122
    DLogFunc(@"self=%p", self);
123
    return kCMIOHardwareIllegalOperationError;
124
 }
125
@@ -419,6 +445,10 @@
126
                    Float64 requestedTimecode,
127
                    Boolean playOnCue)
128
 {
129
+   UNUSED_PARAMETER(streamID);
130
+   UNUSED_PARAMETER(requestedTimecode);
131
+   UNUSED_PARAMETER(playOnCue);
132
+
133
    DLogFunc(@"self=%p", self);
134
    return kCMIOHardwareIllegalOperationError;
135
 }
136
obs-studio-26.1.0.tar.xz/plugins/mac-virtualcam/src/dal-plugin/OBSDALPluginMain.mm -> obs-studio-26.1.1.tar.xz/plugins/mac-virtualcam/src/dal-plugin/OBSDALPluginMain.mm Changed
10
 
1
@@ -27,6 +27,8 @@
2
 extern "C" {
3
 void *PlugInMain(CFAllocatorRef allocator, CFUUIDRef requestedTypeUUID)
4
 {
5
+   UNUSED_PARAMETER(allocator);
6
+
7
    DLogFunc(@"version=%@", PLUGIN_VERSION);
8
    if (!CFEqual(requestedTypeUUID, kCMIOHardwarePlugInTypeID)) {
9
        return 0;
10
obs-studio-26.1.0.tar.xz/plugins/mac-virtualcam/src/dal-plugin/OBSDALStream.mm -> obs-studio-26.1.1.tar.xz/plugins/mac-virtualcam/src/dal-plugin/OBSDALStream.mm Changed
56
 
1
@@ -299,7 +299,6 @@
2
 - (void)fillFrame
3
 {
4
    if (CMSimpleQueueGetFullness(self.queue) >= 1.0) {
5
-       DLog(@"Queue is full, bailing out");
6
        return;
7
    }
8
 
9
@@ -433,9 +432,6 @@
10
    case kCMIOStreamPropertyClock:
11
        return sizeof(CFTypeRef);
12
    default:
13
-       DLog(@"Stream unhandled getPropertyDataSizeWithAddress for %@",
14
-            [OBSDALObjectStore
15
-                StringFromPropertySelector:address.mSelector]);
16
        return 0;
17
    };
18
 }
19
@@ -509,9 +505,6 @@
20
        *dataUsed = sizeof(CFTypeRef);
21
        break;
22
    default:
23
-       DLog(@"Stream unhandled getPropertyDataWithAddress for %@",
24
-            [OBSDALObjectStore
25
-                StringFromPropertySelector:address.mSelector]);
26
        *dataUsed = 0;
27
    };
28
 }
29
@@ -543,17 +536,12 @@
30
                 StringFromPropertySelector:address.mSelector]);
31
        return false;
32
    default:
33
-       DLog(@"Stream unhandled hasPropertyWithAddress for %@",
34
-            [OBSDALObjectStore
35
-                StringFromPropertySelector:address.mSelector]);
36
        return false;
37
    };
38
 }
39
 
40
 - (BOOL)isPropertySettableWithAddress:(CMIOObjectPropertyAddress)address
41
 {
42
-   DLog(@"Stream unhandled isPropertySettableWithAddress for %@",
43
-        [OBSDALObjectStore StringFromPropertySelector:address.mSelector]);
44
    return false;
45
 }
46
 
47
@@ -563,8 +551,6 @@
48
              dataSize:(UInt32)dataSize
49
                  data:(nonnull const void *)data
50
 {
51
-   DLog(@"Stream unhandled setPropertyDataWithAddress for %@",
52
-        [OBSDALObjectStore StringFromPropertySelector:address.mSelector]);
53
 }
54
 
55
 @end
56
obs-studio-26.1.0.tar.xz/plugins/mac-virtualcam/src/obs-plugin/plugin-main.mm -> obs-studio-26.1.1.tar.xz/plugins/mac-virtualcam/src/obs-plugin/plugin-main.mm Changed
45
 
1
@@ -117,6 +117,8 @@
2
 static void *virtualcam_output_create(obs_data_t *settings,
3
                      obs_output_t *output)
4
 {
5
+   UNUSED_PARAMETER(settings);
6
+
7
    outputRef = output;
8
 
9
    blog(LOG_DEBUG, "output_create");
10
@@ -126,12 +128,15 @@
11
 
12
 static void virtualcam_output_destroy(void *data)
13
 {
14
+   UNUSED_PARAMETER(data);
15
    blog(LOG_DEBUG, "output_destroy");
16
    sMachServer = nil;
17
 }
18
 
19
 static bool virtualcam_output_start(void *data)
20
 {
21
+   UNUSED_PARAMETER(data);
22
+
23
    bool hasDalPlugin = check_dal_plugin();
24
 
25
    if (!hasDalPlugin) {
26
@@ -158,6 +163,9 @@
27
 
28
 static void virtualcam_output_stop(void *data, uint64_t ts)
29
 {
30
+   UNUSED_PARAMETER(data);
31
+   UNUSED_PARAMETER(ts);
32
+
33
    blog(LOG_DEBUG, "output_stop");
34
    obs_output_end_data_capture(outputRef);
35
    [sMachServer stop];
36
@@ -165,6 +173,8 @@
37
 
38
 static void virtualcam_output_raw_video(void *data, struct video_data *frame)
39
 {
40
+   UNUSED_PARAMETER(data);
41
+
42
    uint8_t *outData = frame->data[0];
43
    if (frame->linesize[0] != (videoInfo.output_width * 2)) {
44
        blog(LOG_ERROR,
45
obs-studio-26.1.0.tar.xz/plugins/obs-browser/obs-browser-plugin.cpp -> obs-studio-26.1.1.tar.xz/plugins/obs-browser/obs-browser-plugin.cpp Changed
14
 
1
@@ -595,6 +595,12 @@
2
    }
3
    obs_data_release(private_data);
4
 #endif
5
+
6
+#if defined(__APPLE__) && CHROME_VERSION_BUILD < 4183
7
+   // Make sure CEF malloc hijacking happens early in the process
8
+   obs_browser_initialize();
9
+#endif
10
+
11
    return true;
12
 }
13
 
14
obs-studio-26.1.0.tar.xz/plugins/obs-ffmpeg/ffmpeg-mux/ffmpeg-mux.c -> obs-studio-26.1.1.tar.xz/plugins/obs-ffmpeg/ffmpeg-mux/ffmpeg-mux.c Changed
10
 
1
@@ -808,7 +808,7 @@
2
    }
3
 
4
    /* Treat "Invalid data found when processing input" and "Invalid argument" as non-fatal */
5
-   if (ret == AVERROR_INVALIDDATA || ret == EINVAL) {
6
+   if (ret == AVERROR_INVALIDDATA || ret == -EINVAL) {
7
        return true;
8
    }
9
 
10
obs-studio-26.1.0.tar.xz/plugins/rtmp-services/data/package.json -> obs-studio-26.1.1.tar.xz/plugins/rtmp-services/data/package.json Changed
14
 
1
@@ -1,10 +1,10 @@
2
 {
3
    "url": "https://obsproject.com/obs2_update/rtmp-services",
4
-   "version": 161,
5
+   "version": 163,
6
    "files": [
7
        {
8
            "name": "services.json",
9
-           "version": 161
10
+           "version": 163
11
        }
12
    ]
13
 }
14
obs-studio-26.1.0.tar.xz/plugins/rtmp-services/data/services.json -> obs-studio-26.1.1.tar.xz/plugins/rtmp-services/data/services.json Changed
136
 
1
@@ -260,77 +260,56 @@
2
             }
3
         },
4
         {
5
-            "name": "VIMM",
6
+            "name": "Loola.tv",
7
+            "common": false,
8
             "servers": [
9
                 {
10
-                    "name": "Europe: Frankfurt",
11
-                    "url": "rtmp://eu.vimm.tv/live"
12
+                    "name": "US East: Virginia",
13
+                    "url": "rtmp://rtmp.loola.tv/push"
14
                 },
15
                 {
16
-                    "name": "North America: Montreal",
17
-                    "url": "rtmp://us.vimm.tv/live"
18
+                    "name": "EU Central: Germany",
19
+                    "url": "rtmp://rtmp-eu.loola.tv/push"
20
+                },
21
+                {
22
+                    "name": "South America: Brazil",
23
+                    "url": "rtmp://rtmp-sa.loola.tv/push"
24
+                },
25
+                {
26
+                    "name": "Asia/Pacific: Singapore",
27
+                    "url": "rtmp://rtmp-sg.loola.tv/push"
28
+                },
29
+                {
30
+                    "name": "Middle East: Bahrain",
31
+                    "url": "rtmp://rtmp-me.loola.tv/push"
32
                 }
33
             ],
34
             "recommended": {
35
                 "keyint": 2,
36
-                "max video bitrate": 8000,
37
-                "max audio bitrate": 320,
38
+                "profile": "high",
39
+                "max video bitrate": 2500,
40
+                "max audio bitrate": 160,
41
+                "bframes": 2,
42
                 "x264opts": "scenecut=0"
43
             }
44
         },
45
         {
46
-            "name": "Smashcast",
47
+            "name": "VIMM",
48
             "servers": [
49
                 {
50
-                    "name": "Default",
51
-                    "url": "rtmp://live.hitbox.tv/push"
52
-                },
53
-                {
54
-                    "name": "EU-North: Amsterdam, Netherlands",
55
-                    "url": "rtmp://live.ams.hitbox.tv/push"
56
-                },
57
-                {
58
-                    "name": "EU-West: Paris, France",
59
-                    "url": "rtmp://live.cdg.hitbox.tv/push"
60
-                },
61
-                {
62
-                    "name": "EU-South: Milan, Italia",
63
-                    "url": "rtmp://live.mxp.hitbox.tv/push"
64
-                },
65
-                {
66
-                    "name": "Russia: Moscow",
67
-                    "url": "rtmp://live.dme.hitbox.tv/push"
68
-                },
69
-                {
70
-                    "name": "US-East: New York",
71
-                    "url": "rtmp://live.jfk.hitbox.tv/push"
72
-                },
73
-                {
74
-                    "name": "US-West: San Francisco",
75
-                    "url": "rtmp://live.sfo.hitbox.tv/push"
76
-                },
77
-                {
78
-                    "name": "US-West: Los Angeles",
79
-                    "url": "rtmp://live.lax.hitbox.tv/push"
80
-                },
81
-                {
82
-                    "name": "South America: Sao Paulo, Brazil",
83
-                    "url": "rtmp://live.gru.hitbox.tv/push"
84
-                },
85
-                {
86
-                    "name": "Asia: Singapore",
87
-                    "url": "rtmp://live.sin.hitbox.tv/push"
88
+                    "name": "Europe: Frankfurt",
89
+                    "url": "rtmp://eu.vimm.tv/live"
90
                 },
91
                 {
92
-                    "name": "Oceania: Sydney, Australia",
93
-                    "url": "rtmp://live.syd.hitbox.tv/push"
94
+                    "name": "North America: Montreal",
95
+                    "url": "rtmp://us.vimm.tv/live"
96
                 }
97
             ],
98
             "recommended": {
99
                 "keyint": 2,
100
-                "profile": "high",
101
-                "max video bitrate": 3500,
102
-                "max audio bitrate": 320
103
+                "max video bitrate": 8000,
104
+                "max audio bitrate": 320,
105
+                "x264opts": "scenecut=0"
106
             }
107
         },
108
         {
109
@@ -399,7 +378,7 @@
110
                 {
111
                     "name": "US: New York, NY",
112
                     "url": "rtmp://live-nyc.vaughnsoft.net/live"
113
-                },         
114
+                },
115
                 {
116
                     "name": "US: Miami, FL",
117
                     "url": "rtmp://live-mia.vaughnsoft.net/live"
118
@@ -441,7 +420,7 @@
119
                 {
120
                     "name": "US: New York, NY",
121
                     "url": "rtmp://live-nyc.vaughnsoft.net/live"
122
-                },         
123
+                },
124
                 {
125
                     "name": "US: Miami, FL",
126
                     "url": "rtmp://live-mia.vaughnsoft.net/live"
127
@@ -1779,7 +1758,7 @@
128
                 "max audio bitrate": 160,
129
                 "x264opts": "tune=zerolatency"
130
             }
131
-   },
132
+        },
133
         {
134
             "name": "Mux",
135
             "servers": [
136
obs-studio-26.1.0.tar.xz/plugins/win-dshow/libdshowcapture/dshowcapture.hpp -> obs-studio-26.1.1.tar.xz/plugins/win-dshow/libdshowcapture/dshowcapture.hpp Changed
10
 
1
@@ -31,7 +31,7 @@
2
 
3
 #define DSHOWCAPTURE_VERSION_MAJOR 0
4
 #define DSHOWCAPTURE_VERSION_MINOR 8
5
-#define DSHOWCAPTURE_VERSION_PATCH 5
6
+#define DSHOWCAPTURE_VERSION_PATCH 6
7
 
8
 #define MAKE_DSHOWCAPTURE_VERSION(major, minor, patch) \
9
    ((major << 24) | (minor << 16) | (patch))
10
obs-studio-26.1.0.tar.xz/plugins/win-dshow/libdshowcapture/source/dshow-base.cpp -> obs-studio-26.1.1.tar.xz/plugins/win-dshow/libdshowcapture/source/dshow-base.cpp Changed
198
 
1
@@ -780,9 +780,105 @@
2
    return hr;
3
 }
4
 
5
+static HRESULT GetFriendlyName(REFCLSID deviceClass, const wchar_t *devPath,
6
+                  wchar_t *name, int nameSize)
7
+{
8
+   /* Sanity checks */
9
+   if (!devPath)
10
+       return E_POINTER;
11
+   if (!name)
12
+       return E_POINTER;
13
+
14
+   /* Create device enumerator */
15
+   ComPtr<ICreateDevEnum> createDevEnum;
16
+   HRESULT hr = CoCreateInstance(CLSID_SystemDeviceEnum, NULL,
17
+                     CLSCTX_INPROC_SERVER, IID_ICreateDevEnum,
18
+                     (void **)&createDevEnum);
19
+
20
+   /* Enumerate filters */
21
+   ComPtr<IEnumMoniker> enumMoniker;
22
+   if (SUCCEEDED(hr)) {
23
+       /* returns S_FALSE if no devices are installed */
24
+       hr = createDevEnum->CreateClassEnumerator(deviceClass,
25
+                             &enumMoniker, 0);
26
+       if (!enumMoniker)
27
+           hr = E_FAIL;
28
+   }
29
+
30
+   /* Cycle through the enumeration */
31
+   if (SUCCEEDED(hr)) {
32
+       ULONG fetched = 0;
33
+       ComPtr<IMoniker> moniker;
34
+
35
+       enumMoniker->Reset();
36
+
37
+       while (enumMoniker->Next(1, &moniker, &fetched) == S_OK) {
38
+
39
+           /* Get device path from moniker */
40
+           wchar_t monikerDevPath[512];
41
+           hr = ReadProperty(moniker, L"DevicePath",
42
+                     monikerDevPath,
43
+                     _ARRAYSIZE(monikerDevPath));
44
+
45
+           /* Find desired filter */
46
+           if (wcscmp(devPath, monikerDevPath) == 0) {
47
+
48
+               /* Get friendly name */
49
+               hr = ReadProperty(moniker, L"FriendlyName",
50
+                         name, nameSize);
51
+               return hr;
52
+           }
53
+       }
54
+   }
55
+
56
+   return E_FAIL;
57
+}
58
+
59
+static bool MatchFriendlyNames(const wchar_t *vidName, const wchar_t *audName)
60
+{
61
+   /* Sanity checks */
62
+   if (!vidName)
63
+       return false;
64
+   if (!audName)
65
+       return false;
66
+
67
+   /* Convert strings to lower case */
68
+   wstring strVidName = vidName;
69
+   for (wchar_t &c : strVidName)
70
+       c = (wchar_t)tolower(c);
71
+   wstring strAudName = audName;
72
+   for (wchar_t &c : strAudName)
73
+       c = (wchar_t)tolower(c);
74
+
75
+   /* Remove 'video' from friendly name */
76
+   size_t posVid;
77
+   wstring searchVid[] = {L"(video) ", L"(video)", L"video ", L"video"};
78
+   for (int i = 0; i < _ARRAYSIZE(searchVid); i++) {
79
+       wstring &search = searchVid[i];
80
+       while ((posVid = strVidName.find(search)) !=
81
+              std::string::npos) {
82
+           strVidName.replace(posVid, search.length(), L"");
83
+       }
84
+   }
85
+
86
+   /* Remove 'audio' from friendly name */
87
+   size_t posAud;
88
+   wstring searchAud[] = {L"(audio) ", L"(audio)", L"audio ", L"audio"};
89
+   for (int i = 0; i < _ARRAYSIZE(searchAud); i++) {
90
+       wstring &search = searchAud[i];
91
+       while ((posAud = strAudName.find(search)) !=
92
+              std::string::npos) {
93
+           strAudName.replace(posAud, search.length(), L"");
94
+       }
95
+   }
96
+
97
+   return strVidName == strAudName;
98
+}
99
+
100
 static bool GetDeviceAudioFilterInternal(REFCLSID deviceClass,
101
                     const wchar_t *vidDevPath,
102
-                    IBaseFilter **audioCaptureFilter)
103
+                    IBaseFilter **audioCaptureFilter,
104
+                    bool matchFilterName = false)
105
 {
106
    /* Get video device instance path */
107
    wchar_t vidDevInstPath[512];
108
@@ -797,6 +893,15 @@
109
        return false;
110
 #endif
111
 
112
+   /* Get friendly name */
113
+   wchar_t vidName[512];
114
+   if (matchFilterName) {
115
+       hr = GetFriendlyName(CLSID_VideoInputDeviceCategory, vidDevPath,
116
+                    vidName, _ARRAYSIZE(vidName));
117
+       if (FAILED(hr))
118
+           return false;
119
+   }
120
+
121
    /* Create device enumerator */
122
    ComPtr<ICreateDevEnum> createDevEnum;
123
    if (SUCCEEDED(hr))
124
@@ -823,12 +928,6 @@
125
 
126
        while (enumMoniker->Next(1, &moniker, &fetched) == S_OK) {
127
            bool samePath = false;
128
-#if 0
129
-           /* Get friendly name (helpful for debugging) */
130
-           wchar_t friendlyName[512];
131
-           ReadProperty(moniker, L"FriendlyName", friendlyName,
132
-                   _ARRAYSIZE(friendlyName));
133
-#endif
134
 
135
            /* Get device path */
136
            wchar_t audDevPath[512];
137
@@ -848,11 +947,29 @@
138
 
139
            /* Get audio capture filter */
140
            if (samePath) {
141
-               hr = moniker->BindToObject(
142
-                   0, 0, IID_IBaseFilter,
143
-                   (void **)audioCaptureFilter);
144
-               if (SUCCEEDED(hr))
145
-                   return true;
146
+               /* Match video and audio filter names */
147
+               bool isSameFilterName = false;
148
+               if (matchFilterName) {
149
+                   wchar_t audName[512];
150
+                   hr = ReadProperty(moniker,
151
+                             L"FriendlyName",
152
+                             audName,
153
+                             _ARRAYSIZE(audName));
154
+                   if (SUCCEEDED(hr)) {
155
+                       isSameFilterName =
156
+                           MatchFriendlyNames(
157
+                               vidName,
158
+                               audName);
159
+                   }
160
+               }
161
+
162
+               if (!matchFilterName || isSameFilterName) {
163
+                   hr = moniker->BindToObject(
164
+                       0, 0, IID_IBaseFilter,
165
+                       (void **)audioCaptureFilter);
166
+                   if (SUCCEEDED(hr))
167
+                       return true;
168
+               }
169
            }
170
        }
171
    }
172
@@ -863,9 +980,23 @@
173
 bool GetDeviceAudioFilter(const wchar_t *vidDevPath,
174
              IBaseFilter **audioCaptureFilter)
175
 {
176
-   /* Search in "Audio capture sources" */
177
+   /* Search in "Audio capture sources" and match filter name */
178
    bool success = GetDeviceAudioFilterInternal(
179
-       CLSID_AudioInputDeviceCategory, vidDevPath, audioCaptureFilter);
180
+       CLSID_AudioInputDeviceCategory, vidDevPath, audioCaptureFilter,
181
+       true);
182
+
183
+   /* Search in "WDM Streaming Capture Devices" and match filter name */
184
+   if (!success)
185
+       success = GetDeviceAudioFilterInternal(KSCATEGORY_CAPTURE,
186
+                              vidDevPath,
187
+                              audioCaptureFilter,
188
+                              true);
189
+
190
+   /* Search in "Audio capture sources" */
191
+   if (!success)
192
+       success = GetDeviceAudioFilterInternal(
193
+           CLSID_AudioInputDeviceCategory, vidDevPath,
194
+           audioCaptureFilter);
195
 
196
    /* Search in "WDM Streaming Capture Devices" */
197
    if (!success)
198
obs-studio-26.1.0.tar.xz/plugins/win-dshow/libdshowcapture/source/output-filter.cpp -> obs-studio-26.1.1.tar.xz/plugins/win-dshow/libdshowcapture/source/output-filter.cpp Changed
15
 
1
@@ -17,11 +17,12 @@
2
  *  USA
3
  */
4
 
5
-#include <strsafe.h>
6
 #include "output-filter.hpp"
7
 #include "dshow-formats.hpp"
8
 #include "log.hpp"
9
 
10
+#include <strsafe.h>
11
+
12
 namespace DShow {
13
 
14
 #if 0
15