From ec94f0e2e6793453356692cb2d0ef8acac07c008 Mon Sep 17 00:00:00 2001 From: cheng wang Date: Mon, 10 Jul 2023 19:47:11 +0000 Subject: [PATCH 01/15] Upload New File --- cmake/HexagonToolchain.cmake | 158 +++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 cmake/HexagonToolchain.cmake diff --git a/cmake/HexagonToolchain.cmake b/cmake/HexagonToolchain.cmake new file mode 100644 index 000000000..37e9eca10 --- /dev/null +++ b/cmake/HexagonToolchain.cmake @@ -0,0 +1,158 @@ +# =============================================================================== +# Copyright 2018 Qualcomm Technologies, Inc. All rights reserved. +# Confidential & Proprietary +# =============================================================================== + +if(HEXAGON_TOOLCHAIN_INCLUDED) + return() +endif(HEXAGON_TOOLCHAIN_INCLUDED) +set(HEXAGON_TOOLCHAIN_INCLUDED true) + +set(EIGEN_TEST_HVX ON) + +if (NOT DSP_VERSION) + set(DSP_VERSION v69) +endif() + +set(TOOLS_VARIANT $ENV{DEFAULT_TOOLS_VARIANT}) +set(PREBUILT_LIB_DIR hexagon_${TOOLS_VARIANT}_${DSP_VERSION}) + +# Cross Compiling for Hexagon +set(HEXAGON TRUE) +set(CMAKE_SYSTEM_NAME QURT) +set(CMAKE_SYSTEM_PROCESSOR Hexagon) +set(CMAKE_SYSTEM_VERSION "1") #${HEXAGON_PLATFORM_LEVEL}) +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) +set(CUSTOM_RUNELF_PATH "") + +# To fix backward compatibility with EAI addon. +if (NOT HEXAGON_SDK_ROOT) + set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT}) +endif() + +if (NOT HEXAGON_TOOLS_ROOT) + if (DEFINED ENV{HEXAGON_TOOLS_ROOT}) + set(HEXAGON_TOOLS_ROOT $ENV{HEXAGON_TOOLS_ROOT}) + endif() + if(NOT HEXAGON_TOOLS_ROOT) + set(HEXAGON_TOOLS_ROOT $ENV{DEFAULT_HEXAGON_TOOLS_ROOT}) + endif() +endif() + +file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT) +file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT) + +include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_arch.cmake) + +set(HEXAGON_TOOLCHAIN ${HEXAGON_TOOLS_ROOT}) +set(HEXAGON_LIB_DIR "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib") +set(HEXAGON_ISS_DIR ${HEXAGON_TOOLCHAIN}/Tools/lib/iss) +set(RUN_MAIN_HEXAGON "${HEXAGON_SDK_ROOT}/libs/run_main_on_hexagon/ship/${PREBUILT_LIB_DIR}/run_main_on_hexagon_sim") + +set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES + HEXAGON_SDK_ROOT + HEXAGON_TOOLS_ROOT +) + +#QURT SPECIFIC LIBS and Includes +# Linker Flags +# QURT Related includes and linker flags + +set(V_ARCH ${HEXAGON_ARCH}) +set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/ADSP${V_ARCH}MP${V_ARCH_EXTN}") +set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/compute${V_ARCH}${V_ARCH_EXTN}") + +message(DEBUG "_QURT_INSTALL_DIR:${_QURT_INSTALL_DIR}") +set(RTOS_DIR ${_QURT_INSTALL_DIR}) +set(TARGET_DIR "${HEXAGON_LIB_DIR}/${V_ARCH}/G0") +include_directories( + ${_QURT_INSTALL_DIR}/include + ${_QURT_INSTALL_DIR}/include/qurt + ${_QURT_INSTALL_DIR}/include/posix + ) + +# Non QURT related includes and linker flags +set(TARGET_DIR_NOOS "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/${HEXAGON_ARCH}") + +set(EXE_LD_FLAGS + -m${V_ARCH} + -G0 + -fpic + -Wl,-Bsymbolic + -Wl,-L${TARGET_DIR_NOOS}/G0/pic + -Wl,-L${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/ + -Wl,--no-threads -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign + -shared + "-o " + "" + -Wl,--start-group + "" + "" + -Wl,${TARGET_DIR_NOOS}/G0/pic/libc++.a + -Wl,${TARGET_DIR_NOOS}/G0/pic/libc++abi.a + -Wl,--end-group + -lc + ) + STRING(REPLACE ";" " " EXE_LD_FLAGS "${EXE_LD_FLAGS}") + +set(HEXAGON_C_LINK_EXECUTABLE_LINK_OPTIONS "${EXE_LD_FLAGS}" ) +message(DEBUG "Hexagon C Executable Linker Line:${HEXAGON_C_LINK_EXECUTABLE_LINK_OPTIONS}") +set(HEXAGON_CXX_LINK_EXECUTABLE_LINK_OPTIONS "${EXE_LD_FLAGS}") +message(DEBUG "Hexagon CXX Executable Linker Line:${HEXAGON_CXX_LINK_EXECUTABLE_LINK_OPTIONS}") + +# System include paths +include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs) +include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef) +include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/ipc/fastrpc/incs) + +# LLVM toolchain setup +# Compiler paths, options and architecture +set(CMAKE_C_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang${HEXAGON_TOOLCHAIN_SUFFIX}) +set(CMAKE_CXX_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX}) +set(CMAKE_AR ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-ar${HEXAGON_TOOLCHAIN_SUFFIX}) +set(CMAKE_ASM_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX}) +set(HEXAGON_LINKER ${CMAKE_C_COMPILER}) +set(CMAKE_PREFIX_PATH ${HEXAGON_TOOLCHAIN}/Tools/target/hexagon) +set(HEXAGON_SIM "${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-sim${HEXAGON_TOOLCHAIN_SUFFIX}") +set(DEBUG_FLAGS "-O0 -g") +set(RELEASE_FLAGS "-O2") +set(COMMON_FLAGS "-m${HEXAGON_ARCH} -G0 -Wall -fno-zero-initialized-in-bss -fdata-sections -fpic") + +set(COMMON_FLAGS "${COMMON_FLAGS} -mhvx -mhvx-length=128B") + +set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-Wl,-soname,") +set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Wl,-soname,") + +set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} ${DEBUG_FLAGS}") +set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} ${RELEASE_FLAGS}") +set(CMAKE_C_FLAGS_DEBUG "${COMMON_FLAGS} ${DEBUG_FLAGS}") +set(CMAKE_C_FLAGS_RELEASE "${COMMON_FLAGS} ${RELEASE_FLAGS}") +if(ADD_SYMBOLS) + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -g ") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -g ") +endif() +set(CMAKE_ASM_FLAGS_DEBUG "${CMAKE_ASM_FLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}") +set(CMAKE_ASM_FLAGS_RELEASE "${CMAKE_ASM_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}") + +# Linker Options +set(CMAKE_C_LINK_EXECUTABLE "${HEXAGON_LINKER} ${HEXAGON_C_LINK_EXECUTABLE_LINK_OPTIONS}") +set(CMAKE_C_CREATE_SHARED_LIBRARY "${HEXAGON_LINKER} ${HEXAGON_PIC_SHARED_LINK_OPTIONS}") +set(CMAKE_CXX_LINK_EXECUTABLE "${HEXAGON_LINKER} ${HEXAGON_CXX_LINK_EXECUTABLE_LINK_OPTIONS}") +set(CMAKE_CXX_CREATE_SHARED_LIBRARY "${HEXAGON_LINKER} ${HEXAGON_PIC_SHARED_LINK_OPTIONS}") + +# Run simulator +set(CUSTOM_RUNELF_PATH ${RTOS_DIR}/sdksim_bin/runelf.pbn) + +set(q6ssLine1 "${HEXAGON_ISS_DIR}/qtimer.so --csr_base=0xFC900000 --irq_p=1 --freq=19200000 --cnttid=1\n") +set(q6ssLine2 "${HEXAGON_ISS_DIR}/l2vic.so 32 0xFC910000\n") +set(osamString "${RTOS_DIR}/debugger/lnx64/qurt_model.so\n") +file(WRITE ${CMAKE_BINARY_DIR}/q6ss.cfg ${q6ssLine1}) +file(APPEND ${CMAKE_BINARY_DIR}/q6ss.cfg ${q6ssLine2}) +file(WRITE ${CMAKE_BINARY_DIR}/osam.cfg ${osamString}) + +set(CMAKE_CROSSCOMPILING_EMULATOR + ${HEXAGON_SIM};-m${SIM_V_ARCH};--simulated_returnval;--usefs;${CMAKE_CURRENT_BINARY_DIR};--cosim_file;${CMAKE_BINARY_DIR}/q6ss.cfg;--l2tcm_base;0xd800;--rtos;${CMAKE_BINARY_DIR}/osam.cfg;${CUSTOM_RUNELF_PATH};--;${RUN_MAIN_HEXAGON};--) + -- GitLab From e6e6ac941c00197a971dfc27747ede610820d16d Mon Sep 17 00:00:00 2001 From: cheng wang Date: Mon, 10 Jul 2023 19:49:22 +0000 Subject: [PATCH 02/15] Update EigenTesting.cmake --- cmake/EigenTesting.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 2022cf001..82b280fe2 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -1,4 +1,3 @@ - macro(ei_add_property prop value) get_property(previous GLOBAL PROPERTY ${prop}) if ((NOT previous) OR (previous STREQUAL "")) @@ -105,7 +104,7 @@ macro(ei_add_test_internal testname testname_with_suffix) endif() endif() - add_test(${testname_with_suffix} "${targetname}") + add_test(NAME ${testname_with_suffix} COMMAND "${targetname}") # Specify target and test labels according to EIGEN_CURRENT_SUBPROJECT get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT) -- GitLab From 9125d76fef12044476411fa3b3e166bcbf813a3a Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Wed, 12 Jul 2023 08:39:49 -0700 Subject: [PATCH 03/15] Fix a build with command-line argument in ei_add_test --- cmake/EigenTesting.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 82b280fe2..2b69ca9ac 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -74,7 +74,7 @@ macro(ei_add_test_internal testname testname_with_suffix) # let the user pass flags. if(${ARGC} GREATER 2) - target_compile_options(${targetname} PRIVATE ${ARGV2}) + target_compile_options(${targetname} PRIVATE "SHELL:${ARGV2}") endif() if(EIGEN_TEST_CUSTOM_CXX_FLAGS) -- GitLab From 4e3bf8d17d15e6aa0ddee78c25ab933d7b2c0051 Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Wed, 12 Jul 2023 15:36:15 -0700 Subject: [PATCH 04/15] Add HVX configuration. --- Eigen/Core | 6 + .../Core/arch/HVX/GeneralBlockPanelKernel.h | 41 ++++ Eigen/src/Core/arch/HVX/PacketMath.h | 185 ++++++++++++++++++ Eigen/src/Core/util/ConfigureVectorization.h | 8 + Eigen/src/Core/util/Constants.h | 3 + 5 files changed, 243 insertions(+) create mode 100644 Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h create mode 100644 Eigen/src/Core/arch/HVX/PacketMath.h diff --git a/Eigen/Core b/Eigen/Core index 1e7e38cb1..93d47890a 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -242,6 +242,8 @@ using std::ptrdiff_t; #include "src/Core/arch/MSA/PacketMath.h" #include "src/Core/arch/MSA/MathFunctions.h" #include "src/Core/arch/MSA/Complex.h" +#elif defined EIGEN_VECTORIZE_HVX + #include "src/Core/arch/HVX/PacketMath.h" #endif #if defined EIGEN_VECTORIZE_GPU @@ -375,6 +377,10 @@ using std::ptrdiff_t; #include "src/Core/arch/AVX512/GemmKernel.h" #endif +#if defined(EIGEN_VECTORIZE_HVX) + #include "src/Core/arch/HVX/GeneralBlockPanelKernel.h" +#endif + #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" #include "src/Core/PartialReduxEvaluator.h" diff --git a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h new file mode 100644 index 000000000..afce45e2b --- /dev/null +++ b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h @@ -0,0 +1,41 @@ + +#ifndef EIGEN_CORE_ARCH_HVX_GENERAL_BLOCK_KERNEL_H +#define EIGEN_CORE_ARCH_HVX_GENERAL_BLOCK_KERNEL_H + +namespace Eigen { +namespace internal { + +template +class gebp_traits + : public gebp_traits { + public: + typedef Packet32qf AccPacket; + + EIGEN_STRONG_INLINE void initAcc(Packet32qf& p) { p = pzero(p); } + + template + EIGEN_STRONG_INLINE void madd(const Packet32f& a, const Packet32f& b, + Packet32qf& c, Packet32f& tmp, + const LaneIdType&) const { + c = pmadd(a, b, c); + } + + template + EIGEN_STRONG_INLINE void madd(const Packet32f& a, + const QuadPacket& b, Packet32qf& c, + Packet32f& tmp, const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + EIGEN_STRONG_INLINE void acc(const Packet32qf& c, const Packet32f& alpha, + Packet32f& r) const { + r = pmadd(c, alpha, r); + } +}; + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_CORE_ARCH_HVX_GENERAL_BLOCK_KERNEL_H diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h new file mode 100644 index 000000000..9225fe0dc --- /dev/null +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -0,0 +1,185 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_HVX_H +#define EIGEN_PACKET_MATH_HVX_H + +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +#endif + +namespace Eigen { +namespace internal { + +// Hexagon compiler uses same HVX_Vector to represent all HVX vector types. +// Wrap different vector type (float32, int32, etc) to different class with +// explicit constructor and casting back-and-force to HVX_Vector. +template +class HVXPacket { + public: + HVXPacket() = default; + explicit HVXPacket(HVX_Vector v) : m_val(v) {} + explicit operator HVX_Vector() const { return m_val; } + + private: + HVX_Vector m_val = Q6_V_vzero(); +}; + +// Generic operations. +template +EIGEN_STRONG_INLINE void ptranspose( + PacketBlock, 4>& kernel) { + // zip 0,2 + HVX_VectorPair transpose_0_2 = Q6_W_vshuff_VVR( + HVX_Vector(kernel.packet[2]), HVX_Vector(kernel.packet[0]), -4); + // zip 1,3 + HVX_VectorPair transpose_1_3 = Q6_W_vshuff_VVR( + HVX_Vector(kernel.packet[3]), HVX_Vector(kernel.packet[1]), -4); + // zip 0,1 + HVX_VectorPair transpose_0_1 = Q6_W_vshuff_VVR( + HEXAGON_HVX_GET_V0(transpose_1_3), HEXAGON_HVX_GET_V0(transpose_0_2), -4); + // zip 2,3 + HVX_VectorPair transpose_2_3 = Q6_W_vshuff_VVR( + HEXAGON_HVX_GET_V1(transpose_1_3), HEXAGON_HVX_GET_V1(transpose_0_2), -4); + + kernel.packet[0] = HVXPacket(HEXAGON_HVX_GET_V0(transpose_0_1)); + kernel.packet[1] = HVXPacket(HEXAGON_HVX_GET_V1(transpose_0_1)); + kernel.packet[2] = HVXPacket(HEXAGON_HVX_GET_V0(transpose_2_3)); + kernel.packet[3] = HVXPacket(HEXAGON_HVX_GET_V1(transpose_2_3)); +} + +#if __HVX_ARCH__ >= 68 + +typedef HVXPacket<0> Packet32f; // float32 +typedef HVXPacket<1> Packet32qf; // qfloat32 + +template <> +struct packet_traits : default_packet_traits { + typedef Packet32f type; + typedef Packet32f half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 32, + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef Packet32f half; + enum { + size = 32, + alignment = Aligned128, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +// float32 operations. +template <> +EIGEN_STRONG_INLINE Packet32f pset1(const float& from) { + union { + float f; + int32_t i; + } u = {.f = from}; + return Packet32f(Q6_V_vsplat_R(u.i)); +} + +template <> +EIGEN_STRONG_INLINE Packet32f pload(const float* from) { + return Packet32f(*reinterpret_cast(from)); +} +template <> +EIGEN_STRONG_INLINE Packet32f ploadu(const float* from) { + return Packet32f(*reinterpret_cast(from)); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const Packet32f& from) { + *reinterpret_cast(to) = HVX_Vector(from); +} +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet32f& from) { + *reinterpret_cast(to) = HVX_Vector(from); +} + +template <> +EIGEN_STRONG_INLINE float predux(const Packet32f& a) { + HVX_Vector vsum_4 = Q6_Vqf32_vadd_VsfVsf( + Q6_V_vror_VR(HVX_Vector(a), 4), HVX_Vector(a)); + HVX_Vector vsum_8 = Q6_Vqf32_vadd_Vqf32Vqf32( + Q6_V_vror_VR(vsum_4, 8), vsum_4); + HVX_Vector vsum_16 = Q6_Vqf32_vadd_Vqf32Vqf32( + Q6_V_vror_VR(vsum_8, 16), vsum_8); + HVX_Vector vsum_32 = Q6_Vqf32_vadd_Vqf32Vqf32( + Q6_V_vror_VR(vsum_16, 32), vsum_16); + HVX_Vector vsum_64 = Q6_Vqf32_vadd_Vqf32Vqf32( + Q6_V_vror_VR(vsum_32, 64), vsum_32); + + union { + float f; + int32_t i; + } u = {.i = Q6_R_vextract_VR(Q6_Vsf_equals_Vqf32(vsum_64), 0)}; + return u.f; +} + +template <> +EIGEN_STRONG_INLINE Packet32f pmul(const Packet32f& a, + const Packet32f& b) { + return Packet32f( + Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(HVX_Vector(a), HVX_Vector(b)))); +} + +template <> +EIGEN_STRONG_INLINE Packet32f padd(const Packet32f& a, + const Packet32f& b) { + return Packet32f( + Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(HVX_Vector(a), HVX_Vector(b)))); +} + +// qfloat32 operations. +template <> +EIGEN_STRONG_INLINE Packet32qf pzero(const Packet32qf&) { + return Packet32qf(Q6_V_vzero()); +} + +template <> +EIGEN_STRONG_INLINE Packet32qf pmul(const Packet32qf& a, + const Packet32qf& b) { + return Packet32qf(Q6_Vqf32_vmpy_Vqf32Vqf32(HVX_Vector(a), HVX_Vector(b))); +} + +template <> +EIGEN_STRONG_INLINE Packet32qf padd(const Packet32qf& a, + const Packet32qf& b) { + return Packet32qf(Q6_Vqf32_vadd_Vqf32Vqf32(HVX_Vector(a), HVX_Vector(b))); +} + +// Mixed float32 and qfloat32 operations. +EIGEN_STRONG_INLINE Packet32qf pmadd(const Packet32f& a, const Packet32f& b, + const Packet32qf& c) { + return Packet32qf(Q6_Vqf32_vadd_Vqf32Vqf32( + Q6_Vqf32_vmpy_VsfVsf(HVX_Vector(a), HVX_Vector(b)), HVX_Vector(c))); +} + +EIGEN_STRONG_INLINE Packet32f pmadd(const Packet32qf& a, const Packet32f& b, + const Packet32f& c) { + return Packet32f(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf( + Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(HVX_Vector(a)), HVX_Vector(b)), + HVX_Vector(c)))); +} + +#endif // __HVX_ARCH__ >= 68 + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_PACKET_MATH_HVX_H diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 643d64003..29e43d591 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -54,6 +54,8 @@ #elif defined(__AVX__) // 32 bytes static alignment is preferred only if really required #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 +#elif defined __HVX__ && (__HVX_LENGTH__ == 128) + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 128 #else #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 #endif @@ -417,6 +419,12 @@ #include #endif +#elif defined __HVX__ && (__HVX_LENGTH__ == 128) + +#define EIGEN_VECTORIZE +#define EIGEN_VECTORIZE_HVX +#include + #endif #endif diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 67a697650..04f7af482 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -480,6 +480,7 @@ namespace Architecture NEON = 0x4, MSA = 0x5, SVE = 0x6, + HVX = 0x7, #if defined EIGEN_VECTORIZE_SSE Target = SSE #elif defined EIGEN_VECTORIZE_ALTIVEC @@ -492,6 +493,8 @@ namespace Architecture Target = SVE #elif defined EIGEN_VECTORIZE_MSA Target = MSA +#elif defined EIGEN_VECTORIZE_HVX + Target = HVX #else Target = Generic #endif -- GitLab From a9b9db8906ec5431d3c244effc7c32d59d24e02c Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Wed, 12 Jul 2023 23:16:28 -0700 Subject: [PATCH 05/15] Add missing function to pass buildsmoketests --- .../Core/arch/HVX/GeneralBlockPanelKernel.h | 2 +- Eigen/src/Core/arch/HVX/PacketMath.h | 379 +++++++++++++++--- test/main.h | 5 + 3 files changed, 332 insertions(+), 54 deletions(-) diff --git a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h index afce45e2b..eb1b2e6c2 100644 --- a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h @@ -17,7 +17,7 @@ class gebp_traits EIGEN_STRONG_INLINE void madd(const Packet32f& a, const Packet32f& b, - Packet32qf& c, Packet32f& tmp, + Packet32qf& c, Packet32f& /*tmp*/, const LaneIdType&) const { c = pmadd(a, b, c); } diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index 9225fe0dc..1c8f408a9 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -24,36 +24,14 @@ template class HVXPacket { public: HVXPacket() = default; - explicit HVXPacket(HVX_Vector v) : m_val(v) {} - explicit operator HVX_Vector() const { return m_val; } + static HVXPacket Create(HVX_Vector v) { return HVXPacket(v); } + HVX_Vector Get() const { return m_val; } private: + explicit HVXPacket(HVX_Vector v) : m_val(v) {} HVX_Vector m_val = Q6_V_vzero(); }; -// Generic operations. -template -EIGEN_STRONG_INLINE void ptranspose( - PacketBlock, 4>& kernel) { - // zip 0,2 - HVX_VectorPair transpose_0_2 = Q6_W_vshuff_VVR( - HVX_Vector(kernel.packet[2]), HVX_Vector(kernel.packet[0]), -4); - // zip 1,3 - HVX_VectorPair transpose_1_3 = Q6_W_vshuff_VVR( - HVX_Vector(kernel.packet[3]), HVX_Vector(kernel.packet[1]), -4); - // zip 0,1 - HVX_VectorPair transpose_0_1 = Q6_W_vshuff_VVR( - HEXAGON_HVX_GET_V0(transpose_1_3), HEXAGON_HVX_GET_V0(transpose_0_2), -4); - // zip 2,3 - HVX_VectorPair transpose_2_3 = Q6_W_vshuff_VVR( - HEXAGON_HVX_GET_V1(transpose_1_3), HEXAGON_HVX_GET_V1(transpose_0_2), -4); - - kernel.packet[0] = HVXPacket(HEXAGON_HVX_GET_V0(transpose_0_1)); - kernel.packet[1] = HVXPacket(HEXAGON_HVX_GET_V1(transpose_0_1)); - kernel.packet[2] = HVXPacket(HEXAGON_HVX_GET_V0(transpose_2_3)); - kernel.packet[3] = HVXPacket(HEXAGON_HVX_GET_V1(transpose_2_3)); -} - #if __HVX_ARCH__ >= 68 typedef HVXPacket<0> Packet32f; // float32 @@ -89,32 +67,322 @@ EIGEN_STRONG_INLINE Packet32f pset1(const float& from) { union { float f; int32_t i; - } u = {.f = from}; - return Packet32f(Q6_V_vsplat_R(u.i)); + } u; + u.f = from; + return Packet32f::Create(Q6_V_vsplat_R(u.i)); } template <> EIGEN_STRONG_INLINE Packet32f pload(const float* from) { - return Packet32f(*reinterpret_cast(from)); + return Packet32f::Create(*reinterpret_cast(from)); } template <> EIGEN_STRONG_INLINE Packet32f ploadu(const float* from) { - return Packet32f(*reinterpret_cast(from)); + return Packet32f::Create(*reinterpret_cast(from)); } template <> EIGEN_STRONG_INLINE void pstore(float* to, const Packet32f& from) { - *reinterpret_cast(to) = HVX_Vector(from); + *reinterpret_cast(to) = from.Get(); } template <> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet32f& from) { - *reinterpret_cast(to) = HVX_Vector(from); + *reinterpret_cast(to) = from.Get(); +} + +template <> +EIGEN_STRONG_INLINE Packet32f pmul(const Packet32f& a, + const Packet32f& b) { + return Packet32f::Create( + Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()))); +} + +template <> +EIGEN_STRONG_INLINE Packet32f padd(const Packet32f& a, + const Packet32f& b) { + return Packet32f::Create( + Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get()))); +} + +template <> +EIGEN_STRONG_INLINE Packet32f psub(const Packet32f& a, + const Packet32f& b) { + return Packet32f::Create( + Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get()))); +} + +template <> +EIGEN_STRONG_INLINE Packet32f pnegate(const Packet32f& a) { + return psub(Packet32f::Create(Q6_V_vzero()), a); +} + +template <> +EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) { + HVX_Vector v_one = Q6_V_vsplat_R(0x3f800000); // +1 IEEE vsf + HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get()); + return Packet32f::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_one)); +} + +template <> +EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) { + HVX_Vector v_one = Q6_V_vsplat_R(0x3f800000); // +1 IEEE vsf + HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get()); + return Packet32f::Create(Q6_V_vmux_QVV(pred, v_one, Q6_V_vzero())); +} + +template <> +EIGEN_STRONG_INLINE Packet32f pcmp_lt(const Packet32f& a, const Packet32f& b) { + HVX_Vector v_one = Q6_V_vsplat_R(0x3f800000); // +1 IEEE vsf + HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); + return Packet32f::Create(Q6_V_vmux_QVV(pred, v_one, Q6_V_vzero())); } +template <> +EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, + const Packet32f& b) { + // HVX does not support NaN. + HVX_Vector v_one = Q6_V_vsplat_R(0x3f800000); // +1 IEEE vsf + HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); + return Packet32f::Create(Q6_V_vmux_QVV(pred, v_one, Q6_V_vzero())); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) { + float vsf[32] __attribute__((aligned(128))); + pstore(vsf, a); + return vsf[0]; +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + // zip 0,2 + HVX_VectorPair transpose_0_2 = Q6_W_vshuff_VVR( + kernel.packet[2].Get(), kernel.packet[0].Get(), -4); + // zip 1,3 + HVX_VectorPair transpose_1_3 = Q6_W_vshuff_VVR( + kernel.packet[3].Get(), kernel.packet[1].Get(), -4); + // zip 0,1 + HVX_VectorPair transpose_0_1 = Q6_W_vshuff_VVR( + HEXAGON_HVX_GET_V0(transpose_1_3), HEXAGON_HVX_GET_V0(transpose_0_2), -4); + // zip 2,3 + HVX_VectorPair transpose_2_3 = Q6_W_vshuff_VVR( + HEXAGON_HVX_GET_V1(transpose_1_3), HEXAGON_HVX_GET_V1(transpose_0_2), -4); + + kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(transpose_0_1)); + kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(transpose_0_1)); + kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(transpose_2_3)); + kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(transpose_2_3)); +} + +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + //Shuffle the 32-bit lanes. + HVX_VectorPair VD1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), + kernel.packet[0].Get(), -4); + HVX_VectorPair VD3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), + kernel.packet[2].Get(), -4); + HVX_VectorPair VD5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), + kernel.packet[4].Get(), -4); + HVX_VectorPair VD7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), + kernel.packet[6].Get(), -4); + HVX_VectorPair VD9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), + kernel.packet[8].Get(), -4); + HVX_VectorPair VD11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), + kernel.packet[10].Get(), -4); + HVX_VectorPair VD13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), + kernel.packet[12].Get(), -4); + HVX_VectorPair VD15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), + kernel.packet[14].Get(), -4); + HVX_VectorPair VD17_16 = Q6_W_vshuff_VVR(kernel.packet[17].Get(), + kernel.packet[16].Get(), -4); + HVX_VectorPair VD19_18 = Q6_W_vshuff_VVR(kernel.packet[19].Get(), + kernel.packet[18].Get(), -4); + HVX_VectorPair VD21_20 = Q6_W_vshuff_VVR(kernel.packet[21].Get(), + kernel.packet[20].Get(), -4); + HVX_VectorPair VD23_22 = Q6_W_vshuff_VVR(kernel.packet[23].Get(), + kernel.packet[22].Get(), -4); + HVX_VectorPair VD25_24 = Q6_W_vshuff_VVR(kernel.packet[25].Get(), + kernel.packet[24].Get(), -4); + HVX_VectorPair VD27_26 = Q6_W_vshuff_VVR(kernel.packet[27].Get(), + kernel.packet[26].Get(), -4); + HVX_VectorPair VD29_28 = Q6_W_vshuff_VVR(kernel.packet[29].Get(), + kernel.packet[28].Get(), -4); + HVX_VectorPair VD31_30 = Q6_W_vshuff_VVR(kernel.packet[31].Get(), + kernel.packet[30].Get(), -4); + + //Shuffle the 64-bit lanes + HVX_VectorPair VS1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD3_2), + HEXAGON_HVX_GET_V0(VD1_0), -8); + HVX_VectorPair VS3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD3_2), + HEXAGON_HVX_GET_V1(VD1_0), -8); + HVX_VectorPair VS5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD7_6), + HEXAGON_HVX_GET_V0(VD5_4), -8); + HVX_VectorPair VS7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD7_6), + HEXAGON_HVX_GET_V1(VD5_4), -8); + HVX_VectorPair VS9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD11_10), + HEXAGON_HVX_GET_V0(VD9_8), -8); + HVX_VectorPair VS11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD11_10), + HEXAGON_HVX_GET_V1(VD9_8), -8); + HVX_VectorPair VS13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD15_14), + HEXAGON_HVX_GET_V0(VD13_12), -8); + HVX_VectorPair VS15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD15_14), + HEXAGON_HVX_GET_V1(VD13_12), -8); + HVX_VectorPair VS17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD19_18), + HEXAGON_HVX_GET_V0(VD17_16), -8); + HVX_VectorPair VS19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD19_18), + HEXAGON_HVX_GET_V1(VD17_16), -8); + HVX_VectorPair VS21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD23_22), + HEXAGON_HVX_GET_V0(VD21_20), -8); + HVX_VectorPair VS23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD23_22), + HEXAGON_HVX_GET_V1(VD21_20), -8); + HVX_VectorPair VS25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD27_26), + HEXAGON_HVX_GET_V0(VD25_24), -8); + HVX_VectorPair VS27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD27_26), + HEXAGON_HVX_GET_V1(VD25_24), -8); + HVX_VectorPair VS29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD31_30), + HEXAGON_HVX_GET_V0(VD29_28), -8); + HVX_VectorPair VS31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD31_30), + HEXAGON_HVX_GET_V1(VD29_28), -8); + + //Shuffle the 128-bit lanes + VD1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS5_4), + HEXAGON_HVX_GET_V0(VS1_0), -16); + VD3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS5_4), + HEXAGON_HVX_GET_V1(VS1_0), -16); + VD5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS7_6), + HEXAGON_HVX_GET_V0(VS3_2), -16); + VD7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS7_6), + HEXAGON_HVX_GET_V1(VS3_2), -16); + VD9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS13_12), + HEXAGON_HVX_GET_V0(VS9_8), -16); + VD11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS13_12), + HEXAGON_HVX_GET_V1(VS9_8), -16); + VD13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS15_14), + HEXAGON_HVX_GET_V0(VS11_10), -16); + VD15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS15_14), + HEXAGON_HVX_GET_V1(VS11_10), -16); + VD17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS21_20), + HEXAGON_HVX_GET_V0(VS17_16), -16); + VD19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS21_20), + HEXAGON_HVX_GET_V1(VS17_16), -16); + VD21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS23_22), + HEXAGON_HVX_GET_V0(VS19_18), -16); + VD23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS23_22), + HEXAGON_HVX_GET_V1(VS19_18), -16); + VD25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS29_28), + HEXAGON_HVX_GET_V0(VS25_24), -16); + VD27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS29_28), + HEXAGON_HVX_GET_V1(VS25_24), -16); + VD29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS31_30), + HEXAGON_HVX_GET_V0(VS27_26), -16); + VD31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS31_30), + HEXAGON_HVX_GET_V1(VS27_26), -16); + + //Shuffle the 256-bit lanes + VS1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD9_8), + HEXAGON_HVX_GET_V0(VD1_0), -32); + VS3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD9_8), + HEXAGON_HVX_GET_V1(VD1_0), -32); + VS5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD11_10), + HEXAGON_HVX_GET_V0(VD3_2), -32); + VS7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD11_10), + HEXAGON_HVX_GET_V1(VD3_2), -32); + VS9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD13_12), + HEXAGON_HVX_GET_V0(VD5_4), -32); + VS11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD13_12), + HEXAGON_HVX_GET_V1(VD5_4), -32); + VS13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD15_14), + HEXAGON_HVX_GET_V0(VD7_6), -32); + VS15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD15_14), + HEXAGON_HVX_GET_V1(VD7_6), -32); + VS17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD25_24), + HEXAGON_HVX_GET_V0(VD17_16), -32); + VS19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD25_24), + HEXAGON_HVX_GET_V1(VD17_16), -32); + VS21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD27_26), + HEXAGON_HVX_GET_V0(VD19_18), -32); + VS23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD27_26), + HEXAGON_HVX_GET_V1(VD19_18), -32); + VS25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD29_28), + HEXAGON_HVX_GET_V0(VD21_20), -32); + VS27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD29_28), + HEXAGON_HVX_GET_V1(VD21_20), -32); + VS29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD31_30), + HEXAGON_HVX_GET_V0(VD23_22), -32); + VS31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD31_30), + HEXAGON_HVX_GET_V1(VD23_22), -32); + + //Shuffle the 512-bit lanes + VD1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS17_16), + HEXAGON_HVX_GET_V0(VS1_0), -64); + VD3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS17_16), + HEXAGON_HVX_GET_V1(VS1_0), -64); + VD5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS19_18), + HEXAGON_HVX_GET_V0(VS3_2), -64); + VD7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS19_18), + HEXAGON_HVX_GET_V1(VS3_2), -64); + VD9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS21_20), + HEXAGON_HVX_GET_V0(VS5_4), -64); + VD11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS21_20), + HEXAGON_HVX_GET_V1(VS5_4), -64); + VD13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS23_22), + HEXAGON_HVX_GET_V0(VS7_6), -64); + VD15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS23_22), + HEXAGON_HVX_GET_V1(VS7_6), -64); + VD17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS25_24), + HEXAGON_HVX_GET_V0(VS9_8), -64); + VD19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS25_24), + HEXAGON_HVX_GET_V1(VS9_8), -64); + VD21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS27_26), + HEXAGON_HVX_GET_V0(VS11_10), -64); + VD23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS27_26), + HEXAGON_HVX_GET_V1(VS11_10), -64); + VD25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS29_28), + HEXAGON_HVX_GET_V0(VS13_12), -64); + VD27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS29_28), + HEXAGON_HVX_GET_V1(VS13_12), -64); + VD29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS31_30), + HEXAGON_HVX_GET_V0(VS15_14), -64); + VD31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS31_30), + HEXAGON_HVX_GET_V1(VS15_14), -64); + + kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD1_0)); + kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD1_0)); + kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD3_2)); + kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD3_2)); + kernel.packet[4] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD5_4)); + kernel.packet[5] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD5_4)); + kernel.packet[6] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD7_6)); + kernel.packet[7] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD7_6)); + kernel.packet[8] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD9_8)); + kernel.packet[9] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD9_8)); + kernel.packet[10] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD11_10)); + kernel.packet[11] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD11_10)); + kernel.packet[12] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD13_12)); + kernel.packet[13] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD13_12)); + kernel.packet[14] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD15_14)); + kernel.packet[15] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD15_14)); + kernel.packet[16] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD17_16)); + kernel.packet[17] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD17_16)); + kernel.packet[18] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD19_18)); + kernel.packet[19] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD19_18)); + kernel.packet[20] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD21_20)); + kernel.packet[21] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD21_20)); + kernel.packet[22] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD23_22)); + kernel.packet[23] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD23_22)); + kernel.packet[24] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD25_24)); + kernel.packet[25] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD25_24)); + kernel.packet[26] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD27_26)); + kernel.packet[27] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD27_26)); + kernel.packet[28] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD29_28)); + kernel.packet[29] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD29_28)); + kernel.packet[30] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD31_30)); + kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD31_30)); +} + + template <> EIGEN_STRONG_INLINE float predux(const Packet32f& a) { HVX_Vector vsum_4 = Q6_Vqf32_vadd_VsfVsf( - Q6_V_vror_VR(HVX_Vector(a), 4), HVX_Vector(a)); + Q6_V_vror_VR(a.Get(), 4), a.Get()); HVX_Vector vsum_8 = Q6_Vqf32_vadd_Vqf32Vqf32( Q6_V_vror_VR(vsum_4, 8), vsum_4); HVX_Vector vsum_16 = Q6_Vqf32_vadd_Vqf32Vqf32( @@ -123,58 +391,63 @@ EIGEN_STRONG_INLINE float predux(const Packet32f& a) { Q6_V_vror_VR(vsum_16, 32), vsum_16); HVX_Vector vsum_64 = Q6_Vqf32_vadd_Vqf32Vqf32( Q6_V_vror_VR(vsum_32, 64), vsum_32); + return pfirst(Packet32f::Create(Q6_Vsf_equals_Vqf32(vsum_64))); +} - union { - float f; - int32_t i; - } u = {.i = Q6_R_vextract_VR(Q6_Vsf_equals_Vqf32(vsum_64), 0)}; - return u.f; +template <> +EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) { + return pset1(*from); } template <> -EIGEN_STRONG_INLINE Packet32f pmul(const Packet32f& a, - const Packet32f& b) { - return Packet32f( - Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(HVX_Vector(a), HVX_Vector(b)))); +EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) { + return Packet32f::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get())); } template <> -EIGEN_STRONG_INLINE Packet32f padd(const Packet32f& a, - const Packet32f& b) { - return Packet32f( - Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(HVX_Vector(a), HVX_Vector(b)))); +EIGEN_STRONG_INLINE Packet32f pmax(const Packet32f& a, const Packet32f& b) { + return Packet32f::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get())); +} + +static const float index_vsf[32] __attribute__((aligned(128))) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +}; + +template <> +EIGEN_STRONG_INLINE Packet32f plset(const float& a) { + return padd(pload(index_vsf), pset1(a)); } // qfloat32 operations. template <> EIGEN_STRONG_INLINE Packet32qf pzero(const Packet32qf&) { - return Packet32qf(Q6_V_vzero()); + return Packet32qf::Create(Q6_V_vzero()); } template <> EIGEN_STRONG_INLINE Packet32qf pmul(const Packet32qf& a, const Packet32qf& b) { - return Packet32qf(Q6_Vqf32_vmpy_Vqf32Vqf32(HVX_Vector(a), HVX_Vector(b))); + return Packet32qf::Create(Q6_Vqf32_vmpy_Vqf32Vqf32(a.Get(), b.Get())); } template <> EIGEN_STRONG_INLINE Packet32qf padd(const Packet32qf& a, const Packet32qf& b) { - return Packet32qf(Q6_Vqf32_vadd_Vqf32Vqf32(HVX_Vector(a), HVX_Vector(b))); + return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32(a.Get(), b.Get())); } // Mixed float32 and qfloat32 operations. EIGEN_STRONG_INLINE Packet32qf pmadd(const Packet32f& a, const Packet32f& b, const Packet32qf& c) { - return Packet32qf(Q6_Vqf32_vadd_Vqf32Vqf32( - Q6_Vqf32_vmpy_VsfVsf(HVX_Vector(a), HVX_Vector(b)), HVX_Vector(c))); + return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32( + Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()), c.Get())); } EIGEN_STRONG_INLINE Packet32f pmadd(const Packet32qf& a, const Packet32f& b, const Packet32f& c) { - return Packet32f(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf( - Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(HVX_Vector(a)), HVX_Vector(b)), - HVX_Vector(c)))); + return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf( + Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(a.Get()), b.Get()), c.Get()))); } #endif // __HVX_ARCH__ >= 68 diff --git a/test/main.h b/test/main.h index 99149ca4f..82566ecde 100644 --- a/test/main.h +++ b/test/main.h @@ -86,6 +86,11 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #endif +#if defined __HVX__ && (__HVX_LENGTH__ == 128) +// Need to prevent conflict FORBIDDEN_IDENTIFIER B0. +#include +#endif + // To test that all calls from Eigen code to std::min() and std::max() are // protected by parenthesis against macro expansion, the min()/max() macros // are defined here and any not-parenthesized min/max call will cause a -- GitLab From 5078c46b6d504731eb533588419d8b4aa32290c9 Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Wed, 26 Jul 2023 09:46:48 -0700 Subject: [PATCH 06/15] Add more packet math operations. --- .../Core/arch/HVX/GeneralBlockPanelKernel.h | 19 +- Eigen/src/Core/arch/HVX/PacketMath.h | 291 +++++++++++------- 2 files changed, 199 insertions(+), 111 deletions(-) diff --git a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h index eb1b2e6c2..afc9edd8a 100644 --- a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h @@ -1,10 +1,15 @@ +#ifndef EIGEN_HVX_GENERAL_BLOCK_KERNEL_H +#define EIGEN_HVX_GENERAL_BLOCK_KERNEL_H -#ifndef EIGEN_CORE_ARCH_HVX_GENERAL_BLOCK_KERNEL_H -#define EIGEN_CORE_ARCH_HVX_GENERAL_BLOCK_KERNEL_H +#if defined __HVX__ && (__HVX_LENGTH__ == 128) + +#include "hexagon_types.h" namespace Eigen { namespace internal { +#if __HVX_ARCH__ >= 68 + template class gebp_traits @@ -19,7 +24,7 @@ class gebp_traits @@ -31,11 +36,15 @@ class gebp_traits= 68 + } // end namespace internal } // end namespace Eigen -#endif // EIGEN_CORE_ARCH_HVX_GENERAL_BLOCK_KERNEL_H +#endif // __HVX__ && (__HVX_LENGTH__ == 128) + +#endif // EIGEN_HVX_GENERAL_BLOCK_KERNEL_H diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index 1c8f408a9..4af00af4b 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -1,14 +1,10 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com) -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_HVX_H -#define EIGEN_PACKET_MATH_HVX_H + +#ifndef EIGEN_HVX_PACKET_MATH_H +#define EIGEN_HVX_PACKET_MATH_H + +#if defined __HVX__ && (__HVX_LENGTH__ == 128) + +#include "hexagon_types.h" #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 @@ -17,6 +13,10 @@ namespace Eigen { namespace internal { +// These defines borrows from qhmath internal header, qhmath_hvx_convert.h +#define vmem(A) *((HVX_Vector*)(A)) +#define vmemu(A) *((HVX_UVector*)(A)) + // Hexagon compiler uses same HVX_Vector to represent all HVX vector types. // Wrap different vector type (float32, int32, etc) to different class with // explicit constructor and casting back-and-force to HVX_Vector. @@ -45,6 +45,7 @@ struct packet_traits : default_packet_traits { Vectorizable = 1, AlignedOnScalar = 1, size = 32, + HasSign = 0, }; }; @@ -74,20 +75,20 @@ EIGEN_STRONG_INLINE Packet32f pset1(const float& from) { template <> EIGEN_STRONG_INLINE Packet32f pload(const float* from) { - return Packet32f::Create(*reinterpret_cast(from)); + return Packet32f::Create(vmem(from)); } template <> EIGEN_STRONG_INLINE Packet32f ploadu(const float* from) { - return Packet32f::Create(*reinterpret_cast(from)); + return Packet32f::Create(vmemu(from)); } template <> EIGEN_STRONG_INLINE void pstore(float* to, const Packet32f& from) { - *reinterpret_cast(to) = from.Get(); + vmem(to) = from.Get(); } template <> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet32f& from) { - *reinterpret_cast(to) = from.Get(); + vmemu(to) = from.Get(); } template <> @@ -118,32 +119,37 @@ EIGEN_STRONG_INLINE Packet32f pnegate(const Packet32f& a) { template <> EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) { - HVX_Vector v_one = Q6_V_vsplat_R(0x3f800000); // +1 IEEE vsf + HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_one)); + return Packet32f::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true)); } template <> EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) { - HVX_Vector v_one = Q6_V_vsplat_R(0x3f800000); // +1 IEEE vsf + HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, v_one, Q6_V_vzero())); + return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); } template <> EIGEN_STRONG_INLINE Packet32f pcmp_lt(const Packet32f& a, const Packet32f& b) { - HVX_Vector v_one = Q6_V_vsplat_R(0x3f800000); // +1 IEEE vsf + HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, v_one, Q6_V_vzero())); + return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); } template <> EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) { - // HVX does not support NaN. - HVX_Vector v_one = Q6_V_vsplat_R(0x3f800000); // +1 IEEE vsf + HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); - return Packet32f::Create(Q6_V_vmux_QVV(pred, v_one, Q6_V_vzero())); + return Packet32f::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); +} + +template <> +EIGEN_STRONG_INLINE Packet32f pabs(const Packet32f& a) { + HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), Q6_V_vzero()); + return Packet32f::Create(Q6_V_vmux_QVV(pred, a.Get(), pnegate(a).Get())); } template <> @@ -155,11 +161,11 @@ EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) { EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { // zip 0,2 - HVX_VectorPair transpose_0_2 = Q6_W_vshuff_VVR( - kernel.packet[2].Get(), kernel.packet[0].Get(), -4); + HVX_VectorPair transpose_0_2 = + Q6_W_vshuff_VVR(kernel.packet[2].Get(), kernel.packet[0].Get(), -4); // zip 1,3 - HVX_VectorPair transpose_1_3 = Q6_W_vshuff_VVR( - kernel.packet[3].Get(), kernel.packet[1].Get(), -4); + HVX_VectorPair transpose_1_3 = + Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[1].Get(), -4); // zip 0,1 HVX_VectorPair transpose_0_1 = Q6_W_vshuff_VVR( HEXAGON_HVX_GET_V0(transpose_1_3), HEXAGON_HVX_GET_V0(transpose_0_2), -4); @@ -174,49 +180,49 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { } EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - //Shuffle the 32-bit lanes. - HVX_VectorPair VD1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), - kernel.packet[0].Get(), -4); - HVX_VectorPair VD3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), - kernel.packet[2].Get(), -4); - HVX_VectorPair VD5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), - kernel.packet[4].Get(), -4); - HVX_VectorPair VD7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), - kernel.packet[6].Get(), -4); - HVX_VectorPair VD9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), - kernel.packet[8].Get(), -4); - HVX_VectorPair VD11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), - kernel.packet[10].Get(), -4); - HVX_VectorPair VD13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), - kernel.packet[12].Get(), -4); - HVX_VectorPair VD15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), - kernel.packet[14].Get(), -4); - HVX_VectorPair VD17_16 = Q6_W_vshuff_VVR(kernel.packet[17].Get(), - kernel.packet[16].Get(), -4); - HVX_VectorPair VD19_18 = Q6_W_vshuff_VVR(kernel.packet[19].Get(), - kernel.packet[18].Get(), -4); - HVX_VectorPair VD21_20 = Q6_W_vshuff_VVR(kernel.packet[21].Get(), - kernel.packet[20].Get(), -4); - HVX_VectorPair VD23_22 = Q6_W_vshuff_VVR(kernel.packet[23].Get(), - kernel.packet[22].Get(), -4); - HVX_VectorPair VD25_24 = Q6_W_vshuff_VVR(kernel.packet[25].Get(), - kernel.packet[24].Get(), -4); - HVX_VectorPair VD27_26 = Q6_W_vshuff_VVR(kernel.packet[27].Get(), - kernel.packet[26].Get(), -4); - HVX_VectorPair VD29_28 = Q6_W_vshuff_VVR(kernel.packet[29].Get(), - kernel.packet[28].Get(), -4); - HVX_VectorPair VD31_30 = Q6_W_vshuff_VVR(kernel.packet[31].Get(), - kernel.packet[30].Get(), -4); - - //Shuffle the 64-bit lanes - HVX_VectorPair VS1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD3_2), - HEXAGON_HVX_GET_V0(VD1_0), -8); - HVX_VectorPair VS3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD3_2), - HEXAGON_HVX_GET_V1(VD1_0), -8); - HVX_VectorPair VS5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD7_6), - HEXAGON_HVX_GET_V0(VD5_4), -8); - HVX_VectorPair VS7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD7_6), - HEXAGON_HVX_GET_V1(VD5_4), -8); + // Shuffle the 32-bit lanes. + HVX_VectorPair VD1_0 = + Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4); + HVX_VectorPair VD3_2 = + Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4); + HVX_VectorPair VD5_4 = + Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4); + HVX_VectorPair VD7_6 = + Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4); + HVX_VectorPair VD9_8 = + Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4); + HVX_VectorPair VD11_10 = + Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4); + HVX_VectorPair VD13_12 = + Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4); + HVX_VectorPair VD15_14 = + Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4); + HVX_VectorPair VD17_16 = + Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4); + HVX_VectorPair VD19_18 = + Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4); + HVX_VectorPair VD21_20 = + Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4); + HVX_VectorPair VD23_22 = + Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4); + HVX_VectorPair VD25_24 = + Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4); + HVX_VectorPair VD27_26 = + Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4); + HVX_VectorPair VD29_28 = + Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4); + HVX_VectorPair VD31_30 = + Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4); + + // Shuffle the 64-bit lanes + HVX_VectorPair VS1_0 = + Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD3_2), HEXAGON_HVX_GET_V0(VD1_0), -8); + HVX_VectorPair VS3_2 = + Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD3_2), HEXAGON_HVX_GET_V1(VD1_0), -8); + HVX_VectorPair VS5_4 = + Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD7_6), HEXAGON_HVX_GET_V0(VD5_4), -8); + HVX_VectorPair VS7_6 = + Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD7_6), HEXAGON_HVX_GET_V1(VD5_4), -8); HVX_VectorPair VS9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD11_10), HEXAGON_HVX_GET_V0(VD9_8), -8); HVX_VectorPair VS11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD11_10), @@ -242,15 +248,15 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { HVX_VectorPair VS31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD31_30), HEXAGON_HVX_GET_V1(VD29_28), -8); - //Shuffle the 128-bit lanes - VD1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS5_4), - HEXAGON_HVX_GET_V0(VS1_0), -16); - VD3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS5_4), - HEXAGON_HVX_GET_V1(VS1_0), -16); - VD5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS7_6), - HEXAGON_HVX_GET_V0(VS3_2), -16); - VD7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS7_6), - HEXAGON_HVX_GET_V1(VS3_2), -16); + // Shuffle the 128-bit lanes + VD1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS5_4), HEXAGON_HVX_GET_V0(VS1_0), + -16); + VD3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS5_4), HEXAGON_HVX_GET_V1(VS1_0), + -16); + VD5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS7_6), HEXAGON_HVX_GET_V0(VS3_2), + -16); + VD7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS7_6), HEXAGON_HVX_GET_V1(VS3_2), + -16); VD9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS13_12), HEXAGON_HVX_GET_V0(VS9_8), -16); VD11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS13_12), @@ -276,11 +282,11 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { VD31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS31_30), HEXAGON_HVX_GET_V1(VS27_26), -16); - //Shuffle the 256-bit lanes - VS1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD9_8), - HEXAGON_HVX_GET_V0(VD1_0), -32); - VS3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD9_8), - HEXAGON_HVX_GET_V1(VD1_0), -32); + // Shuffle the 256-bit lanes + VS1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD9_8), HEXAGON_HVX_GET_V0(VD1_0), + -32); + VS3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD9_8), HEXAGON_HVX_GET_V1(VD1_0), + -32); VS5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD11_10), HEXAGON_HVX_GET_V0(VD3_2), -32); VS7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD11_10), @@ -310,7 +316,7 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { VS31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD31_30), HEXAGON_HVX_GET_V1(VD23_22), -32); - //Shuffle the 512-bit lanes + // Shuffle the 512-bit lanes VD1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS17_16), HEXAGON_HVX_GET_V0(VS1_0), -64); VD3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS17_16), @@ -378,25 +384,39 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD31_30)); } - template <> EIGEN_STRONG_INLINE float predux(const Packet32f& a) { - HVX_Vector vsum_4 = Q6_Vqf32_vadd_VsfVsf( - Q6_V_vror_VR(a.Get(), 4), a.Get()); - HVX_Vector vsum_8 = Q6_Vqf32_vadd_Vqf32Vqf32( - Q6_V_vror_VR(vsum_4, 8), vsum_4); - HVX_Vector vsum_16 = Q6_Vqf32_vadd_Vqf32Vqf32( - Q6_V_vror_VR(vsum_8, 16), vsum_8); - HVX_Vector vsum_32 = Q6_Vqf32_vadd_Vqf32Vqf32( - Q6_V_vror_VR(vsum_16, 32), vsum_16); - HVX_Vector vsum_64 = Q6_Vqf32_vadd_Vqf32Vqf32( - Q6_V_vror_VR(vsum_32, 64), vsum_32); + HVX_Vector vsum_4 = Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), 4), a.Get()); + HVX_Vector vsum_8 = Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_4, 8), vsum_4); + HVX_Vector vsum_16 = + Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_8, 16), vsum_8); + HVX_Vector vsum_32 = + Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_16, 32), vsum_16); + HVX_Vector vsum_64 = + Q6_Vqf32_vadd_Vqf32Vqf32(Q6_V_vror_VR(vsum_32, 64), vsum_32); return pfirst(Packet32f::Create(Q6_Vsf_equals_Vqf32(vsum_64))); } template <> EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) { - return pset1(*from); + HVX_Vector load = vmemu(from); + HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4); + return Packet32f::Create(HEXAGON_HVX_GET_V0(dup)); +} + +template <> +EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) { + HVX_Vector load = vmemu(from); + HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4); + HVX_VectorPair quad = + Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8); + return Packet32f::Create(HEXAGON_HVX_GET_V0(quad)); +} + +template <> +EIGEN_STRONG_INLINE Packet32f preverse(const Packet32f& a) { + HVX_Vector delta = Q6_Vb_vsplat_R(0x7c); + return Packet32f::Create(Q6_V_vdelta_VV(a.Get(), delta)); } template <> @@ -409,10 +429,65 @@ EIGEN_STRONG_INLINE Packet32f pmax(const Packet32f& a, const Packet32f& b) { return Packet32f::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get())); } +template <> +EIGEN_STRONG_INLINE Packet32f pand(const Packet32f& a, const Packet32f& b) { + return Packet32f::Create(a.Get() & b.Get()); +} + +template <> +EIGEN_STRONG_INLINE Packet32f por(const Packet32f& a, const Packet32f& b) { + return Packet32f::Create(a.Get() | b.Get()); +} + +template <> +EIGEN_STRONG_INLINE Packet32f pxor(const Packet32f& a, const Packet32f& b) { + return Packet32f::Create(a.Get() ^ b.Get()); +} + +template <> +EIGEN_STRONG_INLINE Packet32f pnot(const Packet32f& a) { + return Packet32f::Create(~a.Get()); +} + +template <> +EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, + const Packet32f& b) { + HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero()); + return Packet32f::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get())); +} + +template +EIGEN_STRONG_INLINE float predux_generic(const Packet32f& a, Op op) { + Packet32f vredux_4 = op(Packet32f::Create(Q6_V_vror_VR(a.Get(), 4)), a); + Packet32f vredux_8 = + op(Packet32f::Create(Q6_V_vror_VR(vredux_4.Get(), 8)), vredux_4); + Packet32f vredux_16 = + op(Packet32f::Create(Q6_V_vror_VR(vredux_8.Get(), 16)), vredux_8); + Packet32f vredux_32 = + op(Packet32f::Create(Q6_V_vror_VR(vredux_16.Get(), 32)), vredux_16); + Packet32f vredux_64 = + op(Packet32f::Create(Q6_V_vror_VR(vredux_32.Get(), 64)), vredux_32); + return pfirst(vredux_64); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet32f& a) { + return predux_generic(a, pmax); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet32f& a) { + return predux_generic(a, pmin); +} + +template <> +EIGEN_STRONG_INLINE bool predux_any(const Packet32f& a) { + return predux_generic(a, por) != 0.0f; +} + static const float index_vsf[32] __attribute__((aligned(128))) = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 -}; + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; template <> EIGEN_STRONG_INLINE Packet32f plset(const float& a) { @@ -438,14 +513,16 @@ EIGEN_STRONG_INLINE Packet32qf padd(const Packet32qf& a, } // Mixed float32 and qfloat32 operations. -EIGEN_STRONG_INLINE Packet32qf pmadd(const Packet32f& a, const Packet32f& b, - const Packet32qf& c) { +EIGEN_STRONG_INLINE Packet32qf pmadd_f32_to_qf32(const Packet32f& a, + const Packet32f& b, + const Packet32qf& c) { return Packet32qf::Create(Q6_Vqf32_vadd_Vqf32Vqf32( Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()), c.Get())); } -EIGEN_STRONG_INLINE Packet32f pmadd(const Packet32qf& a, const Packet32f& b, - const Packet32f& c) { +EIGEN_STRONG_INLINE Packet32f pmadd_qf32_to_f32(const Packet32qf& a, + const Packet32f& b, + const Packet32f& c) { return Packet32f::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf( Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(a.Get()), b.Get()), c.Get()))); } @@ -455,4 +532,6 @@ EIGEN_STRONG_INLINE Packet32f pmadd(const Packet32qf& a, const Packet32f& b, } // end namespace internal } // end namespace Eigen -#endif // EIGEN_PACKET_MATH_HVX_H +#endif // __HVX__ && (__HVX_LENGTH__ == 128) + +#endif // EIGEN_HVX_PACKET_MATH_H -- GitLab From df211706ae41f8072993473fe02feaeb5f095410 Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Wed, 26 Jul 2023 09:48:52 -0700 Subject: [PATCH 07/15] Remove HexagonToolchain.cmake --- cmake/HexagonToolchain.cmake | 158 ----------------------------------- 1 file changed, 158 deletions(-) delete mode 100644 cmake/HexagonToolchain.cmake diff --git a/cmake/HexagonToolchain.cmake b/cmake/HexagonToolchain.cmake deleted file mode 100644 index 37e9eca10..000000000 --- a/cmake/HexagonToolchain.cmake +++ /dev/null @@ -1,158 +0,0 @@ -# =============================================================================== -# Copyright 2018 Qualcomm Technologies, Inc. All rights reserved. -# Confidential & Proprietary -# =============================================================================== - -if(HEXAGON_TOOLCHAIN_INCLUDED) - return() -endif(HEXAGON_TOOLCHAIN_INCLUDED) -set(HEXAGON_TOOLCHAIN_INCLUDED true) - -set(EIGEN_TEST_HVX ON) - -if (NOT DSP_VERSION) - set(DSP_VERSION v69) -endif() - -set(TOOLS_VARIANT $ENV{DEFAULT_TOOLS_VARIANT}) -set(PREBUILT_LIB_DIR hexagon_${TOOLS_VARIANT}_${DSP_VERSION}) - -# Cross Compiling for Hexagon -set(HEXAGON TRUE) -set(CMAKE_SYSTEM_NAME QURT) -set(CMAKE_SYSTEM_PROCESSOR Hexagon) -set(CMAKE_SYSTEM_VERSION "1") #${HEXAGON_PLATFORM_LEVEL}) -set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) -set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) -set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) -set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) -set(CUSTOM_RUNELF_PATH "") - -# To fix backward compatibility with EAI addon. -if (NOT HEXAGON_SDK_ROOT) - set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT}) -endif() - -if (NOT HEXAGON_TOOLS_ROOT) - if (DEFINED ENV{HEXAGON_TOOLS_ROOT}) - set(HEXAGON_TOOLS_ROOT $ENV{HEXAGON_TOOLS_ROOT}) - endif() - if(NOT HEXAGON_TOOLS_ROOT) - set(HEXAGON_TOOLS_ROOT $ENV{DEFAULT_HEXAGON_TOOLS_ROOT}) - endif() -endif() - -file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT) -file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT) - -include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_arch.cmake) - -set(HEXAGON_TOOLCHAIN ${HEXAGON_TOOLS_ROOT}) -set(HEXAGON_LIB_DIR "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib") -set(HEXAGON_ISS_DIR ${HEXAGON_TOOLCHAIN}/Tools/lib/iss) -set(RUN_MAIN_HEXAGON "${HEXAGON_SDK_ROOT}/libs/run_main_on_hexagon/ship/${PREBUILT_LIB_DIR}/run_main_on_hexagon_sim") - -set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES - HEXAGON_SDK_ROOT - HEXAGON_TOOLS_ROOT -) - -#QURT SPECIFIC LIBS and Includes -# Linker Flags -# QURT Related includes and linker flags - -set(V_ARCH ${HEXAGON_ARCH}) -set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/ADSP${V_ARCH}MP${V_ARCH_EXTN}") -set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/compute${V_ARCH}${V_ARCH_EXTN}") - -message(DEBUG "_QURT_INSTALL_DIR:${_QURT_INSTALL_DIR}") -set(RTOS_DIR ${_QURT_INSTALL_DIR}) -set(TARGET_DIR "${HEXAGON_LIB_DIR}/${V_ARCH}/G0") -include_directories( - ${_QURT_INSTALL_DIR}/include - ${_QURT_INSTALL_DIR}/include/qurt - ${_QURT_INSTALL_DIR}/include/posix - ) - -# Non QURT related includes and linker flags -set(TARGET_DIR_NOOS "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/${HEXAGON_ARCH}") - -set(EXE_LD_FLAGS - -m${V_ARCH} - -G0 - -fpic - -Wl,-Bsymbolic - -Wl,-L${TARGET_DIR_NOOS}/G0/pic - -Wl,-L${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/ - -Wl,--no-threads -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign - -shared - "-o " - "" - -Wl,--start-group - "" - "" - -Wl,${TARGET_DIR_NOOS}/G0/pic/libc++.a - -Wl,${TARGET_DIR_NOOS}/G0/pic/libc++abi.a - -Wl,--end-group - -lc - ) - STRING(REPLACE ";" " " EXE_LD_FLAGS "${EXE_LD_FLAGS}") - -set(HEXAGON_C_LINK_EXECUTABLE_LINK_OPTIONS "${EXE_LD_FLAGS}" ) -message(DEBUG "Hexagon C Executable Linker Line:${HEXAGON_C_LINK_EXECUTABLE_LINK_OPTIONS}") -set(HEXAGON_CXX_LINK_EXECUTABLE_LINK_OPTIONS "${EXE_LD_FLAGS}") -message(DEBUG "Hexagon CXX Executable Linker Line:${HEXAGON_CXX_LINK_EXECUTABLE_LINK_OPTIONS}") - -# System include paths -include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs) -include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef) -include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/ipc/fastrpc/incs) - -# LLVM toolchain setup -# Compiler paths, options and architecture -set(CMAKE_C_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang${HEXAGON_TOOLCHAIN_SUFFIX}) -set(CMAKE_CXX_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX}) -set(CMAKE_AR ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-ar${HEXAGON_TOOLCHAIN_SUFFIX}) -set(CMAKE_ASM_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX}) -set(HEXAGON_LINKER ${CMAKE_C_COMPILER}) -set(CMAKE_PREFIX_PATH ${HEXAGON_TOOLCHAIN}/Tools/target/hexagon) -set(HEXAGON_SIM "${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-sim${HEXAGON_TOOLCHAIN_SUFFIX}") -set(DEBUG_FLAGS "-O0 -g") -set(RELEASE_FLAGS "-O2") -set(COMMON_FLAGS "-m${HEXAGON_ARCH} -G0 -Wall -fno-zero-initialized-in-bss -fdata-sections -fpic") - -set(COMMON_FLAGS "${COMMON_FLAGS} -mhvx -mhvx-length=128B") - -set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-Wl,-soname,") -set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Wl,-soname,") - -set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} ${DEBUG_FLAGS}") -set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} ${RELEASE_FLAGS}") -set(CMAKE_C_FLAGS_DEBUG "${COMMON_FLAGS} ${DEBUG_FLAGS}") -set(CMAKE_C_FLAGS_RELEASE "${COMMON_FLAGS} ${RELEASE_FLAGS}") -if(ADD_SYMBOLS) - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -g ") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -g ") -endif() -set(CMAKE_ASM_FLAGS_DEBUG "${CMAKE_ASM_FLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}") -set(CMAKE_ASM_FLAGS_RELEASE "${CMAKE_ASM_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}") - -# Linker Options -set(CMAKE_C_LINK_EXECUTABLE "${HEXAGON_LINKER} ${HEXAGON_C_LINK_EXECUTABLE_LINK_OPTIONS}") -set(CMAKE_C_CREATE_SHARED_LIBRARY "${HEXAGON_LINKER} ${HEXAGON_PIC_SHARED_LINK_OPTIONS}") -set(CMAKE_CXX_LINK_EXECUTABLE "${HEXAGON_LINKER} ${HEXAGON_CXX_LINK_EXECUTABLE_LINK_OPTIONS}") -set(CMAKE_CXX_CREATE_SHARED_LIBRARY "${HEXAGON_LINKER} ${HEXAGON_PIC_SHARED_LINK_OPTIONS}") - -# Run simulator -set(CUSTOM_RUNELF_PATH ${RTOS_DIR}/sdksim_bin/runelf.pbn) - -set(q6ssLine1 "${HEXAGON_ISS_DIR}/qtimer.so --csr_base=0xFC900000 --irq_p=1 --freq=19200000 --cnttid=1\n") -set(q6ssLine2 "${HEXAGON_ISS_DIR}/l2vic.so 32 0xFC910000\n") -set(osamString "${RTOS_DIR}/debugger/lnx64/qurt_model.so\n") -file(WRITE ${CMAKE_BINARY_DIR}/q6ss.cfg ${q6ssLine1}) -file(APPEND ${CMAKE_BINARY_DIR}/q6ss.cfg ${q6ssLine2}) -file(WRITE ${CMAKE_BINARY_DIR}/osam.cfg ${osamString}) - -set(CMAKE_CROSSCOMPILING_EMULATOR - ${HEXAGON_SIM};-m${SIM_V_ARCH};--simulated_returnval;--usefs;${CMAKE_CURRENT_BINARY_DIR};--cosim_file;${CMAKE_BINARY_DIR}/q6ss.cfg;--l2tcm_base;0xd800;--rtos;${CMAKE_BINARY_DIR}/osam.cfg;${CUSTOM_RUNELF_PATH};--;${RUN_MAIN_HEXAGON};--) - -- GitLab From 77b70f4832884b79eb96b62fe61a58aed0d4f104 Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Wed, 26 Jul 2023 10:39:21 -0700 Subject: [PATCH 08/15] cleanup --- cmake/EigenTesting.cmake | 5 +++-- test/main.h | 5 ----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 2b69ca9ac..2022cf001 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -1,3 +1,4 @@ + macro(ei_add_property prop value) get_property(previous GLOBAL PROPERTY ${prop}) if ((NOT previous) OR (previous STREQUAL "")) @@ -74,7 +75,7 @@ macro(ei_add_test_internal testname testname_with_suffix) # let the user pass flags. if(${ARGC} GREATER 2) - target_compile_options(${targetname} PRIVATE "SHELL:${ARGV2}") + target_compile_options(${targetname} PRIVATE ${ARGV2}) endif() if(EIGEN_TEST_CUSTOM_CXX_FLAGS) @@ -104,7 +105,7 @@ macro(ei_add_test_internal testname testname_with_suffix) endif() endif() - add_test(NAME ${testname_with_suffix} COMMAND "${targetname}") + add_test(${testname_with_suffix} "${targetname}") # Specify target and test labels according to EIGEN_CURRENT_SUBPROJECT get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT) diff --git a/test/main.h b/test/main.h index 82566ecde..99149ca4f 100644 --- a/test/main.h +++ b/test/main.h @@ -86,11 +86,6 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #endif -#if defined __HVX__ && (__HVX_LENGTH__ == 128) -// Need to prevent conflict FORBIDDEN_IDENTIFIER B0. -#include -#endif - // To test that all calls from Eigen code to std::min() and std::max() are // protected by parenthesis against macro expansion, the min()/max() macros // are defined here and any not-parenthesized min/max call will cause a -- GitLab From 048f5d694b90c4d4e3c14dca9ff4e3189155ee45 Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Wed, 26 Jul 2023 10:40:52 -0700 Subject: [PATCH 09/15] cleanup --- Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h | 2 -- Eigen/src/Core/arch/HVX/PacketMath.h | 2 -- 2 files changed, 4 deletions(-) diff --git a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h index afc9edd8a..da54cf1ae 100644 --- a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h @@ -3,8 +3,6 @@ #if defined __HVX__ && (__HVX_LENGTH__ == 128) -#include "hexagon_types.h" - namespace Eigen { namespace internal { diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index 4af00af4b..220e380d3 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -4,8 +4,6 @@ #if defined __HVX__ && (__HVX_LENGTH__ == 128) -#include "hexagon_types.h" - #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif -- GitLab From 2c7f5055ba9aa48d4656217e6d9a96ff4824e231 Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Wed, 26 Jul 2023 12:34:27 -0700 Subject: [PATCH 10/15] cleanup. --- Eigen/src/Core/arch/HVX/PacketMath.h | 1 - 1 file changed, 1 deletion(-) diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index 220e380d3..d555447af 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -43,7 +43,6 @@ struct packet_traits : default_packet_traits { Vectorizable = 1, AlignedOnScalar = 1, size = 32, - HasSign = 0, }; }; -- GitLab From 6d5ab120652b4bf228076f02e772bd580031fda2 Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Wed, 26 Jul 2023 12:40:12 -0700 Subject: [PATCH 11/15] Add comments for floating-point support from V68. --- Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h | 1 + Eigen/src/Core/arch/HVX/PacketMath.h | 1 + 2 files changed, 2 insertions(+) diff --git a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h index da54cf1ae..5cfc2cdb8 100644 --- a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h @@ -6,6 +6,7 @@ namespace Eigen { namespace internal { +// Floating-point operations are only supported since V68. #if __HVX_ARCH__ >= 68 template diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index d555447af..0f31deebe 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -30,6 +30,7 @@ class HVXPacket { HVX_Vector m_val = Q6_V_vzero(); }; +// Floating-point operations are supported only since V68. #if __HVX_ARCH__ >= 68 typedef HVXPacket<0> Packet32f; // float32 -- GitLab From 9c97c1370eeb4a615e0a0ec7624182bb560f8930 Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Wed, 26 Jul 2023 14:23:26 -0700 Subject: [PATCH 12/15] Revise based on review: 1. guard whole file with single HVX define 2. change vmem macros to inline functions. --- .../Core/arch/HVX/GeneralBlockPanelKernel.h | 11 ++--- Eigen/src/Core/arch/HVX/PacketMath.h | 41 ++++++++++++------- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h index 5cfc2cdb8..deb1825da 100644 --- a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h @@ -1,14 +1,15 @@ #ifndef EIGEN_HVX_GENERAL_BLOCK_KERNEL_H #define EIGEN_HVX_GENERAL_BLOCK_KERNEL_H +// Only support 128B HVX now. #if defined __HVX__ && (__HVX_LENGTH__ == 128) -namespace Eigen { -namespace internal { - // Floating-point operations are only supported since V68. #if __HVX_ARCH__ >= 68 +namespace Eigen { +namespace internal { + template class gebp_traits @@ -39,11 +40,11 @@ class gebp_traits= 68 - } // end namespace internal } // end namespace Eigen +#endif // __HVX_ARCH__ >= 68 + #endif // __HVX__ && (__HVX_LENGTH__ == 128) #endif // EIGEN_HVX_GENERAL_BLOCK_KERNEL_H diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index 0f31deebe..694332362 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -2,8 +2,12 @@ #ifndef EIGEN_HVX_PACKET_MATH_H #define EIGEN_HVX_PACKET_MATH_H +// Only support 128B HVX now. #if defined __HVX__ && (__HVX_LENGTH__ == 128) +// Floating-point operations are supported only since V68. +#if __HVX_ARCH__ >= 68 + #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif @@ -11,9 +15,21 @@ namespace Eigen { namespace internal { -// These defines borrows from qhmath internal header, qhmath_hvx_convert.h -#define vmem(A) *((HVX_Vector*)(A)) -#define vmemu(A) *((HVX_UVector*)(A)) +EIGEN_STRONG_INLINE HVX_Vector HVX_load(const void *mem) { + return *((HVX_Vector*)mem); +} + +EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const void *mem) { + return *((HVX_UVector*)mem); +} + +EIGEN_STRONG_INLINE void HVX_store(void *mem, HVX_Vector v) { + *((HVX_Vector*)mem) = v; +} + +EIGEN_STRONG_INLINE void HVX_storeu(void *mem, HVX_Vector v) { + *((HVX_UVector*)mem) = v; +} // Hexagon compiler uses same HVX_Vector to represent all HVX vector types. // Wrap different vector type (float32, int32, etc) to different class with @@ -30,9 +46,6 @@ class HVXPacket { HVX_Vector m_val = Q6_V_vzero(); }; -// Floating-point operations are supported only since V68. -#if __HVX_ARCH__ >= 68 - typedef HVXPacket<0> Packet32f; // float32 typedef HVXPacket<1> Packet32qf; // qfloat32 @@ -73,20 +86,20 @@ EIGEN_STRONG_INLINE Packet32f pset1(const float& from) { template <> EIGEN_STRONG_INLINE Packet32f pload(const float* from) { - return Packet32f::Create(vmem(from)); + return Packet32f::Create(HVX_load(from)); } template <> EIGEN_STRONG_INLINE Packet32f ploadu(const float* from) { - return Packet32f::Create(vmemu(from)); + return Packet32f::Create(HVX_loadu(from)); } template <> EIGEN_STRONG_INLINE void pstore(float* to, const Packet32f& from) { - vmem(to) = from.Get(); + HVX_store(to, from.Get()); } template <> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet32f& from) { - vmemu(to) = from.Get(); + HVX_storeu(to, from.Get()); } template <> @@ -397,14 +410,14 @@ EIGEN_STRONG_INLINE float predux(const Packet32f& a) { template <> EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) { - HVX_Vector load = vmemu(from); + HVX_Vector load = HVX_loadu(from); HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4); return Packet32f::Create(HEXAGON_HVX_GET_V0(dup)); } template <> EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) { - HVX_Vector load = vmemu(from); + HVX_Vector load = HVX_loadu(from); HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4); HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8); @@ -525,11 +538,11 @@ EIGEN_STRONG_INLINE Packet32f pmadd_qf32_to_f32(const Packet32qf& a, Q6_Vqf32_vmpy_VsfVsf(Q6_Vsf_equals_Vqf32(a.Get()), b.Get()), c.Get()))); } -#endif // __HVX_ARCH__ >= 68 - } // end namespace internal } // end namespace Eigen +#endif // __HVX_ARCH__ >= 68 + #endif // __HVX__ && (__HVX_LENGTH__ == 128) #endif // EIGEN_HVX_PACKET_MATH_H -- GitLab From 44449a84538be5c7380478e3952a95bc77cefe9d Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Wed, 26 Jul 2023 14:29:42 -0700 Subject: [PATCH 13/15] Update file. --- Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h | 8 ++------ Eigen/src/Core/arch/HVX/PacketMath.h | 8 ++------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h index deb1825da..51f37fa41 100644 --- a/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/arch/HVX/GeneralBlockPanelKernel.h @@ -2,10 +2,8 @@ #define EIGEN_HVX_GENERAL_BLOCK_KERNEL_H // Only support 128B HVX now. -#if defined __HVX__ && (__HVX_LENGTH__ == 128) - // Floating-point operations are only supported since V68. -#if __HVX_ARCH__ >= 68 +#if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68 namespace Eigen { namespace internal { @@ -43,8 +41,6 @@ class gebp_traits= 68 - -#endif // __HVX__ && (__HVX_LENGTH__ == 128) +#endif // __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68 #endif // EIGEN_HVX_GENERAL_BLOCK_KERNEL_H diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index 694332362..3d0e8bc57 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -3,10 +3,8 @@ #define EIGEN_HVX_PACKET_MATH_H // Only support 128B HVX now. -#if defined __HVX__ && (__HVX_LENGTH__ == 128) - // Floating-point operations are supported only since V68. -#if __HVX_ARCH__ >= 68 +#if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 @@ -541,8 +539,6 @@ EIGEN_STRONG_INLINE Packet32f pmadd_qf32_to_f32(const Packet32qf& a, } // end namespace internal } // end namespace Eigen -#endif // __HVX_ARCH__ >= 68 - -#endif // __HVX__ && (__HVX_LENGTH__ == 128) +#endif // __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68 #endif // EIGEN_HVX_PACKET_MATH_H -- GitLab From 7af0dd2003346a52e2ccdb51d2155e8d6cf04a97 Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Thu, 27 Jul 2023 15:02:34 -0700 Subject: [PATCH 14/15] Add a comment for IEEE floating-point standard. --- Eigen/src/Core/arch/HVX/PacketMath.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index 3d0e8bc57..d693757a3 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -6,6 +6,11 @@ // Floating-point operations are supported only since V68. #if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68 +// All the floating-point operations do not support IEEE standard. +// From HVX document: +// There is no concept of infinity or NaN. QFloat saturates to maximum +// exponent with maximum positive or minimum negative significand. + #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif -- GitLab From 49b54a1bd432d2c6ae817a6a5d8b178e41508dc3 Mon Sep 17 00:00:00 2001 From: Cheng Wang Date: Fri, 28 Jul 2023 08:57:35 -0700 Subject: [PATCH 15/15] Some cleanup on transpose implementation. --- Eigen/src/Core/arch/HVX/PacketMath.h | 409 +++++++++++++-------------- 1 file changed, 204 insertions(+), 205 deletions(-) diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index d693757a3..cc8722fdd 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -18,19 +18,19 @@ namespace Eigen { namespace internal { -EIGEN_STRONG_INLINE HVX_Vector HVX_load(const void *mem) { +EIGEN_STRONG_INLINE HVX_Vector HVX_load(const void* mem) { return *((HVX_Vector*)mem); } -EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const void *mem) { +EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const void* mem) { return *((HVX_UVector*)mem); } -EIGEN_STRONG_INLINE void HVX_store(void *mem, HVX_Vector v) { +EIGEN_STRONG_INLINE void HVX_store(void* mem, HVX_Vector v) { *((HVX_Vector*)mem) = v; } -EIGEN_STRONG_INLINE void HVX_storeu(void *mem, HVX_Vector v) { +EIGEN_STRONG_INLINE void HVX_storeu(void* mem, HVX_Vector v) { *((HVX_UVector*)mem) = v; } @@ -174,228 +174,227 @@ EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) { } EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - // zip 0,2 - HVX_VectorPair transpose_0_2 = - Q6_W_vshuff_VVR(kernel.packet[2].Get(), kernel.packet[0].Get(), -4); - // zip 1,3 - HVX_VectorPair transpose_1_3 = - Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[1].Get(), -4); - // zip 0,1 - HVX_VectorPair transpose_0_1 = Q6_W_vshuff_VVR( - HEXAGON_HVX_GET_V0(transpose_1_3), HEXAGON_HVX_GET_V0(transpose_0_2), -4); - // zip 2,3 - HVX_VectorPair transpose_2_3 = Q6_W_vshuff_VVR( - HEXAGON_HVX_GET_V1(transpose_1_3), HEXAGON_HVX_GET_V1(transpose_0_2), -4); - - kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(transpose_0_1)); - kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(transpose_0_1)); - kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(transpose_2_3)); - kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(transpose_2_3)); + // Shuffle the 32-bit lanes. + HVX_VectorPair v_0_1_0 = + Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4); + HVX_VectorPair v_0_3_2 = + Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4); + + // Shuffle the 64-bit lanes. + HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), + HEXAGON_HVX_GET_V0(v_0_1_0), -8); + HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), + HEXAGON_HVX_GET_V1(v_0_1_0), -8); + + kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0)); + kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0)); + kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_3_2)); + kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_3_2)); } EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { // Shuffle the 32-bit lanes. - HVX_VectorPair VD1_0 = + HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4); - HVX_VectorPair VD3_2 = + HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4); - HVX_VectorPair VD5_4 = + HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4); - HVX_VectorPair VD7_6 = + HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4); - HVX_VectorPair VD9_8 = + HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4); - HVX_VectorPair VD11_10 = + HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4); - HVX_VectorPair VD13_12 = + HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4); - HVX_VectorPair VD15_14 = + HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4); - HVX_VectorPair VD17_16 = + HVX_VectorPair v_0_17_16 = Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4); - HVX_VectorPair VD19_18 = + HVX_VectorPair v_0_19_18 = Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4); - HVX_VectorPair VD21_20 = + HVX_VectorPair v_0_21_20 = Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4); - HVX_VectorPair VD23_22 = + HVX_VectorPair v_0_23_22 = Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4); - HVX_VectorPair VD25_24 = + HVX_VectorPair v_0_25_24 = Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4); - HVX_VectorPair VD27_26 = + HVX_VectorPair v_0_27_26 = Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4); - HVX_VectorPair VD29_28 = + HVX_VectorPair v_0_29_28 = Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4); - HVX_VectorPair VD31_30 = + HVX_VectorPair v_0_31_30 = Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4); - // Shuffle the 64-bit lanes - HVX_VectorPair VS1_0 = - Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD3_2), HEXAGON_HVX_GET_V0(VD1_0), -8); - HVX_VectorPair VS3_2 = - Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD3_2), HEXAGON_HVX_GET_V1(VD1_0), -8); - HVX_VectorPair VS5_4 = - Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD7_6), HEXAGON_HVX_GET_V0(VD5_4), -8); - HVX_VectorPair VS7_6 = - Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD7_6), HEXAGON_HVX_GET_V1(VD5_4), -8); - HVX_VectorPair VS9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD11_10), - HEXAGON_HVX_GET_V0(VD9_8), -8); - HVX_VectorPair VS11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD11_10), - HEXAGON_HVX_GET_V1(VD9_8), -8); - HVX_VectorPair VS13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD15_14), - HEXAGON_HVX_GET_V0(VD13_12), -8); - HVX_VectorPair VS15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD15_14), - HEXAGON_HVX_GET_V1(VD13_12), -8); - HVX_VectorPair VS17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD19_18), - HEXAGON_HVX_GET_V0(VD17_16), -8); - HVX_VectorPair VS19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD19_18), - HEXAGON_HVX_GET_V1(VD17_16), -8); - HVX_VectorPair VS21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD23_22), - HEXAGON_HVX_GET_V0(VD21_20), -8); - HVX_VectorPair VS23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD23_22), - HEXAGON_HVX_GET_V1(VD21_20), -8); - HVX_VectorPair VS25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD27_26), - HEXAGON_HVX_GET_V0(VD25_24), -8); - HVX_VectorPair VS27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD27_26), - HEXAGON_HVX_GET_V1(VD25_24), -8); - HVX_VectorPair VS29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD31_30), - HEXAGON_HVX_GET_V0(VD29_28), -8); - HVX_VectorPair VS31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD31_30), - HEXAGON_HVX_GET_V1(VD29_28), -8); - - // Shuffle the 128-bit lanes - VD1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS5_4), HEXAGON_HVX_GET_V0(VS1_0), - -16); - VD3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS5_4), HEXAGON_HVX_GET_V1(VS1_0), - -16); - VD5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS7_6), HEXAGON_HVX_GET_V0(VS3_2), - -16); - VD7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS7_6), HEXAGON_HVX_GET_V1(VS3_2), - -16); - VD9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS13_12), - HEXAGON_HVX_GET_V0(VS9_8), -16); - VD11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS13_12), - HEXAGON_HVX_GET_V1(VS9_8), -16); - VD13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS15_14), - HEXAGON_HVX_GET_V0(VS11_10), -16); - VD15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS15_14), - HEXAGON_HVX_GET_V1(VS11_10), -16); - VD17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS21_20), - HEXAGON_HVX_GET_V0(VS17_16), -16); - VD19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS21_20), - HEXAGON_HVX_GET_V1(VS17_16), -16); - VD21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS23_22), - HEXAGON_HVX_GET_V0(VS19_18), -16); - VD23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS23_22), - HEXAGON_HVX_GET_V1(VS19_18), -16); - VD25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS29_28), - HEXAGON_HVX_GET_V0(VS25_24), -16); - VD27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS29_28), - HEXAGON_HVX_GET_V1(VS25_24), -16); - VD29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS31_30), - HEXAGON_HVX_GET_V0(VS27_26), -16); - VD31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS31_30), - HEXAGON_HVX_GET_V1(VS27_26), -16); - - // Shuffle the 256-bit lanes - VS1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD9_8), HEXAGON_HVX_GET_V0(VD1_0), - -32); - VS3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD9_8), HEXAGON_HVX_GET_V1(VD1_0), - -32); - VS5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD11_10), - HEXAGON_HVX_GET_V0(VD3_2), -32); - VS7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD11_10), - HEXAGON_HVX_GET_V1(VD3_2), -32); - VS9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD13_12), - HEXAGON_HVX_GET_V0(VD5_4), -32); - VS11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD13_12), - HEXAGON_HVX_GET_V1(VD5_4), -32); - VS13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD15_14), - HEXAGON_HVX_GET_V0(VD7_6), -32); - VS15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD15_14), - HEXAGON_HVX_GET_V1(VD7_6), -32); - VS17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD25_24), - HEXAGON_HVX_GET_V0(VD17_16), -32); - VS19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD25_24), - HEXAGON_HVX_GET_V1(VD17_16), -32); - VS21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD27_26), - HEXAGON_HVX_GET_V0(VD19_18), -32); - VS23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD27_26), - HEXAGON_HVX_GET_V1(VD19_18), -32); - VS25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD29_28), - HEXAGON_HVX_GET_V0(VD21_20), -32); - VS27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD29_28), - HEXAGON_HVX_GET_V1(VD21_20), -32); - VS29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VD31_30), - HEXAGON_HVX_GET_V0(VD23_22), -32); - VS31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VD31_30), - HEXAGON_HVX_GET_V1(VD23_22), -32); - - // Shuffle the 512-bit lanes - VD1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS17_16), - HEXAGON_HVX_GET_V0(VS1_0), -64); - VD3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS17_16), - HEXAGON_HVX_GET_V1(VS1_0), -64); - VD5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS19_18), - HEXAGON_HVX_GET_V0(VS3_2), -64); - VD7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS19_18), - HEXAGON_HVX_GET_V1(VS3_2), -64); - VD9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS21_20), - HEXAGON_HVX_GET_V0(VS5_4), -64); - VD11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS21_20), - HEXAGON_HVX_GET_V1(VS5_4), -64); - VD13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS23_22), - HEXAGON_HVX_GET_V0(VS7_6), -64); - VD15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS23_22), - HEXAGON_HVX_GET_V1(VS7_6), -64); - VD17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS25_24), - HEXAGON_HVX_GET_V0(VS9_8), -64); - VD19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS25_24), - HEXAGON_HVX_GET_V1(VS9_8), -64); - VD21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS27_26), - HEXAGON_HVX_GET_V0(VS11_10), -64); - VD23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS27_26), - HEXAGON_HVX_GET_V1(VS11_10), -64); - VD25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS29_28), - HEXAGON_HVX_GET_V0(VS13_12), -64); - VD27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS29_28), - HEXAGON_HVX_GET_V1(VS13_12), -64); - VD29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(VS31_30), - HEXAGON_HVX_GET_V0(VS15_14), -64); - VD31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(VS31_30), - HEXAGON_HVX_GET_V1(VS15_14), -64); - - kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD1_0)); - kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD1_0)); - kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD3_2)); - kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD3_2)); - kernel.packet[4] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD5_4)); - kernel.packet[5] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD5_4)); - kernel.packet[6] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD7_6)); - kernel.packet[7] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD7_6)); - kernel.packet[8] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD9_8)); - kernel.packet[9] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD9_8)); - kernel.packet[10] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD11_10)); - kernel.packet[11] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD11_10)); - kernel.packet[12] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD13_12)); - kernel.packet[13] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD13_12)); - kernel.packet[14] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD15_14)); - kernel.packet[15] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD15_14)); - kernel.packet[16] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD17_16)); - kernel.packet[17] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD17_16)); - kernel.packet[18] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD19_18)); - kernel.packet[19] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD19_18)); - kernel.packet[20] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD21_20)); - kernel.packet[21] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD21_20)); - kernel.packet[22] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD23_22)); - kernel.packet[23] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD23_22)); - kernel.packet[24] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD25_24)); - kernel.packet[25] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD25_24)); - kernel.packet[26] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD27_26)); - kernel.packet[27] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD27_26)); - kernel.packet[28] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD29_28)); - kernel.packet[29] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD29_28)); - kernel.packet[30] = Packet32f::Create(HEXAGON_HVX_GET_V0(VD31_30)); - kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(VD31_30)); + // Shuffle the 64-bit lanes. + HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), + HEXAGON_HVX_GET_V0(v_0_1_0), -8); + HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), + HEXAGON_HVX_GET_V1(v_0_1_0), -8); + HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), + HEXAGON_HVX_GET_V0(v_0_5_4), -8); + HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_7_6), + HEXAGON_HVX_GET_V1(v_0_5_4), -8); + HVX_VectorPair v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), + HEXAGON_HVX_GET_V0(v_0_9_8), -8); + HVX_VectorPair v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), + HEXAGON_HVX_GET_V1(v_0_9_8), -8); + HVX_VectorPair v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), + HEXAGON_HVX_GET_V0(v_0_13_12), -8); + HVX_VectorPair v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), + HEXAGON_HVX_GET_V1(v_0_13_12), -8); + HVX_VectorPair v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_19_18), + HEXAGON_HVX_GET_V0(v_0_17_16), -8); + HVX_VectorPair v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_19_18), + HEXAGON_HVX_GET_V1(v_0_17_16), -8); + HVX_VectorPair v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_23_22), + HEXAGON_HVX_GET_V0(v_0_21_20), -8); + HVX_VectorPair v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_23_22), + HEXAGON_HVX_GET_V1(v_0_21_20), -8); + HVX_VectorPair v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), + HEXAGON_HVX_GET_V0(v_0_25_24), -8); + HVX_VectorPair v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), + HEXAGON_HVX_GET_V1(v_0_25_24), -8); + HVX_VectorPair v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), + HEXAGON_HVX_GET_V0(v_0_29_28), -8); + HVX_VectorPair v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), + HEXAGON_HVX_GET_V1(v_0_29_28), -8); + + // Shuffle the 128-bit lanes. + v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), + HEXAGON_HVX_GET_V0(v_1_1_0), -16); + v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), + HEXAGON_HVX_GET_V1(v_1_1_0), -16); + v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), + HEXAGON_HVX_GET_V0(v_1_3_2), -16); + v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), + HEXAGON_HVX_GET_V1(v_1_3_2), -16); + v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_13_12), + HEXAGON_HVX_GET_V0(v_1_9_8), -16); + v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_13_12), + HEXAGON_HVX_GET_V1(v_1_9_8), -16); + v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_15_14), + HEXAGON_HVX_GET_V0(v_1_11_10), -16); + v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_15_14), + HEXAGON_HVX_GET_V1(v_1_11_10), -16); + v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), + HEXAGON_HVX_GET_V0(v_1_17_16), -16); + v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), + HEXAGON_HVX_GET_V1(v_1_17_16), -16); + v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), + HEXAGON_HVX_GET_V0(v_1_19_18), -16); + v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), + HEXAGON_HVX_GET_V1(v_1_19_18), -16); + v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), + HEXAGON_HVX_GET_V0(v_1_25_24), -16); + v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), + HEXAGON_HVX_GET_V1(v_1_25_24), -16); + v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), + HEXAGON_HVX_GET_V0(v_1_27_26), -16); + v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), + HEXAGON_HVX_GET_V1(v_1_27_26), -16); + + // Shuffle the 256-bit lanes. + v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), + HEXAGON_HVX_GET_V0(v_0_1_0), -32); + v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), + HEXAGON_HVX_GET_V1(v_0_1_0), -32); + v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), + HEXAGON_HVX_GET_V0(v_0_3_2), -32); + v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), + HEXAGON_HVX_GET_V1(v_0_3_2), -32); + v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_13_12), + HEXAGON_HVX_GET_V0(v_0_5_4), -32); + v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_13_12), + HEXAGON_HVX_GET_V1(v_0_5_4), -32); + v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), + HEXAGON_HVX_GET_V0(v_0_7_6), -32); + v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), + HEXAGON_HVX_GET_V1(v_0_7_6), -32); + v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_25_24), + HEXAGON_HVX_GET_V0(v_0_17_16), -32); + v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_25_24), + HEXAGON_HVX_GET_V1(v_0_17_16), -32); + v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), + HEXAGON_HVX_GET_V0(v_0_19_18), -32); + v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), + HEXAGON_HVX_GET_V1(v_0_19_18), -32); + v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_29_28), + HEXAGON_HVX_GET_V0(v_0_21_20), -32); + v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_29_28), + HEXAGON_HVX_GET_V1(v_0_21_20), -32); + v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), + HEXAGON_HVX_GET_V0(v_0_23_22), -32); + v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), + HEXAGON_HVX_GET_V1(v_0_23_22), -32); + + // Shuffle the 512-bit lanes. + v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_17_16), + HEXAGON_HVX_GET_V0(v_1_1_0), -64); + v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_17_16), + HEXAGON_HVX_GET_V1(v_1_1_0), -64); + v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_19_18), + HEXAGON_HVX_GET_V0(v_1_3_2), -64); + v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_19_18), + HEXAGON_HVX_GET_V1(v_1_3_2), -64); + v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), + HEXAGON_HVX_GET_V0(v_1_5_4), -64); + v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), + HEXAGON_HVX_GET_V1(v_1_5_4), -64); + v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), + HEXAGON_HVX_GET_V0(v_1_7_6), -64); + v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), + HEXAGON_HVX_GET_V1(v_1_7_6), -64); + v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_25_24), + HEXAGON_HVX_GET_V0(v_1_9_8), -64); + v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_25_24), + HEXAGON_HVX_GET_V1(v_1_9_8), -64); + v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_27_26), + HEXAGON_HVX_GET_V0(v_1_11_10), -64); + v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_27_26), + HEXAGON_HVX_GET_V1(v_1_11_10), -64); + v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), + HEXAGON_HVX_GET_V0(v_1_13_12), -64); + v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), + HEXAGON_HVX_GET_V1(v_1_13_12), -64); + v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), + HEXAGON_HVX_GET_V0(v_1_15_14), -64); + v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), + HEXAGON_HVX_GET_V1(v_1_15_14), -64); + + kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_1_0)); + kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_1_0)); + kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_3_2)); + kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_3_2)); + kernel.packet[4] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_5_4)); + kernel.packet[5] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_5_4)); + kernel.packet[6] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_7_6)); + kernel.packet[7] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_7_6)); + kernel.packet[8] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_9_8)); + kernel.packet[9] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_9_8)); + kernel.packet[10] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_11_10)); + kernel.packet[11] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_11_10)); + kernel.packet[12] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_13_12)); + kernel.packet[13] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_13_12)); + kernel.packet[14] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_15_14)); + kernel.packet[15] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_15_14)); + kernel.packet[16] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_17_16)); + kernel.packet[17] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_17_16)); + kernel.packet[18] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_19_18)); + kernel.packet[19] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_19_18)); + kernel.packet[20] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_21_20)); + kernel.packet[21] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_21_20)); + kernel.packet[22] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_23_22)); + kernel.packet[23] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_23_22)); + kernel.packet[24] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_25_24)); + kernel.packet[25] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_25_24)); + kernel.packet[26] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_27_26)); + kernel.packet[27] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_27_26)); + kernel.packet[28] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_29_28)); + kernel.packet[29] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_29_28)); + kernel.packet[30] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_31_30)); + kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_31_30)); } template <> -- GitLab