From 89f0fcbe156347e994ec1f168064c8129e4abb73 Mon Sep 17 00:00:00 2001 From: "kseniya.zaytseva" Date: Fri, 19 Apr 2024 09:58:55 +0300 Subject: [PATCH 1/7] Add riscv rvv1.0 support --- Eigen/Core | 4 + Eigen/src/Core/arch/RVV10/MathFunctions.h | 42 ++ Eigen/src/Core/arch/RVV10/PacketMath.h | 726 +++++++++++++++++++ Eigen/src/Core/arch/RVV10/TypeCasting.h | 52 ++ Eigen/src/Core/util/ConfigureVectorization.h | 16 + Eigen/src/Core/util/Constants.h | 3 + 6 files changed, 843 insertions(+) create mode 100644 Eigen/src/Core/arch/RVV10/MathFunctions.h create mode 100644 Eigen/src/Core/arch/RVV10/PacketMath.h create mode 100644 Eigen/src/Core/arch/RVV10/TypeCasting.h diff --git a/Eigen/Core b/Eigen/Core index 6ae069a92..43a44157e 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -249,6 +249,10 @@ using std::ptrdiff_t; #include "src/Core/arch/SVE/PacketMath.h" #include "src/Core/arch/SVE/TypeCasting.h" #include "src/Core/arch/SVE/MathFunctions.h" +#elif defined EIGEN_VECTORIZE_RVV10 +#include "src/Core/arch/RVV10/PacketMath.h" +#include "src/Core/arch/RVV10/TypeCasting.h" +#include "src/Core/arch/RVV10/MathFunctions.h" #elif defined EIGEN_VECTORIZE_ZVECTOR #include "src/Core/arch/ZVector/PacketMath.h" #include "src/Core/arch/ZVector/MathFunctions.h" diff --git a/Eigen/src/Core/arch/RVV10/MathFunctions.h b/Eigen/src/Core/arch/RVV10/MathFunctions.h new file mode 100644 index 000000000..797fca3e2 --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/MathFunctions.h @@ -0,0 +1,42 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_RVV10_H +#define EIGEN_MATH_FUNCTIONS_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +template <> +EIGEN_STRONG_INLINE PacketXf pexp(const PacketXf& x) { + return pexp_float(x); +} + +template <> +EIGEN_STRONG_INLINE PacketXf plog(const PacketXf& x) { + return plog_float(x); +} + +template <> +EIGEN_STRONG_INLINE PacketXf psin(const PacketXf& x) { + return psin_float(x); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcos(const PacketXf& x) { + return pcos_float(x); +} + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h new file mode 100644 index 000000000..973633df6 --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -0,0 +1,726 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_RVV10_H +#define EIGEN_PACKET_MATH_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 + +template +struct rvv_packet_size_selector { + enum { size = VectorLength / (sizeof(Scalar) * CHAR_BIT) }; +}; + +template +struct rvv_packet_alignment_selector { + enum { + alignment = VectorLength >= 512 ? Aligned64 : (VectorLength >= 256 ? Aligned32 : Aligned16) + }; +}; + +typedef vbool32_t PacketMask; + +/********************************* int32 **************************************/ +typedef vint32m1_t PacketXi __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vuint32m1_t PacketXu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXi type; + typedef PacketXi half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int32_t type; + typedef PacketXi half; // Half not yet implemented + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE void prefetch(const numext::int32_t* addr) { +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + __builtin_prefetch(addr); +#endif +} + +template <> +EIGEN_STRONG_INLINE PacketXi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m1(from, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi plset(const numext::int32_t& a) { + PacketXi idx = __riscv_vid_v_i32m1(packet_traits::size); + return __riscv_vadd_vx_i32m1(idx, a, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pzero(const PacketXi& /*a*/) { + return __riscv_vmv_v_x_i32m1(0, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi padd(const PacketXi& a, const PacketXi& b) { + return __riscv_vadd_vv_i32m1(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi psub(const PacketXi& a, const PacketXi& b) { + return __riscv_vsub(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a) { + return __riscv_vneg(a, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmul(const PacketXi& a, const PacketXi& b) { + return __riscv_vmul(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pdiv(const PacketXi& a, const PacketXi& b) { + return __riscv_vdiv(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) { + return __riscv_vmadd(a, b, c, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmsub(const PacketXi& a, const PacketXi& b, const PacketXi& c) { + return __riscv_vmadd(a, b, pnegate(c), packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pnmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) { + return __riscv_vnmsub_vv_i32m1(a, b, c, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pnmsub(const PacketXi& a, const PacketXi& b, const PacketXi& c) { + return __riscv_vnmsub_vv_i32m1(a, b, pnegate(c), packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmin(const PacketXi& a, const PacketXi& b) { + return __riscv_vmin(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmax(const PacketXi& a, const PacketXi& b) { + return __riscv_vmax(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_le(const PacketXi& a, const PacketXi& b) { + PacketMask mask = __riscv_vmsle_vv_i32m1_b32(a, b, packet_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_lt(const PacketXi& a, const PacketXi& b) { + PacketMask mask = __riscv_vmslt_vv_i32m1_b32(a, b, packet_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_eq(const PacketXi& a, const PacketXi& b) { + PacketMask mask = __riscv_vmseq_vv_i32m1_b32(a, b, packet_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ptrue(const PacketXi& /*a*/) { + return __riscv_vmv_v_x_i32m1(0xffffffffu, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pand(const PacketXi& a, const PacketXi& b) { + return __riscv_vand_vv_i32m1(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi por(const PacketXi& a, const PacketXi& b) { + return __riscv_vor_vv_i32m1(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pxor(const PacketXi& a, const PacketXi& b) { + return __riscv_vxor_vv_i32m1(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pandnot(const PacketXi& a, const PacketXi& b) { + return __riscv_vand_vv_i32m1(a, __riscv_vnot_v_i32m1(b, packet_traits::size), packet_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) { + return __riscv_vsra_vx_i32m1(a, N, packet_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) { + return __riscv_vreinterpret_i32m1(__riscv_vsrl_vx_u32m1(__riscv_vreinterpret_u32m1(a), N, packet_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a) { + return __riscv_vsll_vx_i32m1(a, N, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m1(from, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m1(from, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploaddup(const numext::int32_t* from) { + PacketXu idx = __riscv_vid_v_u32m1(packet_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, packet_traits::size), 1, packet_traits::size); + // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... + return __riscv_vloxei32_v_i32m1(from, idx, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploadquad(const numext::int32_t* from) { + PacketXu idx = __riscv_vid_v_u32m1(packet_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, packet_traits::size); + return __riscv_vloxei32_v_i32m1(from, idx, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketXi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m1(to, from, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketXi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m1(to, from, packet_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXi pgather(const numext::int32_t* from, Index stride) { + return __riscv_vlse32_v_i32m1(from, stride * sizeof(numext::int32_t), packet_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketXi& from, + Index stride) { + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketXi& a) { + return __riscv_vmv_x_s_i32m1_i32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) { + PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(packet_traits::size), packet_traits::size-1, packet_traits::size); + return __riscv_vrgather_vv_i32m1(a, idx, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a) { + PacketXi mask = __riscv_vsra_vx_i32m1(a, 31, packet_traits::size); + return __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(a, mask, packet_traits::size), mask, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const PacketXi& a) { + PacketXi vzero = __riscv_vmv_v_x_i32m1(0, packet_traits::size); + return __riscv_vmv_x(__riscv_vredsum_vs_i32m1_i32m1(a, vzero, packet_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketXi& a) { + // Multiply the vector by its reverse + PacketXi prod = __riscv_vmul_vv_i32m1(preverse(a), a, packet_traits::size); + PacketXi half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_i32m1(prod, 8, packet_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, packet_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_i32m1(prod, 4, packet_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, packet_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_i32m1(prod, 2, packet_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, packet_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_i32m1(prod, 1, packet_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, packet_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketXi& a) { + PacketXi vmax = __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), packet_traits::size); + return __riscv_vmv_x(__riscv_vredmin_vs_i32m1_i32m1(a, vmax, packet_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketXi& a) { + PacketXi vmin = __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), packet_traits::size); + return __riscv_vmv_x(__riscv_vredmax_vs_i32m1_i32m1(a, vmin, packet_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[packet_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], packet_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle32_v_i32m1(&buffer[i * packet_traits::size], packet_traits::size); + } +} + +/********************************* float32 ************************************/ + +typedef vfloat32m1_t PacketXf __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXf type; + typedef PacketXf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef PacketXf half; // Half not yet implemented + typedef PacketXi integer_packet; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXf ptrue(const PacketXf& /*a*/) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(0xffffffffu, packet_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pzero(const PacketXf& /*a*/) { + return __riscv_vfmv_v_f_f32m1(0.0f, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a) { + return __riscv_vfabs_v_f32m1(a, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m1(from, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(from, packet_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf plset(const float& a) { + PacketXf idx = __riscv_vfcvt_f_x_v_f32m1(__riscv_vid_v_i32m1(packet_traits::size), packet_traits::size); + return __riscv_vfadd_vf_f32m1(idx, a, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf padd(const PacketXf& a, const PacketXf& b) { + return __riscv_vfadd_vv_f32m1(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf psub(const PacketXf& a, const PacketXf& b) { + return __riscv_vfsub_vv_f32m1(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a) { + return __riscv_vfneg_v_f32m1(a, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmul(const PacketXf& a, const PacketXf& b) { + return __riscv_vfmul_vv_f32m1(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pdiv(const PacketXf& a, const PacketXf& b) { + return __riscv_vfdiv_vv_f32m1(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfmadd_vv_f32m1(a, b, c, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmsub(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfmsub_vv_f32m1(a, b, c, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pnmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfnmsub_vv_f32m1(a, b, c, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pnmsub(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfnmadd_vv_f32m1(a, b, c, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { + PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), packet_traits::size); + PacketMask mask = __riscv_vmfeq_vv_f32m1_b32(a, a, packet_traits::size); + PacketMask mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, packet_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, packet_traits::size); + + return __riscv_vfmin_vv_f32m1_tum(mask, nans, a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { + return __riscv_vfmin_vv_f32m1(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { + PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), packet_traits::size); + PacketMask mask = __riscv_vmfeq_vv_f32m1_b32(a, a, packet_traits::size); + PacketMask mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, packet_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, packet_traits::size); + + return __riscv_vfmax_vv_f32m1_tum(mask, nans, a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { + return __riscv_vfmax_vv_f32m1(a, b, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_le(const PacketXf& a, const PacketXf& b) { + PacketMask mask = __riscv_vmfle_vv_f32m1_b32(a, b, packet_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_lt(const PacketXf& a, const PacketXf& b) { + PacketMask mask = __riscv_vmflt_vv_f32m1_b32(a, b, packet_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_eq(const PacketXf& a, const PacketXf& b) { + PacketMask mask = __riscv_vmfeq_vv_f32m1_b32(a, b, packet_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan(const PacketXf& a, const PacketXf& b) { + PacketMask mask = __riscv_vmfge_vv_f32m1_b32(a, b, packet_traits::size); + return __riscv_vfmerge_vfm_f32m1(ptrue(a), 0.0f, mask, packet_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketXf pand(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), packet_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf por(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vor_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), packet_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pxor(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), packet_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pandnot(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), packet_traits::size), packet_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m1(from, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m1(from, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploaddup(const float* from) { + PacketXu idx = __riscv_vid_v_u32m1(packet_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, packet_traits::size), 1, packet_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploadquad(const float* from) { + PacketXu idx = __riscv_vid_v_u32m1(packet_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, packet_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const PacketXf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m1(to, from, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketXf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m1(to, from, packet_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m1(from, stride * sizeof(float), packet_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketXf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const PacketXf& a) { + return __riscv_vfmv_f_s_f32m1_f32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXf psqrt(const PacketXf& a) { + return __riscv_vfsqrt_v_f32m1(a, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf print(const PacketXf& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const PacketXf limit = pset1(static_cast(1 << 23)); + const PacketXf abs_a = pabs(a); + PacketXf r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template <> +EIGEN_STRONG_INLINE PacketXf pfloor(const PacketXf& a) { + const PacketXf cst_1 = pset1(1.0f); + PacketXf tmp = print(a); + // If greater, subtract one. + PacketXf mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template <> +EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) { + PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(packet_traits::size), packet_traits::size-1, packet_traits::size); + return __riscv_vrgather_vv_f32m1(a, idx, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pfrexp(const PacketXf& a, PacketXf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const PacketXf& a) { + PacketXf vzero = __riscv_vfmv_v_f_f32m1(0.0, packet_traits::size); + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m1_f32m1(a, vzero, packet_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) { + // Multiply the vector by its reverse + PacketXf prod = __riscv_vfmul_vv_f32m1(preverse(a), a, packet_traits::size); + PacketXf half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, packet_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, packet_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, packet_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, packet_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, packet_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, packet_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, packet_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, packet_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const PacketXf& a) { + PacketXf vmax = __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), packet_traits::size); + return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1(a, vmax, packet_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const PacketXf& a) { + PacketXf vmin = __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), packet_traits::size); + return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1(a, vmin, packet_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[packet_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], packet_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle32_v_f32m1(&buffer[i * packet_traits::size], packet_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketXf pldexp(const PacketXf& a, const PacketXf& exponent) { + return pldexp_generic(a, exponent); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_PACKET_MATH_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/TypeCasting.h b/Eigen/src/Core/arch/RVV10/TypeCasting.h new file mode 100644 index 000000000..b26bbf4bc --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/TypeCasting.h @@ -0,0 +1,52 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_RVV10_H +#define EIGEN_TYPE_CASTING_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXf pcast(const PacketXi& a) { + return __riscv_vfcvt_f_x_v_f32m1(a, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcast(const PacketXf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m1(a, packet_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf preinterpret(const PacketXi& a) { + return __riscv_vreinterpret_v_i32m1_f32m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi preinterpret(const PacketXf& a) { + return __riscv_vreinterpret_v_f32m1_i32m1(a); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_TYPE_CASTING_RVV10_H diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 49f307c73..f9ba6c558 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -414,6 +414,22 @@ extern "C" { #error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set." #endif +// We currently require RVV to be enabled explicitly via EIGEN_RISCV64_USE_RVV and +// will not select the backend automatically +#elif (defined EIGEN_RISCV64_USE_RVV10) + +#define EIGEN_VECTORIZE +#define EIGEN_VECTORIZE_RVV10 +#include + +// Since we depend on knowing RVV vector lengths at compile-time, we need +// to ensure a fixed lengths is set +#if defined(__riscv_v_fixed_vlen) +#define EIGEN_RISCV64_RVV_VL __riscv_v_fixed_vlen +#else +#error "Eigen requires a fixed RVV vector length but -mrvv-vector-bits=zvl is not set." +#endif + #elif (defined __s390x__ && defined __VEC__) #define EIGEN_VECTORIZE diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index fcc2db822..8aba62b75 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -475,6 +475,7 @@ enum Type { SVE = 0x6, HVX = 0x7, LSX = 0x8, + RVV10 = 0x9, #if defined EIGEN_VECTORIZE_SSE Target = SSE #elif defined EIGEN_VECTORIZE_ALTIVEC @@ -491,6 +492,8 @@ enum Type { Target = HVX #elif defined EIGEN_VECTORIZE_LSX Target = LSX +#elif defined EIGEN_VECTORIZE_RVV10 + Target = RVV10 #else Target = Generic #endif -- GitLab From 40a3101b519c134efd03e899b5a6041918abc588 Mon Sep 17 00:00:00 2001 From: "kseniya.zaytseva" Date: Tue, 11 Feb 2025 22:38:41 +0300 Subject: [PATCH 2/7] Add double, long, complex(float/double), short, half types RVV1.0 support --- Eigen/Core | 8 + Eigen/src/Core/AssignEvaluator.h | 5 + Eigen/src/Core/CoreEvaluators.h | 4 + Eigen/src/Core/GenericPacketMath.h | 2 +- Eigen/src/Core/ProductEvaluators.h | 5 + Eigen/src/Core/Redux.h | 4 + Eigen/src/Core/arch/RVV10/Complex.h | 704 +++ .../Core/arch/RVV10/GeneralBlockPanelKernel.h | 493 ++ Eigen/src/Core/arch/RVV10/MathFunctions.h | 24 +- Eigen/src/Core/arch/RVV10/PacketMath.h | 4803 ++++++++++++++++- Eigen/src/Core/arch/RVV10/PacketMathFP16.h | 879 +++ Eigen/src/Core/arch/RVV10/TypeCasting.h | 281 +- Eigen/src/Core/functors/UnaryFunctors.h | 4 + Eigen/src/Core/products/GeneralMatrixVector.h | 9 + Eigen/src/Core/util/ConfigureVectorization.h | 28 +- Eigen/src/Core/util/Macros.h | 7 + Eigen/src/Core/util/XprHelper.h | 9 +- Eigen/src/Eigenvalues/Tridiagonalization.h | 2 +- Eigen/src/Jacobi/Jacobi.h | 11 +- test/packetmath.cpp | 13 + test/vectorization_logic.cpp | 4 + 21 files changed, 6989 insertions(+), 310 deletions(-) create mode 100644 Eigen/src/Core/arch/RVV10/Complex.h create mode 100644 Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h create mode 100644 Eigen/src/Core/arch/RVV10/PacketMathFP16.h diff --git a/Eigen/Core b/Eigen/Core index 43a44157e..3a238407f 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -253,6 +253,10 @@ using std::ptrdiff_t; #include "src/Core/arch/RVV10/PacketMath.h" #include "src/Core/arch/RVV10/TypeCasting.h" #include "src/Core/arch/RVV10/MathFunctions.h" +#include "src/Core/arch/RVV10/Complex.h" +#if defined EIGEN_VECTORIZE_RVV10FP16 +#include "src/Core/arch/RVV10/PacketMathFP16.h" +#endif #elif defined EIGEN_VECTORIZE_ZVECTOR #include "src/Core/arch/ZVector/PacketMath.h" #include "src/Core/arch/ZVector/MathFunctions.h" @@ -400,6 +404,10 @@ using std::ptrdiff_t; #include "src/Core/arch/AVX512/GemmKernel.h" #endif +#if defined(EIGEN_VECTORIZE_RVV10) +#include "src/Core/arch/RVV10/GeneralBlockPanelKernel.h" +#endif + #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" #include "src/Core/PartialReduxEvaluator.h" diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 36f0a9d74..093ceb435 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -64,8 +64,13 @@ struct copy_using_evaluator_traits { static constexpr int OuterStride = outer_stride_at_compile_time::ret; // TODO distinguish between linear traversal and inner-traversals +#ifdef EIGEN_RISCV64_USE_RVV10 + using LinearPacketType = typename find_best_packet::type; + using InnerPacketType = typename find_best_packet::type; +#else using LinearPacketType = typename find_best_packet::type; using InnerPacketType = typename find_best_packet::type; +#endif static constexpr int LinearPacketSize = unpacket_traits::size; static constexpr int InnerPacketSize = unpacket_traits::size; diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index e3af2d202..9fa8e4286 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -1367,7 +1367,11 @@ struct evaluator> typedef Block XprType; typedef typename XprType::Scalar Scalar; // TODO: should check for smaller packet types once we can handle multi-sized packet types +#ifdef EIGEN_RISCV64_USE_RVV10 + typedef typename packet_traits::type PacketScalar; +#else typedef typename packet_traits::type PacketScalar; +#endif enum { CoeffReadCost = evaluator::CoeffReadCost, diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index d45cb4bf4..e42baf75d 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -105,7 +105,7 @@ struct default_packet_traits { }; }; -template +template struct packet_traits : default_packet_traits { typedef T type; typedef T half; diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index ce8d954bf..db820ba6d 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -533,8 +533,13 @@ struct product_evaluator, ProductTag, DenseShape, MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime }; +#ifdef EIGEN_RISCV64_USE_RVV10 + typedef typename find_best_packet::type LhsVecPacketType; + typedef typename find_best_packet::type RhsVecPacketType; +#else typedef typename find_best_packet::type LhsVecPacketType; typedef typename find_best_packet::type RhsVecPacketType; +#endif enum { diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 4e9ab0e4f..841d6349a 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -29,7 +29,11 @@ namespace internal { template struct redux_traits { public: + #ifdef EIGEN_RISCV64_USE_RVV10 + typedef typename find_best_packet::type PacketType; +#else typedef typename find_best_packet::type PacketType; +#endif enum { PacketSize = unpacket_traits::size, InnerMaxSize = int(Evaluator::IsRowMajor) ? Evaluator::MaxColsAtCompileTime : Evaluator::MaxRowsAtCompileTime, diff --git a/Eigen/src/Core/arch/RVV10/Complex.h b/Eigen/src/Core/arch/RVV10/Complex.h new file mode 100644 index 000000000..67e6dae82 --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/Complex.h @@ -0,0 +1,704 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_RVV10_H +#define EIGEN_COMPLEX_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +/********************************* float32 ************************************/ + +struct PacketXcf { + EIGEN_STRONG_INLINE PacketXcf() {} + EIGEN_STRONG_INLINE explicit PacketXcf(const PacketXf& _real, const PacketXf& _imag) : real(_real), imag(_imag) { + } + EIGEN_STRONG_INLINE explicit PacketXcf(const PacketMul2Xf& a) : real(__riscv_vget_v_f32m2_f32m1(a, 0)), + imag(__riscv_vget_v_f32m2_f32m1(a, 1)){} + PacketXf real; + PacketXf imag; +}; + +template +struct packet_traits, LMul> : default_packet_traits { + typedef PacketXcf type; + typedef PacketXcf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasSqrt = 1, + HasSign = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasLog = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + typedef PacketXcf half; + typedef PacketMul2Xf as_real; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXcf pcast(const PacketMul2Xf& a) { + return PacketXcf(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXcf& a) { + PacketMul2Xf res = __riscv_vundefined_f32m2(); + res = __riscv_vset_v_f32m1_f32m2(res, 0, a.real); + res = __riscv_vset_v_f32m1_f32m2(res, 1, a.imag); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pset1(const std::complex& from) { + PacketXf real = pset1(from.real()); + PacketXf imag = pset1(from.imag()); + return PacketXcf(real, imag); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf padd(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(padd(a.real, b.real), padd(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf psub(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(psub(a.real, b.real), psub(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pnegate(const PacketXcf& a) { + return PacketXcf(pnegate(a.real), pnegate(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pconj(const PacketXcf& a) { + return PacketXcf(a.real, __riscv_vreinterpret_v_u32m1_f32m1( + __riscv_vxor_vx_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a.imag), 0x80000000, unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pmul(const PacketXcf& a, const PacketXcf& b) { + PacketXf v1 = pmul(a.real, b.real); + PacketXf v2 = pmul(a.imag, b.imag); + PacketXf v3 = pmul(a.real, b.imag); + PacketXf v4 = pmul(a.imag, b.real); + return PacketXcf(psub(v1, v2), padd(v3, v4)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketXcf& a, const PacketXcf& b, const PacketXcf& c) { + PacketXf v1 = pmadd(a.real, b.real, c.real); + PacketXf v2 = pmul(a.imag, b.imag); + PacketXf v3 = pmadd(a.real, b.imag, c.imag); + PacketXf v4 = pmul(a.imag, b.real); + return PacketXcf(psub(v1, v2), padd(v3, v4)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pcmp_eq(const PacketXcf& a, const PacketXcf& b) { + PacketXf eq_real = pcmp_eq(a.real, b.real); + PacketXf eq_imag = pcmp_eq(a.imag, b.imag); + PacketXf eq_both = pand(eq_real, eq_imag); + return PacketXcf(eq_both, eq_both); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pand(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(pand(a.real, b.real), pand(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf por(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(por(a.real, b.real), por(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pxor(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(pxor(a.real, b.real), pxor(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pandnot(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pload(const std::complex* from) { + vfloat32m1x2_t res = __riscv_vlseg2e32_v_f32m1x2((const float*)from, unpacket_traits::size); + EIGEN_DEBUG_ALIGNED_LOAD return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), __riscv_vget_v_f32m1x2_f32m1(res, 1)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf ploadu(const std::complex* from) { + vfloat32m1x2_t res = __riscv_vlseg2e32_v_f32m1x2((const float*)from, unpacket_traits::size); + EIGEN_DEBUG_UNALIGNED_LOAD return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), __riscv_vget_v_f32m1x2_f32m1(res, 1)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf ploaddup(const std::complex* from) { + PacketXu real_idx = __riscv_vid_v_u32m1(unpacket_traits::size); + real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffeu, unpacket_traits::size), 2, unpacket_traits::size); + PacketXu imag_idx = __riscv_vadd_vx_u32m1(real_idx, sizeof(float), unpacket_traits::size); + // real_idx = 0 0 2*sizeof(float) 2*sizeof(float) 4*sizeof(float) 4*sizeof(float) ... + return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), + __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf ploadquad(const std::complex* from) { + PacketXu real_idx = __riscv_vid_v_u32m1(unpacket_traits::size); + real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffcu, unpacket_traits::size), 1, unpacket_traits::size); + PacketXu imag_idx = __riscv_vadd_vx_u32m1(real_idx, sizeof(float), unpacket_traits::size); + // real_idx = 0 0 2*sizeof(float) 2*sizeof(float) 4*sizeof(float) 4*sizeof(float) ... + return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), + __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, const PacketXcf& from) { + vfloat32m1x2_t vx2 = __riscv_vundefined_f32m1x2(); + vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 0, from.real); + vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 1, from.imag); + EIGEN_DEBUG_ALIGNED_STORE __riscv_vsseg2e32_v_f32m1x2((float*)to, vx2, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const PacketXcf& from) { + vfloat32m1x2_t vx2 = __riscv_vundefined_f32m1x2(); + vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 0, from.real); + vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 1, from.imag); + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vsseg2e32_v_f32m1x2((float*)to, vx2, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXcf pgather, PacketXcf>(const std::complex* from, + Index stride) { + vfloat32m1x2_t res = __riscv_vlsseg2e32_v_f32m1x2((const float*)from, 2 * stride * sizeof(float), unpacket_traits::size); + return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), __riscv_vget_v_f32m1x2_f32m1(res, 1)); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, PacketXcf>(std::complex* to, const PacketXcf& from, + Index stride) { + vfloat32m1x2_t from_rvv_type = __riscv_vundefined_f32m1x2(); + from_rvv_type = __riscv_vset_v_f32m1_f32m1x2(from_rvv_type, 0, from.real); + from_rvv_type = __riscv_vset_v_f32m1_f32m1x2(from_rvv_type, 1, from.imag); + __riscv_vssseg2e32_v_f32m1x2((float*)to, 2 * stride * sizeof(float), from_rvv_type, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const PacketXcf& a) { + return std::complex(pfirst(a.real), pfirst(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf preverse(const PacketXcf& a) { + return PacketXcf(preverse(a.real), preverse(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pcplxflip(const PacketXcf& a) { + return PacketXcf(a.imag, a.real); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const PacketXcf& a) { + return std::complex(predux(a.real), predux(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pdiv(const PacketXcf& a, const PacketXcf& b) { + PacketXcf b_conj = pconj(b); + PacketXcf dividend = pmul(a, b_conj); + PacketXf divider = psub(pmul(b.real, b_conj.real), pmul(b.imag, b_conj.imag)); + return PacketXcf(pdiv(dividend.real, divider), pdiv(dividend.imag, divider)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer_real[unpacket_traits::size * N]; + float buffer_imag[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer_real[i], N * sizeof(float), kernel.packet[i].real, unpacket_traits::size); + __riscv_vsse32(&buffer_imag[i], N * sizeof(float), kernel.packet[i].imag, unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i].real = __riscv_vle32_v_f32m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i].imag = __riscv_vle32_v_f32m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE Packet psqrt_complex_rvv(const Packet& a) { + typedef typename unpacket_traits::type Scalar; + typedef typename Scalar::value_type RealScalar; + typedef typename packet_traits::type RealPacket; + + // Computes the principal sqrt of the complex numbers in the input. + // + // For example, for packets containing 2 complex numbers stored in + // [real0, real1, imag0, imag1] format + // a = [a0, a1] = [x0, x1, y0, y1], + // where x0 = real(a0), y0 = imag(a0) etc., this function returns + // b = [b0, b1] = [u0, u1, v0, v1], + // such that b0^2 = a0, b1^2 = a1. + // + // To derive the formula for the complex square roots, let's consider the equation for + // a single complex square root of the number x + i*y. We want to find real numbers + // u and v such that + // (u + i*v)^2 = x + i*y <=> + // u^2 - v^2 + i*2*u*v = x + i*v. + // By equating the real and imaginary parts we get: + // u^2 - v^2 = x + // 2*u*v = y. + // + // For x >= 0, this has the numerically stable solution + // u = sqrt(0.5 * (x + sqrt(x^2 + y^2))) + // v = 0.5 * (y / u) + // and for x < 0, + // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2))) + // u = 0.5 * (y / v) + // + // To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as + // l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) , + + // In the following, without lack of generality, we have annotated the code, assuming + // that the input is a packet of 2 complex numbers. + // + // Step 1. Compute l = [l0, l1], where + // l0 = sqrt(x0^2 + y0^2), l1 = sqrt(x1^2 + y1^2) + // To avoid over- and underflow, we use the stable formula for each hypotenuse + // l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)), + // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1. + + Packet a_abs = Packet(pabs(a.real), pabs(a.imag)); + RealPacket a_max = pmax(a_abs.real, a_abs.imag); + RealPacket a_min = pmin(a_abs.real, a_abs.imag); + + RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min)); + RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max)); + RealPacket r = pdiv(a_min, a_max); + + const RealPacket cst_one = pset1(RealScalar(1)); + RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); + // Set l to a_max if a_min is zero. + l = pselect(a_min_zero_mask, a_max, l); + + // Step 2. Compute [rho0, rho1], where + // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 = sqrt(0.5 * (l1 + |x1|)) + // We don't care about the imaginary parts computed here. They will be overwritten later. + const RealPacket cst_half = pset1(RealScalar(0.5)); + RealPacket rho = psqrt(pmul(cst_half, padd(a_abs.real, l))); + + // Step 3. Compute [rho0, rho1, eta0, eta1], where + // eta0 = (y0 / rho0) / 2, and eta1 = (y1 / rho1) / 2. + // set eta = 0 of input is 0 + i0. + RealPacket eta = pandnot(pmul(cst_half, pdiv(a.imag, rho)), a_max_zero_mask); + // Compute result for inputs with positive real part. + Packet positive_real_result = Packet(rho, eta); + + // Step 4. Compute solution for inputs with negative real part: + // [|eta0| |eta1|, sign(y0)*rho0, sign(y1)*rho1] + const RealPacket cst_imag_sign_mask = pset1(RealScalar(-0.0)); + RealPacket imag_signs = pand(a.imag, cst_imag_sign_mask); + Packet negative_real_result = Packet(pabs(eta), por(positive_real_result.real, imag_signs)); + + // Step 5. Select solution branch based on the sign of the real parts. + RealPacket negative_real_mask_half = pcmp_lt(a.real, pzero(a.real)); + Packet negative_real_mask = Packet(negative_real_mask_half, negative_real_mask_half); + Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result); + + // Step 6. Handle special cases for infinities: + // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN + // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN + // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y + // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y + const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); + RealPacket is_real_inf = pcmp_eq(a_abs.real, cst_pos_inf); + // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part. + const Packet cst_one_zero = pset1(Scalar(RealScalar(1.0), RealScalar(0.0))); + Packet real_inf_result = Packet(pmul(a_abs.real, cst_one_zero.real), pmul(a_abs.imag, cst_one_zero.imag)); + real_inf_result = pselect(negative_real_mask, pcplxflip(real_inf_result), real_inf_result); + // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part. + RealPacket is_imag_inf = pcmp_eq(a_abs.imag, cst_pos_inf); + Packet imag_inf_result = Packet(cst_pos_inf, a.imag); + // unless otherwise specified, if either the real or imaginary component is nan, the entire result is nan + Packet result_is_nan = pisnan(result); + result = por(result_is_nan, result); + + return pselect(Packet(is_imag_inf,is_imag_inf), imag_inf_result, pselect(Packet(is_real_inf, is_real_inf), real_inf_result, result)); +} + +template +EIGEN_STRONG_INLINE Packet plog_complex_rvv(const Packet& x) { + typedef typename unpacket_traits::type Scalar; + typedef typename Scalar::value_type RealScalar; + typedef typename packet_traits::type RealPacket; + + // log(sqrt(a^2 + b^2)), atan2(b, a) + RealPacket xlogr = plog(psqrt(padd(pmul(x.real,x.real), pmul(x.imag,x.imag)))); + RealPacket ximg = patan2(x.imag, x.real); + + const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); + RealPacket r_abs = pabs(x.real); + RealPacket i_abs = pabs(x.imag); + RealPacket is_r_pos_inf = pcmp_eq(r_abs, cst_pos_inf); + RealPacket is_i_pos_inf = pcmp_eq(i_abs, cst_pos_inf); + RealPacket is_any_inf = por(is_r_pos_inf, is_i_pos_inf); + RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr); + + return Packet(xreal, ximg); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf psqrt(const PacketXcf& a) +{ + return psqrt_complex_rvv(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf plog(const PacketXcf& a) +{ + return plog_complex_rvv(a); +} + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketMul2Xf& x, const PacketXcf& y, const PacketXcf& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcf pmul(const PacketMul2Xf& x, const PacketXcf& y) const { + return PacketXcf(Eigen::internal::pmul(x, pcast(y))); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketXcf& x, const PacketMul2Xf& y, const PacketXcf& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcf pmul(const PacketXcf& x, const PacketMul2Xf& y) const { + return PacketXcf(Eigen::internal::pmul(pcast(x), y)); + } +}; + +/********************************* double ************************************/ + +struct PacketXcd { + EIGEN_STRONG_INLINE PacketXcd() {} + EIGEN_STRONG_INLINE explicit PacketXcd(const PacketXd& _real, const PacketXd& _imag) : real(_real), imag(_imag) { + } + EIGEN_STRONG_INLINE explicit PacketXcd(const PacketMul2Xd& a) : real(__riscv_vget_v_f64m2_f64m1(a, 0)), + imag(__riscv_vget_v_f64m2_f64m1(a, 1)){} + PacketXd real; + PacketXd imag; +}; + +template +struct packet_traits, LMul> : default_packet_traits { + typedef PacketXcd type; + typedef PacketXcd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasSqrt = 1, + HasSign = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasLog = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + typedef PacketXcd half; + typedef PacketMul2Xd as_real; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXcd pcast(const PacketMul2Xd& a) { + return PacketXcd(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXcd& a) { + PacketMul2Xd res = __riscv_vundefined_f64m2(); + res = __riscv_vset_v_f64m1_f64m2(res, 0, a.real); + res = __riscv_vset_v_f64m1_f64m2(res, 1, a.imag); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pset1(const std::complex& from) { + PacketXd real = pset1(from.real()); + PacketXd imag = pset1(from.imag()); + return PacketXcd(real, imag); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd padd(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(padd(a.real, b.real), padd(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd psub(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(psub(a.real, b.real), psub(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pnegate(const PacketXcd& a) { + return PacketXcd(pnegate(a.real), pnegate(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pconj(const PacketXcd& a) { + return PacketXcd(a.real, __riscv_vreinterpret_v_u64m1_f64m1( + __riscv_vxor_vx_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(a.imag), 0x8000000000000000, unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pmul(const PacketXcd& a, const PacketXcd& b) { + PacketXd v1 = pmul(a.real, b.real); + PacketXd v2 = pmul(a.imag, b.imag); + PacketXd v3 = pmul(a.real, b.imag); + PacketXd v4 = pmul(a.imag, b.real); + return PacketXcd(psub(v1, v2), padd(v3, v4)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketXcd& a, const PacketXcd& b, const PacketXcd& c) { + PacketXd v1 = pmadd(a.real, b.real, c.real); + PacketXd v2 = pmul(a.imag, b.imag); + PacketXd v3 = pmadd(a.real, b.imag, c.imag); + PacketXd v4 = pmul(a.imag, b.real); + return PacketXcd(psub(v1, v2), padd(v3, v4)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pcmp_eq(const PacketXcd& a, const PacketXcd& b) { + PacketXd eq_real = pcmp_eq(a.real, b.real); + PacketXd eq_imag = pcmp_eq(a.imag, b.imag); + PacketXd eq_both = pand(eq_real, eq_imag); + return PacketXcd(eq_both, eq_both); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pand(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(pand(a.real, b.real), pand(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd por(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(por(a.real, b.real), por(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pxor(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(pxor(a.real, b.real), pxor(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pandnot(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pload(const std::complex* from) { + vfloat64m1x2_t res = __riscv_vlseg2e64_v_f64m1x2((const double*)from, unpacket_traits::size); + EIGEN_DEBUG_ALIGNED_LOAD return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), __riscv_vget_v_f64m1x2_f64m1(res, 1)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd ploadu(const std::complex* from) { + vfloat64m1x2_t res = __riscv_vlseg2e64_v_f64m1x2((const double*)from, unpacket_traits::size); + EIGEN_DEBUG_UNALIGNED_LOAD return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), __riscv_vget_v_f64m1x2_f64m1(res, 1)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd ploaddup(const std::complex* from) { + PacketXul real_idx = __riscv_vid_v_u64m1(unpacket_traits::size); + real_idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffeu, unpacket_traits::size), 3, unpacket_traits::size); + PacketXul imag_idx = __riscv_vadd_vx_u64m1(real_idx, sizeof(double), unpacket_traits::size); + // real_idx = 0 0 2*sizeof(double) 2*sizeof(double) 4*sizeof(double) 4*sizeof(double) ... + return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), + __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd ploadquad(const std::complex* from) { + PacketXul real_idx = __riscv_vid_v_u64m1(unpacket_traits::size); + real_idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffcu, unpacket_traits::size), 2, unpacket_traits::size); + PacketXul imag_idx = __riscv_vadd_vx_u64m1(real_idx, sizeof(double), unpacket_traits::size); + // real_idx = 0 0 2*sizeof(double) 2*sizeof(double) 4*sizeof(double) 4*sizeof(double) ... + return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), + __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, const PacketXcd& from) { + vfloat64m1x2_t vx2 = __riscv_vundefined_f64m1x2(); + vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 0, from.real); + vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 1, from.imag); + EIGEN_DEBUG_ALIGNED_STORE __riscv_vsseg2e64_v_f64m1x2((double*)to, vx2, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const PacketXcd& from) { + vfloat64m1x2_t vx2 = __riscv_vundefined_f64m1x2(); + vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 0, from.real); + vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 1, from.imag); + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vsseg2e64_v_f64m1x2((double*)to, vx2, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXcd pgather, PacketXcd>(const std::complex* from, + Index stride) { + vfloat64m1x2_t res = __riscv_vlsseg2e64_v_f64m1x2((const double*)from, 2 * stride * sizeof(double), unpacket_traits::size); + return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), __riscv_vget_v_f64m1x2_f64m1(res, 1)); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, PacketXcd>(std::complex* to, const PacketXcd& from, + Index stride) { + vfloat64m1x2_t from_rvv_type = __riscv_vundefined_f64m1x2(); + from_rvv_type = __riscv_vset_v_f64m1_f64m1x2(from_rvv_type, 0, from.real); + from_rvv_type = __riscv_vset_v_f64m1_f64m1x2(from_rvv_type, 1, from.imag); + __riscv_vssseg2e64_v_f64m1x2((double*)to, 2 * stride * sizeof(double), from_rvv_type, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const PacketXcd& a) { + return std::complex(pfirst(a.real), pfirst(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd preverse(const PacketXcd& a) { + return PacketXcd(preverse(a.real), preverse(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pcplxflip(const PacketXcd& a) { + return PacketXcd(a.imag, a.real); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const PacketXcd& a) { + return std::complex(predux(a.real), predux(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pdiv(const PacketXcd& a, const PacketXcd& b) { + PacketXcd b_conj = pconj(b); + PacketXcd dividend = pmul(a, b_conj); + PacketXd divider = psub(pmul(b.real, b_conj.real), pmul(b.imag, b_conj.imag)); + return PacketXcd(pdiv(dividend.real, divider), pdiv(dividend.imag, divider)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer_real[unpacket_traits::size * N]; + double buffer_imag[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer_real[i], N * sizeof(double), kernel.packet[i].real, unpacket_traits::size); + __riscv_vsse64(&buffer_imag[i], N * sizeof(double), kernel.packet[i].imag, unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i].real = __riscv_vle64_v_f64m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i].imag = __riscv_vle64_v_f64m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketXcd psqrt(const PacketXcd& a) +{ + return psqrt_complex_rvv(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd plog(const PacketXcd& a) +{ + return plog_complex_rvv(a); +} + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketMul2Xd& x, const PacketXcd& y, const PacketXcd& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcd pmul(const PacketMul2Xd& x, const PacketXcd& y) const { + return PacketXcd(Eigen::internal::pmul(x, pcast(y))); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketXcd& x, const PacketMul2Xd& y, const PacketXcd& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcd pmul(const PacketXcd& x, const PacketMul2Xd& y) const { + return PacketXcd(Eigen::internal::pmul(pcast(x), y)); + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h new file mode 100644 index 000000000..85803aa1f --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h @@ -0,0 +1,493 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H +#define EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/********************************* real ************************************/ + +template <> +struct gebp_traits + : gebp_traits { + typedef float RhsPacket; + typedef QuadPacket RhsPacketx4; + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = __riscv_vfmadd_vf_f32m1(a, b, c, unpacket_traits::size); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, + const LaneIdType& lane) const { + c = __riscv_vfmadd_vf_f32m1(a, b.get(lane), c, unpacket_traits::size); + } +}; + +template <> +struct gebp_traits + : gebp_traits { + typedef double RhsPacket; + typedef QuadPacket RhsPacketx4; + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = __riscv_vfmadd_vf_f64m1(a, b, c, unpacket_traits::size); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, + const LaneIdType& lane) const { + c = __riscv_vfmadd_vf_f64m1(a, b.get(lane), c, unpacket_traits::size); + } +}; + +#if defined(EIGEN_VECTORIZE_RVV10FP16) + +template <> +struct gebp_traits + : gebp_traits { + typedef half RhsPacket; + typedef PacketXh LhsPacket; + typedef PacketXh AccPacket; + typedef QuadPacket RhsPacketx4; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = pload(b); } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = __riscv_vfmadd_vf_f16m1(a, b, c, unpacket_traits::size); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, + const LaneIdType& lane) const { + c = __riscv_vfmadd_vf_f16m1(a, b.get(lane), c, unpacket_traits::size); + } +}; + +#endif + +/********************************* complex ************************************/ + +#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ + typedef typename packet_conditional< \ + packet_size, typename packet_traits::type, typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type name##Packet##postfix + +#define RISCV_COMPLEX_PACKET_DECL_COND_SCALAR(packet_size) \ + typedef typename packet_conditional< \ + packet_size, typename packet_traits::type, typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type ScalarPacket + + +template +struct gebp_traits, std::complex, ConjLhs_, ConjRhs_, Architecture::RVV10, PacketSize_> + : gebp_traits, std::complex, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> { + typedef std::complex Scalar; + typedef std::complex LhsScalar; + typedef std::complex RhsScalar; + typedef std::complex ResScalar; + typedef typename packet_traits>::type RealPacket; + + PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); + RISCV_COMPLEX_PACKET_DECL_COND_SCALAR(PacketSize_); + #undef RISCV_COMPLEX_PACKET_DECL_COND_SCALAR + + enum { + ConjLhs = ConjLhs_, + ConjRhs = ConjRhs_, + Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, + ResPacketSize = Vectorizable ? unpacket_traits::size : 1, + LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RealPacketSize = Vectorizable ? unpacket_traits::size : 1, + + nr = 4, + mr = ResPacketSize, + + LhsProgress = ResPacketSize, + RhsProgress = 1 + }; + + typedef DoublePacket DoublePacketType; + + typedef std::conditional_t LhsPacket4Packing; + typedef std::conditional_t LhsPacket; + typedef std::conditional_t, Scalar> RhsPacket; + typedef std::conditional_t ResPacket; + typedef std::conditional_t AccPacket; + + typedef QuadPacket RhsPacketx4; + + EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } + + EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) { + p.first = pset1(RealScalar(0)); + p.second = pset1(RealScalar(0)); + } + + // Scalar path + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { dest = pset1(*b); } + + // Vectorized path + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { + dest.first = pset1(numext::real(*b)); + dest.second = pset1(numext::imag(*b)); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + loadRhs(b, dest.B_0); + loadRhs(b + 1, dest.B1); + loadRhs(b + 2, dest.B2); + loadRhs(b + 3, dest.B3); + } + + // Scalar path + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { loadRhs(b, dest); } + + // Vectorized path + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket& dest) const { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { loadRhs(b, dest); } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacket& dest) const { + loadQuadToDoublePacket(b, dest); + } + + // nothing special here + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { + dest = pload((const typename unpacket_traits::type*)(a)); + } + + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { + dest = ploadu((const typename unpacket_traits::type*)(a)); + } + + EIGEN_STRONG_INLINE PacketXcf pmadd_scalar(const PacketXcf& a, float b, const PacketXcf& c) const { + PacketXf v1 = __riscv_vfmadd_vf_f32m1(a.real, b, c.real, unpacket_traits::size); + PacketXf v4 = __riscv_vfmadd_vf_f32m1(a.imag, b, c.imag, unpacket_traits::size); + return PacketXcf(v1, v4); + } + + EIGEN_STRONG_INLINE PacketXcd pmadd_scalar(const PacketXcd& a, double b, const PacketXcd& c) const { + PacketXd v1 = __riscv_vfmadd_vf_f64m1(a.real, b, c.real, unpacket_traits::size); + PacketXd v4 = __riscv_vfmadd_vf_f64m1(a.imag, b, c.imag, unpacket_traits::size); + return PacketXcd(v1, v4); + } + + template + EIGEN_STRONG_INLINE std::enable_if_t::value> madd(const LhsPacketType& a, + const RhsPacketType& b, + DoublePacket& c, + TmpType& /*tmp*/, + const LaneIdType&) const { + c.first = pmadd_scalar(a, b.first, c.first); + c.second = pmadd_scalar(a, b.second, c.second); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, + const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, + const LaneIdType&) const { + c = cj.pmadd(a, b, c); + } + +protected: + conj_helper cj; +}; + +#define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size) \ + typedef typename packet_conditional< \ + packet_size, typename packet_traits::type, typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type ScalarPacket##postfix + + template +class gebp_traits, false, ConjRhs_, Architecture::RVV10, PacketSize_> + : public gebp_traits, false, ConjRhs_, Architecture::Generic, PacketSize_> + { +public: + typedef std::complex Scalar; + typedef RealScalar LhsScalar; + typedef Scalar RhsScalar; + typedef Scalar ResScalar; + PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Real, PacketSize_); + PACKET_DECL_COND_SCALAR_POSTFIX(_, PacketSize_); +#undef PACKET_DECL_COND_SCALAR_POSTFIX + + enum { + ConjLhs = false, + ConjRhs = ConjRhs_, + Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + // FIXME: should depend on NumberOfRegisters + nr = 4, + mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * ResPacketSize, + + LhsProgress = ResPacketSize, + RhsProgress = 1 + }; + + typedef std::conditional_t LhsPacket; + typedef RhsScalar RhsPacket; + typedef std::conditional_t ResPacket; + typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket RhsPacketx4; + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } + + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { + dest = pset1(*b); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload(a); } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); } + + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { + dest = ploadu((const typename unpacket_traits::type*)a); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, + const LaneIdType&) const { + madd_impl(a, b, c, tmp, std::conditional_t()); + } + + EIGEN_STRONG_INLINE PacketXcf pmadd_scalar(const PacketXf& a, std::complex b, const PacketXcf& c) const { + PacketXf v1 = __riscv_vfmadd_vf_f32m1(a, b.real(), c.real, unpacket_traits::size); + PacketXf v3 = __riscv_vfmadd_vf_f32m1(a, b.imag(), c.imag, unpacket_traits::size); + return PacketXcf(v1, v3); + } + + EIGEN_STRONG_INLINE PacketXcd pmadd_scalar(const PacketXd& a, std::complex b, const PacketXcd& c) const { + PacketXd v1 = __riscv_vfmadd_vf_f64m1(a, b.real(), c.real, unpacket_traits::size); + PacketXd v3 = __riscv_vfmadd_vf_f64m1(a, b.imag(), c.imag, unpacket_traits::size); + return PacketXcd(v1, v3); + } + + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, + RhsPacketType& tmp, const true_type&) const { + EIGEN_UNUSED_VARIABLE(tmp); + c = pmadd_scalar(a, b, c); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, + const false_type&) const { + c += a * b; + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, + const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + conj_helper cj; + r = cj.pmadd(alpha, c, r); + } +}; + +template +class gebp_traits, RealScalar, ConjLhs_, false, Architecture::RVV10, PacketSize_> +: public gebp_traits, ConjLhs_, false, Architecture::Generic, PacketSize_> { + public: + typedef std::complex LhsScalar; + typedef RealScalar RhsScalar; + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + + PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); +#undef PACKET_DECL_COND_POSTFIX + + enum { + ConjLhs = ConjLhs_, + ConjRhs = false, + Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits::size : 1, + + nr = 4, + mr = 3 * LhsPacketSize, + + LhsProgress = LhsPacketSize, + RhsProgress = 1 + }; + + typedef std::conditional_t LhsPacket; + typedef RhsScalar RhsPacket; + typedef std::conditional_t ResPacket; + typedef LhsPacket LhsPacket4Packing; + + typedef QuadPacket RhsPacketx4; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } + + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { + dest = pset1(*b); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { + loadRhsQuad_impl(b, dest, std::conditional_t()); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const { + // FIXME we can do better! + // what we want here is a ploadheight + RhsScalar tmp[4] = {b[0], b[0], b[1], b[1]}; + dest = ploadquad(tmp); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const { + eigen_internal_assert(RhsPacketSize <= 8); + dest = pset1(*b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload(a); } + + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { + dest = ploadu(a); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, + const LaneIdType&) const { + madd_impl(a, b, c, tmp, std::conditional_t()); + } + + EIGEN_STRONG_INLINE PacketXcf pmadd_scalar(const PacketXcf& a, float b, const PacketXcf& c) const { + PacketXf v1 = __riscv_vfmadd_vf_f32m1(a.real, b, c.real, unpacket_traits::size); + PacketXf v3 = __riscv_vfmadd_vf_f32m1(a.imag, b, c.imag, unpacket_traits::size); + return PacketXcf(v1, v3); + } + + EIGEN_STRONG_INLINE PacketXcd pmadd_scalar(const PacketXcd& a, double b, const PacketXcd& c) const { + PacketXd v1 = __riscv_vfmadd_vf_f64m1(a.real, b, c.real, unpacket_traits::size); + PacketXd v3 = __riscv_vfmadd_vf_f64m1(a.imag, b, c.imag, unpacket_traits::size); + return PacketXcd(v1, v3); + } + + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, + RhsPacketType& tmp, const true_type&) const { + EIGEN_UNUSED_VARIABLE(tmp); + c = pmadd_scalar(a, b, c); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, + const false_type&) const { + c += a * b; + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, + const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + conj_helper cj; + r = cj.pmadd(c, alpha, r); + } + +}; + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H diff --git a/Eigen/src/Core/arch/RVV10/MathFunctions.h b/Eigen/src/Core/arch/RVV10/MathFunctions.h index 797fca3e2..a77496540 100644 --- a/Eigen/src/Core/arch/RVV10/MathFunctions.h +++ b/Eigen/src/Core/arch/RVV10/MathFunctions.h @@ -16,25 +16,13 @@ namespace Eigen { namespace internal { -template <> -EIGEN_STRONG_INLINE PacketXf pexp(const PacketXf& x) { - return pexp_float(x); -} +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketXf) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketMul2Xf) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketMul4Xf) -template <> -EIGEN_STRONG_INLINE PacketXf plog(const PacketXf& x) { - return plog_float(x); -} - -template <> -EIGEN_STRONG_INLINE PacketXf psin(const PacketXf& x) { - return psin_float(x); -} - -template <> -EIGEN_STRONG_INLINE PacketXf pcos(const PacketXf& x) { - return pcos_float(x); -} +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketXd) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketMul2Xd) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketMul4Xd) } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h index 973633df6..662ed9908 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -25,24 +25,34 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 -template +template struct rvv_packet_size_selector { - enum { size = VectorLength / (sizeof(Scalar) * CHAR_BIT) }; + enum { size = VectorLength * VectorLMul / (sizeof(Scalar) * CHAR_BIT) }; }; -template +template struct rvv_packet_alignment_selector { enum { - alignment = VectorLength >= 512 ? Aligned64 : (VectorLength >= 256 ? Aligned32 : Aligned16) + alignment = (VectorLength*VectorLMul) >= 1024 ? Aligned128 : ((VectorLength*VectorLMul) >= 512 ? Aligned64 : ((VectorLength*VectorLMul) >= 256 ? Aligned32 : Aligned16)) }; }; -typedef vbool32_t PacketMask; +typedef vbool64_t PacketMask64; +typedef vbool32_t PacketMask32; +typedef vbool16_t PacketMask16; +typedef vbool8_t PacketMask8; +typedef vbool4_t PacketMask4; /********************************* int32 **************************************/ typedef vint32m1_t PacketXi __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); typedef vuint32m1_t PacketXu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vint32m2_t PacketMul2Xi __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); +typedef vuint32m2_t PacketMul2Xu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); + +typedef vint32m4_t PacketMul4Xi __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); +typedef vuint32m4_t PacketMul4Xu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); + template <> struct packet_traits : default_packet_traits { typedef PacketXi type; @@ -50,7 +60,59 @@ struct packet_traits : default_packet_traits { enum { Vectorizable = 1, AlignedOnScalar = 1, - size = rvv_packet_size_selector::size, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xi type; + typedef PacketXi half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xi type; + typedef PacketMul2Xi half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, @@ -73,9 +135,38 @@ template <> struct unpacket_traits { typedef numext::int32_t type; typedef PacketXi half; // Half not yet implemented + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int32_t type; + typedef PacketXi half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int32_t type; + typedef PacketMul2Xi half; + typedef numext::uint8_t mask_t; enum { - size = rvv_packet_size_selector::size, - alignment = rvv_packet_alignment_selector::alignment, + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false @@ -89,35 +180,37 @@ EIGEN_STRONG_INLINE void prefetch(const numext::int32_t* addr) #endif } +/********************************* PacketXi ************************************/ + template <> EIGEN_STRONG_INLINE PacketXi pset1(const numext::int32_t& from) { - return __riscv_vmv_v_x_i32m1(from, packet_traits::size); + return __riscv_vmv_v_x_i32m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi plset(const numext::int32_t& a) { - PacketXi idx = __riscv_vid_v_i32m1(packet_traits::size); - return __riscv_vadd_vx_i32m1(idx, a, packet_traits::size); + PacketXi idx = __riscv_vid_v_i32m1(unpacket_traits::size); + return __riscv_vadd_vx_i32m1(idx, a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pzero(const PacketXi& /*a*/) { - return __riscv_vmv_v_x_i32m1(0, packet_traits::size); + return __riscv_vmv_v_x_i32m1(0, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi padd(const PacketXi& a, const PacketXi& b) { - return __riscv_vadd_vv_i32m1(a, b, packet_traits::size); + return __riscv_vadd_vv_i32m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi psub(const PacketXi& a, const PacketXi& b) { - return __riscv_vsub(a, b, packet_traits::size); + return __riscv_vsub(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a) { - return __riscv_vneg(a, packet_traits::size); + return __riscv_vneg(a, unpacket_traits::size); } template <> @@ -127,146 +220,146 @@ EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a) { template <> EIGEN_STRONG_INLINE PacketXi pmul(const PacketXi& a, const PacketXi& b) { - return __riscv_vmul(a, b, packet_traits::size); + return __riscv_vmul(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pdiv(const PacketXi& a, const PacketXi& b) { - return __riscv_vdiv(a, b, packet_traits::size); + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) { - return __riscv_vmadd(a, b, c, packet_traits::size); + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pmsub(const PacketXi& a, const PacketXi& b, const PacketXi& c) { - return __riscv_vmadd(a, b, pnegate(c), packet_traits::size); + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pnmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) { - return __riscv_vnmsub_vv_i32m1(a, b, c, packet_traits::size); + return __riscv_vnmsub_vv_i32m1(a, b, c, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pnmsub(const PacketXi& a, const PacketXi& b, const PacketXi& c) { - return __riscv_vnmsub_vv_i32m1(a, b, pnegate(c), packet_traits::size); + return __riscv_vnmsub_vv_i32m1(a, b, pnegate(c), unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pmin(const PacketXi& a, const PacketXi& b) { - return __riscv_vmin(a, b, packet_traits::size); + return __riscv_vmin(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pmax(const PacketXi& a, const PacketXi& b) { - return __riscv_vmax(a, b, packet_traits::size); + return __riscv_vmax(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pcmp_le(const PacketXi& a, const PacketXi& b) { - PacketMask mask = __riscv_vmsle_vv_i32m1_b32(a, b, packet_traits::size); - return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, packet_traits::size); + PacketMask32 mask = __riscv_vmsle_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pcmp_lt(const PacketXi& a, const PacketXi& b) { - PacketMask mask = __riscv_vmslt_vv_i32m1_b32(a, b, packet_traits::size); - return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, packet_traits::size); + PacketMask32 mask = __riscv_vmslt_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pcmp_eq(const PacketXi& a, const PacketXi& b) { - PacketMask mask = __riscv_vmseq_vv_i32m1_b32(a, b, packet_traits::size); - return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, packet_traits::size); + PacketMask32 mask = __riscv_vmseq_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi ptrue(const PacketXi& /*a*/) { - return __riscv_vmv_v_x_i32m1(0xffffffffu, packet_traits::size); + return __riscv_vmv_v_x_i32m1(0xffffffffu, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pand(const PacketXi& a, const PacketXi& b) { - return __riscv_vand_vv_i32m1(a, b, packet_traits::size); + return __riscv_vand_vv_i32m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi por(const PacketXi& a, const PacketXi& b) { - return __riscv_vor_vv_i32m1(a, b, packet_traits::size); + return __riscv_vor_vv_i32m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pxor(const PacketXi& a, const PacketXi& b) { - return __riscv_vxor_vv_i32m1(a, b, packet_traits::size); + return __riscv_vxor_vv_i32m1(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pandnot(const PacketXi& a, const PacketXi& b) { - return __riscv_vand_vv_i32m1(a, __riscv_vnot_v_i32m1(b, packet_traits::size), packet_traits::size); + return __riscv_vand_vv_i32m1(a, __riscv_vnot_v_i32m1(b, unpacket_traits::size), unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) { - return __riscv_vsra_vx_i32m1(a, N, packet_traits::size); + return __riscv_vsra_vx_i32m1(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) { - return __riscv_vreinterpret_i32m1(__riscv_vsrl_vx_u32m1(__riscv_vreinterpret_u32m1(a), N, packet_traits::size)); + return __riscv_vreinterpret_i32m1(__riscv_vsrl_vx_u32m1(__riscv_vreinterpret_u32m1(a), N, unpacket_traits::size)); } template EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a) { - return __riscv_vsll_vx_i32m1(a, N, packet_traits::size); + return __riscv_vsll_vx_i32m1(a, N, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pload(const numext::int32_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m1(from, packet_traits::size); + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi ploadu(const numext::int32_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m1(from, packet_traits::size); + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi ploaddup(const numext::int32_t* from) { - PacketXu idx = __riscv_vid_v_u32m1(packet_traits::size); - idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, packet_traits::size), 1, packet_traits::size); + PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... - return __riscv_vloxei32_v_i32m1(from, idx, packet_traits::size); + return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi ploadquad(const numext::int32_t* from) { - PacketXu idx = __riscv_vid_v_u32m1(packet_traits::size); - idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, packet_traits::size); - return __riscv_vloxei32_v_i32m1(from, idx, packet_traits::size); + PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketXi& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m1(to, from, packet_traits::size); + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketXi& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m1(to, from, packet_traits::size); + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketXi pgather(const numext::int32_t* from, Index stride) { - return __riscv_vlse32_v_i32m1(from, stride * sizeof(numext::int32_t), packet_traits::size); + return __riscv_vlse32_v_i32m1(from, stride * sizeof(numext::int32_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketXi& from, Index stride) { - __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, packet_traits::size); + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); } template <> @@ -276,43 +369,44 @@ EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketXi& a) { template <> EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) { - PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(packet_traits::size), packet_traits::size-1, packet_traits::size); - return __riscv_vrgather_vv_i32m1(a, idx, packet_traits::size); + PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m1(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a) { - PacketXi mask = __riscv_vsra_vx_i32m1(a, 31, packet_traits::size); - return __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(a, mask, packet_traits::size), mask, packet_traits::size); + PacketXi mask = __riscv_vsra_vx_i32m1(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(a, mask, unpacket_traits::size), mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int32_t predux(const PacketXi& a) { - PacketXi vzero = __riscv_vmv_v_x_i32m1(0, packet_traits::size); - return __riscv_vmv_x(__riscv_vredsum_vs_i32m1_i32m1(a, vzero, packet_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i32m1_i32m1(a, + __riscv_vmv_v_x_i32m1(0, unpacket_traits::size), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketXi& a) { // Multiply the vector by its reverse - PacketXi prod = __riscv_vmul_vv_i32m1(preverse(a), a, packet_traits::size); + PacketXi prod = __riscv_vmul_vv_i32m1(preverse(a), a, unpacket_traits::size); PacketXi half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_i32m1(prod, 8, packet_traits::size); - prod = __riscv_vmul_vv_i32m1(prod, half_prod, packet_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 8, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_i32m1(prod, 4, packet_traits::size); - prod = __riscv_vmul_vv_i32m1(prod, half_prod, packet_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_i32m1(prod, 2, packet_traits::size); - prod = __riscv_vmul_vv_i32m1(prod, half_prod, packet_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } // Last reduction - half_prod = __riscv_vslidedown_vx_i32m1(prod, 1, packet_traits::size); - prod = __riscv_vmul_vv_i32m1(prod, half_prod, packet_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); // The reduction is done to the first element. return pfirst(prod); @@ -320,404 +414,4551 @@ EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketXi& a) { template <> EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketXi& a) { - PacketXi vmax = __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), packet_traits::size); - return __riscv_vmv_x(__riscv_vredmin_vs_i32m1_i32m1(a, vmax, packet_traits::size)); + return __riscv_vmv_x(__riscv_vredmin_vs_i32m1_i32m1(a, + __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), + unpacket_traits::size), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketXi& a) { - PacketXi vmin = __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), packet_traits::size); - return __riscv_vmv_x(__riscv_vredmax_vs_i32m1_i32m1(a, vmin, packet_traits::size)); + return __riscv_vmv_x(__riscv_vredmax_vs_i32m1_i32m1(a, + __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int32_t buffer[packet_traits::size * N] = {0}; + numext::int32_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], packet_traits::size); + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle32_v_i32m1(&buffer[i * packet_traits::size], packet_traits::size); + kernel.packet[i] = __riscv_vle32_v_i32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } -/********************************* float32 ************************************/ +/********************************* PacketMul4Xi ************************************/ -typedef vfloat32m1_t PacketXf __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m4(from, unpacket_traits::size); +} template <> -struct packet_traits : default_packet_traits { - typedef PacketXf type; - typedef PacketXf half; +EIGEN_STRONG_INLINE PacketMul4Xi plset(const numext::int32_t& a) { + PacketMul4Xi idx = __riscv_vid_v_i32m4(unpacket_traits::size); + return __riscv_vadd_vx_i32m4(idx, a, unpacket_traits::size); +} - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = rvv_packet_size_selector::size, +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pzero(const PacketMul4Xi& /*a*/) { + return __riscv_vmv_v_x_i32m4(0, unpacket_traits::size); +} - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasReduxp = 0, +template <> +EIGEN_STRONG_INLINE PacketMul4Xi padd(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vadd_vv_i32m4(a, b, unpacket_traits::size); +} - HasCmp = 1, - HasDiv = 1, - HasFloor = 1, - HasRint = 1, +template <> +EIGEN_STRONG_INLINE PacketMul4Xi psub(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH - }; -}; +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pnegate(const PacketMul4Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); +} template <> -struct unpacket_traits { - typedef float type; - typedef PacketXf half; // Half not yet implemented - typedef PacketXi integer_packet; +EIGEN_STRONG_INLINE PacketMul4Xi pconj(const PacketMul4Xi& a) { + return a; +} - enum { - size = rvv_packet_size_selector::size, - alignment = rvv_packet_alignment_selector::alignment, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmul(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} template <> -EIGEN_STRONG_INLINE PacketXf ptrue(const PacketXf& /*a*/) { - return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(0xffffffffu, packet_traits::size)); +EIGEN_STRONG_INLINE PacketMul4Xi pdiv(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pzero(const PacketXf& /*a*/) { - return __riscv_vfmv_v_f_f32m1(0.0f, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi pmadd(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a) { - return __riscv_vfabs_v_f32m1(a, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi pmsub(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pset1(const float& from) { - return __riscv_vfmv_v_f_f32m1(from, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi pnmadd(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vnmsub_vv_i32m4(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pset1frombits(numext::uint32_t from) { - return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(from, packet_traits::size)); +EIGEN_STRONG_INLINE PacketMul4Xi pnmsub(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vnmsub_vv_i32m4(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf plset(const float& a) { - PacketXf idx = __riscv_vfcvt_f_x_v_f32m1(__riscv_vid_v_i32m1(packet_traits::size), packet_traits::size); - return __riscv_vfadd_vf_f32m1(idx, a, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi pmin(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf padd(const PacketXf& a, const PacketXf& b) { - return __riscv_vfadd_vv_f32m1(a, b, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi pmax(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf psub(const PacketXf& a, const PacketXf& b) { - return __riscv_vfsub_vv_f32m1(a, b, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi pcmp_le(const PacketMul4Xi& a, const PacketMul4Xi& b) { + PacketMask8 mask = __riscv_vmsle_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a) { - return __riscv_vfneg_v_f32m1(a, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi pcmp_lt(const PacketMul4Xi& a, const PacketMul4Xi& b) { + PacketMask8 mask = __riscv_vmslt_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a) { - return a; +EIGEN_STRONG_INLINE PacketMul4Xi pcmp_eq(const PacketMul4Xi& a, const PacketMul4Xi& b) { + PacketMask8 mask = __riscv_vmseq_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmul(const PacketXf& a, const PacketXf& b) { - return __riscv_vfmul_vv_f32m1(a, b, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi ptrue(const PacketMul4Xi& /*a*/) { + return __riscv_vmv_v_x_i32m4(0xffffffffu, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pdiv(const PacketXf& a, const PacketXf& b) { - return __riscv_vfdiv_vv_f32m1(a, b, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi pand(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vand_vv_i32m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) { - return __riscv_vfmadd_vv_f32m1(a, b, c, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi por(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vor_vv_i32m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmsub(const PacketXf& a, const PacketXf& b, const PacketXf& c) { - return __riscv_vfmsub_vv_f32m1(a, b, c, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi pxor(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vxor_vv_i32m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pnmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) { - return __riscv_vfnmsub_vv_f32m1(a, b, c, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi pandnot(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vand_vv_i32m4(a, __riscv_vnot_v_i32m4(b, unpacket_traits::size), unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xi parithmetic_shift_right(PacketMul4Xi a) { + return __riscv_vsra_vx_i32m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_right(PacketMul4Xi a) { + return __riscv_vreinterpret_i32m4(__riscv_vsrl_vx_u32m4(__riscv_vreinterpret_u32m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_left(PacketMul4Xi a) { + return __riscv_vsll_vx_i32m4(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pnmsub(const PacketXf& a, const PacketXf& b, const PacketXf& c) { - return __riscv_vfnmadd_vv_f32m1(a, b, c, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { - PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), packet_traits::size); - PacketMask mask = __riscv_vmfeq_vv_f32m1_b32(a, a, packet_traits::size); - PacketMask mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, packet_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); +} - return __riscv_vfmin_vv_f32m1_tum(mask, nans, a, b, packet_traits::size); +template <> +EIGEN_STRONG_INLINE PacketMul4Xi ploaddup(const numext::int32_t* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); + // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... + return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { - return pmin(a, b); +EIGEN_STRONG_INLINE PacketMul4Xi ploadquad(const numext::int32_t* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { - return __riscv_vfmin_vv_f32m1(a, b, packet_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul4Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { - PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), packet_traits::size); - PacketMask mask = __riscv_vmfeq_vv_f32m1_b32(a, a, packet_traits::size); - PacketMask mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, packet_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, packet_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul4Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); +} - return __riscv_vfmax_vv_f32m1_tum(mask, nans, a, b, packet_traits::size); +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xi pgather(const numext::int32_t* from, Index stride) { + return __riscv_vlse32_v_i32m4(from, stride * sizeof(numext::int32_t), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { - return pmax(a, b); +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul4Xi& from, + Index stride) { + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { - return __riscv_vfmax_vv_f32m1(a, b, packet_traits::size); +EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul4Xi& a) { + return __riscv_vmv_x_s_i32m4_i32(a); } template <> -EIGEN_STRONG_INLINE PacketXf pcmp_le(const PacketXf& a, const PacketXf& b) { - PacketMask mask = __riscv_vmfle_vv_f32m1_b32(a, b, packet_traits::size); - return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi preverse(const PacketMul4Xi& a) { + PacketMul4Xu idx = __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m4(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pcmp_lt(const PacketXf& a, const PacketXf& b) { - PacketMask mask = __riscv_vmflt_vv_f32m1_b32(a, b, packet_traits::size); - return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul4Xi pabs(const PacketMul4Xi& a) { + PacketMul4Xi mask = __riscv_vsra_vx_i32m4(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m4(__riscv_vxor_vv_i32m4(a, mask, unpacket_traits::size), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pcmp_eq(const PacketXf& a, const PacketXf& b) { - PacketMask mask = __riscv_vmfeq_vv_f32m1_b32(a, b, packet_traits::size); - return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, packet_traits::size); +EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul4Xi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1(a, + __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan(const PacketXf& a, const PacketXf& b) { - PacketMask mask = __riscv_vmfge_vv_f32m1_b32(a, b, packet_traits::size); - return __riscv_vfmerge_vfm_f32m1(ptrue(a), 0.0f, mask, packet_traits::size); +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul4Xi& a) { + PacketXi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), + __riscv_vget_v_i32m4_i32m1(a, 1), unpacket_traits::size); + PacketXi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), + __riscv_vget_v_i32m4_i32m1(a, 3), unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits::size)); } -// Logical Operations are not supported for float, so reinterpret casts template <> -EIGEN_STRONG_INLINE PacketXf pand(const PacketXf& a, const PacketXf& b) { - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), packet_traits::size)); +EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul4Xi& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i32m4_i32m1(a, + __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), + unpacket_traits::size / 4), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketXf por(const PacketXf& a, const PacketXf& b) { - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vor_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), packet_traits::size)); +EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul4Xi& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i32m4_i32m1(a, + __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle32_v_i32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } } +/********************************* PacketMul2Xi ************************************/ + template <> -EIGEN_STRONG_INLINE PacketXf pxor(const PacketXf& a, const PacketXf& b) { - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), packet_traits::size)); +EIGEN_STRONG_INLINE PacketMul2Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pandnot(const PacketXf& a, const PacketXf& b) { - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), packet_traits::size), packet_traits::size)); +EIGEN_STRONG_INLINE PacketMul2Xi plset(const numext::int32_t& a) { + PacketMul2Xi idx = __riscv_vid_v_i32m2(unpacket_traits::size); + return __riscv_vadd_vx_i32m2(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pload(const float* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m1(from, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul2Xi pzero(const PacketMul2Xi& /*a*/) { + return __riscv_vmv_v_x_i32m2(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf ploadu(const float* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m1(from, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul2Xi padd(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vadd_vv_i32m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf ploaddup(const float* from) { - PacketXu idx = __riscv_vid_v_u32m1(packet_traits::size); - idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, packet_traits::size), 1, packet_traits::size); - return __riscv_vloxei32_v_f32m1(from, idx, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul2Xi psub(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf ploadquad(const float* from) { - PacketXu idx = __riscv_vid_v_u32m1(packet_traits::size); - idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, packet_traits::size); - return __riscv_vloxei32_v_f32m1(from, idx, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul2Xi pnegate(const PacketMul2Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(float* to, const PacketXf& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m1(to, from, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul2Xi pconj(const PacketMul2Xi& a) { + return a; } template <> -EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketXf& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m1(to, from, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul2Xi pmul(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketXf pgather(const float* from, Index stride) { - return __riscv_vlse32_v_f32m1(from, stride * sizeof(float), packet_traits::size); +EIGEN_STRONG_INLINE PacketMul2Xi pdiv(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketXf& from, Index stride) { - __riscv_vsse32(to, stride * sizeof(float), from, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul2Xi pmadd(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE float pfirst(const PacketXf& a) { - return __riscv_vfmv_f_s_f32m1_f32(a); +EIGEN_STRONG_INLINE PacketMul2Xi pmsub(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf psqrt(const PacketXf& a) { - return __riscv_vfsqrt_v_f32m1(a, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul2Xi pnmadd(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vnmsub_vv_i32m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf print(const PacketXf& a) { - // Adds and subtracts signum(a) * 2^23 to force rounding. - const PacketXf limit = pset1(static_cast(1 << 23)); - const PacketXf abs_a = pabs(a); - PacketXf r = padd(abs_a, limit); - // Don't compile-away addition and subtraction. - EIGEN_OPTIMIZATION_BARRIER(r); - r = psub(r, limit); - // If greater than limit, simply return a. Otherwise, account for sign. - r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); - return r; +EIGEN_STRONG_INLINE PacketMul2Xi pnmsub(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vnmsub_vv_i32m2(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pfloor(const PacketXf& a) { - const PacketXf cst_1 = pset1(1.0f); - PacketXf tmp = print(a); - // If greater, subtract one. - PacketXf mask = pcmp_lt(a, tmp); - mask = pand(mask, cst_1); - return psub(tmp, mask); +EIGEN_STRONG_INLINE PacketMul2Xi pmin(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) { - PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(packet_traits::size), packet_traits::size-1, packet_traits::size); - return __riscv_vrgather_vv_f32m1(a, idx, packet_traits::size); +EIGEN_STRONG_INLINE PacketMul2Xi pmax(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pfrexp(const PacketXf& a, PacketXf& exponent) { - return pfrexp_generic(a, exponent); +EIGEN_STRONG_INLINE PacketMul2Xi pcmp_le(const PacketMul2Xi& a, const PacketMul2Xi& b) { + PacketMask16 mask = __riscv_vmsle_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE float predux(const PacketXf& a) { - PacketXf vzero = __riscv_vfmv_v_f_f32m1(0.0, packet_traits::size); - return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m1_f32m1(a, vzero, packet_traits::size)); +EIGEN_STRONG_INLINE PacketMul2Xi pcmp_lt(const PacketMul2Xi& a, const PacketMul2Xi& b) { + PacketMask16 mask = __riscv_vmslt_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) { - // Multiply the vector by its reverse - PacketXf prod = __riscv_vfmul_vv_f32m1(preverse(a), a, packet_traits::size); - PacketXf half_prod; +EIGEN_STRONG_INLINE PacketMul2Xi pcmp_eq(const PacketMul2Xi& a, const PacketMul2Xi& b) { + PacketMask16 mask = __riscv_vmseq_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} - if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, packet_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, packet_traits::size); - } - if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, packet_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, packet_traits::size); - } - if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, packet_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, packet_traits::size); - } - // Last reduction - half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, packet_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, packet_traits::size); +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ptrue(const PacketMul2Xi& /*a*/) { + return __riscv_vmv_v_x_i32m2(0xffffffffu, unpacket_traits::size); +} - // The reduction is done to the first element. - return pfirst(prod); +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pand(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vand_vv_i32m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE float predux_min(const PacketXf& a) { - PacketXf vmax = __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), packet_traits::size); - return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1(a, vmax, packet_traits::size)); +EIGEN_STRONG_INLINE PacketMul2Xi por(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vor_vv_i32m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE float predux_max(const PacketXf& a) { - PacketXf vmin = __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), packet_traits::size); - return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1(a, vmin, packet_traits::size)); +EIGEN_STRONG_INLINE PacketMul2Xi pxor(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vxor_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pandnot(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vand_vv_i32m2(a, __riscv_vnot_v_i32m2(b, unpacket_traits::size), unpacket_traits::size); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - float buffer[packet_traits::size * N]; - int i = 0; +EIGEN_STRONG_INLINE PacketMul2Xi parithmetic_shift_right(PacketMul2Xi a) { + return __riscv_vsra_vx_i32m2(a, N, unpacket_traits::size); +} - for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], packet_traits::size); - } +template +EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_right(PacketMul2Xi a) { + return __riscv_vreinterpret_i32m2(__riscv_vsrl_vx_u32m2(__riscv_vreinterpret_u32m2(a), N, unpacket_traits::size)); +} - for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle32_v_f32m1(&buffer[i * packet_traits::size], packet_traits::size); - } +template +EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_left(PacketMul2Xi a) { + return __riscv_vsll_vx_i32m2(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pldexp(const PacketXf& a, const PacketXf& exponent) { - return pldexp_generic(a, exponent); +EIGEN_STRONG_INLINE PacketMul2Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ploaddup(const numext::int32_t* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); + // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... + return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ploadquad(const numext::int32_t* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul2Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul2Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xi pgather(const numext::int32_t* from, Index stride) { + return __riscv_vlse32_v_i32m2(from, stride * sizeof(numext::int32_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul2Xi& from, + Index stride) { + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul2Xi& a) { + return __riscv_vmv_x_s_i32m2_i32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi preverse(const PacketMul2Xi& a) { + PacketMul2Xu idx = __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pabs(const PacketMul2Xi& a) { + PacketMul2Xi mask = __riscv_vsra_vx_i32m2(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m2(__riscv_vxor_vv_i32m2(a, mask, unpacket_traits::size), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul2Xi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(a, + __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul2Xi& a) { + return predux_mul(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), + __riscv_vget_v_i32m2_i32m1(a, 1), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul2Xi& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i32m2_i32m1(a, + __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), + unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul2Xi& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i32m2_i32m1(a, + __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle32_v_i32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, +PacketMul2Xi>::type predux_half_dowto4(const PacketMul4Xi& a) { + return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), + __riscv_vget_v_i32m4_i32m2(a, 1), unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, +PacketXi>::type predux_half_dowto4(const PacketMul2Xi& a) { + return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), + __riscv_vget_v_i32m2_i32m1(a, 1), unpacket_traits::size); +} + +/********************************* float32 ************************************/ + +typedef vfloat32m1_t PacketXf __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vfloat32m2_t PacketMul2Xf __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); +typedef vfloat32m4_t PacketMul4Xf __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXf type; + typedef PacketXf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xf type; + typedef PacketXf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xf type; + typedef PacketMul2Xf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef PacketXf half; // Half not yet implemented + typedef PacketXi integer_packet; + typedef numext::uint8_t mask_t; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef PacketXf half; + typedef PacketMul2Xi integer_packet; + typedef numext::uint8_t mask_t; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef PacketMul2Xf half; + typedef PacketMul4Xi integer_packet; + typedef numext::uint8_t mask_t; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +/********************************* PacketXf ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXf ptrue(const PacketXf& /*a*/) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(0xffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pzero(const PacketXf& /*a*/) { + return __riscv_vfmv_v_f_f32m1(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a) { + return __riscv_vfabs_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf plset(const float& a) { + PacketXf idx = __riscv_vfcvt_f_x_v_f32m1(__riscv_vid_v_i32m1(unpacket_traits::size), unpacket_traits::size); + return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf padd(const PacketXf& a, const PacketXf& b) { + return __riscv_vfadd_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf psub(const PacketXf& a, const PacketXf& b) { + return __riscv_vfsub_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a) { + return __riscv_vfneg_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmul(const PacketXf& a, const PacketXf& b) { + return __riscv_vfmul_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pdiv(const PacketXf& a, const PacketXf& b) { + return __riscv_vfdiv_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfmadd_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmsub(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfmsub_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pnmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfnmsub_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pnmsub(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfnmadd_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { + PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f32m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { + return __riscv_vfmin_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { + PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f32m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { + return __riscv_vfmax_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_le(const PacketXf& a, const PacketXf& b) { + PacketMask32 mask = __riscv_vmfle_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_lt(const PacketXf& a, const PacketXf& b) { + PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_eq(const PacketXf& a, const PacketXf& b) { + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan(const PacketXf& a, const PacketXf& b) { + PacketMask32 mask = __riscv_vmfge_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m1(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketXf pand(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf por(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vor_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pxor(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pandnot(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), + __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploaddup(const float* from) { + PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploadquad(const float* from) { + PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const PacketXf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketXf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m1(from, stride * sizeof(float), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketXf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const PacketXf& a) { + return __riscv_vfmv_f_s_f32m1_f32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXf psqrt(const PacketXf& a) { + return __riscv_vfsqrt_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf print(const PacketXf& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const PacketXf limit = pset1(static_cast(1 << 23)); + const PacketXf abs_a = pabs(a); + PacketXf r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template <> +EIGEN_STRONG_INLINE PacketXf pfloor(const PacketXf& a) { + const PacketXf cst_1 = pset1(1.0f); + PacketXf tmp = print(a); + // If greater, subtract one. + PacketXf mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template <> +EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) { + PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pfrexp(const PacketXf& a, PacketXf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const PacketXf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m1_f32m1(a, + __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) { + // Multiply the vector by its reverse + PacketXf prod = __riscv_vfmul_vv_f32m1(preverse(a), a, unpacket_traits::size); + PacketXf half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const PacketXf& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1(a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const PacketXf& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1(a, + __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle32_v_f32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketXf pldexp(const PacketXf& a, const PacketXf& exponent) { + return pldexp_generic(a, exponent); +} + +/********************************* PacketMul4Xf ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ptrue(const PacketMul4Xf& /*a*/) { + return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(0xffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pzero(const PacketMul4Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m4(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pabs(const PacketMul4Xf& a) { + return __riscv_vfabs_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf plset(const float& a) { + PacketMul4Xf idx = __riscv_vfcvt_f_x_v_f32m4(__riscv_vid_v_i32m4(unpacket_traits::size), unpacket_traits::size); + return __riscv_vfadd_vf_f32m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf padd(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfadd_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf psub(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfsub_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pnegate(const PacketMul4Xf& a) { + return __riscv_vfneg_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pconj(const PacketMul4Xf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmul(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfmul_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pdiv(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfdiv_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmadd(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfmadd_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmsub(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfmsub_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pnmadd(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfnmsub_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pnmsub(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfnmadd_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMul4Xf nans = __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f32m4_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfmin_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMul4Xf nans = __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f32m4_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfmax_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_le(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmfle_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_eq(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt_or_nan(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmfge_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m4(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pand(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf por(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pxor(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pandnot(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vnot_v_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ploaddup(const float* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ploadquad(const float* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul4Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul4Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m4(from, stride * sizeof(float), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul4Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const PacketMul4Xf& a) { + return __riscv_vfmv_f_s_f32m4_f32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf psqrt(const PacketMul4Xf& a) { + return __riscv_vfsqrt_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf print(const PacketMul4Xf& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const PacketMul4Xf limit = pset1(static_cast(1 << 23)); + const PacketMul4Xf abs_a = pabs(a); + PacketMul4Xf r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pfloor(const PacketMul4Xf& a) { + const PacketMul4Xf cst_1 = pset1(1.0f); + PacketMul4Xf tmp = print(a); + // If greater, subtract one. + PacketMul4Xf mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf preverse(const PacketMul4Xf& a) { + PacketMul4Xu idx = __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pfrexp(const PacketMul4Xf& a, PacketMul4Xf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const PacketMul4Xf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m4_f32m1(a, + __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketMul4Xf& a) { + PacketXf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), + __riscv_vget_v_f32m4_f32m1(a, 1), unpacket_traits::size); + PacketXf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), + __riscv_vget_v_f32m4_f32m1(a, 3), unpacket_traits::size); + return predux_mul(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const PacketMul4Xf& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1(a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const PacketMul4Xf& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1(a, + __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle32_v_f32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pldexp(const PacketMul4Xf& a, const PacketMul4Xf& exponent) { + return pldexp_generic(a, exponent); +} + +/********************************* PacketMul2Xf ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ptrue(const PacketMul2Xf& /*a*/) { + return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(0xffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pzero(const PacketMul2Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m2(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pabs(const PacketMul2Xf& a) { + return __riscv_vfabs_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf plset(const float& a) { + PacketMul2Xf idx = __riscv_vfcvt_f_x_v_f32m2(__riscv_vid_v_i32m2(unpacket_traits::size), unpacket_traits::size); + return __riscv_vfadd_vf_f32m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf padd(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfadd_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf psub(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfsub_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pnegate(const PacketMul2Xf& a) { + return __riscv_vfneg_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pconj(const PacketMul2Xf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmul(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfmul_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pdiv(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfdiv_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmadd(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfmadd_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmsub(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfmsub_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pnmadd(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfnmsub_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pnmsub(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfnmadd_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMul2Xf nans = __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f32m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfmin_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMul2Xf nans = __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f32m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfmax_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_le(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmfle_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_eq(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt_or_nan(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmfge_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m2(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pand(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf por(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pxor(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pandnot(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vnot_v_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ploaddup(const float* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ploadquad(const float* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul2Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul2Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m2(from, stride * sizeof(float), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul2Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const PacketMul2Xf& a) { + return __riscv_vfmv_f_s_f32m2_f32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf psqrt(const PacketMul2Xf& a) { + return __riscv_vfsqrt_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf print(const PacketMul2Xf& a) { + // Adds and subtracts signum(a) * 2^23 to force rounding. + const PacketMul2Xf limit = pset1(static_cast(1 << 23)); + const PacketMul2Xf abs_a = pabs(a); + PacketMul2Xf r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pfloor(const PacketMul2Xf& a) { + const PacketMul2Xf cst_1 = pset1(1.0f); + PacketMul2Xf tmp = print(a); + // If greater, subtract one. + PacketMul2Xf mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf preverse(const PacketMul2Xf& a) { + PacketMul2Xu idx = __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pfrexp(const PacketMul2Xf& a, PacketMul2Xf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const PacketMul2Xf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m2_f32m1(a, + __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketMul2Xf& a) { + return predux_mul(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), + __riscv_vget_v_f32m2_f32m1(a, 1), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const PacketMul2Xf& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1(a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const PacketMul2Xf& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1(a, + __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle32_v_f32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pldexp(const PacketMul2Xf& a, const PacketMul2Xf& exponent) { + return pldexp_generic(a, exponent); +} + +template +EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, +PacketMul2Xf>::type predux_half_dowto4(const PacketMul4Xf& a) { + return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), + __riscv_vget_v_f32m4_f32m2(a, 1), unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, +PacketXf>::type predux_half_dowto4(const PacketMul2Xf& a) { + return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), + __riscv_vget_v_f32m2_f32m1(a, 1), unpacket_traits::size); +} + +/********************************* int64 **************************************/ + +typedef vint64m1_t PacketXl __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vuint64m1_t PacketXul __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); + +typedef vint64m2_t PacketMul2Xl __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); +typedef vuint64m2_t PacketMul2Xul __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); + +typedef vint64m4_t PacketMul4Xl __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); +typedef vuint64m4_t PacketMul4Xul __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXl type; + typedef PacketXl half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xl type; + typedef PacketXl half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xl type; + typedef PacketMul2Xl half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int64_t type; + typedef PacketXl half; // Half not yet implemented + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int64_t type; + typedef PacketXl half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int64_t type; + typedef PacketMul2Xl half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE void prefetch(const numext::int64_t* addr) { +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + __builtin_prefetch(addr); +#endif +} + +/********************************* PacketXl ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl plset(const numext::int64_t& a) { + PacketXl idx = __riscv_vid_v_i64m1(unpacket_traits::size); + return __riscv_vadd_vx_i64m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pzero(const PacketXl& /*a*/) { + return __riscv_vmv_v_x_i64m1(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl padd(const PacketXl& a, const PacketXl& b) { + return __riscv_vadd_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl psub(const PacketXl& a, const PacketXl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pnegate(const PacketXl& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pconj(const PacketXl& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmul(const PacketXl& a, const PacketXl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pdiv(const PacketXl& a, const PacketXl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmadd(const PacketXl& a, const PacketXl& b, const PacketXl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmsub(const PacketXl& a, const PacketXl& b, const PacketXl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pnmadd(const PacketXl& a, const PacketXl& b, const PacketXl& c) { + return __riscv_vnmsub_vv_i64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pnmsub(const PacketXl& a, const PacketXl& b, const PacketXl& c) { + return __riscv_vnmsub_vv_i64m1(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmin(const PacketXl& a, const PacketXl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmax(const PacketXl& a, const PacketXl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pcmp_le(const PacketXl& a, const PacketXl& b) { + PacketMask64 mask = __riscv_vmsle_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pcmp_lt(const PacketXl& a, const PacketXl& b) { + PacketMask64 mask = __riscv_vmslt_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pcmp_eq(const PacketXl& a, const PacketXl& b) { + PacketMask64 mask = __riscv_vmseq_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl ptrue(const PacketXl& /*a*/) { + return __riscv_vmv_v_x_i64m1(0xffffffffffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pand(const PacketXl& a, const PacketXl& b) { + return __riscv_vand_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl por(const PacketXl& a, const PacketXl& b) { + return __riscv_vor_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pxor(const PacketXl& a, const PacketXl& b) { + return __riscv_vxor_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pandnot(const PacketXl& a, const PacketXl& b) { + return __riscv_vand_vv_i64m1(a, __riscv_vnot_v_i64m1(b, unpacket_traits::size), unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXl parithmetic_shift_right(PacketXl a) { + return __riscv_vsra_vx_i64m1(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXl plogical_shift_right(PacketXl a) { + return __riscv_vreinterpret_i64m1(__riscv_vsrl_vx_u64m1(__riscv_vreinterpret_u64m1(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketXl plogical_shift_left(PacketXl a) { + return __riscv_vsll_vx_i64m1(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl ploaddup(const numext::int64_t* from) { + PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl ploadquad(const numext::int64_t* from) { + PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size);; + return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketXl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketXl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXl pgather(const numext::int64_t* from, Index stride) { + return __riscv_vlse64_v_i64m1(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketXl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketXl& a) { + return __riscv_vmv_x_s_i64m1_i64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXl preverse(const PacketXl& a) { + PacketXul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pabs(const PacketXl& a) { + PacketXl mask = __riscv_vsra_vx_i64m1(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m1(__riscv_vxor_vv_i64m1(a, mask, unpacket_traits::size), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux(const PacketXl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m1_i64m1(a, + __riscv_vmv_v_x_i64m1(0, unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketXl& a) { + // Multiply the vector by its reverse + PacketXl prod = __riscv_vmul_vv_i64m1(preverse(a), a, unpacket_traits::size); + PacketXl half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketXl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m1_i64m1(a, + __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), + unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketXl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m1_i64m1(a, + __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle64_v_i64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul4Xl ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl plset(const numext::int64_t& a) { + PacketMul4Xl idx = __riscv_vid_v_i64m4(unpacket_traits::size); + return __riscv_vadd_vx_i64m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pzero(const PacketMul4Xl& /*a*/) { + return __riscv_vmv_v_x_i64m4(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl padd(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vadd_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl psub(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pnegate(const PacketMul4Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pconj(const PacketMul4Xl& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmul(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pdiv(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmadd(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmsub(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pnmadd(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vnmsub_vv_i64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pnmsub(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vnmsub_vv_i64m4(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmin(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmax(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcmp_le(const PacketMul4Xl& a, const PacketMul4Xl& b) { + PacketMask16 mask = __riscv_vmsle_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcmp_lt(const PacketMul4Xl& a, const PacketMul4Xl& b) { + PacketMask16 mask = __riscv_vmslt_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcmp_eq(const PacketMul4Xl& a, const PacketMul4Xl& b) { + PacketMask16 mask = __riscv_vmseq_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ptrue(const PacketMul4Xl& /*a*/) { + return __riscv_vmv_v_x_i64m4(0xffffffffffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pand(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vand_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl por(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vor_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pxor(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vxor_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pandnot(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vand_vv_i64m4(a, __riscv_vnot_v_i64m4(b, unpacket_traits::size), unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xl parithmetic_shift_right(PacketMul4Xl a) { + return __riscv_vsra_vx_i64m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_right(PacketMul4Xl a) { + return __riscv_vreinterpret_i64m4(__riscv_vsrl_vx_u64m4(__riscv_vreinterpret_u64m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_left(PacketMul4Xl a) { + return __riscv_vsll_vx_i64m4(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ploaddup(const numext::int64_t* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ploadquad(const numext::int64_t* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul4Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul4Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xl pgather(const numext::int64_t* from, Index stride) { + return __riscv_vlse64_v_i64m4(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul4Xl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul4Xl& a) { + return __riscv_vmv_x_s_i64m4_i64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl preverse(const PacketMul4Xl& a) { + PacketMul4Xul idx = __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pabs(const PacketMul4Xl& a) { + PacketMul4Xl mask = __riscv_vsra_vx_i64m4(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m4(__riscv_vxor_vv_i64m4(a, mask, unpacket_traits::size), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul4Xl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1(a, + __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul4Xl& a) { + PacketXl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), + __riscv_vget_v_i64m4_i64m1(a, 1), unpacket_traits::size); + PacketXl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), + __riscv_vget_v_i64m4_i64m1(a, 3), unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul4Xl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m4_i64m1(a, + __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), + unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul4Xl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m4_i64m1(a, + __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle64_v_i64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul2Xl ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl plset(const numext::int64_t& a) { + PacketMul2Xl idx = __riscv_vid_v_i64m2(unpacket_traits::size); + return __riscv_vadd_vx_i64m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pzero(const PacketMul2Xl& /*a*/) { + return __riscv_vmv_v_x_i64m2(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl padd(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vadd_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl psub(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pnegate(const PacketMul2Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pconj(const PacketMul2Xl& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmul(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pdiv(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmadd(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmsub(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pnmadd(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vnmsub_vv_i64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pnmsub(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vnmsub_vv_i64m2(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmin(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmax(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcmp_le(const PacketMul2Xl& a, const PacketMul2Xl& b) { + PacketMask32 mask = __riscv_vmsle_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcmp_lt(const PacketMul2Xl& a, const PacketMul2Xl& b) { + PacketMask32 mask = __riscv_vmslt_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcmp_eq(const PacketMul2Xl& a, const PacketMul2Xl& b) { + PacketMask32 mask = __riscv_vmseq_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ptrue(const PacketMul2Xl& /*a*/) { + return __riscv_vmv_v_x_i64m2(0xffffffffffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pand(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vand_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl por(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vor_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pxor(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vxor_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pandnot(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vand_vv_i64m2(a, __riscv_vnot_v_i64m2(b, unpacket_traits::size), unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xl parithmetic_shift_right(PacketMul2Xl a) { + return __riscv_vsra_vx_i64m2(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_right(PacketMul2Xl a) { + return __riscv_vreinterpret_i64m2(__riscv_vsrl_vx_u64m2(__riscv_vreinterpret_u64m2(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_left(PacketMul2Xl a) { + return __riscv_vsll_vx_i64m2(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ploaddup(const numext::int64_t* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ploadquad(const numext::int64_t* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul2Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul2Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xl pgather(const numext::int64_t* from, Index stride) { + return __riscv_vlse64_v_i64m2(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul2Xl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul2Xl& a) { + return __riscv_vmv_x_s_i64m2_i64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl preverse(const PacketMul2Xl& a) { + PacketMul2Xul idx = __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pabs(const PacketMul2Xl& a) { + PacketMul2Xl mask = __riscv_vsra_vx_i64m2(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m2(__riscv_vxor_vv_i64m2(a, mask, unpacket_traits::size), + mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul2Xl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1(a, + __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul2Xl& a) { + return predux_mul(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), + __riscv_vget_v_i64m2_i64m1(a, 1), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul2Xl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m2_i64m1(a, + __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), + unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul2Xl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m2_i64m1(a, + __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle64_v_i64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, +PacketMul2Xl>::type predux_half_dowto4(const PacketMul4Xl& a) { + return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), + __riscv_vget_v_i64m4_i64m2(a, 1), unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, +PacketXl>::type predux_half_dowto4(const PacketMul2Xl& a) { + return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), + __riscv_vget_v_i64m2_i64m1(a, 1), unpacket_traits::size); +} + +/********************************* double ************************************/ + +typedef vfloat64m1_t PacketXd __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vfloat64m2_t PacketMul2Xd __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); +typedef vfloat64m4_t PacketMul4Xd __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXd type; + typedef PacketXd half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xd type; + typedef PacketXd half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xd type; + typedef PacketMul2Xd half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + typedef PacketXd half; // Half not yet implemented + typedef PacketXl integer_packet; + typedef numext::uint8_t mask_t; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + typedef PacketXd half; + typedef PacketMul2Xl integer_packet; + typedef numext::uint8_t mask_t; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + typedef PacketMul2Xd half; + typedef PacketMul4Xl integer_packet; + typedef numext::uint8_t mask_t; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +/********************************* PacketXd ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXd ptrue(const PacketXd& /*a*/) { + return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(0xffffffffffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pzero(const PacketXd& /*a*/) { + return __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pabs(const PacketXd& a) { + return __riscv_vfabs_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd plset(const double& a) { + PacketXd idx = __riscv_vfcvt_f_x_v_f64m1(__riscv_vid_v_i64m1(unpacket_traits::size), unpacket_traits::size); + return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd padd(const PacketXd& a, const PacketXd& b) { + return __riscv_vfadd_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd psub(const PacketXd& a, const PacketXd& b) { + return __riscv_vfsub_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pnegate(const PacketXd& a) { + return __riscv_vfneg_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pconj(const PacketXd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmul(const PacketXd& a, const PacketXd& b) { + return __riscv_vfmul_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pdiv(const PacketXd& a, const PacketXd& b) { + return __riscv_vfdiv_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmadd(const PacketXd& a, const PacketXd& b, const PacketXd& c) { + return __riscv_vfmadd_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmsub(const PacketXd& a, const PacketXd& b, const PacketXd& c) { + return __riscv_vfmsub_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pnmadd(const PacketXd& a, const PacketXd& b, const PacketXd& c) { + return __riscv_vfnmsub_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pnmsub(const PacketXd& a, const PacketXd& b, const PacketXd& c) { + return __riscv_vfnmadd_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { + PacketXd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); + PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f64m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { + return __riscv_vfmin_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { + PacketXd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); + PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f64m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { + return __riscv_vfmax_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pcmp_le(const PacketXd& a, const PacketXd& b) { + PacketMask64 mask = __riscv_vmfle_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pcmp_lt(const PacketXd& a, const PacketXd& b) { + PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pcmp_eq(const PacketXd& a, const PacketXd& b) { + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pcmp_lt_or_nan(const PacketXd& a, const PacketXd& b) { + PacketMask64 mask = __riscv_vmfge_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m1(ptrue(a), 0.0, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for double, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketXd pand(const PacketXd& a, const PacketXd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd por(const PacketXd& a, const PacketXd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vor_vv_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pxor(const PacketXd& a, const PacketXd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vv_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pandnot(const PacketXd& a, const PacketXd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(a), + __riscv_vnot_v_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd ploaddup(const double* from) { + PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd ploadquad(const double* from) { + PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size);; + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const PacketXd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketXd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m1(from, stride * sizeof(double), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketXd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const PacketXd& a) { + return __riscv_vfmv_f_s_f64m1_f64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXd psqrt(const PacketXd& a) { + return __riscv_vfsqrt_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd print(const PacketXd& a) { + // Adds and subtracts signum(a) * 2^52 to force rounding. + const PacketXd limit = pset1(static_cast(1ull << 52)); + const PacketXd abs_a = pabs(a); + PacketXd r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template <> +EIGEN_STRONG_INLINE PacketXd pfloor(const PacketXd& a) { + const PacketXd cst_1 = pset1(1.0); + PacketXd tmp = print(a); + // If greater, subtract one. + PacketXd mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template <> +EIGEN_STRONG_INLINE PacketXd preverse(const PacketXd& a) { + PacketXul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pfrexp(const PacketXd& a, PacketXd& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE double predux(const PacketXd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m1_f64m1(a, + __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const PacketXd& a) { + // Multiply the vector by its reverse + PacketXd prod = __riscv_vfmul_vv_f64m1(preverse(a), a, unpacket_traits::size); + PacketXd half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const PacketXd& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1(a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const PacketXd& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1(a, + __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle64_v_f64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketXd pldexp(const PacketXd& a, const PacketXd& exponent) { + return pldexp_generic(a, exponent); +} + +/********************************* PacketMul4Xd ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ptrue(const PacketMul4Xd& /*a*/) { + return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(0xffffffffffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pzero(const PacketMul4Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m4(0.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pabs(const PacketMul4Xd& a) { + return __riscv_vfabs_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd plset(const double& a) { + PacketMul4Xd idx = __riscv_vfcvt_f_x_v_f64m4(__riscv_vid_v_i64m4(unpacket_traits::size), unpacket_traits::size); + return __riscv_vfadd_vf_f64m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd padd(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfadd_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd psub(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfsub_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pnegate(const PacketMul4Xd& a) { + return __riscv_vfneg_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pconj(const PacketMul4Xd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmul(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfmul_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pdiv(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfdiv_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfmadd_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfmsub_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pnmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfnmsub_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pnmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfnmadd_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMul4Xd nans = __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f64m4_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfmin_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMul4Xd nans = __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f64m4_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfmax_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_le(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmfle_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_eq(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt_or_nan(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmfge_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m4(ptrue(a), 0.0, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for double, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pand(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd por(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pxor(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pandnot(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vnot_v_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ploaddup(const double* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ploadquad(const double* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul4Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul4Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m4(from, stride * sizeof(double), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul4Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const PacketMul4Xd& a) { + return __riscv_vfmv_f_s_f64m4_f64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd psqrt(const PacketMul4Xd& a) { + return __riscv_vfsqrt_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd print(const PacketMul4Xd& a) { + // Adds and subtracts signum(a) * 2^52 to force rounding. + const PacketMul4Xd limit = pset1(static_cast(1ull << 52)); + const PacketMul4Xd abs_a = pabs(a); + PacketMul4Xd r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pfloor(const PacketMul4Xd& a) { + const PacketMul4Xd cst_1 = pset1(1.0); + PacketMul4Xd tmp = print(a); + // If greater, subtract one. + PacketMul4Xd mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd preverse(const PacketMul4Xd& a) { + PacketMul4Xul idx = __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pfrexp(const PacketMul4Xd& a, PacketMul4Xd& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE double predux(const PacketMul4Xd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m4_f64m1(a, + __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const PacketMul4Xd& a) { + PacketXd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), + __riscv_vget_v_f64m4_f64m1(a, 1), unpacket_traits::size); + PacketXd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), + __riscv_vget_v_f64m4_f64m1(a, 3), unpacket_traits::size); + return predux_mul(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const PacketMul4Xd& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1(a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const PacketMul4Xd& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1(a, + __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle64_v_f64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pldexp(const PacketMul4Xd& a, const PacketMul4Xd& exponent) { + return pldexp_generic(a, exponent); +} + +/********************************* PacketMul2Xd ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ptrue(const PacketMul2Xd& /*a*/) { + return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(0xffffffffffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pzero(const PacketMul2Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m2(0.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pabs(const PacketMul2Xd& a) { + return __riscv_vfabs_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd plset(const double& a) { + PacketMul2Xd idx = __riscv_vfcvt_f_x_v_f64m2(__riscv_vid_v_i64m2(unpacket_traits::size), unpacket_traits::size); + return __riscv_vfadd_vf_f64m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd padd(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfadd_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd psub(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfsub_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pnegate(const PacketMul2Xd& a) { + return __riscv_vfneg_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pconj(const PacketMul2Xd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmul(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfmul_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pdiv(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfdiv_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmadd(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfmadd_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmsub(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfmsub_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pnmadd(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfnmsub_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pnmsub(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfnmadd_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMul2Xd nans = __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f64m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfmin_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMul2Xd nans = __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f64m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfmax_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_le(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmfle_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_eq(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt_or_nan(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmfge_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m2(ptrue(a), 0.0, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for double, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pand(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd por(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pxor(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pandnot(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vnot_v_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ploaddup(const double* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); + return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ploadquad(const double* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul2Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul2Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m2(from, stride * sizeof(double), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul2Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const PacketMul2Xd& a) { + return __riscv_vfmv_f_s_f64m2_f64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd psqrt(const PacketMul2Xd& a) { + return __riscv_vfsqrt_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd print(const PacketMul2Xd& a) { + // Adds and subtracts signum(a) * 2^52 to force rounding. + const PacketMul2Xd limit = pset1(static_cast(1ull << 52)); + const PacketMul2Xd abs_a = pabs(a); + PacketMul2Xd r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pfloor(const PacketMul2Xd& a) { + const PacketMul2Xd cst_1 = pset1(1.0); + PacketMul2Xd tmp = print(a); + // If greater, subtract one. + PacketMul2Xd mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd preverse(const PacketMul2Xd& a) { + PacketMul2Xul idx = __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pfrexp(const PacketMul2Xd& a, PacketMul2Xd& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE double predux(const PacketMul2Xd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m2_f64m1(a, + __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const PacketMul2Xd& a) { + return predux_mul(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), + __riscv_vget_v_f64m2_f64m1(a, 1), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const PacketMul2Xd& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1(a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const PacketMul2Xd& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1(a, + __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle64_v_f64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pldexp(const PacketMul2Xd& a, const PacketMul2Xd& exponent) { + return pldexp_generic(a, exponent); +} + +template +EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, +PacketMul2Xd>::type predux_half_dowto4(const PacketMul4Xd& a) { + return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), + __riscv_vget_v_f64m4_f64m2(a, 1), unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, +PacketXd>::type predux_half_dowto4(const PacketMul2Xd& a) { + return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), + __riscv_vget_v_f64m2_f64m1(a, 1), unpacket_traits::size); +} + +/********************************* short **************************************/ + +typedef vint16m1_t PacketXs __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vuint16m1_t PacketXsu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); + +typedef vint16m2_t PacketMul2Xs __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); +typedef vuint16m2_t PacketMul2Xsu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); + +typedef vint16m4_t PacketMul4Xs __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); +typedef vuint16m4_t PacketMul4Xsu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXs type; + typedef PacketXs half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xs type; + typedef PacketXs half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xs type; + typedef PacketMul2Xs half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int16_t type; + typedef PacketXs half; // Half not yet implemented + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int16_t type; + typedef PacketXs half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int16_t type; + typedef PacketMul2Xs half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE void prefetch(const numext::int16_t* addr) { +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + __builtin_prefetch(addr); +#endif +} + +/********************************* PacketXs ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs plset(const numext::int16_t& a) { + PacketXs idx = __riscv_vid_v_i16m1(unpacket_traits::size); + return __riscv_vadd_vx_i16m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pzero(const PacketXs& /*a*/) { + return __riscv_vmv_v_x_i16m1(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs padd(const PacketXs& a, const PacketXs& b) { + return __riscv_vadd_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs psub(const PacketXs& a, const PacketXs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pnegate(const PacketXs& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pconj(const PacketXs& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmul(const PacketXs& a, const PacketXs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pdiv(const PacketXs& a, const PacketXs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmadd(const PacketXs& a, const PacketXs& b, const PacketXs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmsub(const PacketXs& a, const PacketXs& b, const PacketXs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pnmadd(const PacketXs& a, const PacketXs& b, const PacketXs& c) { + return __riscv_vnmsub_vv_i16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pnmsub(const PacketXs& a, const PacketXs& b, const PacketXs& c) { + return __riscv_vnmsub_vv_i16m1(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmin(const PacketXs& a, const PacketXs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmax(const PacketXs& a, const PacketXs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pcmp_le(const PacketXs& a, const PacketXs& b) { + PacketMask16 mask = __riscv_vmsle_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pcmp_lt(const PacketXs& a, const PacketXs& b) { + PacketMask16 mask = __riscv_vmslt_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pcmp_eq(const PacketXs& a, const PacketXs& b) { + PacketMask16 mask = __riscv_vmseq_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs ptrue(const PacketXs& /*a*/) { + return __riscv_vmv_v_x_i16m1(0xffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pand(const PacketXs& a, const PacketXs& b) { + return __riscv_vand_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs por(const PacketXs& a, const PacketXs& b) { + return __riscv_vor_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pxor(const PacketXs& a, const PacketXs& b) { + return __riscv_vxor_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pandnot(const PacketXs& a, const PacketXs& b) { + return __riscv_vand_vv_i16m1(a, __riscv_vnot_v_i16m1(b, unpacket_traits::size), unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXs parithmetic_shift_right(PacketXs a) { + return __riscv_vsra_vx_i16m1(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXs plogical_shift_right(PacketXs a) { + return __riscv_vreinterpret_i16m1(__riscv_vsrl_vx_u16m1(__riscv_vreinterpret_u16m1(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketXs plogical_shift_left(PacketXs a) { + return __riscv_vsll_vx_i16m1(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs ploaddup(const numext::int16_t* from) { + PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); + // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... + return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs ploadquad(const numext::int16_t* from) { + PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketXs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketXs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXs pgather(const numext::int16_t* from, Index stride) { + return __riscv_vlse16_v_i16m1(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketXs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketXs& a) { + return __riscv_vmv_x_s_i16m1_i16(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXs preverse(const PacketXs& a) { + PacketXsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pabs(const PacketXs& a) { + PacketXs mask = __riscv_vsra_vx_i16m1(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(a, mask, unpacket_traits::size), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux(const PacketXs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m1_i16m1(a, + __riscv_vmv_v_x_i16m1(0, unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketXs& a) { + // Multiply the vector by its reverse + PacketXs prod = __riscv_vmul_vv_i16m1(preverse(a), a, unpacket_traits::size); + PacketXs half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 16, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 8, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_i16m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + + half_prod = __riscv_vslidedown_vx_i16m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketXs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m1_i16m1(a, + __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), + unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketXs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m1_i16m1(a, + __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle16_v_i16m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul4Xs ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs plset(const numext::int16_t& a) { + PacketMul4Xs idx = __riscv_vid_v_i16m4(unpacket_traits::size); + return __riscv_vadd_vx_i16m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pzero(const PacketMul4Xs& /*a*/) { + return __riscv_vmv_v_x_i16m4(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs padd(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vadd_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs psub(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pnegate(const PacketMul4Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pconj(const PacketMul4Xs& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmul(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pdiv(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmadd(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmsub(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pnmadd(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vnmsub_vv_i16m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pnmsub(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vnmsub_vv_i16m4(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmin(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmax(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcmp_le(const PacketMul4Xs& a, const PacketMul4Xs& b) { + PacketMask4 mask = __riscv_vmsle_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcmp_lt(const PacketMul4Xs& a, const PacketMul4Xs& b) { + PacketMask4 mask = __riscv_vmslt_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcmp_eq(const PacketMul4Xs& a, const PacketMul4Xs& b) { + PacketMask4 mask = __riscv_vmseq_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ptrue(const PacketMul4Xs& /*a*/) { + return __riscv_vmv_v_x_i16m4(0xffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pand(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vand_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs por(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vor_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pxor(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vxor_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pandnot(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vand_vv_i16m4(a, __riscv_vnot_v_i16m4(b, unpacket_traits::size), unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xs parithmetic_shift_right(PacketMul4Xs a) { + return __riscv_vsra_vx_i16m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_right(PacketMul4Xs a) { + return __riscv_vreinterpret_i16m4(__riscv_vsrl_vx_u16m4(__riscv_vreinterpret_u16m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_left(PacketMul4Xs a) { + return __riscv_vsll_vx_i16m4(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ploaddup(const numext::int16_t* from) { + PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); + idx = __riscv_vand_vx_u16m4(idx, 0xfffeu, unpacket_traits::size); + // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... + return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ploadquad(const numext::int16_t* from) { + PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m4(__riscv_vand_vx_u16m4(idx, 0xfffcu, unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul4Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul4Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xs pgather(const numext::int16_t* from, Index stride) { + return __riscv_vlse16_v_i16m4(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul4Xs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul4Xs& a) { + return __riscv_vmv_x_s_i16m4_i16(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs preverse(const PacketMul4Xs& a) { + PacketMul4Xsu idx = __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pabs(const PacketMul4Xs& a) { + PacketMul4Xs mask = __riscv_vsra_vx_i16m4(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m4(__riscv_vxor_vv_i16m4(a, mask, unpacket_traits::size), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul4Xs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1(a, + __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul4Xs& a) { + PacketXs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), + __riscv_vget_v_i16m4_i16m1(a, 1), unpacket_traits::size); + PacketXs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), + __riscv_vget_v_i16m4_i16m1(a, 3), unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul4Xs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m4_i16m1(a, + __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), + unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul4Xs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m4_i16m1(a, + __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle16_v_i16m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + + +/********************************* PacketMul2Xs ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs plset(const numext::int16_t& a) { + PacketMul2Xs idx = __riscv_vid_v_i16m2(unpacket_traits::size); + return __riscv_vadd_vx_i16m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pzero(const PacketMul2Xs& /*a*/) { + return __riscv_vmv_v_x_i16m2(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs padd(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vadd_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs psub(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pnegate(const PacketMul2Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pconj(const PacketMul2Xs& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmul(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pdiv(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmadd(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmsub(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pnmadd(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vnmsub_vv_i16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pnmsub(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vnmsub_vv_i16m2(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmin(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmax(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcmp_le(const PacketMul2Xs& a, const PacketMul2Xs& b) { + PacketMask8 mask = __riscv_vmsle_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcmp_lt(const PacketMul2Xs& a, const PacketMul2Xs& b) { + PacketMask8 mask = __riscv_vmslt_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcmp_eq(const PacketMul2Xs& a, const PacketMul2Xs& b) { + PacketMask8 mask = __riscv_vmseq_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ptrue(const PacketMul2Xs& /*a*/) { + return __riscv_vmv_v_x_i16m2(0xffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pand(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vand_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs por(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vor_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pxor(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vxor_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pandnot(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vand_vv_i16m2(a, __riscv_vnot_v_i16m2(b, unpacket_traits::size), unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xs parithmetic_shift_right(PacketMul2Xs a) { + return __riscv_vsra_vx_i16m2(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_right(PacketMul2Xs a) { + return __riscv_vreinterpret_i16m2(__riscv_vsrl_vx_u16m2(__riscv_vreinterpret_u16m2(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_left(PacketMul2Xs a) { + return __riscv_vsll_vx_i16m2(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ploaddup(const numext::int16_t* from) { + PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); + // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... + return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ploadquad(const numext::int16_t* from) { + PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul2Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul2Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xs pgather(const numext::int16_t* from, Index stride) { + return __riscv_vlse16_v_i16m2(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul2Xs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul2Xs& a) { + return __riscv_vmv_x_s_i16m2_i16(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs preverse(const PacketMul2Xs& a) { + PacketMul2Xsu idx = __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pabs(const PacketMul2Xs& a) { + PacketMul2Xs mask = __riscv_vsra_vx_i16m2(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m2(__riscv_vxor_vv_i16m2(a, mask, unpacket_traits::size), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul2Xs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1(a, + __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul2Xs& a) { + return predux_mul(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), + __riscv_vget_v_i16m2_i16m1(a, 1), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul2Xs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m2_i16m1(a, + __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), + unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul2Xs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m2_i16m1(a, + __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle16_v_i16m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, +PacketMul2Xs>::type predux_half_dowto4(const PacketMul4Xs& a) { + return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), + __riscv_vget_v_i16m4_i16m2(a, 1), unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, +PacketXs>::type predux_half_dowto4(const PacketMul2Xs& a) { + return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), + __riscv_vget_v_i16m2_i16m1(a, 1), unpacket_traits::size); } } // namespace internal diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h new file mode 100644 index 000000000..c1401bdaa --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h @@ -0,0 +1,879 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_FP16_RVV10_H +#define EIGEN_PACKET_MATH_FP16_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +typedef vfloat16m1_t PacketXh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vfloat16m2_t PacketMul2Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXh type; + typedef PacketXh half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 0, + HasExp = 0, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xh type; + typedef PacketXh half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 0, + HasExp = 0, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef PacketXh half; // Half not yet implemented + typedef PacketXs integer_packet; + typedef numext::uint8_t mask_t; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef PacketXh half; + typedef PacketMul2Xs integer_packet; + typedef numext::uint8_t mask_t; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +/********************************* PacketXh ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXh ptrue(const PacketXh& /*a*/) { + return __riscv_vreinterpret_f16m1(__riscv_vmv_v_x_u16m1(0xffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pzero(const PacketXh& /*a*/) { + return __riscv_vfmv_v_f_f16m1(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pabs(const PacketXh& a) { + return __riscv_vfabs_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pset1(const Eigen::half& from) { + return __riscv_vfmv_v_f_f16m1(static_cast<_Float16>(from), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pset1frombits(numext::uint16_t from) { + return __riscv_vreinterpret_f16m1(__riscv_vmv_v_x_u16m1(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh plset(const Eigen::half& a) { + PacketXh idx = __riscv_vfcvt_f_x_v_f16m1(__riscv_vid_v_i16m1(unpacket_traits::size), unpacket_traits::size); + return __riscv_vfadd_vf_f16m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh padd(const PacketXh& a, const PacketXh& b) { + return __riscv_vfadd_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh psub(const PacketXh& a, const PacketXh& b) { + return __riscv_vfsub_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pnegate(const PacketXh& a) { + return __riscv_vfneg_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pconj(const PacketXh& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmul(const PacketXh& a, const PacketXh& b) { + return __riscv_vfmul_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pdiv(const PacketXh& a, const PacketXh& b) { + return __riscv_vfdiv_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmadd(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfmadd_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmsub(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfmsub_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pnmadd(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfnmsub_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pnmsub(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfnmadd_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmin(const PacketXh& a, const PacketXh& b) { + PacketXh nans = __riscv_vfmv_v_f_f16m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f16m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmin(const PacketXh& a, const PacketXh& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmin(const PacketXh& a, const PacketXh& b) { + return __riscv_vfmin_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmax(const PacketXh& a, const PacketXh& b) { + PacketXh nans = __riscv_vfmv_v_f_f16m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f16m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmax(const PacketXh& a, const PacketXh& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmax(const PacketXh& a, const PacketXh& b) { + return __riscv_vfmax_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_le(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmfle_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_lt(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmflt_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_eq(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_lt_or_nan(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmfge_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f16m1(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for half, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketXh pand(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh por(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vor_vv_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pxor(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vxor_vv_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pandnot(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(a), + __riscv_vnot_v_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast(from), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast(from), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh ploaddup(const Eigen::half* from) { + PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); + return __riscv_vloxei16_v_f16m1(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh ploadquad(const Eigen::half* from) { + PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vloxei16_v_f16m1(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const PacketXh& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const PacketXh& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXh pgather(const Eigen::half* from, Index stride) { + return __riscv_vlse16_v_f16m1(reinterpret_cast(from), stride * sizeof(Eigen::half), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const PacketXh& from, Index stride) { + __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const PacketXh& a) { + return static_cast(__riscv_vfmv_f_s_f16m1_f16(a)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh psqrt(const PacketXh& a) { + return __riscv_vfsqrt_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh print(const PacketXh& a) { + // Adds and subtracts signum(a) * 2^10 to force rounding. + const PacketXh limit = pset1(static_cast(1 << 10)); + const PacketXh abs_a = pabs(a); + PacketXh r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template <> +EIGEN_STRONG_INLINE PacketXh pfloor(const PacketXh& a) { + const PacketXh cst_1 = pset1(static_cast(1.0)); + PacketXh tmp = print(a); + // If greater, subtract one. + PacketXh mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template <> +EIGEN_STRONG_INLINE PacketXh preverse(const PacketXh& a) { + PacketXsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_f16m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux(const PacketXh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m1_f16m1(a, + __riscv_vfmv_v_f_f16m1(0.0, unpacket_traits::size), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_mul(const PacketXh& a) { + // Multiply the vector by its reverse + PacketXh prod = __riscv_vfmul_vv_f16m1(preverse(a), a, unpacket_traits::size); + PacketXh half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f16m1(prod, 16, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f16m1(prod, 8, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f16m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_f16m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + + half_prod = __riscv_vslidedown_vx_f16m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_min(const PacketXh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m1_f16m1(a, + __riscv_vfmv_v_f_f16m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_max(const PacketXh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m1_f16m1(a, + __riscv_vfmv_v_f_f16m1(-(std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size))); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + Eigen::half buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle16_v_f16m1(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), unpacket_traits::size); + } +} + +EIGEN_STRONG_INLINE PacketMul2Xf half2float(const PacketXh& a) { + return __riscv_vfwcvt_f_f_v_f32m2(a, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketXh float2half(const PacketMul2Xf& a) { + return __riscv_vfncvt_f_f_w_f16m1(a, unpacket_traits::size); +} + +/********************************* PacketMul2Xh ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh ptrue(const PacketMul2Xh& /*a*/) { + return __riscv_vreinterpret_f16m2(__riscv_vmv_v_x_u16m2(0xffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pzero(const PacketMul2Xh& /*a*/) { + return __riscv_vfmv_v_f_f16m2(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pabs(const PacketMul2Xh& a) { + return __riscv_vfabs_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pset1(const Eigen::half& from) { + return __riscv_vfmv_v_f_f16m2(static_cast<_Float16>(from), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pset1frombits(numext::uint16_t from) { + return __riscv_vreinterpret_f16m2(__riscv_vmv_v_x_u16m2(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh plset(const Eigen::half& a) { + PacketMul2Xh idx = __riscv_vfcvt_f_x_v_f16m2(__riscv_vid_v_i16m2(unpacket_traits::size), unpacket_traits::size); + return __riscv_vfadd_vf_f16m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh padd(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfadd_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh psub(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfsub_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pnegate(const PacketMul2Xh& a) { + return __riscv_vfneg_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pconj(const PacketMul2Xh& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmul(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfmul_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pdiv(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfdiv_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmadd(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { + return __riscv_vfmadd_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmsub(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { + return __riscv_vfmsub_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pnmadd(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { + return __riscv_vfnmsub_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pnmsub(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { + return __riscv_vfnmadd_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmin(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMul2Xh nans = __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f16m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmin(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmin(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfmin_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmax(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMul2Xh nans = __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f16m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmax(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmax(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfmax_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcmp_le(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMask8 mask = __riscv_vmfle_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcmp_lt(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcmp_eq(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcmp_lt_or_nan(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMask8 mask = __riscv_vmfge_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f16m2(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for half, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pand(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh por(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pxor(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vxor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pandnot(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vnot_v_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast(from), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast(from), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh ploaddup(const Eigen::half* from) { + PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); + return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh ploadquad(const Eigen::half* from) { + PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, unpacket_traits::size); + return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const PacketMul2Xh& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const PacketMul2Xh& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xh pgather(const Eigen::half* from, Index stride) { + return __riscv_vlse16_v_f16m2(reinterpret_cast(from), stride * sizeof(Eigen::half), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const PacketMul2Xh& from, Index stride) { + __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const PacketMul2Xh& a) { + return static_cast(__riscv_vfmv_f_s_f16m2_f16(a)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh psqrt(const PacketMul2Xh& a) { + return __riscv_vfsqrt_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh print(const PacketMul2Xh& a) { + // Adds and subtracts signum(a) * 2^10 to force rounding. + const PacketMul2Xh limit = pset1(static_cast(1 << 10)); + const PacketMul2Xh abs_a = pabs(a); + PacketMul2Xh r = padd(abs_a, limit); + // Don't compile-away addition and subtraction. + EIGEN_OPTIMIZATION_BARRIER(r); + r = psub(r, limit); + // If greater than limit, simply return a. Otherwise, account for sign. + r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); + return r; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pfloor(const PacketMul2Xh& a) { + const PacketMul2Xh cst_1 = pset1(static_cast(1.0)); + PacketMul2Xh tmp = print(a); + // If greater, subtract one. + PacketMul2Xh mask = pcmp_lt(a, tmp); + mask = pand(mask, cst_1); + return psub(tmp, mask); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh preverse(const PacketMul2Xh& a) { + PacketMul2Xsu idx = __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + return __riscv_vrgather_vv_f16m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux(const PacketMul2Xh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m2_f16m1(a, + __riscv_vfmv_v_f_f16m1(0.0, unpacket_traits::size / 4), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_mul(const PacketMul2Xh& a) { + return predux_mul(__riscv_vfmul_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), + __riscv_vget_v_f16m2_f16m1(a, 1), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_min(const PacketMul2Xh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m2_f16m1(a, + __riscv_vfmv_v_f_f16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_max(const PacketMul2Xh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m2_f16m1(a, + __riscv_vfmv_v_f_f16m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size))); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + Eigen::half buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle16_v_f16m2(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), unpacket_traits::size); + } +} + +EIGEN_STRONG_INLINE PacketMul4Xf half2float(const PacketMul2Xh& a) { + return __riscv_vfwcvt_f_f_v_f32m4(a, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMul2Xh float2half(const PacketMul4Xf& a) { + return __riscv_vfncvt_f_f_w_f16m2(a, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, +PacketXh>::type predux_half_dowto4(const PacketMul2Xh& a) { + return __riscv_vfadd_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), + __riscv_vget_v_f16m2_f16m1(a, 1), unpacket_traits::size); +} + +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, pcos) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, pexp) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, pexpm1) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, plog) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, plog1p) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, plog2) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, preciprocal) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, prsqrt) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, psin) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, ptanh) + +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, pcos) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, pexp) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, pexpm1) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, plog) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, plog1p) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, plog2) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, preciprocal) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, prsqrt) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, psin) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, ptanh) + +/********************************* casting ************************************/ + +template <> +struct type_casting_traits<_Float16, numext::int16_t> { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXh pcast(const PacketXs& a) { + return __riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pcast(const PacketXh& a) { + return __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh preinterpret(const PacketXs& a) { + return __riscv_vreinterpret_v_i16m1_f16m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXs preinterpret(const PacketXh& a) { + return __riscv_vreinterpret_v_f16m1_i16m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketMul2Xs& a) { + return __riscv_vfcvt_f_x_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketMul2Xh& a) { + return __riscv_vfcvt_rtz_x_f_v_i16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh preinterpret(const PacketMul2Xs& a) { + return __riscv_vreinterpret_v_i16m2_f16m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs preinterpret(const PacketMul2Xh& a) { + return __riscv_vreinterpret_v_f16m2_i16m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXh& a, const PacketXh& b, + const PacketXh& c, const PacketXh& d) { + PacketMul4Xs res = __riscv_vset_v_i16m1_i16m4(__riscv_vundefined_i16m4(), 0, + __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size)); + res = __riscv_vset_v_i16m1_i16m4(res, 1, __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size)); + res = __riscv_vset_v_i16m1_i16m4(res, 2, __riscv_vfcvt_rtz_x_f_v_i16m1(c, unpacket_traits::size)); + res = __riscv_vset_v_i16m1_i16m4(res, 3, __riscv_vfcvt_rtz_x_f_v_i16m1(d, unpacket_traits::size)); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketXs& a, const PacketXs& b) { + PacketMul2Xh res = __riscv_vset_v_f16m1_f16m2(__riscv_vundefined_f16m2(), 0, + __riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits::size)); + res = __riscv_vset_v_f16m1_f16m2(res, 1, __riscv_vfcvt_f_x_v_f16m1(b, unpacket_traits::size)); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketXh& a, const PacketXh& b) { + PacketMul2Xh res = __riscv_vset_v_f16m1_f16m2(__riscv_vundefined_f16m2(), 0, a); + res = __riscv_vset_v_f16m1_f16m2(res, 1, b); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketXh& a, const PacketXh& b) { + PacketMul2Xs res = __riscv_vset_v_i16m1_i16m2(__riscv_vundefined_i16m2(), 0, + __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size)); + res = __riscv_vset_v_i16m1_i16m2(res, 1, __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size)); + return res; +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_PACKET_MATH_FP16_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/TypeCasting.h b/Eigen/src/Core/arch/RVV10/TypeCasting.h index b26bbf4bc..3508f9617 100644 --- a/Eigen/src/Core/arch/RVV10/TypeCasting.h +++ b/Eigen/src/Core/arch/RVV10/TypeCasting.h @@ -16,6 +16,8 @@ namespace Eigen { namespace internal { +/********************************* 32 bits ************************************/ + template <> struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; @@ -28,12 +30,12 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE PacketXf pcast(const PacketXi& a) { - return __riscv_vfcvt_f_x_v_f32m1(a, packet_traits::size); + return __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pcast(const PacketXf& a) { - return __riscv_vfcvt_rtz_x_f_v_i32m1(a, packet_traits::size); + return __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size); } template <> @@ -46,6 +48,281 @@ EIGEN_STRONG_INLINE PacketXi preinterpret(const PacketXf& a) return __riscv_vreinterpret_v_f32m1_i32m1(a); } +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketMul4Xi& a) { + return __riscv_vfcvt_f_x_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketMul4Xf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf preinterpret(const PacketMul4Xi& a) { + return __riscv_vreinterpret_v_i32m4_f32m4(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi preinterpret(const PacketMul4Xf& a) { + return __riscv_vreinterpret_v_f32m4_i32m4(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketMul2Xi& a) { + return __riscv_vfcvt_f_x_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketMul2Xf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf preinterpret(const PacketMul2Xi& a) { + return __riscv_vreinterpret_v_i32m2_f32m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi preinterpret(const PacketMul2Xf& a) { + return __riscv_vreinterpret_v_f32m2_i32m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXi& a, const PacketXi& b, + const PacketXi& c, const PacketXi& d) { + PacketMul4Xi res = __riscv_vset_v_i32m1_i32m4(__riscv_vundefined_i32m4(), 0, a); + res = __riscv_vset_v_i32m1_i32m4(res, 1, b); + res = __riscv_vset_v_i32m1_i32m4(res, 2, c); + res = __riscv_vset_v_i32m1_i32m4(res, 3, d); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXi& a, const PacketXi& b, + const PacketXi& c, const PacketXi& d) { + PacketMul4Xf res = __riscv_vset_v_f32m1_f32m4(__riscv_vundefined_f32m4(), 0, + __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size)); + res = __riscv_vset_v_f32m1_f32m4(res, 1, __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size)); + res = __riscv_vset_v_f32m1_f32m4(res, 2, __riscv_vfcvt_f_x_v_f32m1(c, unpacket_traits::size)); + res = __riscv_vset_v_f32m1_f32m4(res, 3, __riscv_vfcvt_f_x_v_f32m1(d, unpacket_traits::size)); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXf& a, const PacketXf& b, + const PacketXf& c, const PacketXf& d) { + PacketMul4Xf res = __riscv_vset_v_f32m1_f32m4(__riscv_vundefined_f32m4(), 0, a); + res = __riscv_vset_v_f32m1_f32m4(res, 1, b); + res = __riscv_vset_v_f32m1_f32m4(res, 2, c); + res = __riscv_vset_v_f32m1_f32m4(res, 3, d); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXf& a, const PacketXf& b, + const PacketXf& c, const PacketXf& d) { + PacketMul4Xi res = __riscv_vset_v_i32m1_i32m4(__riscv_vundefined_i32m4(), 0, + __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size)); + res = __riscv_vset_v_i32m1_i32m4(res, 1, __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size)); + res = __riscv_vset_v_i32m1_i32m4(res, 2, __riscv_vfcvt_rtz_x_f_v_i32m1(c, unpacket_traits::size)); + res = __riscv_vset_v_i32m1_i32m4(res, 3, __riscv_vfcvt_rtz_x_f_v_i32m1(d, unpacket_traits::size)); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketXi& a, const PacketXi& b) { + PacketMul2Xi res = __riscv_vset_v_i32m1_i32m2(__riscv_vundefined_i32m2(), 0, a); + res = __riscv_vset_v_i32m1_i32m2(res, 1, b); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXi& a, const PacketXi& b) { + PacketMul2Xf res = __riscv_vset_v_f32m1_f32m2(__riscv_vundefined_f32m2(), 0, + __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size)); + res = __riscv_vset_v_f32m1_f32m2(res, 1, __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size)); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXf& a, const PacketXf& b) { + PacketMul2Xf res = __riscv_vset_v_f32m1_f32m2(__riscv_vundefined_f32m2(), 0, a); + res = __riscv_vset_v_f32m1_f32m2(res, 1, b); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketXf& a, const PacketXf& b) { + PacketMul2Xi res = __riscv_vset_v_i32m1_i32m2(__riscv_vundefined_i32m2(), 0, + __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size)); + res = __riscv_vset_v_i32m1_i32m2(res, 1, __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size)); + return res; +} + +/********************************* 64 bits ************************************/ + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXd pcast(const PacketXl& a) { + return __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pcast(const PacketXd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd preinterpret(const PacketXl& a) { + return __riscv_vreinterpret_v_i64m1_f64m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXl preinterpret(const PacketXd& a) { + return __riscv_vreinterpret_v_f64m1_i64m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketMul4Xl& a) { + return __riscv_vfcvt_f_x_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketMul4Xd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd preinterpret(const PacketMul4Xl& a) { + return __riscv_vreinterpret_v_i64m4_f64m4(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl preinterpret(const PacketMul4Xd& a) { + return __riscv_vreinterpret_v_f64m4_i64m4(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketMul2Xl& a) { + return __riscv_vfcvt_f_x_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketMul2Xd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd preinterpret(const PacketMul2Xl& a) { + return __riscv_vreinterpret_v_i64m2_f64m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl preinterpret(const PacketMul2Xd& a) { + return __riscv_vreinterpret_v_f64m2_i64m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXl& a, const PacketXl& b, + const PacketXl& c, const PacketXl& d) { + PacketMul4Xl res = __riscv_vset_v_i64m1_i64m4(__riscv_vundefined_i64m4(), 0, a); + res = __riscv_vset_v_i64m1_i64m4(res, 1, b); + res = __riscv_vset_v_i64m1_i64m4(res, 2, c); + res = __riscv_vset_v_i64m1_i64m4(res, 3, d); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXl& a, const PacketXl& b, + const PacketXl& c, const PacketXl& d) { + PacketMul4Xd res = __riscv_vset_v_f64m1_f64m4(__riscv_vundefined_f64m4(), 0, + __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size)); + res = __riscv_vset_v_f64m1_f64m4(res, 1, __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size)); + res = __riscv_vset_v_f64m1_f64m4(res, 2, __riscv_vfcvt_f_x_v_f64m1(c, unpacket_traits::size)); + res = __riscv_vset_v_f64m1_f64m4(res, 3, __riscv_vfcvt_f_x_v_f64m1(d, unpacket_traits::size)); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXd& a, const PacketXd& b, + const PacketXd& c, const PacketXd& d) { + PacketMul4Xd res = __riscv_vset_v_f64m1_f64m4(__riscv_vundefined_f64m4(), 0, a); + res = __riscv_vset_v_f64m1_f64m4(res, 1, b); + res = __riscv_vset_v_f64m1_f64m4(res, 2, c); + res = __riscv_vset_v_f64m1_f64m4(res, 3, d); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXd& a, const PacketXd& b, + const PacketXd& c, const PacketXd& d) { + PacketMul4Xl res = __riscv_vset_v_i64m1_i64m4(__riscv_vundefined_i64m4(), 0, + __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size)); + res = __riscv_vset_v_i64m1_i64m4(res, 1, __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size)); + res = __riscv_vset_v_i64m1_i64m4(res, 2, __riscv_vfcvt_rtz_x_f_v_i64m1(c, unpacket_traits::size)); + res = __riscv_vset_v_i64m1_i64m4(res, 3, __riscv_vfcvt_rtz_x_f_v_i64m1(d, unpacket_traits::size)); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketXl& a, const PacketXl& b) { + PacketMul2Xl res = __riscv_vset_v_i64m1_i64m2(__riscv_vundefined_i64m2(), 0, a); + res = __riscv_vset_v_i64m1_i64m2(res, 1, b); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXl& a, const PacketXl& b) { + PacketMul2Xd res = __riscv_vset_v_f64m1_f64m2(__riscv_vundefined_f64m2(), 0, + __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size)); + res = __riscv_vset_v_f64m1_f64m2(res, 1, __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size)); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXd& a, const PacketXd& b) { + PacketMul2Xd res = __riscv_vset_v_f64m1_f64m2(__riscv_vundefined_f64m2(), 0, a); + res = __riscv_vset_v_f64m1_f64m2(res, 1, b); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketXd& a, const PacketXd& b) { + PacketMul2Xl res = __riscv_vset_v_i64m1_i64m2(__riscv_vundefined_i64m2(), 0, + __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size)); + res = __riscv_vset_v_i64m1_i64m2(res, 1, __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size)); + return res; +} + +/********************************* 16 bits ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketXs& a, const PacketXs& b) { + PacketMul2Xs res = __riscv_vset_v_i16m1_i16m2(__riscv_vundefined_i16m2(), 0, a); + res = __riscv_vset_v_i16m1_i16m2(res, 1, b); + return res; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXs& a, const PacketXs& b, + const PacketXs& c, const PacketXs& d) { + PacketMul4Xs res = __riscv_vset_v_i16m1_i16m4(__riscv_vundefined_i16m4(), 0, a); + res = __riscv_vset_v_i16m1_i16m4(res, 1, b); + res = __riscv_vset_v_i16m1_i16m4(res, 2, c); + res = __riscv_vset_v_i16m1_i16m4(res, 3, d); + return res; +} + } // namespace internal } // namespace Eigen diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index ba7d97a03..326c6eab3 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -111,7 +111,11 @@ struct squared_norm_functor { } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { +#if defined EIGEN_VECTORIZE_RVV10 + return Packet(pmul(a.real, a.real), pmul(a.imag, a.imag)); +#else return Packet(pmul(a.v, a.v)); +#endif } }; template diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index ba72a8a4f..463f2bc17 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -38,10 +38,19 @@ template ::ReturnType ResScalar; +#ifdef EIGEN_RISCV64_USE_RVV10 +#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ + typedef typename std::conditional_t::IsComplex || NumTraits::IsComplex, \ + typename packet_traits::type, \ + typename gemv_packet_cond< \ + packet_size, typename packet_traits::type, typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type> name##Packet##postfix +#else #define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ typedef typename gemv_packet_cond< \ packet_size, typename packet_traits::type, typename packet_traits::half, \ typename unpacket_traits::half>::half>::type name##Packet##postfix +#endif PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index f9ba6c558..0766073c3 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -68,6 +68,8 @@ #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 #elif defined __HVX__ && (__HVX_LENGTH__ == 128) #define EIGEN_IDEAL_MAX_ALIGN_BYTES 128 +#elif defined(EIGEN_RISCV64_USE_RVV10) +#define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 #else #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 #endif @@ -104,7 +106,7 @@ // Only static alignment is really problematic (relies on nonstandard compiler extensions), // try to keep heap alignment even when we have to disable static alignment. #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || \ - EIGEN_ARCH_MIPS || EIGEN_ARCH_LOONGARCH64) + EIGEN_ARCH_MIPS || EIGEN_ARCH_LOONGARCH64 || EIGEN_ARCH_RISCV) #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 #else #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 @@ -414,9 +416,15 @@ extern "C" { #error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set." #endif +#elif defined(EIGEN_ARCH_RISCV) + +#if defined(__riscv_zfh) +#define EIGEN_HAS_RISCV64_FP16_SCALAR_ARITHMETIC +#endif + // We currently require RVV to be enabled explicitly via EIGEN_RISCV64_USE_RVV and // will not select the backend automatically -#elif (defined EIGEN_RISCV64_USE_RVV10) +#if (defined EIGEN_RISCV64_USE_RVV10) #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_RVV10 @@ -426,10 +434,26 @@ extern "C" { // to ensure a fixed lengths is set #if defined(__riscv_v_fixed_vlen) #define EIGEN_RISCV64_RVV_VL __riscv_v_fixed_vlen +#if __riscv_v_fixed_vlen >= 256 +#undef EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT +#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 +#endif #else +#ifdef __GNUC__ #error "Eigen requires a fixed RVV vector length but -mrvv-vector-bits=zvl is not set." +#else +#error "Eigen requires a fixed RVV vector length but -mrvv-vector-bits=N is not set." +#endif #endif +#if defined(__riscv_zvfh) && defined(__riscv_zfh) +#define EIGEN_VECTORIZE_RVV10FP16 +#elif defined(__riscv_zvfh) +#error "The Eigen::Half vectorization requires Zfh and Zvfh extensions." +#endif + +#endif // defined(EIGEN_ARCH_RISCV) + #elif (defined __s390x__ && defined __VEC__) #define EIGEN_VECTORIZE diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 00d55577d..ad91668cb 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -404,6 +404,13 @@ #define EIGEN_ARCH_PPC 0 #endif +/// \internal EIGEN_ARCH_RISCV set to 1 if the architecture is RISC-V. +#if defined(__riscv) +#define EIGEN_ARCH_RISCV 1 +#else +#define EIGEN_ARCH_RISCV 0 +#endif + //------------------------------------------------------------------------------------------ // Operating system identification, EIGEN_OS_* //------------------------------------------------------------------------------------------ diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index a0e160eba..e91a14e9d 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -264,7 +264,7 @@ struct functor_cost { static constexpr Index Cost = plain_enum_max(nested_functor_cost::Cost, 1); }; -template +template struct packet_traits; template @@ -285,9 +285,12 @@ struct find_best_packet_helper { typedef typename find_best_packet_helper::half>::type type; }; -template +template +struct find_best_packet; + +template struct find_best_packet { - typedef typename find_best_packet_helper::type>::type type; + typedef typename find_best_packet_helper::type>::type type; }; template () * diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index 2686a5237..ffa66980c 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -301,12 +301,19 @@ template { static inline void run(Scalar* x, Index incrx, Scalar* y, Index incry, Index size, OtherScalar c, OtherScalar s) { +#ifdef EIGEN_RISCV64_USE_RVV10 + typedef typename std::conditional_t::IsComplex || NumTraits::IsComplex, typename packet_traits::type, typename packet_traits::type> Packet; + typedef typename std::conditional_t::IsComplex || NumTraits::IsComplex, typename packet_traits::type, typename packet_traits::type> OtherPacket; + + constexpr Index PacketSize = unpacket_traits::size; +#else typedef typename packet_traits::type Packet; typedef typename packet_traits::type OtherPacket; - constexpr int RequiredAlignment = - (std::max)(unpacket_traits::alignment, unpacket_traits::alignment); constexpr Index PacketSize = packet_traits::size; +#endif + constexpr int RequiredAlignment = + (std::max)(unpacket_traits::alignment, unpacket_traits::alignment); /*** dynamic-size vectorized paths ***/ if (size >= 2 * PacketSize && SizeAtCompileTime == Dynamic && ((incrx == 1 && incry == 1) || PacketSize == 1)) { diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 76475923f..4259b61b1 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -1816,6 +1816,19 @@ EIGEN_DECLARE_TEST(packetmath) { CALL_SUBTEST_14((packetmath::type>())); CALL_SUBTEST_14((packetmath_scatter_gather::type>())); CALL_SUBTEST_15(test::runner::run()); +#ifdef EIGEN_RISCV64_USE_RVV10 + CALL_SUBTEST_16((test::runner::type>::run())); + CALL_SUBTEST_17((test::runner::type>::run())); + CALL_SUBTEST_18((test::runner::type>::run())); + CALL_SUBTEST_19((test::runner::type>::run())); + CALL_SUBTEST_20((test::runner::type>::run())); + CALL_SUBTEST_21((test::runner::type>::run())); + CALL_SUBTEST_22((test::runner::type>::run())); + CALL_SUBTEST_23((test::runner::type>::run())); + CALL_SUBTEST_24((test::runner::type>::run())); + CALL_SUBTEST_25((test::runner::type>::run())); + CALL_SUBTEST_26((test::runner::type>::run())); +#endif g_first_pass = false; } } diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 724fa40ba..02d46cef8 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -107,7 +107,11 @@ template ::Vector struct vectorization_logic { typedef internal::packet_traits PacketTraits; +#ifdef EIGEN_RISCV64_USE_RVV10 + typedef typename internal::packet_traits::type PacketType; +#else typedef typename internal::packet_traits::type PacketType; +#endif typedef typename internal::unpacket_traits::half HalfPacketType; enum { PacketSize = internal::unpacket_traits::size, -- GitLab From 2c396b925f520501e86801acb29a5a5f5207f2f7 Mon Sep 17 00:00:00 2001 From: "kseniya.zaytseva" Date: Wed, 12 Feb 2025 17:02:08 +0300 Subject: [PATCH 3/7] clang-format fix --- Eigen/src/Core/Redux.h | 2 +- Eigen/src/Core/arch/RVV10/Complex.h | 182 +-- .../Core/arch/RVV10/GeneralBlockPanelKernel.h | 30 +- Eigen/src/Core/arch/RVV10/PacketMath.h | 1021 ++++++++++------- Eigen/src/Core/arch/RVV10/PacketMathFP16.h | 216 ++-- Eigen/src/Core/arch/RVV10/TypeCasting.h | 68 +- Eigen/src/Core/products/GeneralMatrixVector.h | 14 +- Eigen/src/Core/util/ConfigureVectorization.h | 2 +- Eigen/src/Jacobi/Jacobi.h | 12 +- 9 files changed, 885 insertions(+), 662 deletions(-) diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 841d6349a..716a7c00e 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -29,7 +29,7 @@ namespace internal { template struct redux_traits { public: - #ifdef EIGEN_RISCV64_USE_RVV10 +#ifdef EIGEN_RISCV64_USE_RVV10 typedef typename find_best_packet::type PacketType; #else typedef typename find_best_packet::type PacketType; diff --git a/Eigen/src/Core/arch/RVV10/Complex.h b/Eigen/src/Core/arch/RVV10/Complex.h index 67e6dae82..968a4cc40 100644 --- a/Eigen/src/Core/arch/RVV10/Complex.h +++ b/Eigen/src/Core/arch/RVV10/Complex.h @@ -21,10 +21,9 @@ namespace internal { struct PacketXcf { EIGEN_STRONG_INLINE PacketXcf() {} - EIGEN_STRONG_INLINE explicit PacketXcf(const PacketXf& _real, const PacketXf& _imag) : real(_real), imag(_imag) { - } - EIGEN_STRONG_INLINE explicit PacketXcf(const PacketMul2Xf& a) : real(__riscv_vget_v_f32m2_f32m1(a, 0)), - imag(__riscv_vget_v_f32m2_f32m1(a, 1)){} + EIGEN_STRONG_INLINE explicit PacketXcf(const PacketXf& _real, const PacketXf& _imag) : real(_real), imag(_imag) {} + EIGEN_STRONG_INLINE explicit PacketXcf(const PacketMul2Xf& a) + : real(__riscv_vget_v_f32m2_f32m1(a, 0)), imag(__riscv_vget_v_f32m2_f32m1(a, 1)) {} PacketXf real; PacketXf imag; }; @@ -105,8 +104,9 @@ EIGEN_STRONG_INLINE PacketXcf pnegate(const PacketXcf& a) { template <> EIGEN_STRONG_INLINE PacketXcf pconj(const PacketXcf& a) { - return PacketXcf(a.real, __riscv_vreinterpret_v_u32m1_f32m1( - __riscv_vxor_vx_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a.imag), 0x80000000, unpacket_traits::size))); + return PacketXcf( + a.real, __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vx_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a.imag), + 0x80000000, unpacket_traits::size))); } template <> @@ -152,7 +152,7 @@ EIGEN_STRONG_INLINE PacketXcf pxor(const PacketXcf& a, const PacketXc template <> EIGEN_STRONG_INLINE PacketXcf pandnot(const PacketXcf& a, const PacketXcf& b) { - return PacketXcf(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); + return PacketXcf(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); } template <> @@ -164,27 +164,30 @@ EIGEN_STRONG_INLINE PacketXcf pload(const std::complex* from) template <> EIGEN_STRONG_INLINE PacketXcf ploadu(const std::complex* from) { vfloat32m1x2_t res = __riscv_vlseg2e32_v_f32m1x2((const float*)from, unpacket_traits::size); - EIGEN_DEBUG_UNALIGNED_LOAD return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), __riscv_vget_v_f32m1x2_f32m1(res, 1)); + EIGEN_DEBUG_UNALIGNED_LOAD return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), + __riscv_vget_v_f32m1x2_f32m1(res, 1)); } template <> EIGEN_STRONG_INLINE PacketXcf ploaddup(const std::complex* from) { PacketXu real_idx = __riscv_vid_v_u32m1(unpacket_traits::size); - real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffeu, unpacket_traits::size), 2, unpacket_traits::size); + real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); PacketXu imag_idx = __riscv_vadd_vx_u32m1(real_idx, sizeof(float), unpacket_traits::size); // real_idx = 0 0 2*sizeof(float) 2*sizeof(float) 4*sizeof(float) 4*sizeof(float) ... - return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), - __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); + return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), + __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXcf ploadquad(const std::complex* from) { PacketXu real_idx = __riscv_vid_v_u32m1(unpacket_traits::size); - real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffcu, unpacket_traits::size), 1, unpacket_traits::size); + real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); PacketXu imag_idx = __riscv_vadd_vx_u32m1(real_idx, sizeof(float), unpacket_traits::size); // real_idx = 0 0 2*sizeof(float) 2*sizeof(float) 4*sizeof(float) 4*sizeof(float) ... - return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), - __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); + return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), + __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); } template <> @@ -206,7 +209,8 @@ EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, template <> EIGEN_DEVICE_FUNC inline PacketXcf pgather, PacketXcf>(const std::complex* from, Index stride) { - vfloat32m1x2_t res = __riscv_vlsseg2e32_v_f32m1x2((const float*)from, 2 * stride * sizeof(float), unpacket_traits::size); + vfloat32m1x2_t res = + __riscv_vlsseg2e32_v_f32m1x2((const float*)from, 2 * stride * sizeof(float), unpacket_traits::size); return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), __riscv_vget_v_f32m1x2_f32m1(res, 1)); } @@ -259,8 +263,10 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { } for (i = 0; i < N; i++) { - kernel.packet[i].real = __riscv_vle32_v_f32m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); - kernel.packet[i].imag = __riscv_vle32_v_f32m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i].real = + __riscv_vle32_v_f32m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i].imag = + __riscv_vle32_v_f32m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); } } @@ -272,7 +278,7 @@ EIGEN_STRONG_INLINE Packet psqrt_complex_rvv(const Packet& a) { // Computes the principal sqrt of the complex numbers in the input. // - // For example, for packets containing 2 complex numbers stored in + // For example, for packets containing 2 complex numbers stored in // [real0, real1, imag0, imag1] format // a = [a0, a1] = [x0, x1, y0, y1], // where x0 = real(a0), y0 = imag(a0) etc., this function returns @@ -287,7 +293,7 @@ EIGEN_STRONG_INLINE Packet psqrt_complex_rvv(const Packet& a) { // By equating the real and imaginary parts we get: // u^2 - v^2 = x // 2*u*v = y. - // + // // For x >= 0, this has the numerically stable solution // u = sqrt(0.5 * (x + sqrt(x^2 + y^2))) // v = 0.5 * (y / u) @@ -362,7 +368,8 @@ EIGEN_STRONG_INLINE Packet psqrt_complex_rvv(const Packet& a) { Packet result_is_nan = pisnan(result); result = por(result_is_nan, result); - return pselect(Packet(is_imag_inf,is_imag_inf), imag_inf_result, pselect(Packet(is_real_inf, is_real_inf), real_inf_result, result)); + return pselect(Packet(is_imag_inf, is_imag_inf), imag_inf_result, + pselect(Packet(is_real_inf, is_real_inf), real_inf_result, result)); } template @@ -372,7 +379,7 @@ EIGEN_STRONG_INLINE Packet plog_complex_rvv(const Packet& x) { typedef typename packet_traits::type RealPacket; // log(sqrt(a^2 + b^2)), atan2(b, a) - RealPacket xlogr = plog(psqrt(padd(pmul(x.real,x.real), pmul(x.imag,x.imag)))); + RealPacket xlogr = plog(psqrt(padd(pmul(x.real, x.real), pmul(x.imag, x.imag)))); RealPacket ximg = patan2(x.imag, x.real); const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); @@ -387,45 +394,42 @@ EIGEN_STRONG_INLINE Packet plog_complex_rvv(const Packet& x) { } template <> -EIGEN_STRONG_INLINE PacketXcf psqrt(const PacketXcf& a) -{ +EIGEN_STRONG_INLINE PacketXcf psqrt(const PacketXcf& a) { return psqrt_complex_rvv(a); } template <> -EIGEN_STRONG_INLINE PacketXcf plog(const PacketXcf& a) -{ +EIGEN_STRONG_INLINE PacketXcf plog(const PacketXcf& a) { return plog_complex_rvv(a); } -template <> -struct conj_helper { - EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketMul2Xf& x, const PacketXcf& y, const PacketXcf& c) const { - return padd(c, this->pmul(x, y)); - } - EIGEN_STRONG_INLINE PacketXcf pmul(const PacketMul2Xf& x, const PacketXcf& y) const { - return PacketXcf(Eigen::internal::pmul(x, pcast(y))); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketXcf& x, const PacketMul2Xf& y, const PacketXcf& c) const { - return padd(c, this->pmul(x, y)); - } - EIGEN_STRONG_INLINE PacketXcf pmul(const PacketXcf& x, const PacketMul2Xf& y) const { - return PacketXcf(Eigen::internal::pmul(pcast(x), y)); - } +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketMul2Xf& x, const PacketXcf& y, const PacketXcf& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcf pmul(const PacketMul2Xf& x, const PacketXcf& y) const { + return PacketXcf(Eigen::internal::pmul(x, pcast(y))); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketXcf& x, const PacketMul2Xf& y, const PacketXcf& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcf pmul(const PacketXcf& x, const PacketMul2Xf& y) const { + return PacketXcf(Eigen::internal::pmul(pcast(x), y)); + } }; /********************************* double ************************************/ struct PacketXcd { EIGEN_STRONG_INLINE PacketXcd() {} - EIGEN_STRONG_INLINE explicit PacketXcd(const PacketXd& _real, const PacketXd& _imag) : real(_real), imag(_imag) { - } - EIGEN_STRONG_INLINE explicit PacketXcd(const PacketMul2Xd& a) : real(__riscv_vget_v_f64m2_f64m1(a, 0)), - imag(__riscv_vget_v_f64m2_f64m1(a, 1)){} + EIGEN_STRONG_INLINE explicit PacketXcd(const PacketXd& _real, const PacketXd& _imag) : real(_real), imag(_imag) {} + EIGEN_STRONG_INLINE explicit PacketXcd(const PacketMul2Xd& a) + : real(__riscv_vget_v_f64m2_f64m1(a, 0)), imag(__riscv_vget_v_f64m2_f64m1(a, 1)) {} PacketXd real; PacketXd imag; }; @@ -506,8 +510,9 @@ EIGEN_STRONG_INLINE PacketXcd pnegate(const PacketXcd& a) { template <> EIGEN_STRONG_INLINE PacketXcd pconj(const PacketXcd& a) { - return PacketXcd(a.real, __riscv_vreinterpret_v_u64m1_f64m1( - __riscv_vxor_vx_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(a.imag), 0x8000000000000000, unpacket_traits::size))); + return PacketXcd( + a.real, __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vx_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a.imag), 0x8000000000000000, unpacket_traits::size))); } template <> @@ -553,7 +558,7 @@ EIGEN_STRONG_INLINE PacketXcd pxor(const PacketXcd& a, const PacketXc template <> EIGEN_STRONG_INLINE PacketXcd pandnot(const PacketXcd& a, const PacketXcd& b) { - return PacketXcd(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); + return PacketXcd(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); } template <> @@ -565,27 +570,32 @@ EIGEN_STRONG_INLINE PacketXcd pload(const std::complex* from) template <> EIGEN_STRONG_INLINE PacketXcd ploadu(const std::complex* from) { vfloat64m1x2_t res = __riscv_vlseg2e64_v_f64m1x2((const double*)from, unpacket_traits::size); - EIGEN_DEBUG_UNALIGNED_LOAD return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), __riscv_vget_v_f64m1x2_f64m1(res, 1)); + EIGEN_DEBUG_UNALIGNED_LOAD return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), + __riscv_vget_v_f64m1x2_f64m1(res, 1)); } template <> EIGEN_STRONG_INLINE PacketXcd ploaddup(const std::complex* from) { PacketXul real_idx = __riscv_vid_v_u64m1(unpacket_traits::size); - real_idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffeu, unpacket_traits::size), 3, unpacket_traits::size); + real_idx = + __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffeu, unpacket_traits::size), 3, + unpacket_traits::size); PacketXul imag_idx = __riscv_vadd_vx_u64m1(real_idx, sizeof(double), unpacket_traits::size); // real_idx = 0 0 2*sizeof(double) 2*sizeof(double) 4*sizeof(double) 4*sizeof(double) ... - return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), - __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); + return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), + __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXcd ploadquad(const std::complex* from) { PacketXul real_idx = __riscv_vid_v_u64m1(unpacket_traits::size); - real_idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffcu, unpacket_traits::size), 2, unpacket_traits::size); + real_idx = + __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffcu, unpacket_traits::size), 2, + unpacket_traits::size); PacketXul imag_idx = __riscv_vadd_vx_u64m1(real_idx, sizeof(double), unpacket_traits::size); // real_idx = 0 0 2*sizeof(double) 2*sizeof(double) 4*sizeof(double) 4*sizeof(double) ... - return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), - __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); + return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), + __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); } template <> @@ -606,18 +616,20 @@ EIGEN_STRONG_INLINE void pstoreu >(std::complex* to template <> EIGEN_DEVICE_FUNC inline PacketXcd pgather, PacketXcd>(const std::complex* from, - Index stride) { - vfloat64m1x2_t res = __riscv_vlsseg2e64_v_f64m1x2((const double*)from, 2 * stride * sizeof(double), unpacket_traits::size); + Index stride) { + vfloat64m1x2_t res = + __riscv_vlsseg2e64_v_f64m1x2((const double*)from, 2 * stride * sizeof(double), unpacket_traits::size); return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), __riscv_vget_v_f64m1x2_f64m1(res, 1)); } template <> EIGEN_DEVICE_FUNC inline void pscatter, PacketXcd>(std::complex* to, const PacketXcd& from, - Index stride) { + Index stride) { vfloat64m1x2_t from_rvv_type = __riscv_vundefined_f64m1x2(); from_rvv_type = __riscv_vset_v_f64m1_f64m1x2(from_rvv_type, 0, from.real); from_rvv_type = __riscv_vset_v_f64m1_f64m1x2(from_rvv_type, 1, from.imag); - __riscv_vssseg2e64_v_f64m1x2((double*)to, 2 * stride * sizeof(double), from_rvv_type, unpacket_traits::size); + __riscv_vssseg2e64_v_f64m1x2((double*)to, 2 * stride * sizeof(double), from_rvv_type, + unpacket_traits::size); } template <> @@ -660,41 +672,41 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { } for (i = 0; i < N; i++) { - kernel.packet[i].real = __riscv_vle64_v_f64m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); - kernel.packet[i].imag = __riscv_vle64_v_f64m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i].real = + __riscv_vle64_v_f64m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i].imag = + __riscv_vle64_v_f64m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); } } template <> -EIGEN_STRONG_INLINE PacketXcd psqrt(const PacketXcd& a) -{ +EIGEN_STRONG_INLINE PacketXcd psqrt(const PacketXcd& a) { return psqrt_complex_rvv(a); } template <> -EIGEN_STRONG_INLINE PacketXcd plog(const PacketXcd& a) -{ +EIGEN_STRONG_INLINE PacketXcd plog(const PacketXcd& a) { return plog_complex_rvv(a); } -template <> -struct conj_helper { - EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketMul2Xd& x, const PacketXcd& y, const PacketXcd& c) const { - return padd(c, this->pmul(x, y)); - } - EIGEN_STRONG_INLINE PacketXcd pmul(const PacketMul2Xd& x, const PacketXcd& y) const { - return PacketXcd(Eigen::internal::pmul(x, pcast(y))); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketXcd& x, const PacketMul2Xd& y, const PacketXcd& c) const { - return padd(c, this->pmul(x, y)); - } - EIGEN_STRONG_INLINE PacketXcd pmul(const PacketXcd& x, const PacketMul2Xd& y) const { - return PacketXcd(Eigen::internal::pmul(pcast(x), y)); - } +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketMul2Xd& x, const PacketXcd& y, const PacketXcd& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcd pmul(const PacketMul2Xd& x, const PacketXcd& y) const { + return PacketXcd(Eigen::internal::pmul(x, pcast(y))); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketXcd& x, const PacketMul2Xd& y, const PacketXcd& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcd pmul(const PacketXcd& x, const PacketMul2Xd& y) const { + return PacketXcd(Eigen::internal::pmul(pcast(x), y)); + } }; } // end namespace internal diff --git a/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h index 85803aa1f..212df434f 100644 --- a/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h @@ -21,7 +21,7 @@ struct gebp_traits { typedef float RhsPacket; typedef QuadPacket RhsPacketx4; - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } @@ -49,7 +49,7 @@ struct gebp_traits { typedef double RhsPacket; typedef QuadPacket RhsPacketx4; - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } @@ -82,7 +82,7 @@ struct gebp_traits typedef PacketXh AccPacket; typedef QuadPacket RhsPacketx4; - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); } @@ -119,10 +119,10 @@ struct gebp_traits packet_size, typename packet_traits::type, typename packet_traits::half, \ typename unpacket_traits::half>::half>::type ScalarPacket - template -struct gebp_traits, std::complex, ConjLhs_, ConjRhs_, Architecture::RVV10, PacketSize_> - : gebp_traits, std::complex, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> { +struct gebp_traits, std::complex, ConjLhs_, ConjRhs_, Architecture::RVV10, + PacketSize_> : gebp_traits, std::complex, ConjLhs_, ConjRhs_, + Architecture::Generic, PacketSize_> { typedef std::complex Scalar; typedef std::complex LhsScalar; typedef std::complex RhsScalar; @@ -133,7 +133,7 @@ struct gebp_traits, std::complex, ConjLhs_, PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); RISCV_COMPLEX_PACKET_DECL_COND_SCALAR(PacketSize_); - #undef RISCV_COMPLEX_PACKET_DECL_COND_SCALAR +#undef RISCV_COMPLEX_PACKET_DECL_COND_SCALAR enum { ConjLhs = ConjLhs_, @@ -244,9 +244,9 @@ struct gebp_traits, std::complex, ConjLhs_, EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const { c = cj.pmadd(a, b, c); - } - -protected: + } + + protected: conj_helper cj; }; @@ -255,11 +255,10 @@ protected: packet_size, typename packet_traits::type, typename packet_traits::half, \ typename unpacket_traits::half>::half>::type ScalarPacket##postfix - template +template class gebp_traits, false, ConjRhs_, Architecture::RVV10, PacketSize_> - : public gebp_traits, false, ConjRhs_, Architecture::Generic, PacketSize_> - { -public: + : public gebp_traits, false, ConjRhs_, Architecture::Generic, PacketSize_> { + public: typedef std::complex Scalar; typedef RealScalar LhsScalar; typedef Scalar RhsScalar; @@ -367,7 +366,7 @@ public: template class gebp_traits, RealScalar, ConjLhs_, false, Architecture::RVV10, PacketSize_> -: public gebp_traits, ConjLhs_, false, Architecture::Generic, PacketSize_> { + : public gebp_traits, ConjLhs_, false, Architecture::Generic, PacketSize_> { public: typedef std::complex LhsScalar; typedef RealScalar RhsScalar; @@ -484,7 +483,6 @@ class gebp_traits, RealScalar, ConjLhs_, false, Archite conj_helper cj; r = cj.pmadd(c, alpha, r); } - }; } // namespace internal diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h index 662ed9908..e4571e7a9 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -32,9 +32,13 @@ struct rvv_packet_size_selector { template struct rvv_packet_alignment_selector { - enum { - alignment = (VectorLength*VectorLMul) >= 1024 ? Aligned128 : ((VectorLength*VectorLMul) >= 512 ? Aligned64 : ((VectorLength*VectorLMul) >= 256 ? Aligned32 : Aligned16)) - }; + enum { + alignment = + (VectorLength * VectorLMul) >= 1024 + ? Aligned128 + : ((VectorLength * VectorLMul) >= 512 ? Aligned64 + : ((VectorLength * VectorLMul) >= 256 ? Aligned32 : Aligned16)) + }; }; typedef vbool64_t PacketMask64; @@ -47,11 +51,11 @@ typedef vbool4_t PacketMask4; typedef vint32m1_t PacketXi __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); typedef vuint32m1_t PacketXu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); -typedef vint32m2_t PacketMul2Xi __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); -typedef vuint32m2_t PacketMul2Xu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); +typedef vint32m2_t PacketMul2Xi __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); +typedef vuint32m2_t PacketMul2Xu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); -typedef vint32m4_t PacketMul4Xi __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); -typedef vuint32m4_t PacketMul4Xu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); +typedef vint32m4_t PacketMul4Xi __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); +typedef vuint32m4_t PacketMul4Xu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); template <> struct packet_traits : default_packet_traits { @@ -298,17 +302,19 @@ EIGEN_STRONG_INLINE PacketXi pxor(const PacketXi& a, const PacketXi& b template <> EIGEN_STRONG_INLINE PacketXi pandnot(const PacketXi& a, const PacketXi& b) { - return __riscv_vand_vv_i32m1(a, __riscv_vnot_v_i32m1(b, unpacket_traits::size), unpacket_traits::size); + return __riscv_vand_vv_i32m1(a, __riscv_vnot_v_i32m1(b, unpacket_traits::size), + unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) { - return __riscv_vsra_vx_i32m1(a, N, unpacket_traits::size); + return __riscv_vsra_vx_i32m1(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) { - return __riscv_vreinterpret_i32m1(__riscv_vsrl_vx_u32m1(__riscv_vreinterpret_u32m1(a), N, unpacket_traits::size)); + return __riscv_vreinterpret_i32m1( + __riscv_vsrl_vx_u32m1(__riscv_vreinterpret_u32m1(a), N, unpacket_traits::size)); } template @@ -329,7 +335,8 @@ EIGEN_STRONG_INLINE PacketXi ploadu(const numext::int32_t* from) { template <> EIGEN_STRONG_INLINE PacketXi ploaddup(const numext::int32_t* from) { PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); } @@ -369,21 +376,22 @@ EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketXi& a) { template <> EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) { - PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i32m1(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a) { PacketXi mask = __riscv_vsra_vx_i32m1(a, 31, unpacket_traits::size); - return __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(a, mask, unpacket_traits::size), mask, unpacket_traits::size); + return __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int32_t predux(const PacketXi& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i32m1_i32m1(a, - __riscv_vmv_v_x_i32m1(0, unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i32m1_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size), + unpacket_traits::size)); } template <> @@ -393,19 +401,19 @@ EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketXi& a) { PacketXi half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_i32m1(prod, 8, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 8, unpacket_traits::size); prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_i32m1(prod, 4, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 4, unpacket_traits::size); prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_i32m1(prod, 2, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 2, unpacket_traits::size); prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } // Last reduction - half_prod = __riscv_vslidedown_vx_i32m1(prod, 1, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 1, unpacket_traits::size); prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); // The reduction is done to the first element. @@ -414,17 +422,16 @@ EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketXi& a) { template <> EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketXi& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i32m1_i32m1(a, - __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), - unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmin_vs_i32m1_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketXi& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i32m1_i32m1(a, - __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmax_vs_i32m1_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); } template @@ -436,7 +443,8 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle32_v_i32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle32_v_i32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } @@ -558,17 +566,19 @@ EIGEN_STRONG_INLINE PacketMul4Xi pxor(const PacketMul4Xi& a, const template <> EIGEN_STRONG_INLINE PacketMul4Xi pandnot(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vand_vv_i32m4(a, __riscv_vnot_v_i32m4(b, unpacket_traits::size), unpacket_traits::size); + return __riscv_vand_vv_i32m4(a, __riscv_vnot_v_i32m4(b, unpacket_traits::size), + unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul4Xi parithmetic_shift_right(PacketMul4Xi a) { - return __riscv_vsra_vx_i32m4(a, N, unpacket_traits::size); + return __riscv_vsra_vx_i32m4(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_right(PacketMul4Xi a) { - return __riscv_vreinterpret_i32m4(__riscv_vsrl_vx_u32m4(__riscv_vreinterpret_u32m4(a), N, unpacket_traits::size)); + return __riscv_vreinterpret_i32m4( + __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_u32m4(a), N, unpacket_traits::size)); } template @@ -589,7 +599,8 @@ EIGEN_STRONG_INLINE PacketMul4Xi ploadu(const numext::int32_t* fro template <> EIGEN_STRONG_INLINE PacketMul4Xi ploaddup(const numext::int32_t* from) { PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); + idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); } @@ -612,13 +623,14 @@ EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const Pac } template <> -EIGEN_DEVICE_FUNC inline PacketMul4Xi pgather(const numext::int32_t* from, Index stride) { +EIGEN_DEVICE_FUNC inline PacketMul4Xi pgather(const numext::int32_t* from, + Index stride) { return __riscv_vlse32_v_i32m4(from, stride * sizeof(numext::int32_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul4Xi& from, - Index stride) { + Index stride) { __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); } @@ -629,45 +641,46 @@ EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul4Xi& a) template <> EIGEN_STRONG_INLINE PacketMul4Xi preverse(const PacketMul4Xi& a) { - PacketMul4Xu idx = __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketMul4Xu idx = + __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i32m4(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pabs(const PacketMul4Xi& a) { PacketMul4Xi mask = __riscv_vsra_vx_i32m4(a, 31, unpacket_traits::size); - return __riscv_vsub_vv_i32m4(__riscv_vxor_vv_i32m4(a, mask, unpacket_traits::size), mask, unpacket_traits::size); + return __riscv_vsub_vv_i32m4(__riscv_vxor_vv_i32m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul4Xi& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1(a, - __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul4Xi& a) { - PacketXi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), - __riscv_vget_v_i32m4_i32m1(a, 1), unpacket_traits::size); - PacketXi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), - __riscv_vget_v_i32m4_i32m1(a, 3), unpacket_traits::size); + PacketXi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), __riscv_vget_v_i32m4_i32m1(a, 1), + unpacket_traits::size); + PacketXi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), __riscv_vget_v_i32m4_i32m1(a, 3), + unpacket_traits::size); return predux_mul(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul4Xi& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i32m4_i32m1(a, - __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), - unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmin_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul4Xi& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i32m4_i32m1(a, - __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmax_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template @@ -679,7 +692,8 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle32_v_i32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle32_v_i32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } @@ -801,17 +815,19 @@ EIGEN_STRONG_INLINE PacketMul2Xi pxor(const PacketMul2Xi& a, const template <> EIGEN_STRONG_INLINE PacketMul2Xi pandnot(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vand_vv_i32m2(a, __riscv_vnot_v_i32m2(b, unpacket_traits::size), unpacket_traits::size); + return __riscv_vand_vv_i32m2(a, __riscv_vnot_v_i32m2(b, unpacket_traits::size), + unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul2Xi parithmetic_shift_right(PacketMul2Xi a) { - return __riscv_vsra_vx_i32m2(a, N, unpacket_traits::size); + return __riscv_vsra_vx_i32m2(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_right(PacketMul2Xi a) { - return __riscv_vreinterpret_i32m2(__riscv_vsrl_vx_u32m2(__riscv_vreinterpret_u32m2(a), N, unpacket_traits::size)); + return __riscv_vreinterpret_i32m2( + __riscv_vsrl_vx_u32m2(__riscv_vreinterpret_u32m2(a), N, unpacket_traits::size)); } template @@ -832,7 +848,8 @@ EIGEN_STRONG_INLINE PacketMul2Xi ploadu(const numext::int32_t* fro template <> EIGEN_STRONG_INLINE PacketMul2Xi ploaddup(const numext::int32_t* from) { PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); + idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); } @@ -855,13 +872,14 @@ EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const Pac } template <> -EIGEN_DEVICE_FUNC inline PacketMul2Xi pgather(const numext::int32_t* from, Index stride) { +EIGEN_DEVICE_FUNC inline PacketMul2Xi pgather(const numext::int32_t* from, + Index stride) { return __riscv_vlse32_v_i32m2(from, stride * sizeof(numext::int32_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul2Xi& from, - Index stride) { + Index stride) { __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); } @@ -872,42 +890,43 @@ EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul2Xi& a) template <> EIGEN_STRONG_INLINE PacketMul2Xi preverse(const PacketMul2Xi& a) { - PacketMul2Xu idx = __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketMul2Xu idx = + __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i32m2(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pabs(const PacketMul2Xi& a) { PacketMul2Xi mask = __riscv_vsra_vx_i32m2(a, 31, unpacket_traits::size); - return __riscv_vsub_vv_i32m2(__riscv_vxor_vv_i32m2(a, mask, unpacket_traits::size), mask, unpacket_traits::size); + return __riscv_vsub_vv_i32m2(__riscv_vxor_vv_i32m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul2Xi& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1(a, - __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul2Xi& a) { - return predux_mul(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), - __riscv_vget_v_i32m2_i32m1(a, 1), unpacket_traits::size)); + return predux_mul(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul2Xi& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i32m2_i32m1(a, - __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), - unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmin_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul2Xi& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i32m2_i32m1(a, - __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmax_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template @@ -919,29 +938,34 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle32_v_i32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle32_v_i32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template -EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, -PacketMul2Xi>::type predux_half_dowto4(const PacketMul4Xi& a) { - return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), - __riscv_vget_v_i32m4_i32m2(a, 1), unpacket_traits::size); +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xi>::type + predux_half_dowto4(const PacketMul4Xi& a) { + return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), __riscv_vget_v_i32m4_i32m2(a, 1), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, -PacketXi>::type predux_half_dowto4(const PacketMul2Xi& a) { - return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), - __riscv_vget_v_i32m2_i32m1(a, 1), unpacket_traits::size); +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXi>::type + predux_half_dowto4(const PacketMul2Xi& a) { + return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), + unpacket_traits::size); } /********************************* float32 ************************************/ typedef vfloat32m1_t PacketXf __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); -typedef vfloat32m2_t PacketMul2Xf __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); -typedef vfloat32m4_t PacketMul4Xf __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); +typedef vfloat32m2_t PacketMul2Xf __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); +typedef vfloat32m4_t PacketMul4Xf __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); template <> struct packet_traits : default_packet_traits { @@ -1069,7 +1093,7 @@ struct unpacket_traits { typedef PacketXf half; // Half not yet implemented typedef PacketXi integer_packet; typedef numext::uint8_t mask_t; - + enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, @@ -1140,7 +1164,8 @@ EIGEN_STRONG_INLINE PacketXf pset1frombits(numext::uint32_t from) { template <> EIGEN_STRONG_INLINE PacketXf plset(const float& a) { - PacketXf idx = __riscv_vfcvt_f_x_v_f32m1(__riscv_vid_v_i32m1(unpacket_traits::size), unpacket_traits::size); + PacketXf idx = + __riscv_vfcvt_f_x_v_f32m1(__riscv_vid_v_i32m1(unpacket_traits::size), unpacket_traits::size); return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits::size); } @@ -1199,7 +1224,7 @@ EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); return __riscv_vfmin_vv_f32m1_tum(mask, nans, a, b, unpacket_traits::size); } @@ -1219,7 +1244,7 @@ EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); return __riscv_vfmax_vv_f32m1_tum(mask, nans, a, b, unpacket_traits::size); } @@ -1261,23 +1286,28 @@ EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan(const PacketXf& a, const P // Logical Operations are not supported for float, so reinterpret casts template <> EIGEN_STRONG_INLINE PacketXf pand(const PacketXf& a, const PacketXf& b) { - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXf por(const PacketXf& a, const PacketXf& b) { - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vor_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vor_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXf pxor(const PacketXf& a, const PacketXf& b) { - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXf pandnot(const PacketXf& a, const PacketXf& b) { - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a), - __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size), unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), + __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size), + unpacket_traits::size)); } template <> @@ -1293,7 +1323,8 @@ EIGEN_STRONG_INLINE PacketXf ploadu(const float* from) { template <> EIGEN_STRONG_INLINE PacketXf ploaddup(const float* from) { PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); } @@ -1301,7 +1332,7 @@ template <> EIGEN_STRONG_INLINE PacketXf ploadquad(const float* from) { PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); } template <> @@ -1360,7 +1391,8 @@ EIGEN_STRONG_INLINE PacketXf pfloor(const PacketXf& a) { template <> EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) { - PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits::size); } @@ -1371,9 +1403,8 @@ EIGEN_STRONG_INLINE PacketXf pfrexp(const PacketXf& a, PacketXf& expon template <> EIGEN_STRONG_INLINE float predux(const PacketXf& a) { - return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m1_f32m1(a, - __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size), unpacket_traits::size)); } template <> @@ -1383,19 +1414,19 @@ EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) { PacketXf half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, unpacket_traits::size); prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, unpacket_traits::size); prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, unpacket_traits::size); prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); } // Last reduction - half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, unpacket_traits::size); prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); // The reduction is done to the first element. @@ -1404,16 +1435,16 @@ EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) { template <> EIGEN_STRONG_INLINE float predux_min(const PacketXf& a) { - return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1(a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_max(const PacketXf& a) { - return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1(a, - __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template @@ -1426,7 +1457,8 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle32_v_f32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle32_v_f32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } @@ -1464,7 +1496,8 @@ EIGEN_STRONG_INLINE PacketMul4Xf pset1frombits(numext::uint32_t fr template <> EIGEN_STRONG_INLINE PacketMul4Xf plset(const float& a) { - PacketMul4Xf idx = __riscv_vfcvt_f_x_v_f32m4(__riscv_vid_v_i32m4(unpacket_traits::size), unpacket_traits::size); + PacketMul4Xf idx = __riscv_vfcvt_f_x_v_f32m4(__riscv_vid_v_i32m4(unpacket_traits::size), + unpacket_traits::size); return __riscv_vfadd_vf_f32m4(idx, a, unpacket_traits::size); } @@ -1520,10 +1553,11 @@ EIGEN_STRONG_INLINE PacketMul4Xf pnmsub(const PacketMul4Xf& a, const PacketMul4X template <> EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { - PacketMul4Xf nans = __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMul4Xf nans = + __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); return __riscv_vfmin_vv_f32m4_tum(mask, nans, a, b, unpacket_traits::size); } @@ -1540,10 +1574,11 @@ EIGEN_STRONG_INLINE PacketMul4Xf pmin(const Pack template <> EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { - PacketMul4Xf nans = __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMul4Xf nans = + __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); return __riscv_vfmax_vv_f32m4_tum(mask, nans, a, b, unpacket_traits::size); } @@ -1561,19 +1596,22 @@ EIGEN_STRONG_INLINE PacketMul4Xf pmax(const Pack template <> EIGEN_STRONG_INLINE PacketMul4Xf pcmp_le(const PacketMul4Xf& a, const PacketMul4Xf& b) { PacketMask8 mask = __riscv_vmfle_vv_f32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt(const PacketMul4Xf& a, const PacketMul4Xf& b) { PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pcmp_eq(const PacketMul4Xf& a, const PacketMul4Xf& b) { PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> @@ -1585,23 +1623,31 @@ EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt_or_nan(const PacketMul4Xf // Logical Operations are not supported for float, so reinterpret casts template <> EIGEN_STRONG_INLINE PacketMul4Xf pand(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xf por(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pxor(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pandnot(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), - __riscv_vnot_v_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size), unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4( + __riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vnot_v_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size), + unpacket_traits::size)); } template <> @@ -1617,7 +1663,8 @@ EIGEN_STRONG_INLINE PacketMul4Xf ploadu(const float* from) { template <> EIGEN_STRONG_INLINE PacketMul4Xf ploaddup(const float* from) { PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); + idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); } @@ -1625,7 +1672,7 @@ template <> EIGEN_STRONG_INLINE PacketMul4Xf ploadquad(const float* from) { PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); + return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); } template <> @@ -1684,7 +1731,9 @@ EIGEN_STRONG_INLINE PacketMul4Xf pfloor(const PacketMul4Xf& a) { template <> EIGEN_STRONG_INLINE PacketMul4Xf preverse(const PacketMul4Xf& a) { - PacketMul4Xu idx = __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketMul4Xu idx = + __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f32m4(a, idx, unpacket_traits::size); } @@ -1695,32 +1744,31 @@ EIGEN_STRONG_INLINE PacketMul4Xf pfrexp(const PacketMul4Xf& a, Pac template <> EIGEN_STRONG_INLINE float predux(const PacketMul4Xf& a) { - return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m4_f32m1(a, - __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_mul(const PacketMul4Xf& a) { - PacketXf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), - __riscv_vget_v_f32m4_f32m1(a, 1), unpacket_traits::size); - PacketXf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), - __riscv_vget_v_f32m4_f32m1(a, 3), unpacket_traits::size); + PacketXf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), __riscv_vget_v_f32m4_f32m1(a, 1), + unpacket_traits::size); + PacketXf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), __riscv_vget_v_f32m4_f32m1(a, 3), + unpacket_traits::size); return predux_mul(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_min(const PacketMul4Xf& a) { - return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1(a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_max(const PacketMul4Xf& a) { - return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1(a, - __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template @@ -1733,7 +1781,8 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle32_v_f32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle32_v_f32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } @@ -1771,7 +1820,8 @@ EIGEN_STRONG_INLINE PacketMul2Xf pset1frombits(numext::uint32_t fr template <> EIGEN_STRONG_INLINE PacketMul2Xf plset(const float& a) { - PacketMul2Xf idx = __riscv_vfcvt_f_x_v_f32m2(__riscv_vid_v_i32m2(unpacket_traits::size), unpacket_traits::size); + PacketMul2Xf idx = __riscv_vfcvt_f_x_v_f32m2(__riscv_vid_v_i32m2(unpacket_traits::size), + unpacket_traits::size); return __riscv_vfadd_vf_f32m2(idx, a, unpacket_traits::size); } @@ -1827,10 +1877,11 @@ EIGEN_STRONG_INLINE PacketMul2Xf pnmsub(const PacketMul2Xf& a, const PacketMul2X template <> EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { - PacketMul2Xf nans = __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMul2Xf nans = + __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); return __riscv_vfmin_vv_f32m2_tum(mask, nans, a, b, unpacket_traits::size); } @@ -1847,10 +1898,11 @@ EIGEN_STRONG_INLINE PacketMul2Xf pmin(const Pack template <> EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { - PacketMul2Xf nans = __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMul2Xf nans = + __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); return __riscv_vfmax_vv_f32m2_tum(mask, nans, a, b, unpacket_traits::size); } @@ -1868,19 +1920,22 @@ EIGEN_STRONG_INLINE PacketMul2Xf pmax(const Pack template <> EIGEN_STRONG_INLINE PacketMul2Xf pcmp_le(const PacketMul2Xf& a, const PacketMul2Xf& b) { PacketMask16 mask = __riscv_vmfle_vv_f32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt(const PacketMul2Xf& a, const PacketMul2Xf& b) { PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pcmp_eq(const PacketMul2Xf& a, const PacketMul2Xf& b) { PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> @@ -1892,23 +1947,31 @@ EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt_or_nan(const PacketMul2Xf // Logical Operations are not supported for float, so reinterpret casts template <> EIGEN_STRONG_INLINE PacketMul2Xf pand(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xf por(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pxor(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pandnot(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), - __riscv_vnot_v_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size), unpacket_traits::size)); + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2( + __riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vnot_v_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size), + unpacket_traits::size)); } template <> @@ -1924,7 +1987,8 @@ EIGEN_STRONG_INLINE PacketMul2Xf ploadu(const float* from) { template <> EIGEN_STRONG_INLINE PacketMul2Xf ploaddup(const float* from) { PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, unpacket_traits::size); + idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); } @@ -1932,7 +1996,7 @@ template <> EIGEN_STRONG_INLINE PacketMul2Xf ploadquad(const float* from) { PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); + return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); } template <> @@ -1991,7 +2055,9 @@ EIGEN_STRONG_INLINE PacketMul2Xf pfloor(const PacketMul2Xf& a) { template <> EIGEN_STRONG_INLINE PacketMul2Xf preverse(const PacketMul2Xf& a) { - PacketMul2Xu idx = __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketMul2Xu idx = + __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f32m2(a, idx, unpacket_traits::size); } @@ -2002,29 +2068,28 @@ EIGEN_STRONG_INLINE PacketMul2Xf pfrexp(const PacketMul2Xf& a, Pac template <> EIGEN_STRONG_INLINE float predux(const PacketMul2Xf& a) { - return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m2_f32m1(a, - __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_mul(const PacketMul2Xf& a) { - return predux_mul(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), - __riscv_vget_v_f32m2_f32m1(a, 1), unpacket_traits::size)); + return predux_mul(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_min(const PacketMul2Xf& a) { - return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1(a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_max(const PacketMul2Xf& a) { - return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1(a, - __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template @@ -2037,7 +2102,8 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle32_v_f32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle32_v_f32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } @@ -2047,17 +2113,21 @@ EIGEN_STRONG_INLINE PacketMul2Xf pldexp(const PacketMul2Xf& a, con } template -EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, -PacketMul2Xf>::type predux_half_dowto4(const PacketMul4Xf& a) { - return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), - __riscv_vget_v_f32m4_f32m2(a, 1), unpacket_traits::size); +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xf>::type + predux_half_dowto4(const PacketMul4Xf& a) { + return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), __riscv_vget_v_f32m4_f32m2(a, 1), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, -PacketXf>::type predux_half_dowto4(const PacketMul2Xf& a) { - return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), - __riscv_vget_v_f32m2_f32m1(a, 1), unpacket_traits::size); +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXf>::type + predux_half_dowto4(const PacketMul2Xf& a) { + return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), + unpacket_traits::size); } /********************************* int64 **************************************/ @@ -2065,11 +2135,11 @@ PacketXf>::type predux_half_dowto4(const PacketMul2Xf& a) { typedef vint64m1_t PacketXl __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); typedef vuint64m1_t PacketXul __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); -typedef vint64m2_t PacketMul2Xl __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); -typedef vuint64m2_t PacketMul2Xul __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); +typedef vint64m2_t PacketMul2Xl __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); +typedef vuint64m2_t PacketMul2Xul __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); -typedef vint64m4_t PacketMul4Xl __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); -typedef vuint64m4_t PacketMul4Xul __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); +typedef vint64m4_t PacketMul4Xl __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); +typedef vuint64m4_t PacketMul4Xul __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); template <> struct packet_traits : default_packet_traits { @@ -2316,17 +2386,19 @@ EIGEN_STRONG_INLINE PacketXl pxor(const PacketXl& a, const PacketXl& b template <> EIGEN_STRONG_INLINE PacketXl pandnot(const PacketXl& a, const PacketXl& b) { - return __riscv_vand_vv_i64m1(a, __riscv_vnot_v_i64m1(b, unpacket_traits::size), unpacket_traits::size); + return __riscv_vand_vv_i64m1(a, __riscv_vnot_v_i64m1(b, unpacket_traits::size), + unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketXl parithmetic_shift_right(PacketXl a) { - return __riscv_vsra_vx_i64m1(a, N, unpacket_traits::size); + return __riscv_vsra_vx_i64m1(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketXl plogical_shift_right(PacketXl a) { - return __riscv_vreinterpret_i64m1(__riscv_vsrl_vx_u64m1(__riscv_vreinterpret_u64m1(a), N, unpacket_traits::size)); + return __riscv_vreinterpret_i64m1( + __riscv_vsrl_vx_u64m1(__riscv_vreinterpret_u64m1(a), N, unpacket_traits::size)); } template @@ -2347,7 +2419,8 @@ EIGEN_STRONG_INLINE PacketXl ploadu(const numext::int64_t* from) { template <> EIGEN_STRONG_INLINE PacketXl ploaddup(const numext::int64_t* from) { PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); } @@ -2355,7 +2428,9 @@ EIGEN_STRONG_INLINE PacketXl ploaddup(const numext::int64_t* from) { template <> EIGEN_STRONG_INLINE PacketXl ploadquad(const numext::int64_t* from) { PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size);; + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + ; return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); } @@ -2387,21 +2462,22 @@ EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketXl& a) { template <> EIGEN_STRONG_INLINE PacketXl preverse(const PacketXl& a) { - PacketXul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketXul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXl pabs(const PacketXl& a) { PacketXl mask = __riscv_vsra_vx_i64m1(a, 63, unpacket_traits::size); - return __riscv_vsub_vv_i64m1(__riscv_vxor_vv_i64m1(a, mask, unpacket_traits::size), mask, unpacket_traits::size); + return __riscv_vsub_vv_i64m1(__riscv_vxor_vv_i64m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int64_t predux(const PacketXl& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i64m1_i64m1(a, - __riscv_vmv_v_x_i64m1(0, unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i64m1_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size), + unpacket_traits::size)); } template <> @@ -2411,15 +2487,15 @@ EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketXl& a) { PacketXl half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_i64m1(prod, 4, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i64m1(prod, 4, unpacket_traits::size); prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_i64m1(prod, 2, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i64m1(prod, 2, unpacket_traits::size); prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_i64m1(prod, 1, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i64m1(prod, 1, unpacket_traits::size); prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); } @@ -2429,17 +2505,16 @@ EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketXl& a) { template <> EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketXl& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i64m1_i64m1(a, - __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), - unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmin_vs_i64m1_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketXl& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i64m1_i64m1(a, - __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmax_vs_i64m1_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); } template @@ -2451,7 +2526,8 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle64_v_i64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle64_v_i64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } @@ -2573,17 +2649,19 @@ EIGEN_STRONG_INLINE PacketMul4Xl pxor(const PacketMul4Xl& a, const template <> EIGEN_STRONG_INLINE PacketMul4Xl pandnot(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vand_vv_i64m4(a, __riscv_vnot_v_i64m4(b, unpacket_traits::size), unpacket_traits::size); + return __riscv_vand_vv_i64m4(a, __riscv_vnot_v_i64m4(b, unpacket_traits::size), + unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul4Xl parithmetic_shift_right(PacketMul4Xl a) { - return __riscv_vsra_vx_i64m4(a, N, unpacket_traits::size); + return __riscv_vsra_vx_i64m4(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_right(PacketMul4Xl a) { - return __riscv_vreinterpret_i64m4(__riscv_vsrl_vx_u64m4(__riscv_vreinterpret_u64m4(a), N, unpacket_traits::size)); + return __riscv_vreinterpret_i64m4( + __riscv_vsrl_vx_u64m4(__riscv_vreinterpret_u64m4(a), N, unpacket_traits::size)); } template @@ -2604,7 +2682,8 @@ EIGEN_STRONG_INLINE PacketMul4Xl ploadu(const numext::int64_t* fro template <> EIGEN_STRONG_INLINE PacketMul4Xl ploaddup(const numext::int64_t* from) { PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); } @@ -2612,7 +2691,8 @@ EIGEN_STRONG_INLINE PacketMul4Xl ploaddup(const numext::int64_t* f template <> EIGEN_STRONG_INLINE PacketMul4Xl ploadquad(const numext::int64_t* from) { PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); } @@ -2627,13 +2707,14 @@ EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const Pac } template <> -EIGEN_DEVICE_FUNC inline PacketMul4Xl pgather(const numext::int64_t* from, Index stride) { +EIGEN_DEVICE_FUNC inline PacketMul4Xl pgather(const numext::int64_t* from, + Index stride) { return __riscv_vlse64_v_i64m4(from, stride * sizeof(numext::int64_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul4Xl& from, - Index stride) { + Index stride) { __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); } @@ -2644,45 +2725,46 @@ EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul4Xl& a) template <> EIGEN_STRONG_INLINE PacketMul4Xl preverse(const PacketMul4Xl& a) { - PacketMul4Xul idx = __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketMul4Xul idx = + __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i64m4(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pabs(const PacketMul4Xl& a) { PacketMul4Xl mask = __riscv_vsra_vx_i64m4(a, 63, unpacket_traits::size); - return __riscv_vsub_vv_i64m4(__riscv_vxor_vv_i64m4(a, mask, unpacket_traits::size), mask, unpacket_traits::size); + return __riscv_vsub_vv_i64m4(__riscv_vxor_vv_i64m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul4Xl& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1(a, - __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul4Xl& a) { - PacketXl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), - __riscv_vget_v_i64m4_i64m1(a, 1), unpacket_traits::size); - PacketXl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), - __riscv_vget_v_i64m4_i64m1(a, 3), unpacket_traits::size); + PacketXl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), __riscv_vget_v_i64m4_i64m1(a, 1), + unpacket_traits::size); + PacketXl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), __riscv_vget_v_i64m4_i64m1(a, 3), + unpacket_traits::size); return predux_mul(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul4Xl& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i64m4_i64m1(a, - __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), - unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmin_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul4Xl& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i64m4_i64m1(a, - __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmax_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template @@ -2694,7 +2776,8 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle64_v_i64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle64_v_i64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } @@ -2816,17 +2899,19 @@ EIGEN_STRONG_INLINE PacketMul2Xl pxor(const PacketMul2Xl& a, const template <> EIGEN_STRONG_INLINE PacketMul2Xl pandnot(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vand_vv_i64m2(a, __riscv_vnot_v_i64m2(b, unpacket_traits::size), unpacket_traits::size); + return __riscv_vand_vv_i64m2(a, __riscv_vnot_v_i64m2(b, unpacket_traits::size), + unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul2Xl parithmetic_shift_right(PacketMul2Xl a) { - return __riscv_vsra_vx_i64m2(a, N, unpacket_traits::size); + return __riscv_vsra_vx_i64m2(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_right(PacketMul2Xl a) { - return __riscv_vreinterpret_i64m2(__riscv_vsrl_vx_u64m2(__riscv_vreinterpret_u64m2(a), N, unpacket_traits::size)); + return __riscv_vreinterpret_i64m2( + __riscv_vsrl_vx_u64m2(__riscv_vreinterpret_u64m2(a), N, unpacket_traits::size)); } template @@ -2847,7 +2932,8 @@ EIGEN_STRONG_INLINE PacketMul2Xl ploadu(const numext::int64_t* fro template <> EIGEN_STRONG_INLINE PacketMul2Xl ploaddup(const numext::int64_t* from) { PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); } @@ -2855,7 +2941,8 @@ EIGEN_STRONG_INLINE PacketMul2Xl ploaddup(const numext::int64_t* f template <> EIGEN_STRONG_INLINE PacketMul2Xl ploadquad(const numext::int64_t* from) { PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); } @@ -2870,13 +2957,14 @@ EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const Pac } template <> -EIGEN_DEVICE_FUNC inline PacketMul2Xl pgather(const numext::int64_t* from, Index stride) { +EIGEN_DEVICE_FUNC inline PacketMul2Xl pgather(const numext::int64_t* from, + Index stride) { return __riscv_vlse64_v_i64m2(from, stride * sizeof(numext::int64_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul2Xl& from, - Index stride) { + Index stride) { __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); } @@ -2887,44 +2975,43 @@ EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul2Xl& a) template <> EIGEN_STRONG_INLINE PacketMul2Xl preverse(const PacketMul2Xl& a) { - PacketMul2Xul idx = __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), - unpacket_traits::size-1, unpacket_traits::size); + PacketMul2Xul idx = + __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i64m2(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pabs(const PacketMul2Xl& a) { PacketMul2Xl mask = __riscv_vsra_vx_i64m2(a, 63, unpacket_traits::size); - return __riscv_vsub_vv_i64m2(__riscv_vxor_vv_i64m2(a, mask, unpacket_traits::size), - mask, unpacket_traits::size); + return __riscv_vsub_vv_i64m2(__riscv_vxor_vv_i64m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul2Xl& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1(a, - __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul2Xl& a) { - return predux_mul(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), - __riscv_vget_v_i64m2_i64m1(a, 1), unpacket_traits::size)); + return predux_mul(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul2Xl& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i64m2_i64m1(a, - __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), - unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmin_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul2Xl& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i64m2_i64m1(a, - __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmax_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template @@ -2936,29 +3023,34 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle64_v_i64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle64_v_i64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template -EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, -PacketMul2Xl>::type predux_half_dowto4(const PacketMul4Xl& a) { - return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), - __riscv_vget_v_i64m4_i64m2(a, 1), unpacket_traits::size); +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xl>::type + predux_half_dowto4(const PacketMul4Xl& a) { + return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), __riscv_vget_v_i64m4_i64m2(a, 1), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, -PacketXl>::type predux_half_dowto4(const PacketMul2Xl& a) { - return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), - __riscv_vget_v_i64m2_i64m1(a, 1), unpacket_traits::size); +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXl>::type + predux_half_dowto4(const PacketMul2Xl& a) { + return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), + unpacket_traits::size); } /********************************* double ************************************/ typedef vfloat64m1_t PacketXd __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); -typedef vfloat64m2_t PacketMul2Xd __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); -typedef vfloat64m4_t PacketMul4Xd __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); +typedef vfloat64m2_t PacketMul2Xd __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); +typedef vfloat64m4_t PacketMul4Xd __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); template <> struct packet_traits : default_packet_traits { @@ -3074,7 +3166,7 @@ struct unpacket_traits { typedef PacketXd half; // Half not yet implemented typedef PacketXl integer_packet; typedef numext::uint8_t mask_t; - + enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, @@ -3145,7 +3237,8 @@ EIGEN_STRONG_INLINE PacketXd pset1frombits(numext::uint64_t from) { template <> EIGEN_STRONG_INLINE PacketXd plset(const double& a) { - PacketXd idx = __riscv_vfcvt_f_x_v_f64m1(__riscv_vid_v_i64m1(unpacket_traits::size), unpacket_traits::size); + PacketXd idx = + __riscv_vfcvt_f_x_v_f64m1(__riscv_vid_v_i64m1(unpacket_traits::size), unpacket_traits::size); return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits::size); } @@ -3204,7 +3297,7 @@ EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b PacketXd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); return __riscv_vfmin_vv_f64m1_tum(mask, nans, a, b, unpacket_traits::size); } @@ -3224,7 +3317,7 @@ EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b PacketXd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); return __riscv_vfmax_vv_f64m1_tum(mask, nans, a, b, unpacket_traits::size); } @@ -3266,23 +3359,28 @@ EIGEN_STRONG_INLINE PacketXd pcmp_lt_or_nan(const PacketXd& a, const P // Logical Operations are not supported for double, so reinterpret casts template <> EIGEN_STRONG_INLINE PacketXd pand(const PacketXd& a, const PacketXd& b) { - return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXd por(const PacketXd& a, const PacketXd& b) { - return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vor_vv_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vor_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXd pxor(const PacketXd& a, const PacketXd& b) { - return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vv_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXd pandnot(const PacketXd& a, const PacketXd& b) { - return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(a), - __riscv_vnot_v_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size), unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), + __riscv_vnot_v_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size), + unpacket_traits::size)); } template <> @@ -3298,15 +3396,18 @@ EIGEN_STRONG_INLINE PacketXd ploadu(const double* from) { template <> EIGEN_STRONG_INLINE PacketXd ploaddup(const double* from) { PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd ploadquad(const double* from) { PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size);; - return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + ; + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); } template <> @@ -3365,7 +3466,8 @@ EIGEN_STRONG_INLINE PacketXd pfloor(const PacketXd& a) { template <> EIGEN_STRONG_INLINE PacketXd preverse(const PacketXd& a) { - PacketXul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketXul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits::size); } @@ -3376,9 +3478,8 @@ EIGEN_STRONG_INLINE PacketXd pfrexp(const PacketXd& a, PacketXd& expon template <> EIGEN_STRONG_INLINE double predux(const PacketXd& a) { - return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m1_f64m1(a, - __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size), unpacket_traits::size)); } template <> @@ -3388,15 +3489,15 @@ EIGEN_STRONG_INLINE double predux_mul(const PacketXd& a) { PacketXd half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_f64m1(prod, 4, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f64m1(prod, 4, unpacket_traits::size); prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_f64m1(prod, 2, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f64m1(prod, 2, unpacket_traits::size); prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_f64m1(prod, 1, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f64m1(prod, 1, unpacket_traits::size); prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); } @@ -3406,16 +3507,16 @@ EIGEN_STRONG_INLINE double predux_mul(const PacketXd& a) { template <> EIGEN_STRONG_INLINE double predux_min(const PacketXd& a) { - return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1(a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_max(const PacketXd& a) { - return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1(a, - __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template @@ -3428,7 +3529,8 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle64_v_f64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle64_v_f64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } @@ -3466,7 +3568,8 @@ EIGEN_STRONG_INLINE PacketMul4Xd pset1frombits(numext::uint64_t fr template <> EIGEN_STRONG_INLINE PacketMul4Xd plset(const double& a) { - PacketMul4Xd idx = __riscv_vfcvt_f_x_v_f64m4(__riscv_vid_v_i64m4(unpacket_traits::size), unpacket_traits::size); + PacketMul4Xd idx = __riscv_vfcvt_f_x_v_f64m4(__riscv_vid_v_i64m4(unpacket_traits::size), + unpacket_traits::size); return __riscv_vfadd_vf_f64m4(idx, a, unpacket_traits::size); } @@ -3522,10 +3625,11 @@ EIGEN_STRONG_INLINE PacketMul4Xd pnmsub(const PacketMul4Xd& a, const PacketMul4X template <> EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { - PacketMul4Xd nans = __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMul4Xd nans = + __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); return __riscv_vfmin_vv_f64m4_tum(mask, nans, a, b, unpacket_traits::size); } @@ -3542,10 +3646,11 @@ EIGEN_STRONG_INLINE PacketMul4Xd pmin(const Pack template <> EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { - PacketMul4Xd nans = __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMul4Xd nans = + __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); return __riscv_vfmax_vv_f64m4_tum(mask, nans, a, b, unpacket_traits::size); } @@ -3563,19 +3668,22 @@ EIGEN_STRONG_INLINE PacketMul4Xd pmax(const Pack template <> EIGEN_STRONG_INLINE PacketMul4Xd pcmp_le(const PacketMul4Xd& a, const PacketMul4Xd& b) { PacketMask16 mask = __riscv_vmfle_vv_f64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt(const PacketMul4Xd& a, const PacketMul4Xd& b) { PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pcmp_eq(const PacketMul4Xd& a, const PacketMul4Xd& b) { PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> @@ -3587,23 +3695,31 @@ EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt_or_nan(const PacketMul4Xd // Logical Operations are not supported for double, so reinterpret casts template <> EIGEN_STRONG_INLINE PacketMul4Xd pand(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xd por(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pxor(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pandnot(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), - __riscv_vnot_v_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size), unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4( + __riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vnot_v_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size), + unpacket_traits::size)); } template <> @@ -3619,15 +3735,17 @@ EIGEN_STRONG_INLINE PacketMul4Xd ploadu(const double* from) { template <> EIGEN_STRONG_INLINE PacketMul4Xd ploaddup(const double* from) { PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd ploadquad(const double* from) { PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size); - return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); } template <> @@ -3686,7 +3804,9 @@ EIGEN_STRONG_INLINE PacketMul4Xd pfloor(const PacketMul4Xd& a) { template <> EIGEN_STRONG_INLINE PacketMul4Xd preverse(const PacketMul4Xd& a) { - PacketMul4Xul idx = __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketMul4Xul idx = + __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f64m4(a, idx, unpacket_traits::size); } @@ -3697,32 +3817,31 @@ EIGEN_STRONG_INLINE PacketMul4Xd pfrexp(const PacketMul4Xd& a, Pac template <> EIGEN_STRONG_INLINE double predux(const PacketMul4Xd& a) { - return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m4_f64m1(a, - __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_mul(const PacketMul4Xd& a) { - PacketXd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), - __riscv_vget_v_f64m4_f64m1(a, 1), unpacket_traits::size); - PacketXd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), - __riscv_vget_v_f64m4_f64m1(a, 3), unpacket_traits::size); + PacketXd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), __riscv_vget_v_f64m4_f64m1(a, 1), + unpacket_traits::size); + PacketXd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), __riscv_vget_v_f64m4_f64m1(a, 3), + unpacket_traits::size); return predux_mul(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_min(const PacketMul4Xd& a) { - return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1(a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_max(const PacketMul4Xd& a) { - return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1(a, - __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template @@ -3735,7 +3854,8 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle64_v_f64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle64_v_f64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } @@ -3773,7 +3893,8 @@ EIGEN_STRONG_INLINE PacketMul2Xd pset1frombits(numext::uint64_t fr template <> EIGEN_STRONG_INLINE PacketMul2Xd plset(const double& a) { - PacketMul2Xd idx = __riscv_vfcvt_f_x_v_f64m2(__riscv_vid_v_i64m2(unpacket_traits::size), unpacket_traits::size); + PacketMul2Xd idx = __riscv_vfcvt_f_x_v_f64m2(__riscv_vid_v_i64m2(unpacket_traits::size), + unpacket_traits::size); return __riscv_vfadd_vf_f64m2(idx, a, unpacket_traits::size); } @@ -3829,10 +3950,11 @@ EIGEN_STRONG_INLINE PacketMul2Xd pnmsub(const PacketMul2Xd& a, const PacketMul2X template <> EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { - PacketMul2Xd nans = __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMul2Xd nans = + __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); return __riscv_vfmin_vv_f64m2_tum(mask, nans, a, b, unpacket_traits::size); } @@ -3849,10 +3971,11 @@ EIGEN_STRONG_INLINE PacketMul2Xd pmin(const Pack template <> EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { - PacketMul2Xd nans = __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMul2Xd nans = + __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); return __riscv_vfmax_vv_f64m2_tum(mask, nans, a, b, unpacket_traits::size); } @@ -3870,19 +3993,22 @@ EIGEN_STRONG_INLINE PacketMul2Xd pmax(const Pack template <> EIGEN_STRONG_INLINE PacketMul2Xd pcmp_le(const PacketMul2Xd& a, const PacketMul2Xd& b) { PacketMask32 mask = __riscv_vmfle_vv_f64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt(const PacketMul2Xd& a, const PacketMul2Xd& b) { PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pcmp_eq(const PacketMul2Xd& a, const PacketMul2Xd& b) { PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> @@ -3894,23 +4020,31 @@ EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt_or_nan(const PacketMul2Xd // Logical Operations are not supported for double, so reinterpret casts template <> EIGEN_STRONG_INLINE PacketMul2Xd pand(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xd por(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pxor(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pandnot(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), - __riscv_vnot_v_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size), unpacket_traits::size)); + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2( + __riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vnot_v_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size), + unpacket_traits::size)); } template <> @@ -3926,15 +4060,17 @@ EIGEN_STRONG_INLINE PacketMul2Xd ploadu(const double* from) { template <> EIGEN_STRONG_INLINE PacketMul2Xd ploaddup(const double* from) { PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd ploadquad(const double* from) { PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, unpacket_traits::size); - return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); } template <> @@ -3993,7 +4129,9 @@ EIGEN_STRONG_INLINE PacketMul2Xd pfloor(const PacketMul2Xd& a) { template <> EIGEN_STRONG_INLINE PacketMul2Xd preverse(const PacketMul2Xd& a) { - PacketMul2Xul idx = __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketMul2Xul idx = + __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f64m2(a, idx, unpacket_traits::size); } @@ -4004,29 +4142,28 @@ EIGEN_STRONG_INLINE PacketMul2Xd pfrexp(const PacketMul2Xd& a, Pac template <> EIGEN_STRONG_INLINE double predux(const PacketMul2Xd& a) { - return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m2_f64m1(a, - __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_mul(const PacketMul2Xd& a) { - return predux_mul(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), - __riscv_vget_v_f64m2_f64m1(a, 1), unpacket_traits::size)); + return predux_mul(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_min(const PacketMul2Xd& a) { - return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1(a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_max(const PacketMul2Xd& a) { - return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1(a, - __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template @@ -4039,7 +4176,8 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle64_v_f64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle64_v_f64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } @@ -4049,17 +4187,21 @@ EIGEN_STRONG_INLINE PacketMul2Xd pldexp(const PacketMul2Xd& a, con } template -EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, -PacketMul2Xd>::type predux_half_dowto4(const PacketMul4Xd& a) { - return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), - __riscv_vget_v_f64m4_f64m2(a, 1), unpacket_traits::size); +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xd>::type + predux_half_dowto4(const PacketMul4Xd& a) { + return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), __riscv_vget_v_f64m4_f64m2(a, 1), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, -PacketXd>::type predux_half_dowto4(const PacketMul2Xd& a) { - return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), - __riscv_vget_v_f64m2_f64m1(a, 1), unpacket_traits::size); +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXd>::type + predux_half_dowto4(const PacketMul2Xd& a) { + return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), + unpacket_traits::size); } /********************************* short **************************************/ @@ -4067,11 +4209,11 @@ PacketXd>::type predux_half_dowto4(const PacketMul2Xd& a) { typedef vint16m1_t PacketXs __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); typedef vuint16m1_t PacketXsu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); -typedef vint16m2_t PacketMul2Xs __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); -typedef vuint16m2_t PacketMul2Xsu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); +typedef vint16m2_t PacketMul2Xs __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); +typedef vuint16m2_t PacketMul2Xsu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); -typedef vint16m4_t PacketMul4Xs __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); -typedef vuint16m4_t PacketMul4Xsu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*4))); +typedef vint16m4_t PacketMul4Xs __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); +typedef vuint16m4_t PacketMul4Xsu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); template <> struct packet_traits : default_packet_traits { @@ -4318,17 +4460,19 @@ EIGEN_STRONG_INLINE PacketXs pxor(const PacketXs& a, const PacketXs& b template <> EIGEN_STRONG_INLINE PacketXs pandnot(const PacketXs& a, const PacketXs& b) { - return __riscv_vand_vv_i16m1(a, __riscv_vnot_v_i16m1(b, unpacket_traits::size), unpacket_traits::size); + return __riscv_vand_vv_i16m1(a, __riscv_vnot_v_i16m1(b, unpacket_traits::size), + unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketXs parithmetic_shift_right(PacketXs a) { - return __riscv_vsra_vx_i16m1(a, N, unpacket_traits::size); + return __riscv_vsra_vx_i16m1(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketXs plogical_shift_right(PacketXs a) { - return __riscv_vreinterpret_i16m1(__riscv_vsrl_vx_u16m1(__riscv_vreinterpret_u16m1(a), N, unpacket_traits::size)); + return __riscv_vreinterpret_i16m1( + __riscv_vsrl_vx_u16m1(__riscv_vreinterpret_u16m1(a), N, unpacket_traits::size)); } template @@ -4357,7 +4501,8 @@ EIGEN_STRONG_INLINE PacketXs ploaddup(const numext::int16_t* from) { template <> EIGEN_STRONG_INLINE PacketXs ploadquad(const numext::int16_t* from) { PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); } @@ -4389,21 +4534,22 @@ EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketXs& a) { template <> EIGEN_STRONG_INLINE PacketXs preverse(const PacketXs& a) { - PacketXsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketXsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXs pabs(const PacketXs& a) { PacketXs mask = __riscv_vsra_vx_i16m1(a, 15, unpacket_traits::size); - return __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(a, mask, unpacket_traits::size), mask, unpacket_traits::size); + return __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int16_t predux(const PacketXs& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i16m1_i16m1(a, - __riscv_vmv_v_x_i16m1(0, unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size), + unpacket_traits::size)); } template <> @@ -4413,22 +4559,22 @@ EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketXs& a) { PacketXs half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_i16m1(prod, 16, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 16, unpacket_traits::size); prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_i16m1(prod, 8, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 8, unpacket_traits::size); prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_i16m1(prod, 4, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 4, unpacket_traits::size); prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); } // Last reduction - half_prod = __riscv_vslidedown_vx_i16m1(prod, 2, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 2, unpacket_traits::size); prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); - half_prod = __riscv_vslidedown_vx_i16m1(prod, 1, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 1, unpacket_traits::size); prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); // The reduction is done to the first element. @@ -4437,17 +4583,16 @@ EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketXs& a) { template <> EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketXs& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i16m1_i16m1(a, - __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), - unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmin_vs_i16m1_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketXs& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i16m1_i16m1(a, - __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmax_vs_i16m1_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); } template @@ -4459,7 +4604,8 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle16_v_i16m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle16_v_i16m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } @@ -4581,17 +4727,19 @@ EIGEN_STRONG_INLINE PacketMul4Xs pxor(const PacketMul4Xs& a, const template <> EIGEN_STRONG_INLINE PacketMul4Xs pandnot(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vand_vv_i16m4(a, __riscv_vnot_v_i16m4(b, unpacket_traits::size), unpacket_traits::size); + return __riscv_vand_vv_i16m4(a, __riscv_vnot_v_i16m4(b, unpacket_traits::size), + unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul4Xs parithmetic_shift_right(PacketMul4Xs a) { - return __riscv_vsra_vx_i16m4(a, N, unpacket_traits::size); + return __riscv_vsra_vx_i16m4(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_right(PacketMul4Xs a) { - return __riscv_vreinterpret_i16m4(__riscv_vsrl_vx_u16m4(__riscv_vreinterpret_u16m4(a), N, unpacket_traits::size)); + return __riscv_vreinterpret_i16m4( + __riscv_vsrl_vx_u16m4(__riscv_vreinterpret_u16m4(a), N, unpacket_traits::size)); } template @@ -4620,7 +4768,8 @@ EIGEN_STRONG_INLINE PacketMul4Xs ploaddup(const numext::int16_t* f template <> EIGEN_STRONG_INLINE PacketMul4Xs ploadquad(const numext::int16_t* from) { PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m4(__riscv_vand_vx_u16m4(idx, 0xfffcu, unpacket_traits::size), 1, unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m4(__riscv_vand_vx_u16m4(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); } @@ -4635,13 +4784,14 @@ EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const Pac } template <> -EIGEN_DEVICE_FUNC inline PacketMul4Xs pgather(const numext::int16_t* from, Index stride) { +EIGEN_DEVICE_FUNC inline PacketMul4Xs pgather(const numext::int16_t* from, + Index stride) { return __riscv_vlse16_v_i16m4(from, stride * sizeof(numext::int16_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul4Xs& from, - Index stride) { + Index stride) { __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); } @@ -4652,45 +4802,46 @@ EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul4Xs& a) template <> EIGEN_STRONG_INLINE PacketMul4Xs preverse(const PacketMul4Xs& a) { - PacketMul4Xsu idx = __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketMul4Xsu idx = + __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i16m4(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pabs(const PacketMul4Xs& a) { PacketMul4Xs mask = __riscv_vsra_vx_i16m4(a, 15, unpacket_traits::size); - return __riscv_vsub_vv_i16m4(__riscv_vxor_vv_i16m4(a, mask, unpacket_traits::size), mask, unpacket_traits::size); + return __riscv_vsub_vv_i16m4(__riscv_vxor_vv_i16m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul4Xs& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1(a, - __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul4Xs& a) { - PacketXs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), - __riscv_vget_v_i16m4_i16m1(a, 1), unpacket_traits::size); - PacketXs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), - __riscv_vget_v_i16m4_i16m1(a, 3), unpacket_traits::size); + PacketXs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), __riscv_vget_v_i16m4_i16m1(a, 1), + unpacket_traits::size); + PacketXs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), __riscv_vget_v_i16m4_i16m1(a, 3), + unpacket_traits::size); return predux_mul(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul4Xs& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i16m4_i16m1(a, - __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), - unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmin_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul4Xs& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i16m4_i16m1(a, - __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmax_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template @@ -4702,11 +4853,11 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle16_v_i16m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle16_v_i16m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } - /********************************* PacketMul2Xs ************************************/ template <> @@ -4825,17 +4976,19 @@ EIGEN_STRONG_INLINE PacketMul2Xs pxor(const PacketMul2Xs& a, const template <> EIGEN_STRONG_INLINE PacketMul2Xs pandnot(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vand_vv_i16m2(a, __riscv_vnot_v_i16m2(b, unpacket_traits::size), unpacket_traits::size); + return __riscv_vand_vv_i16m2(a, __riscv_vnot_v_i16m2(b, unpacket_traits::size), + unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul2Xs parithmetic_shift_right(PacketMul2Xs a) { - return __riscv_vsra_vx_i16m2(a, N, unpacket_traits::size); + return __riscv_vsra_vx_i16m2(a, N, unpacket_traits::size); } template EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_right(PacketMul2Xs a) { - return __riscv_vreinterpret_i16m2(__riscv_vsrl_vx_u16m2(__riscv_vreinterpret_u16m2(a), N, unpacket_traits::size)); + return __riscv_vreinterpret_i16m2( + __riscv_vsrl_vx_u16m2(__riscv_vreinterpret_u16m2(a), N, unpacket_traits::size)); } template @@ -4864,7 +5017,8 @@ EIGEN_STRONG_INLINE PacketMul2Xs ploaddup(const numext::int16_t* f template <> EIGEN_STRONG_INLINE PacketMul2Xs ploadquad(const numext::int16_t* from) { PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); } @@ -4879,13 +5033,14 @@ EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const Pac } template <> -EIGEN_DEVICE_FUNC inline PacketMul2Xs pgather(const numext::int16_t* from, Index stride) { +EIGEN_DEVICE_FUNC inline PacketMul2Xs pgather(const numext::int16_t* from, + Index stride) { return __riscv_vlse16_v_i16m2(from, stride * sizeof(numext::int16_t), unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul2Xs& from, - Index stride) { + Index stride) { __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); } @@ -4896,42 +5051,43 @@ EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul2Xs& a) template <> EIGEN_STRONG_INLINE PacketMul2Xs preverse(const PacketMul2Xs& a) { - PacketMul2Xsu idx = __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketMul2Xsu idx = + __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_i16m2(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pabs(const PacketMul2Xs& a) { PacketMul2Xs mask = __riscv_vsra_vx_i16m2(a, 15, unpacket_traits::size); - return __riscv_vsub_vv_i16m2(__riscv_vxor_vv_i16m2(a, mask, unpacket_traits::size), mask, unpacket_traits::size); + return __riscv_vsub_vv_i16m2(__riscv_vxor_vv_i16m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul2Xs& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1(a, - __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul2Xs& a) { - return predux_mul(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), - __riscv_vget_v_i16m2_i16m1(a, 1), unpacket_traits::size)); + return predux_mul(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul2Xs& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i16m2_i16m1(a, - __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), - unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmin_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul2Xs& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i16m2_i16m1(a, - __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return __riscv_vmv_x(__riscv_vredmax_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template @@ -4943,22 +5099,27 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle16_v_i16m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i] = + __riscv_vle16_v_i16m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template -EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, -PacketMul2Xs>::type predux_half_dowto4(const PacketMul4Xs& a) { - return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), - __riscv_vget_v_i16m4_i16m2(a, 1), unpacket_traits::size); +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xs>::type + predux_half_dowto4(const PacketMul4Xs& a) { + return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), __riscv_vget_v_i16m4_i16m2(a, 1), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, -PacketXs>::type predux_half_dowto4(const PacketMul2Xs& a) { - return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), - __riscv_vget_v_i16m2_i16m1(a, 1), unpacket_traits::size); +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXs>::type + predux_half_dowto4(const PacketMul2Xs& a) { + return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), + unpacket_traits::size); } } // namespace internal diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h index c1401bdaa..8b8c4ab38 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h +++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h @@ -17,7 +17,7 @@ namespace Eigen { namespace internal { typedef vfloat16m1_t PacketXh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); -typedef vfloat16m2_t PacketMul2Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL*2))); +typedef vfloat16m2_t PacketMul2Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); template <> struct packet_traits : default_packet_traits { @@ -105,7 +105,7 @@ struct unpacket_traits { typedef PacketXh half; // Half not yet implemented typedef PacketXs integer_packet; typedef numext::uint8_t mask_t; - + enum { size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, @@ -160,7 +160,8 @@ EIGEN_STRONG_INLINE PacketXh pset1frombits(numext::uint16_t from) { template <> EIGEN_STRONG_INLINE PacketXh plset(const Eigen::half& a) { - PacketXh idx = __riscv_vfcvt_f_x_v_f16m1(__riscv_vid_v_i16m1(unpacket_traits::size), unpacket_traits::size); + PacketXh idx = + __riscv_vfcvt_f_x_v_f16m1(__riscv_vid_v_i16m1(unpacket_traits::size), unpacket_traits::size); return __riscv_vfadd_vf_f16m1(idx, a, unpacket_traits::size); } @@ -216,10 +217,11 @@ EIGEN_STRONG_INLINE PacketXh pnmsub(const PacketXh& a, const PacketXh& b, const template <> EIGEN_STRONG_INLINE PacketXh pmin(const PacketXh& a, const PacketXh& b) { - PacketXh nans = __riscv_vfmv_v_f_f16m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketXh nans = + __riscv_vfmv_v_f_f16m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); return __riscv_vfmin_vv_f16m1_tum(mask, nans, a, b, unpacket_traits::size); } @@ -236,10 +238,11 @@ EIGEN_STRONG_INLINE PacketXh pmin(const PacketXh& a, template <> EIGEN_STRONG_INLINE PacketXh pmax(const PacketXh& a, const PacketXh& b) { - PacketXh nans = __riscv_vfmv_v_f_f16m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketXh nans = + __riscv_vfmv_v_f_f16m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits::size); PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); return __riscv_vfmax_vv_f16m1_tum(mask, nans, a, b, unpacket_traits::size); } @@ -281,33 +284,40 @@ EIGEN_STRONG_INLINE PacketXh pcmp_lt_or_nan(const PacketXh& a, const P // Logical Operations are not supported for half, so reinterpret casts template <> EIGEN_STRONG_INLINE PacketXh pand(const PacketXh& a, const PacketXh& b) { - return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXh por(const PacketXh& a, const PacketXh& b) { - return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vor_vv_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vor_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXh pxor(const PacketXh& a, const PacketXh& b) { - return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vxor_vv_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vxor_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXh pandnot(const PacketXh& a, const PacketXh& b) { - return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(a), - __riscv_vnot_v_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), + __riscv_vnot_v_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXh pload(const Eigen::half* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast(from), unpacket_traits::size); + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast(from), + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXh ploadu(const Eigen::half* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast(from), unpacket_traits::size); + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast(from), + unpacket_traits::size); } template <> @@ -320,23 +330,27 @@ EIGEN_STRONG_INLINE PacketXh ploaddup(const Eigen::half* from) { template <> EIGEN_STRONG_INLINE PacketXh ploadquad(const Eigen::half* from) { PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, unpacket_traits::size); - return __riscv_vloxei16_v_f16m1(reinterpret_cast(from), idx, unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_f16m1(reinterpret_cast(from), idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const PacketXh& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from, unpacket_traits::size); + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const PacketXh& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from, unpacket_traits::size); + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketXh pgather(const Eigen::half* from, Index stride) { - return __riscv_vlse16_v_f16m1(reinterpret_cast(from), stride * sizeof(Eigen::half), unpacket_traits::size); + return __riscv_vlse16_v_f16m1(reinterpret_cast(from), stride * sizeof(Eigen::half), + unpacket_traits::size); } template <> @@ -380,15 +394,15 @@ EIGEN_STRONG_INLINE PacketXh pfloor(const PacketXh& a) { template <> EIGEN_STRONG_INLINE PacketXh preverse(const PacketXh& a) { - PacketXsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketXsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f16m1(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Eigen::half predux(const PacketXh& a) { - return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m1_f16m1(a, - __riscv_vfmv_v_f_f16m1(0.0, unpacket_traits::size), - unpacket_traits::size))); + return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m1_f16m1( + a, __riscv_vfmv_v_f_f16m1(0.0, unpacket_traits::size), unpacket_traits::size))); } template <> @@ -398,22 +412,22 @@ EIGEN_STRONG_INLINE Eigen::half predux_mul(const PacketXh& a) { PacketXh half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_f16m1(prod, 16, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f16m1(prod, 16, unpacket_traits::size); prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_f16m1(prod, 8, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f16m1(prod, 8, unpacket_traits::size); prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_f16m1(prod, 4, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f16m1(prod, 4, unpacket_traits::size); prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); } // Last reduction - half_prod = __riscv_vslidedown_vx_f16m1(prod, 2, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f16m1(prod, 2, unpacket_traits::size); prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); - half_prod = __riscv_vslidedown_vx_f16m1(prod, 1, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f16m1(prod, 1, unpacket_traits::size); prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); // The reduction is done to the first element. @@ -422,16 +436,16 @@ EIGEN_STRONG_INLINE Eigen::half predux_mul(const PacketXh& a) { template <> EIGEN_STRONG_INLINE Eigen::half predux_min(const PacketXh& a) { - return static_cast(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m1_f16m1(a, - __riscv_vfmv_v_f_f16m1((std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size))); + return static_cast(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m1_f16m1( + a, __riscv_vfmv_v_f_f16m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size))); } template <> EIGEN_STRONG_INLINE Eigen::half predux_max(const PacketXh& a) { - return static_cast(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m1_f16m1(a, - __riscv_vfmv_v_f_f16m1(-(std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size))); + return static_cast(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m1_f16m1( + a, __riscv_vfmv_v_f_f16m1(-(std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size))); } template @@ -440,16 +454,18 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i], unpacket_traits::size); + __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i], + unpacket_traits::size); } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle16_v_f16m1(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), unpacket_traits::size); + kernel.packet[i] = __riscv_vle16_v_f16m1(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), + unpacket_traits::size); } } EIGEN_STRONG_INLINE PacketMul2Xf half2float(const PacketXh& a) { - return __riscv_vfwcvt_f_f_v_f32m2(a, unpacket_traits::size); + return __riscv_vfwcvt_f_f_v_f32m2(a, unpacket_traits::size); } EIGEN_STRONG_INLINE PacketXh float2half(const PacketMul2Xf& a) { @@ -485,7 +501,8 @@ EIGEN_STRONG_INLINE PacketMul2Xh pset1frombits(numext::uint16_t fr template <> EIGEN_STRONG_INLINE PacketMul2Xh plset(const Eigen::half& a) { - PacketMul2Xh idx = __riscv_vfcvt_f_x_v_f16m2(__riscv_vid_v_i16m2(unpacket_traits::size), unpacket_traits::size); + PacketMul2Xh idx = __riscv_vfcvt_f_x_v_f16m2(__riscv_vid_v_i16m2(unpacket_traits::size), + unpacket_traits::size); return __riscv_vfadd_vf_f16m2(idx, a, unpacket_traits::size); } @@ -541,10 +558,11 @@ EIGEN_STRONG_INLINE PacketMul2Xh pnmsub(const PacketMul2Xh& a, const PacketMul2X template <> EIGEN_STRONG_INLINE PacketMul2Xh pmin(const PacketMul2Xh& a, const PacketMul2Xh& b) { - PacketMul2Xh nans = __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMul2Xh nans = + __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); return __riscv_vfmin_vv_f16m2_tum(mask, nans, a, b, unpacket_traits::size); } @@ -561,10 +579,11 @@ EIGEN_STRONG_INLINE PacketMul2Xh pmin(const Pack template <> EIGEN_STRONG_INLINE PacketMul2Xh pmax(const PacketMul2Xh& a, const PacketMul2Xh& b) { - PacketMul2Xh nans = __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMul2Xh nans = + __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); return __riscv_vfmax_vv_f16m2_tum(mask, nans, a, b, unpacket_traits::size); } @@ -582,19 +601,22 @@ EIGEN_STRONG_INLINE PacketMul2Xh pmax(const Pack template <> EIGEN_STRONG_INLINE PacketMul2Xh pcmp_le(const PacketMul2Xh& a, const PacketMul2Xh& b) { PacketMask8 mask = __riscv_vmfle_vv_f16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xh pcmp_lt(const PacketMul2Xh& a, const PacketMul2Xh& b) { PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xh pcmp_eq(const PacketMul2Xh& a, const PacketMul2Xh& b) { PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> @@ -606,33 +628,43 @@ EIGEN_STRONG_INLINE PacketMul2Xh pcmp_lt_or_nan(const PacketMul2Xh // Logical Operations are not supported for half, so reinterpret casts template <> EIGEN_STRONG_INLINE PacketMul2Xh pand(const PacketMul2Xh& a, const PacketMul2Xh& b) { - return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vreinterpret_v_f16m2_u16m2(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xh por(const PacketMul2Xh& a, const PacketMul2Xh& b) { - return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vreinterpret_v_f16m2_u16m2(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xh pxor(const PacketMul2Xh& a, const PacketMul2Xh& b) { - return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vxor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vxor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vreinterpret_v_f16m2_u16m2(b), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xh pandnot(const PacketMul2Xh& a, const PacketMul2Xh& b) { - return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), - __riscv_vnot_v_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size), unpacket_traits::size)); + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2( + __riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vnot_v_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xh pload(const Eigen::half* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast(from), unpacket_traits::size); + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast(from), + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xh ploadu(const Eigen::half* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast(from), unpacket_traits::size); + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast(from), + unpacket_traits::size); } template <> @@ -645,28 +677,34 @@ EIGEN_STRONG_INLINE PacketMul2Xh ploaddup(const Eigen::half* from) template <> EIGEN_STRONG_INLINE PacketMul2Xh ploadquad(const Eigen::half* from) { PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, unpacket_traits::size); - return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const PacketMul2Xh& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from, unpacket_traits::size); + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const PacketMul2Xh& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from, unpacket_traits::size); + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketMul2Xh pgather(const Eigen::half* from, Index stride) { - return __riscv_vlse16_v_f16m2(reinterpret_cast(from), stride * sizeof(Eigen::half), unpacket_traits::size); + return __riscv_vlse16_v_f16m2(reinterpret_cast(from), stride * sizeof(Eigen::half), + unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const PacketMul2Xh& from, Index stride) { - __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const PacketMul2Xh& from, + Index stride) { + __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, + unpacket_traits::size); } template <> @@ -705,35 +743,36 @@ EIGEN_STRONG_INLINE PacketMul2Xh pfloor(const PacketMul2Xh& a) { template <> EIGEN_STRONG_INLINE PacketMul2Xh preverse(const PacketMul2Xh& a) { - PacketMul2Xsu idx = __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), unpacket_traits::size-1, unpacket_traits::size); + PacketMul2Xsu idx = + __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); return __riscv_vrgather_vv_f16m2(a, idx, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE Eigen::half predux(const PacketMul2Xh& a) { - return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m2_f16m1(a, - __riscv_vfmv_v_f_f16m1(0.0, unpacket_traits::size / 4), - unpacket_traits::size))); + return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m2_f16m1( + a, __riscv_vfmv_v_f_f16m1(0.0, unpacket_traits::size / 4), unpacket_traits::size))); } template <> EIGEN_STRONG_INLINE Eigen::half predux_mul(const PacketMul2Xh& a) { - return predux_mul(__riscv_vfmul_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), - __riscv_vget_v_f16m2_f16m1(a, 1), unpacket_traits::size)); + return predux_mul(__riscv_vfmul_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), + unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE Eigen::half predux_min(const PacketMul2Xh& a) { - return static_cast(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m2_f16m1(a, - __riscv_vfmv_v_f_f16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size))); + return static_cast(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m2_f16m1( + a, __riscv_vfmv_v_f_f16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size))); } template <> EIGEN_STRONG_INLINE Eigen::half predux_max(const PacketMul2Xh& a) { - return static_cast(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m2_f16m1(a, - __riscv_vfmv_v_f_f16m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size))); + return static_cast(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m2_f16m1( + a, __riscv_vfmv_v_f_f16m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size))); } template @@ -742,16 +781,19 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i], unpacket_traits::size); + __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i], + unpacket_traits::size); } for (i = 0; i < N; i++) { - kernel.packet[i] = __riscv_vle16_v_f16m2(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), unpacket_traits::size); + kernel.packet[i] = + __riscv_vle16_v_f16m2(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), + unpacket_traits::size); } } EIGEN_STRONG_INLINE PacketMul4Xf half2float(const PacketMul2Xh& a) { - return __riscv_vfwcvt_f_f_v_f32m4(a, unpacket_traits::size); + return __riscv_vfwcvt_f_f_v_f32m4(a, unpacket_traits::size); } EIGEN_STRONG_INLINE PacketMul2Xh float2half(const PacketMul4Xf& a) { @@ -759,10 +801,12 @@ EIGEN_STRONG_INLINE PacketMul2Xh float2half(const PacketMul4Xf& a) { } template -EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, -PacketXh>::type predux_half_dowto4(const PacketMul2Xh& a) { - return __riscv_vfadd_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), - __riscv_vget_v_f16m2_f16m1(a, 1), unpacket_traits::size); +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXh>::type + predux_half_dowto4(const PacketMul2Xh& a) { + return __riscv_vfadd_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), + unpacket_traits::size); } F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, pcos) @@ -840,10 +884,10 @@ EIGEN_STRONG_INLINE PacketMul2Xs preinterpret(const } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXh& a, const PacketXh& b, - const PacketXh& c, const PacketXh& d) { - PacketMul4Xs res = __riscv_vset_v_i16m1_i16m4(__riscv_vundefined_i16m4(), 0, - __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXh& a, const PacketXh& b, const PacketXh& c, + const PacketXh& d) { + PacketMul4Xs res = __riscv_vset_v_i16m1_i16m4(__riscv_vundefined_i16m4(), 0, + __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size)); res = __riscv_vset_v_i16m1_i16m4(res, 1, __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size)); res = __riscv_vset_v_i16m1_i16m4(res, 2, __riscv_vfcvt_rtz_x_f_v_i16m1(c, unpacket_traits::size)); res = __riscv_vset_v_i16m1_i16m4(res, 3, __riscv_vfcvt_rtz_x_f_v_i16m1(d, unpacket_traits::size)); @@ -852,8 +896,8 @@ EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXh& a template <> EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketXs& a, const PacketXs& b) { - PacketMul2Xh res = __riscv_vset_v_f16m1_f16m2(__riscv_vundefined_f16m2(), 0, - __riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits::size)); + PacketMul2Xh res = __riscv_vset_v_f16m1_f16m2(__riscv_vundefined_f16m2(), 0, + __riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits::size)); res = __riscv_vset_v_f16m1_f16m2(res, 1, __riscv_vfcvt_f_x_v_f16m1(b, unpacket_traits::size)); return res; } @@ -867,8 +911,8 @@ EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketXh& a template <> EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketXh& a, const PacketXh& b) { - PacketMul2Xs res = __riscv_vset_v_i16m1_i16m2(__riscv_vundefined_i16m2(), 0, - __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size)); + PacketMul2Xs res = __riscv_vset_v_i16m1_i16m2(__riscv_vundefined_i16m2(), 0, + __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size)); res = __riscv_vset_v_i16m1_i16m2(res, 1, __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size)); return res; } diff --git a/Eigen/src/Core/arch/RVV10/TypeCasting.h b/Eigen/src/Core/arch/RVV10/TypeCasting.h index 3508f9617..08b93ceaa 100644 --- a/Eigen/src/Core/arch/RVV10/TypeCasting.h +++ b/Eigen/src/Core/arch/RVV10/TypeCasting.h @@ -89,8 +89,8 @@ EIGEN_STRONG_INLINE PacketMul2Xi preinterpret(const } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXi& a, const PacketXi& b, - const PacketXi& c, const PacketXi& d) { +EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXi& a, const PacketXi& b, const PacketXi& c, + const PacketXi& d) { PacketMul4Xi res = __riscv_vset_v_i32m1_i32m4(__riscv_vundefined_i32m4(), 0, a); res = __riscv_vset_v_i32m1_i32m4(res, 1, b); res = __riscv_vset_v_i32m1_i32m4(res, 2, c); @@ -99,10 +99,10 @@ EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXi& a } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXi& a, const PacketXi& b, - const PacketXi& c, const PacketXi& d) { - PacketMul4Xf res = __riscv_vset_v_f32m1_f32m4(__riscv_vundefined_f32m4(), 0, - __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXi& a, const PacketXi& b, const PacketXi& c, + const PacketXi& d) { + PacketMul4Xf res = __riscv_vset_v_f32m1_f32m4(__riscv_vundefined_f32m4(), 0, + __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size)); res = __riscv_vset_v_f32m1_f32m4(res, 1, __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size)); res = __riscv_vset_v_f32m1_f32m4(res, 2, __riscv_vfcvt_f_x_v_f32m1(c, unpacket_traits::size)); res = __riscv_vset_v_f32m1_f32m4(res, 3, __riscv_vfcvt_f_x_v_f32m1(d, unpacket_traits::size)); @@ -110,8 +110,8 @@ EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXi& a } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXf& a, const PacketXf& b, - const PacketXf& c, const PacketXf& d) { +EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXf& a, const PacketXf& b, const PacketXf& c, + const PacketXf& d) { PacketMul4Xf res = __riscv_vset_v_f32m1_f32m4(__riscv_vundefined_f32m4(), 0, a); res = __riscv_vset_v_f32m1_f32m4(res, 1, b); res = __riscv_vset_v_f32m1_f32m4(res, 2, c); @@ -120,10 +120,10 @@ EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXf& a } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXf& a, const PacketXf& b, - const PacketXf& c, const PacketXf& d) { - PacketMul4Xi res = __riscv_vset_v_i32m1_i32m4(__riscv_vundefined_i32m4(), 0, - __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXf& a, const PacketXf& b, const PacketXf& c, + const PacketXf& d) { + PacketMul4Xi res = __riscv_vset_v_i32m1_i32m4(__riscv_vundefined_i32m4(), 0, + __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size)); res = __riscv_vset_v_i32m1_i32m4(res, 1, __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size)); res = __riscv_vset_v_i32m1_i32m4(res, 2, __riscv_vfcvt_rtz_x_f_v_i32m1(c, unpacket_traits::size)); res = __riscv_vset_v_i32m1_i32m4(res, 3, __riscv_vfcvt_rtz_x_f_v_i32m1(d, unpacket_traits::size)); @@ -139,8 +139,8 @@ EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketXi& a template <> EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXi& a, const PacketXi& b) { - PacketMul2Xf res = __riscv_vset_v_f32m1_f32m2(__riscv_vundefined_f32m2(), 0, - __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size)); + PacketMul2Xf res = __riscv_vset_v_f32m1_f32m2(__riscv_vundefined_f32m2(), 0, + __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size)); res = __riscv_vset_v_f32m1_f32m2(res, 1, __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size)); return res; } @@ -154,8 +154,8 @@ EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXf& a template <> EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketXf& a, const PacketXf& b) { - PacketMul2Xi res = __riscv_vset_v_i32m1_i32m2(__riscv_vundefined_i32m2(), 0, - __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size)); + PacketMul2Xi res = __riscv_vset_v_i32m1_i32m2(__riscv_vundefined_i32m2(), 0, + __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size)); res = __riscv_vset_v_i32m1_i32m2(res, 1, __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size)); return res; } @@ -233,8 +233,8 @@ EIGEN_STRONG_INLINE PacketMul2Xl preinterpret(const } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXl& a, const PacketXl& b, - const PacketXl& c, const PacketXl& d) { +EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXl& a, const PacketXl& b, const PacketXl& c, + const PacketXl& d) { PacketMul4Xl res = __riscv_vset_v_i64m1_i64m4(__riscv_vundefined_i64m4(), 0, a); res = __riscv_vset_v_i64m1_i64m4(res, 1, b); res = __riscv_vset_v_i64m1_i64m4(res, 2, c); @@ -243,10 +243,10 @@ EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXl& a } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXl& a, const PacketXl& b, - const PacketXl& c, const PacketXl& d) { - PacketMul4Xd res = __riscv_vset_v_f64m1_f64m4(__riscv_vundefined_f64m4(), 0, - __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXl& a, const PacketXl& b, const PacketXl& c, + const PacketXl& d) { + PacketMul4Xd res = __riscv_vset_v_f64m1_f64m4(__riscv_vundefined_f64m4(), 0, + __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size)); res = __riscv_vset_v_f64m1_f64m4(res, 1, __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size)); res = __riscv_vset_v_f64m1_f64m4(res, 2, __riscv_vfcvt_f_x_v_f64m1(c, unpacket_traits::size)); res = __riscv_vset_v_f64m1_f64m4(res, 3, __riscv_vfcvt_f_x_v_f64m1(d, unpacket_traits::size)); @@ -254,8 +254,8 @@ EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXl& a } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXd& a, const PacketXd& b, - const PacketXd& c, const PacketXd& d) { +EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXd& a, const PacketXd& b, const PacketXd& c, + const PacketXd& d) { PacketMul4Xd res = __riscv_vset_v_f64m1_f64m4(__riscv_vundefined_f64m4(), 0, a); res = __riscv_vset_v_f64m1_f64m4(res, 1, b); res = __riscv_vset_v_f64m1_f64m4(res, 2, c); @@ -264,10 +264,10 @@ EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXd& a } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXd& a, const PacketXd& b, - const PacketXd& c, const PacketXd& d) { - PacketMul4Xl res = __riscv_vset_v_i64m1_i64m4(__riscv_vundefined_i64m4(), 0, - __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXd& a, const PacketXd& b, const PacketXd& c, + const PacketXd& d) { + PacketMul4Xl res = __riscv_vset_v_i64m1_i64m4(__riscv_vundefined_i64m4(), 0, + __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size)); res = __riscv_vset_v_i64m1_i64m4(res, 1, __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size)); res = __riscv_vset_v_i64m1_i64m4(res, 2, __riscv_vfcvt_rtz_x_f_v_i64m1(c, unpacket_traits::size)); res = __riscv_vset_v_i64m1_i64m4(res, 3, __riscv_vfcvt_rtz_x_f_v_i64m1(d, unpacket_traits::size)); @@ -283,8 +283,8 @@ EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketXl& a template <> EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXl& a, const PacketXl& b) { - PacketMul2Xd res = __riscv_vset_v_f64m1_f64m2(__riscv_vundefined_f64m2(), 0, - __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size)); + PacketMul2Xd res = __riscv_vset_v_f64m1_f64m2(__riscv_vundefined_f64m2(), 0, + __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size)); res = __riscv_vset_v_f64m1_f64m2(res, 1, __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size)); return res; } @@ -298,8 +298,8 @@ EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXd& a template <> EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketXd& a, const PacketXd& b) { - PacketMul2Xl res = __riscv_vset_v_i64m1_i64m2(__riscv_vundefined_i64m2(), 0, - __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size)); + PacketMul2Xl res = __riscv_vset_v_i64m1_i64m2(__riscv_vundefined_i64m2(), 0, + __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size)); res = __riscv_vset_v_i64m1_i64m2(res, 1, __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size)); return res; } @@ -314,8 +314,8 @@ EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketXs& a } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXs& a, const PacketXs& b, - const PacketXs& c, const PacketXs& d) { +EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXs& a, const PacketXs& b, const PacketXs& c, + const PacketXs& d) { PacketMul4Xs res = __riscv_vset_v_i16m1_i16m4(__riscv_vundefined_i16m4(), 0, a); res = __riscv_vset_v_i16m1_i16m4(res, 1, b); res = __riscv_vset_v_i16m1_i16m4(res, 2, c); diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 463f2bc17..a691d092e 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -39,12 +39,14 @@ class gemv_traits { typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; #ifdef EIGEN_RISCV64_USE_RVV10 -#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ - typedef typename std::conditional_t::IsComplex || NumTraits::IsComplex, \ - typename packet_traits::type, \ - typename gemv_packet_cond< \ - packet_size, typename packet_traits::type, typename packet_traits::half, \ - typename unpacket_traits::half>::half>::type> name##Packet##postfix +#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ + typedef typename std::conditional_t< \ + NumTraits::IsComplex || NumTraits::IsComplex, \ + typename packet_traits::type, \ + typename gemv_packet_cond::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type> \ + name##Packet##postfix #else #define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ typedef typename gemv_packet_cond< \ diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 0766073c3..8a20280c1 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -452,7 +452,7 @@ extern "C" { #error "The Eigen::Half vectorization requires Zfh and Zvfh extensions." #endif -#endif // defined(EIGEN_ARCH_RISCV) +#endif // defined(EIGEN_ARCH_RISCV) #elif (defined __s390x__ && defined __VEC__) diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index ffa66980c..9bc9b1099 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -302,8 +302,14 @@ struct apply_rotation_in_the_plane_selector { static inline void run(Scalar* x, Index incrx, Scalar* y, Index incry, Index size, OtherScalar c, OtherScalar s) { #ifdef EIGEN_RISCV64_USE_RVV10 - typedef typename std::conditional_t::IsComplex || NumTraits::IsComplex, typename packet_traits::type, typename packet_traits::type> Packet; - typedef typename std::conditional_t::IsComplex || NumTraits::IsComplex, typename packet_traits::type, typename packet_traits::type> OtherPacket; + typedef + typename std::conditional_t::IsComplex || NumTraits::IsComplex, + typename packet_traits::type, typename packet_traits::type> + Packet; + typedef typename std::conditional_t::IsComplex || NumTraits::IsComplex, + typename packet_traits::type, + typename packet_traits::type> + OtherPacket; constexpr Index PacketSize = unpacket_traits::size; #else @@ -313,7 +319,7 @@ struct apply_rotation_in_the_plane_selector::size; #endif constexpr int RequiredAlignment = - (std::max)(unpacket_traits::alignment, unpacket_traits::alignment); + (std::max)(unpacket_traits::alignment, unpacket_traits::alignment); /*** dynamic-size vectorized paths ***/ if (size >= 2 * PacketSize && SizeAtCompileTime == Dynamic && ((incrx == 1 && incry == 1) || PacketSize == 1)) { -- GitLab From 974d201d2b51c2fa8ef8f90bb1a2a8897f806284 Mon Sep 17 00:00:00 2001 From: "kseniya.zaytseva" Date: Tue, 29 Apr 2025 16:21:52 +0300 Subject: [PATCH 4/7] vcreate instead of vset --- Eigen/src/Core/arch/RVV10/TypeCasting.h | 115 ++++++++---------------- 1 file changed, 35 insertions(+), 80 deletions(-) diff --git a/Eigen/src/Core/arch/RVV10/TypeCasting.h b/Eigen/src/Core/arch/RVV10/TypeCasting.h index 08b93ceaa..67bc99d0b 100644 --- a/Eigen/src/Core/arch/RVV10/TypeCasting.h +++ b/Eigen/src/Core/arch/RVV10/TypeCasting.h @@ -91,73 +91,53 @@ EIGEN_STRONG_INLINE PacketMul2Xi preinterpret(const template <> EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXi& a, const PacketXi& b, const PacketXi& c, const PacketXi& d) { - PacketMul4Xi res = __riscv_vset_v_i32m1_i32m4(__riscv_vundefined_i32m4(), 0, a); - res = __riscv_vset_v_i32m1_i32m4(res, 1, b); - res = __riscv_vset_v_i32m1_i32m4(res, 2, c); - res = __riscv_vset_v_i32m1_i32m4(res, 3, d); - return res; + return __riscv_vcreate_v_i32m1_i32m4(a, b, c, d); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXi& a, const PacketXi& b, const PacketXi& c, const PacketXi& d) { - PacketMul4Xf res = __riscv_vset_v_f32m1_f32m4(__riscv_vundefined_f32m4(), 0, - __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size)); - res = __riscv_vset_v_f32m1_f32m4(res, 1, __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size)); - res = __riscv_vset_v_f32m1_f32m4(res, 2, __riscv_vfcvt_f_x_v_f32m1(c, unpacket_traits::size)); - res = __riscv_vset_v_f32m1_f32m4(res, 3, __riscv_vfcvt_f_x_v_f32m1(d, unpacket_traits::size)); - return res; + return __riscv_vcreate_v_f32m1_f32m4(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(c, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(d, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXf& a, const PacketXf& b, const PacketXf& c, const PacketXf& d) { - PacketMul4Xf res = __riscv_vset_v_f32m1_f32m4(__riscv_vundefined_f32m4(), 0, a); - res = __riscv_vset_v_f32m1_f32m4(res, 1, b); - res = __riscv_vset_v_f32m1_f32m4(res, 2, c); - res = __riscv_vset_v_f32m1_f32m4(res, 3, d); - return res; + return __riscv_vcreate_v_f32m1_f32m4(a, b, c, d); } template <> EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXf& a, const PacketXf& b, const PacketXf& c, const PacketXf& d) { - PacketMul4Xi res = __riscv_vset_v_i32m1_i32m4(__riscv_vundefined_i32m4(), 0, - __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size)); - res = __riscv_vset_v_i32m1_i32m4(res, 1, __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size)); - res = __riscv_vset_v_i32m1_i32m4(res, 2, __riscv_vfcvt_rtz_x_f_v_i32m1(c, unpacket_traits::size)); - res = __riscv_vset_v_i32m1_i32m4(res, 3, __riscv_vfcvt_rtz_x_f_v_i32m1(d, unpacket_traits::size)); - return res; + return __riscv_vcreate_v_i32m1_i32m4(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(d, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketXi& a, const PacketXi& b) { - PacketMul2Xi res = __riscv_vset_v_i32m1_i32m2(__riscv_vundefined_i32m2(), 0, a); - res = __riscv_vset_v_i32m1_i32m2(res, 1, b); - return res; + return __riscv_vcreate_v_i32m1_i32m2(a, b); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXi& a, const PacketXi& b) { - PacketMul2Xf res = __riscv_vset_v_f32m1_f32m2(__riscv_vundefined_f32m2(), 0, - __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size)); - res = __riscv_vset_v_f32m1_f32m2(res, 1, __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size)); - return res; + return __riscv_vcreate_v_f32m1_f32m2(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXf& a, const PacketXf& b) { - PacketMul2Xf res = __riscv_vset_v_f32m1_f32m2(__riscv_vundefined_f32m2(), 0, a); - res = __riscv_vset_v_f32m1_f32m2(res, 1, b); - return res; + return __riscv_vcreate_v_f32m1_f32m2(a, b); } template <> EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketXf& a, const PacketXf& b) { - PacketMul2Xi res = __riscv_vset_v_i32m1_i32m2(__riscv_vundefined_i32m2(), 0, - __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size)); - res = __riscv_vset_v_i32m1_i32m2(res, 1, __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size)); - return res; + return __riscv_vcreate_v_i32m1_i32m2(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size)); } /********************************* 64 bits ************************************/ @@ -235,92 +215,67 @@ EIGEN_STRONG_INLINE PacketMul2Xl preinterpret(const template <> EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXl& a, const PacketXl& b, const PacketXl& c, const PacketXl& d) { - PacketMul4Xl res = __riscv_vset_v_i64m1_i64m4(__riscv_vundefined_i64m4(), 0, a); - res = __riscv_vset_v_i64m1_i64m4(res, 1, b); - res = __riscv_vset_v_i64m1_i64m4(res, 2, c); - res = __riscv_vset_v_i64m1_i64m4(res, 3, d); - return res; + return __riscv_vcreate_v_i64m1_i64m4(a, b, c, d); + ; } template <> EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXl& a, const PacketXl& b, const PacketXl& c, const PacketXl& d) { - PacketMul4Xd res = __riscv_vset_v_f64m1_f64m4(__riscv_vundefined_f64m4(), 0, - __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size)); - res = __riscv_vset_v_f64m1_f64m4(res, 1, __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size)); - res = __riscv_vset_v_f64m1_f64m4(res, 2, __riscv_vfcvt_f_x_v_f64m1(c, unpacket_traits::size)); - res = __riscv_vset_v_f64m1_f64m4(res, 3, __riscv_vfcvt_f_x_v_f64m1(d, unpacket_traits::size)); - return res; + return __riscv_vcreate_v_f64m1_f64m4(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(c, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(d, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXd& a, const PacketXd& b, const PacketXd& c, const PacketXd& d) { - PacketMul4Xd res = __riscv_vset_v_f64m1_f64m4(__riscv_vundefined_f64m4(), 0, a); - res = __riscv_vset_v_f64m1_f64m4(res, 1, b); - res = __riscv_vset_v_f64m1_f64m4(res, 2, c); - res = __riscv_vset_v_f64m1_f64m4(res, 3, d); - return res; + return __riscv_vcreate_v_f64m1_f64m4(a, b, c, d); } template <> EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXd& a, const PacketXd& b, const PacketXd& c, const PacketXd& d) { - PacketMul4Xl res = __riscv_vset_v_i64m1_i64m4(__riscv_vundefined_i64m4(), 0, - __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size)); - res = __riscv_vset_v_i64m1_i64m4(res, 1, __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size)); - res = __riscv_vset_v_i64m1_i64m4(res, 2, __riscv_vfcvt_rtz_x_f_v_i64m1(c, unpacket_traits::size)); - res = __riscv_vset_v_i64m1_i64m4(res, 3, __riscv_vfcvt_rtz_x_f_v_i64m1(d, unpacket_traits::size)); - return res; + return __riscv_vcreate_v_i64m1_i64m4(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(d, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketXl& a, const PacketXl& b) { - PacketMul2Xl res = __riscv_vset_v_i64m1_i64m2(__riscv_vundefined_i64m2(), 0, a); - res = __riscv_vset_v_i64m1_i64m2(res, 1, b); - return res; + return __riscv_vcreate_v_i64m1_i64m2(a, b); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXl& a, const PacketXl& b) { - PacketMul2Xd res = __riscv_vset_v_f64m1_f64m2(__riscv_vundefined_f64m2(), 0, - __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size)); - res = __riscv_vset_v_f64m1_f64m2(res, 1, __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size)); - return res; + return __riscv_vcreate_v_f64m1_f64m2(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXd& a, const PacketXd& b) { - PacketMul2Xd res = __riscv_vset_v_f64m1_f64m2(__riscv_vundefined_f64m2(), 0, a); - res = __riscv_vset_v_f64m1_f64m2(res, 1, b); - return res; + return __riscv_vcreate_v_f64m1_f64m2(a, b); } template <> EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketXd& a, const PacketXd& b) { - PacketMul2Xl res = __riscv_vset_v_i64m1_i64m2(__riscv_vundefined_i64m2(), 0, - __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size)); - res = __riscv_vset_v_i64m1_i64m2(res, 1, __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size)); - return res; + return __riscv_vcreate_v_i64m1_i64m2(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size)); } /********************************* 16 bits ************************************/ template <> EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketXs& a, const PacketXs& b) { - PacketMul2Xs res = __riscv_vset_v_i16m1_i16m2(__riscv_vundefined_i16m2(), 0, a); - res = __riscv_vset_v_i16m1_i16m2(res, 1, b); - return res; + return __riscv_vcreate_v_i16m1_i16m2(a, b); } template <> EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXs& a, const PacketXs& b, const PacketXs& c, const PacketXs& d) { - PacketMul4Xs res = __riscv_vset_v_i16m1_i16m4(__riscv_vundefined_i16m4(), 0, a); - res = __riscv_vset_v_i16m1_i16m4(res, 1, b); - res = __riscv_vset_v_i16m1_i16m4(res, 2, c); - res = __riscv_vset_v_i16m1_i16m4(res, 3, d); - return res; + return __riscv_vcreate_v_i16m1_i16m4(a, b, c, d); } } // namespace internal -- GitLab From 7b2fbe823b613a2ce0f8e95cba95c3b6ccd59b27 Mon Sep 17 00:00:00 2001 From: "kseniya.zaytseva" Date: Tue, 29 Apr 2025 16:23:42 +0300 Subject: [PATCH 5/7] vmerge reduction in RVV wrappers --- Eigen/src/Core/arch/RVV10/Complex.h | 67 ++++--- Eigen/src/Core/arch/RVV10/PacketMath.h | 193 +++++++++++++-------- Eigen/src/Core/arch/RVV10/PacketMathFP16.h | 88 +++++----- 3 files changed, 194 insertions(+), 154 deletions(-) diff --git a/Eigen/src/Core/arch/RVV10/Complex.h b/Eigen/src/Core/arch/RVV10/Complex.h index 968a4cc40..73ef50cc5 100644 --- a/Eigen/src/Core/arch/RVV10/Complex.h +++ b/Eigen/src/Core/arch/RVV10/Complex.h @@ -74,10 +74,7 @@ EIGEN_STRONG_INLINE PacketXcf pcast(const PacketMul2Xf& template <> EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXcf& a) { - PacketMul2Xf res = __riscv_vundefined_f32m2(); - res = __riscv_vset_v_f32m1_f32m2(res, 0, a.real); - res = __riscv_vset_v_f32m1_f32m2(res, 1, a.imag); - return res; + return __riscv_vcreate_v_f32m1_f32m2(a.real, a.imag); } template <> @@ -129,10 +126,9 @@ EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketXcf& a, const PacketX template <> EIGEN_STRONG_INLINE PacketXcf pcmp_eq(const PacketXcf& a, const PacketXcf& b) { - PacketXf eq_real = pcmp_eq(a.real, b.real); - PacketXf eq_imag = pcmp_eq(a.imag, b.imag); - PacketXf eq_both = pand(eq_real, eq_imag); - return PacketXcf(eq_both, eq_both); + PacketMask32 eq_both = pand(pcmp_eq_mask(a.real, b.real), pcmp_eq_mask(a.imag, b.imag)); + PacketXf res = pselect(eq_both, ptrue(a.real), pzero(a.real)); + return PacketXcf(res, res); } template <> @@ -275,6 +271,7 @@ EIGEN_STRONG_INLINE Packet psqrt_complex_rvv(const Packet& a) { typedef typename unpacket_traits::type Scalar; typedef typename Scalar::value_type RealScalar; typedef typename packet_traits::type RealPacket; + typedef typename unpacket_traits::packet_mask PacketMask; // Computes the principal sqrt of the complex numbers in the input. // @@ -317,11 +314,12 @@ EIGEN_STRONG_INLINE Packet psqrt_complex_rvv(const Packet& a) { RealPacket a_max = pmax(a_abs.real, a_abs.imag); RealPacket a_min = pmin(a_abs.real, a_abs.imag); - RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min)); - RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max)); + PacketMask a_min_zero_mask = pcmp_eq_mask(a_min, pzero(a_min)); + PacketMask a_max_zero_mask = pcmp_eq_mask(a_max, pzero(a_max)); RealPacket r = pdiv(a_min, a_max); const RealPacket cst_one = pset1(RealScalar(1)); + const RealPacket cst_true = ptrue(cst_one); RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); // Set l to a_max if a_min is zero. l = pselect(a_min_zero_mask, a_max, l); @@ -335,7 +333,7 @@ EIGEN_STRONG_INLINE Packet psqrt_complex_rvv(const Packet& a) { // Step 3. Compute [rho0, rho1, eta0, eta1], where // eta0 = (y0 / rho0) / 2, and eta1 = (y1 / rho1) / 2. // set eta = 0 of input is 0 + i0. - RealPacket eta = pandnot(pmul(cst_half, pdiv(a.imag, rho)), a_max_zero_mask); + RealPacket eta = pselect(a_max_zero_mask, pzero(cst_one), pmul(cst_half, pdiv(a.imag, rho))); // Compute result for inputs with positive real part. Packet positive_real_result = Packet(rho, eta); @@ -343,12 +341,12 @@ EIGEN_STRONG_INLINE Packet psqrt_complex_rvv(const Packet& a) { // [|eta0| |eta1|, sign(y0)*rho0, sign(y1)*rho1] const RealPacket cst_imag_sign_mask = pset1(RealScalar(-0.0)); RealPacket imag_signs = pand(a.imag, cst_imag_sign_mask); - Packet negative_real_result = Packet(pabs(eta), por(positive_real_result.real, imag_signs)); + Packet negative_real_result = Packet(pabs(eta), por(rho, imag_signs)); // Step 5. Select solution branch based on the sign of the real parts. - RealPacket negative_real_mask_half = pcmp_lt(a.real, pzero(a.real)); - Packet negative_real_mask = Packet(negative_real_mask_half, negative_real_mask_half); - Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result); + PacketMask negative_real_mask_half = pcmp_lt_mask(a.real, pzero(a.real)); + Packet result = Packet(pselect(negative_real_mask_half, negative_real_result.real, positive_real_result.real), + pselect(negative_real_mask_half, negative_real_result.imag, positive_real_result.imag)); // Step 6. Handle special cases for infinities: // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN @@ -356,20 +354,22 @@ EIGEN_STRONG_INLINE Packet psqrt_complex_rvv(const Packet& a) { // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); - RealPacket is_real_inf = pcmp_eq(a_abs.real, cst_pos_inf); + PacketMask is_real_inf = pcmp_eq_mask(a_abs.real, cst_pos_inf); // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part. const Packet cst_one_zero = pset1(Scalar(RealScalar(1.0), RealScalar(0.0))); Packet real_inf_result = Packet(pmul(a_abs.real, cst_one_zero.real), pmul(a_abs.imag, cst_one_zero.imag)); - real_inf_result = pselect(negative_real_mask, pcplxflip(real_inf_result), real_inf_result); + real_inf_result = Packet(pselect(negative_real_mask_half, real_inf_result.imag, real_inf_result.real), + pselect(negative_real_mask_half, real_inf_result.real, real_inf_result.imag)); // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part. - RealPacket is_imag_inf = pcmp_eq(a_abs.imag, cst_pos_inf); - Packet imag_inf_result = Packet(cst_pos_inf, a.imag); + PacketMask is_imag_inf = pcmp_eq_mask(a_abs.imag, cst_pos_inf); // unless otherwise specified, if either the real or imaginary component is nan, the entire result is nan - Packet result_is_nan = pisnan(result); - result = por(result_is_nan, result); + result = Packet(pselect(pcmp_eq_mask(result.real, result.real), result.real, cst_true), + pselect(pcmp_eq_mask(result.imag, result.imag), result.imag, cst_true)); - return pselect(Packet(is_imag_inf, is_imag_inf), imag_inf_result, - pselect(Packet(is_real_inf, is_real_inf), real_inf_result, result)); + result = Packet(pselect(is_real_inf, real_inf_result.real, result.real), + pselect(is_real_inf, real_inf_result.imag, result.imag)); + + return Packet(pselect(is_imag_inf, cst_pos_inf, result.real), pselect(is_imag_inf, a.imag, result.imag)); } template @@ -377,17 +377,18 @@ EIGEN_STRONG_INLINE Packet plog_complex_rvv(const Packet& x) { typedef typename unpacket_traits::type Scalar; typedef typename Scalar::value_type RealScalar; typedef typename packet_traits::type RealPacket; + typedef typename unpacket_traits::packet_mask PacketMask; // log(sqrt(a^2 + b^2)), atan2(b, a) - RealPacket xlogr = plog(psqrt(padd(pmul(x.real, x.real), pmul(x.imag, x.imag)))); + RealPacket xlogr = plog(psqrt(padd(pmul(x.real, x.real), pmul(x.imag, x.imag)))); RealPacket ximg = patan2(x.imag, x.real); const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); RealPacket r_abs = pabs(x.real); RealPacket i_abs = pabs(x.imag); - RealPacket is_r_pos_inf = pcmp_eq(r_abs, cst_pos_inf); - RealPacket is_i_pos_inf = pcmp_eq(i_abs, cst_pos_inf); - RealPacket is_any_inf = por(is_r_pos_inf, is_i_pos_inf); + PacketMask is_r_pos_inf = pcmp_eq_mask(r_abs, cst_pos_inf); + PacketMask is_i_pos_inf = pcmp_eq_mask(i_abs, cst_pos_inf); + PacketMask is_any_inf = por(is_r_pos_inf, is_i_pos_inf); RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr); return Packet(xreal, ximg); @@ -480,10 +481,7 @@ EIGEN_STRONG_INLINE PacketXcd pcast(const PacketMul2Xd& template <> EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXcd& a) { - PacketMul2Xd res = __riscv_vundefined_f64m2(); - res = __riscv_vset_v_f64m1_f64m2(res, 0, a.real); - res = __riscv_vset_v_f64m1_f64m2(res, 1, a.imag); - return res; + return __riscv_vcreate_v_f64m1_f64m2(a.real, a.imag); } template <> @@ -535,10 +533,9 @@ EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketXcd& a, const PacketX template <> EIGEN_STRONG_INLINE PacketXcd pcmp_eq(const PacketXcd& a, const PacketXcd& b) { - PacketXd eq_real = pcmp_eq(a.real, b.real); - PacketXd eq_imag = pcmp_eq(a.imag, b.imag); - PacketXd eq_both = pand(eq_real, eq_imag); - return PacketXcd(eq_both, eq_both); + PacketMask64 eq_both = pand(pcmp_eq_mask(a.real, b.real), pcmp_eq_mask(a.imag, b.imag)); + PacketXd res = pselect(eq_both, ptrue(a.real), pzero(a.real)); + return PacketXcd(res, res); } template <> diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h index e4571e7a9..4daa0084d 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -1093,6 +1093,7 @@ struct unpacket_traits { typedef PacketXf half; // Half not yet implemented typedef PacketXi integer_packet; typedef numext::uint8_t mask_t; + typedef PacketMask32 packet_mask; enum { size = rvv_packet_size_selector::size, @@ -1109,6 +1110,7 @@ struct unpacket_traits { typedef PacketXf half; typedef PacketMul2Xi integer_packet; typedef numext::uint8_t mask_t; + typedef PacketMask16 packet_mask; enum { size = rvv_packet_size_selector::size, @@ -1125,6 +1127,7 @@ struct unpacket_traits { typedef PacketMul2Xf half; typedef PacketMul4Xi integer_packet; typedef numext::uint8_t mask_t; + typedef PacketMask8 packet_mask; enum { size = rvv_packet_size_selector::size, @@ -1367,26 +1370,25 @@ EIGEN_STRONG_INLINE PacketXf psqrt(const PacketXf& a) { template <> EIGEN_STRONG_INLINE PacketXf print(const PacketXf& a) { - // Adds and subtracts signum(a) * 2^23 to force rounding. const PacketXf limit = pset1(static_cast(1 << 23)); const PacketXf abs_a = pabs(a); - PacketXf r = padd(abs_a, limit); - // Don't compile-away addition and subtraction. - EIGEN_OPTIMIZATION_BARRIER(r); - r = psub(r, limit); - // If greater than limit, simply return a. Otherwise, account for sign. - r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); - return r; + + PacketMask32 mask = __riscv_vmfne_vv_f32m1_b32(a, a, unpacket_traits::size); + const PacketXf x = __riscv_vfadd_vv_f32m1_tum(mask, a, a, a, unpacket_traits::size); + const PacketXf new_x = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(a, unpacket_traits::size), + unpacket_traits::size); + + mask = __riscv_vmflt_vv_f32m1_b32(abs_a, limit, unpacket_traits::size); + PacketXf signed_x = __riscv_vfsgnj_vv_f32m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(x, signed_x, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXf pfloor(const PacketXf& a) { - const PacketXf cst_1 = pset1(1.0f); PacketXf tmp = print(a); // If greater, subtract one. - PacketXf mask = pcmp_lt(a, tmp); - mask = pand(mask, cst_1); - return psub(tmp, mask); + PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m1_tum(mask, tmp, tmp, 1.0f, unpacket_traits::size); } template <> @@ -1467,6 +1469,28 @@ EIGEN_STRONG_INLINE PacketXf pldexp(const PacketXf& a, const PacketXf& return pldexp_generic(a, exponent); } +template <> +EIGEN_STRONG_INLINE PacketMask32 por(const PacketMask32& a, const PacketMask32& b) { + return __riscv_vmor_mm_b32(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMask32 pand(const PacketMask32& a, const PacketMask32& b) { + return __riscv_vmand_mm_b32(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask32 pcmp_eq_mask(const PacketXf& a, const PacketXf& b) { + return __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask32 pcmp_lt_mask(const PacketXf& a, const PacketXf& b) { + return __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketXf pselect(const PacketMask32& mask, const PacketXf& a, const PacketXf& b) { + return __riscv_vmerge_vvm_f32m1(b, a, mask, unpacket_traits::size); +} + /********************************* PacketMul4Xf ************************************/ template <> @@ -1707,26 +1731,25 @@ EIGEN_STRONG_INLINE PacketMul4Xf psqrt(const PacketMul4Xf& a) { template <> EIGEN_STRONG_INLINE PacketMul4Xf print(const PacketMul4Xf& a) { - // Adds and subtracts signum(a) * 2^23 to force rounding. const PacketMul4Xf limit = pset1(static_cast(1 << 23)); const PacketMul4Xf abs_a = pabs(a); - PacketMul4Xf r = padd(abs_a, limit); - // Don't compile-away addition and subtraction. - EIGEN_OPTIMIZATION_BARRIER(r); - r = psub(r, limit); - // If greater than limit, simply return a. Otherwise, account for sign. - r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); - return r; + + PacketMask8 mask = __riscv_vmfne_vv_f32m4_b8(a, a, unpacket_traits::size); + const PacketMul4Xf x = __riscv_vfadd_vv_f32m4_tum(mask, a, a, a, unpacket_traits::size); + const PacketMul4Xf new_x = __riscv_vfcvt_f_x_v_f32m4( + __riscv_vfcvt_x_f_v_i32m4(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f32m4_b8(abs_a, limit, unpacket_traits::size); + PacketMul4Xf signed_x = __riscv_vfsgnj_vv_f32m4(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(x, signed_x, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xf pfloor(const PacketMul4Xf& a) { - const PacketMul4Xf cst_1 = pset1(1.0f); PacketMul4Xf tmp = print(a); // If greater, subtract one. - PacketMul4Xf mask = pcmp_lt(a, tmp); - mask = pand(mask, cst_1); - return psub(tmp, mask); + PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m4_tum(mask, tmp, tmp, 1.0f, unpacket_traits::size); } template <> @@ -2031,26 +2054,25 @@ EIGEN_STRONG_INLINE PacketMul2Xf psqrt(const PacketMul2Xf& a) { template <> EIGEN_STRONG_INLINE PacketMul2Xf print(const PacketMul2Xf& a) { - // Adds and subtracts signum(a) * 2^23 to force rounding. const PacketMul2Xf limit = pset1(static_cast(1 << 23)); const PacketMul2Xf abs_a = pabs(a); - PacketMul2Xf r = padd(abs_a, limit); - // Don't compile-away addition and subtraction. - EIGEN_OPTIMIZATION_BARRIER(r); - r = psub(r, limit); - // If greater than limit, simply return a. Otherwise, account for sign. - r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); - return r; + + PacketMask16 mask = __riscv_vmfne_vv_f32m2_b16(a, a, unpacket_traits::size); + const PacketMul2Xf x = __riscv_vfadd_vv_f32m2_tum(mask, a, a, a, unpacket_traits::size); + const PacketMul2Xf new_x = __riscv_vfcvt_f_x_v_f32m2( + __riscv_vfcvt_x_f_v_i32m2(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f32m2_b16(abs_a, limit, unpacket_traits::size); + PacketMul2Xf signed_x = __riscv_vfsgnj_vv_f32m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(x, signed_x, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xf pfloor(const PacketMul2Xf& a) { - const PacketMul2Xf cst_1 = pset1(1.0f); PacketMul2Xf tmp = print(a); // If greater, subtract one. - PacketMul2Xf mask = pcmp_lt(a, tmp); - mask = pand(mask, cst_1); - return psub(tmp, mask); + PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m2_tum(mask, tmp, tmp, 1.0f, unpacket_traits::size); } template <> @@ -3166,6 +3188,7 @@ struct unpacket_traits { typedef PacketXd half; // Half not yet implemented typedef PacketXl integer_packet; typedef numext::uint8_t mask_t; + typedef PacketMask64 packet_mask; enum { size = rvv_packet_size_selector::size, @@ -3182,6 +3205,7 @@ struct unpacket_traits { typedef PacketXd half; typedef PacketMul2Xl integer_packet; typedef numext::uint8_t mask_t; + typedef PacketMask32 packet_mask; enum { size = rvv_packet_size_selector::size, @@ -3198,6 +3222,7 @@ struct unpacket_traits { typedef PacketMul2Xd half; typedef PacketMul4Xl integer_packet; typedef numext::uint8_t mask_t; + typedef PacketMask16 packet_mask; enum { size = rvv_packet_size_selector::size, @@ -3442,26 +3467,25 @@ EIGEN_STRONG_INLINE PacketXd psqrt(const PacketXd& a) { template <> EIGEN_STRONG_INLINE PacketXd print(const PacketXd& a) { - // Adds and subtracts signum(a) * 2^52 to force rounding. const PacketXd limit = pset1(static_cast(1ull << 52)); const PacketXd abs_a = pabs(a); - PacketXd r = padd(abs_a, limit); - // Don't compile-away addition and subtraction. - EIGEN_OPTIMIZATION_BARRIER(r); - r = psub(r, limit); - // If greater than limit, simply return a. Otherwise, account for sign. - r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); - return r; + + PacketMask64 mask = __riscv_vmfne_vv_f64m1_b64(a, a, unpacket_traits::size); + const PacketXd x = __riscv_vfadd_vv_f64m1_tum(mask, a, a, a, unpacket_traits::size); + const PacketXd new_x = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1(a, unpacket_traits::size), + unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m1_b64(abs_a, limit, unpacket_traits::size); + PacketXd signed_x = __riscv_vfsgnj_vv_f64m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(x, signed_x, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXd pfloor(const PacketXd& a) { - const PacketXd cst_1 = pset1(1.0); PacketXd tmp = print(a); // If greater, subtract one. - PacketXd mask = pcmp_lt(a, tmp); - mask = pand(mask, cst_1); - return psub(tmp, mask); + PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m1_tum(mask, tmp, tmp, 1.0, unpacket_traits::size); } template <> @@ -3539,6 +3563,33 @@ EIGEN_STRONG_INLINE PacketXd pldexp(const PacketXd& a, const PacketXd& return pldexp_generic(a, exponent); } +template <> +EIGEN_STRONG_INLINE PacketMask64 por(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMask64 pandnot(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMask64 pand(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmand_mm_b64(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask64 pcmp_eq_mask(const PacketXd& a, const PacketXd& b) { + return __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask64 pcmp_lt_mask(const PacketXd& a, const PacketXd& b) { + return __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketXd pselect(const PacketMask64& mask, const PacketXd& a, const PacketXd& b) { + return __riscv_vmerge_vvm_f64m1(b, a, mask, unpacket_traits::size); +} + /********************************* PacketMul4Xd ************************************/ template <> @@ -3780,26 +3831,25 @@ EIGEN_STRONG_INLINE PacketMul4Xd psqrt(const PacketMul4Xd& a) { template <> EIGEN_STRONG_INLINE PacketMul4Xd print(const PacketMul4Xd& a) { - // Adds and subtracts signum(a) * 2^52 to force rounding. const PacketMul4Xd limit = pset1(static_cast(1ull << 52)); const PacketMul4Xd abs_a = pabs(a); - PacketMul4Xd r = padd(abs_a, limit); - // Don't compile-away addition and subtraction. - EIGEN_OPTIMIZATION_BARRIER(r); - r = psub(r, limit); - // If greater than limit, simply return a. Otherwise, account for sign. - r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); - return r; + + PacketMask16 mask = __riscv_vmfne_vv_f64m4_b16(a, a, unpacket_traits::size); + const PacketMul4Xd x = __riscv_vfadd_vv_f64m4_tum(mask, a, a, a, unpacket_traits::size); + const PacketMul4Xd new_x = __riscv_vfcvt_f_x_v_f64m4( + __riscv_vfcvt_x_f_v_i64m4(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m4_b16(abs_a, limit, unpacket_traits::size); + PacketMul4Xd signed_x = __riscv_vfsgnj_vv_f64m4(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(x, signed_x, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul4Xd pfloor(const PacketMul4Xd& a) { - const PacketMul4Xd cst_1 = pset1(1.0); PacketMul4Xd tmp = print(a); // If greater, subtract one. - PacketMul4Xd mask = pcmp_lt(a, tmp); - mask = pand(mask, cst_1); - return psub(tmp, mask); + PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m4_tum(mask, tmp, tmp, 1.0, unpacket_traits::size); } template <> @@ -4105,26 +4155,25 @@ EIGEN_STRONG_INLINE PacketMul2Xd psqrt(const PacketMul2Xd& a) { template <> EIGEN_STRONG_INLINE PacketMul2Xd print(const PacketMul2Xd& a) { - // Adds and subtracts signum(a) * 2^52 to force rounding. const PacketMul2Xd limit = pset1(static_cast(1ull << 52)); const PacketMul2Xd abs_a = pabs(a); - PacketMul2Xd r = padd(abs_a, limit); - // Don't compile-away addition and subtraction. - EIGEN_OPTIMIZATION_BARRIER(r); - r = psub(r, limit); - // If greater than limit, simply return a. Otherwise, account for sign. - r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); - return r; + + PacketMask32 mask = __riscv_vmfne_vv_f64m2_b32(a, a, unpacket_traits::size); + const PacketMul2Xd x = __riscv_vfadd_vv_f64m2_tum(mask, a, a, a, unpacket_traits::size); + const PacketMul2Xd new_x = __riscv_vfcvt_f_x_v_f64m2( + __riscv_vfcvt_x_f_v_i64m2(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m2_b32(abs_a, limit, unpacket_traits::size); + PacketMul2Xd signed_x = __riscv_vfsgnj_vv_f64m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(x, signed_x, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xd pfloor(const PacketMul2Xd& a) { - const PacketMul2Xd cst_1 = pset1(1.0); PacketMul2Xd tmp = print(a); // If greater, subtract one. - PacketMul2Xd mask = pcmp_lt(a, tmp); - mask = pand(mask, cst_1); - return psub(tmp, mask); + PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m2_tum(mask, tmp, tmp, 1.0, unpacket_traits::size); } template <> diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h index 8b8c4ab38..085952fcd 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h +++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h @@ -140,7 +140,7 @@ EIGEN_STRONG_INLINE PacketXh ptrue(const PacketXh& /*a*/) { template <> EIGEN_STRONG_INLINE PacketXh pzero(const PacketXh& /*a*/) { - return __riscv_vfmv_v_f_f16m1(0.0f, unpacket_traits::size); + return __riscv_vfmv_v_f_f16m1(static_cast(0.0), unpacket_traits::size); } template <> @@ -278,7 +278,8 @@ EIGEN_STRONG_INLINE PacketXh pcmp_eq(const PacketXh& a, const PacketXh template <> EIGEN_STRONG_INLINE PacketXh pcmp_lt_or_nan(const PacketXh& a, const PacketXh& b) { PacketMask16 mask = __riscv_vmfge_vv_f16m1_b16(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f16m1(ptrue(a), 0.0f, mask, unpacket_traits::size); + return __riscv_vfmerge_vfm_f16m1(ptrue(a), static_cast(0.0), mask, + unpacket_traits::size); } // Logical Operations are not supported for half, so reinterpret casts @@ -370,26 +371,25 @@ EIGEN_STRONG_INLINE PacketXh psqrt(const PacketXh& a) { template <> EIGEN_STRONG_INLINE PacketXh print(const PacketXh& a) { - // Adds and subtracts signum(a) * 2^10 to force rounding. const PacketXh limit = pset1(static_cast(1 << 10)); const PacketXh abs_a = pabs(a); - PacketXh r = padd(abs_a, limit); - // Don't compile-away addition and subtraction. - EIGEN_OPTIMIZATION_BARRIER(r); - r = psub(r, limit); - // If greater than limit, simply return a. Otherwise, account for sign. - r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); - return r; + + PacketMask16 mask = __riscv_vmfne_vv_f16m1_b16(a, a, unpacket_traits::size); + const PacketXh x = __riscv_vfadd_vv_f16m1_tum(mask, a, a, a, unpacket_traits::size); + const PacketXh new_x = __riscv_vfcvt_f_x_v_f16m1(__riscv_vfcvt_x_f_v_i16m1(a, unpacket_traits::size), + unpacket_traits::size); + + mask = __riscv_vmflt_vv_f16m1_b16(abs_a, limit, unpacket_traits::size); + PacketXh signed_x = __riscv_vfsgnj_vv_f16m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(x, signed_x, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketXh pfloor(const PacketXh& a) { - const PacketXh cst_1 = pset1(static_cast(1.0)); PacketXh tmp = print(a); // If greater, subtract one. - PacketXh mask = pcmp_lt(a, tmp); - mask = pand(mask, cst_1); - return psub(tmp, mask); + PacketMask16 mask = __riscv_vmflt_vv_f16m1_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f16m1_tum(mask, tmp, tmp, static_cast(1.0), unpacket_traits::size); } template <> @@ -402,7 +402,8 @@ EIGEN_STRONG_INLINE PacketXh preverse(const PacketXh& a) { template <> EIGEN_STRONG_INLINE Eigen::half predux(const PacketXh& a) { return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m1_f16m1( - a, __riscv_vfmv_v_f_f16m1(0.0, unpacket_traits::size), unpacket_traits::size))); + a, __riscv_vfmv_v_f_f16m1(static_cast(0.0), unpacket_traits::size), + unpacket_traits::size))); } template <> @@ -481,7 +482,7 @@ EIGEN_STRONG_INLINE PacketMul2Xh ptrue(const PacketMul2Xh& /*a*/) template <> EIGEN_STRONG_INLINE PacketMul2Xh pzero(const PacketMul2Xh& /*a*/) { - return __riscv_vfmv_v_f_f16m2(0.0f, unpacket_traits::size); + return __riscv_vfmv_v_f_f16m2(static_cast(0.0), unpacket_traits::size); } template <> @@ -622,7 +623,8 @@ EIGEN_STRONG_INLINE PacketMul2Xh pcmp_eq(const PacketMul2Xh& a, co template <> EIGEN_STRONG_INLINE PacketMul2Xh pcmp_lt_or_nan(const PacketMul2Xh& a, const PacketMul2Xh& b) { PacketMask8 mask = __riscv_vmfge_vv_f16m2_b8(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f16m2(ptrue(a), 0.0f, mask, unpacket_traits::size); + return __riscv_vfmerge_vfm_f16m2(ptrue(a), static_cast(0.0), mask, + unpacket_traits::size); } // Logical Operations are not supported for half, so reinterpret casts @@ -719,26 +721,25 @@ EIGEN_STRONG_INLINE PacketMul2Xh psqrt(const PacketMul2Xh& a) { template <> EIGEN_STRONG_INLINE PacketMul2Xh print(const PacketMul2Xh& a) { - // Adds and subtracts signum(a) * 2^10 to force rounding. const PacketMul2Xh limit = pset1(static_cast(1 << 10)); const PacketMul2Xh abs_a = pabs(a); - PacketMul2Xh r = padd(abs_a, limit); - // Don't compile-away addition and subtraction. - EIGEN_OPTIMIZATION_BARRIER(r); - r = psub(r, limit); - // If greater than limit, simply return a. Otherwise, account for sign. - r = pselect(pcmp_lt(abs_a, limit), pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a); - return r; + + PacketMask8 mask = __riscv_vmfne_vv_f16m2_b8(a, a, unpacket_traits::size); + const PacketMul2Xh x = __riscv_vfadd_vv_f16m2_tum(mask, a, a, a, unpacket_traits::size); + const PacketMul2Xh new_x = __riscv_vfcvt_f_x_v_f16m2( + __riscv_vfcvt_x_f_v_i16m2(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f16m2_b8(abs_a, limit, unpacket_traits::size); + PacketMul2Xh signed_x = __riscv_vfsgnj_vv_f16m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(x, signed_x, mask, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMul2Xh pfloor(const PacketMul2Xh& a) { - const PacketMul2Xh cst_1 = pset1(static_cast(1.0)); PacketMul2Xh tmp = print(a); // If greater, subtract one. - PacketMul2Xh mask = pcmp_lt(a, tmp); - mask = pand(mask, cst_1); - return psub(tmp, mask); + PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f16m2_tum(mask, tmp, tmp, static_cast(1.0), unpacket_traits::size); } template <> @@ -752,7 +753,8 @@ EIGEN_STRONG_INLINE PacketMul2Xh preverse(const PacketMul2Xh& a) { template <> EIGEN_STRONG_INLINE Eigen::half predux(const PacketMul2Xh& a) { return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m2_f16m1( - a, __riscv_vfmv_v_f_f16m1(0.0, unpacket_traits::size / 4), unpacket_traits::size))); + a, __riscv_vfmv_v_f_f16m1(static_cast(0.0), unpacket_traits::size / 4), + unpacket_traits::size))); } template <> @@ -886,35 +888,27 @@ EIGEN_STRONG_INLINE PacketMul2Xs preinterpret(const template <> EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXh& a, const PacketXh& b, const PacketXh& c, const PacketXh& d) { - PacketMul4Xs res = __riscv_vset_v_i16m1_i16m4(__riscv_vundefined_i16m4(), 0, - __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size)); - res = __riscv_vset_v_i16m1_i16m4(res, 1, __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size)); - res = __riscv_vset_v_i16m1_i16m4(res, 2, __riscv_vfcvt_rtz_x_f_v_i16m1(c, unpacket_traits::size)); - res = __riscv_vset_v_i16m1_i16m4(res, 3, __riscv_vfcvt_rtz_x_f_v_i16m1(d, unpacket_traits::size)); - return res; + return __riscv_vcreate_v_i16m1_i16m4(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(d, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketXs& a, const PacketXs& b) { - PacketMul2Xh res = __riscv_vset_v_f16m1_f16m2(__riscv_vundefined_f16m2(), 0, - __riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits::size)); - res = __riscv_vset_v_f16m1_f16m2(res, 1, __riscv_vfcvt_f_x_v_f16m1(b, unpacket_traits::size)); - return res; + return __riscv_vcreate_v_f16m1_f16m2(__riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f16m1(b, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketXh& a, const PacketXh& b) { - PacketMul2Xh res = __riscv_vset_v_f16m1_f16m2(__riscv_vundefined_f16m2(), 0, a); - res = __riscv_vset_v_f16m1_f16m2(res, 1, b); - return res; + return __riscv_vcreate_v_f16m1_f16m2(a, b); } template <> EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketXh& a, const PacketXh& b) { - PacketMul2Xs res = __riscv_vset_v_i16m1_i16m2(__riscv_vundefined_i16m2(), 0, - __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size)); - res = __riscv_vset_v_i16m1_i16m2(res, 1, __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size)); - return res; + return __riscv_vcreate_v_i16m1_i16m2(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size)); } } // namespace internal -- GitLab From 29cf8ef898342823b6609ffc43aa949ec14e8edc Mon Sep 17 00:00:00 2001 From: "kseniya.zaytseva" Date: Tue, 29 Apr 2025 19:23:39 +0300 Subject: [PATCH 6/7] config fixes --- Eigen/src/Core/util/ConfigureVectorization.h | 12 ++++-------- Eigen/src/Core/util/Macros.h | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 8a20280c1..3821f41e7 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -408,8 +408,8 @@ extern "C" { #define EIGEN_VECTORIZE_SVE #include -// Since we depend on knowing SVE vector lengths at compile-time, we need -// to ensure a fixed lengths is set +// Since we depend on knowing SVE vector length at compile-time, we need +// to ensure a fixed length is set #if defined __ARM_FEATURE_SVE_BITS #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS #else @@ -430,8 +430,8 @@ extern "C" { #define EIGEN_VECTORIZE_RVV10 #include -// Since we depend on knowing RVV vector lengths at compile-time, we need -// to ensure a fixed lengths is set +// Since we depend on knowing RVV vector length at compile-time, we need +// to ensure a fixed length is set #if defined(__riscv_v_fixed_vlen) #define EIGEN_RISCV64_RVV_VL __riscv_v_fixed_vlen #if __riscv_v_fixed_vlen >= 256 @@ -439,11 +439,7 @@ extern "C" { #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 #endif #else -#ifdef __GNUC__ #error "Eigen requires a fixed RVV vector length but -mrvv-vector-bits=zvl is not set." -#else -#error "Eigen requires a fixed RVV vector length but -mrvv-vector-bits=N is not set." -#endif #endif #if defined(__riscv_zvfh) && defined(__riscv_zfh) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index ad91668cb..5f29a9c72 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -983,7 +983,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); #if !defined(EIGEN_ASM_COMMENT) -#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64) +#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_RISCV) #define EIGEN_ASM_COMMENT(X) __asm__("#" X) #else #define EIGEN_ASM_COMMENT(X) -- GitLab From af7559d1c3d1f7153ef649a596abf86ea592dfaa Mon Sep 17 00:00:00 2001 From: "kseniya.zaytseva" Date: Tue, 29 Apr 2025 22:54:33 +0300 Subject: [PATCH 7/7] use EIGEN_HAS_BUILTIN_FLOAT16 --- Eigen/src/Core/util/ConfigureVectorization.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 3821f41e7..5be3e8028 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -419,7 +419,7 @@ extern "C" { #elif defined(EIGEN_ARCH_RISCV) #if defined(__riscv_zfh) -#define EIGEN_HAS_RISCV64_FP16_SCALAR_ARITHMETIC +#define EIGEN_HAS_BUILTIN_FLOAT16 #endif // We currently require RVV to be enabled explicitly via EIGEN_RISCV64_USE_RVV and -- GitLab