From dbdb94da81644e5089ce6eb819c626b23d5e8f29 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 10 Jul 2025 12:18:13 -0400 Subject: [PATCH 01/21] Initial RVV_Chip --- Eigen/Core | 12 + Eigen/src/Core/AssignEvaluator.h | 5 + Eigen/src/Core/CoreEvaluators.h | 4 + Eigen/src/Core/GenericPacketMath.h | 2 +- Eigen/src/Core/ProductEvaluators.h | 5 + Eigen/src/Core/Redux.h | 4 + Eigen/src/Core/arch/RVV10/Complex.h | 713 +++ .../Core/arch/RVV10/GeneralBlockPanelKernel.h | 491 ++ Eigen/src/Core/arch/RVV10/MathFunctions.h | 30 + Eigen/src/Core/arch/RVV10/PacketMath.h | 5180 +++++++++++++++++ Eigen/src/Core/arch/RVV10/PacketMathFP16.h | 917 +++ Eigen/src/Core/arch/RVV10/TypeCasting.h | 284 + Eigen/src/Core/functors/UnaryFunctors.h | 4 + Eigen/src/Core/products/GeneralMatrixVector.h | 11 + Eigen/src/Core/util/ConfigureVectorization.h | 42 +- Eigen/src/Core/util/Constants.h | 3 + Eigen/src/Core/util/Macros.h | 9 +- Eigen/src/Core/util/XprHelper.h | 9 +- Eigen/src/Eigenvalues/Tridiagonalization.h | 2 +- Eigen/src/Jacobi/Jacobi.h | 17 +- test/packetmath.cpp | 13 + test/vectorization_logic.cpp | 4 + 22 files changed, 7750 insertions(+), 11 deletions(-) create mode 100644 Eigen/src/Core/arch/RVV10/Complex.h create mode 100644 Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h create mode 100644 Eigen/src/Core/arch/RVV10/MathFunctions.h create mode 100644 Eigen/src/Core/arch/RVV10/PacketMath.h create mode 100644 Eigen/src/Core/arch/RVV10/PacketMathFP16.h create mode 100644 Eigen/src/Core/arch/RVV10/TypeCasting.h diff --git a/Eigen/Core b/Eigen/Core index 6ae069a92..3a238407f 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -249,6 +249,14 @@ using std::ptrdiff_t; #include "src/Core/arch/SVE/PacketMath.h" #include "src/Core/arch/SVE/TypeCasting.h" #include "src/Core/arch/SVE/MathFunctions.h" +#elif defined EIGEN_VECTORIZE_RVV10 +#include "src/Core/arch/RVV10/PacketMath.h" +#include "src/Core/arch/RVV10/TypeCasting.h" +#include "src/Core/arch/RVV10/MathFunctions.h" +#include "src/Core/arch/RVV10/Complex.h" +#if defined EIGEN_VECTORIZE_RVV10FP16 +#include "src/Core/arch/RVV10/PacketMathFP16.h" +#endif #elif defined EIGEN_VECTORIZE_ZVECTOR #include "src/Core/arch/ZVector/PacketMath.h" #include "src/Core/arch/ZVector/MathFunctions.h" @@ -396,6 +404,10 @@ using std::ptrdiff_t; #include "src/Core/arch/AVX512/GemmKernel.h" #endif +#if defined(EIGEN_VECTORIZE_RVV10) +#include "src/Core/arch/RVV10/GeneralBlockPanelKernel.h" +#endif + #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" #include "src/Core/PartialReduxEvaluator.h" diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 36f0a9d74..093ceb435 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -64,8 +64,13 @@ struct copy_using_evaluator_traits { static constexpr int OuterStride = outer_stride_at_compile_time::ret; // TODO distinguish between linear traversal and inner-traversals +#ifdef EIGEN_RISCV64_USE_RVV10 + using LinearPacketType = typename find_best_packet::type; + using InnerPacketType = typename find_best_packet::type; +#else using LinearPacketType = typename find_best_packet::type; using InnerPacketType = typename find_best_packet::type; +#endif static constexpr int LinearPacketSize = unpacket_traits::size; static constexpr int InnerPacketSize = unpacket_traits::size; diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index e3af2d202..9fa8e4286 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -1367,7 +1367,11 @@ struct evaluator> typedef Block XprType; typedef typename XprType::Scalar Scalar; // TODO: should check for smaller packet types once we can handle multi-sized packet types +#ifdef EIGEN_RISCV64_USE_RVV10 + typedef typename packet_traits::type PacketScalar; +#else typedef typename packet_traits::type PacketScalar; +#endif enum { CoeffReadCost = evaluator::CoeffReadCost, diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index d45cb4bf4..e42baf75d 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -105,7 +105,7 @@ struct default_packet_traits { }; }; -template +template struct packet_traits : default_packet_traits { typedef T type; typedef T half; diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index ce8d954bf..db820ba6d 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -533,8 +533,13 @@ struct product_evaluator, ProductTag, DenseShape, MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime }; +#ifdef EIGEN_RISCV64_USE_RVV10 + typedef typename find_best_packet::type LhsVecPacketType; + typedef typename find_best_packet::type RhsVecPacketType; +#else typedef typename find_best_packet::type LhsVecPacketType; typedef typename find_best_packet::type RhsVecPacketType; +#endif enum { diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 4e9ab0e4f..716a7c00e 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -29,7 +29,11 @@ namespace internal { template struct redux_traits { public: +#ifdef EIGEN_RISCV64_USE_RVV10 + typedef typename find_best_packet::type PacketType; +#else typedef typename find_best_packet::type PacketType; +#endif enum { PacketSize = unpacket_traits::size, InnerMaxSize = int(Evaluator::IsRowMajor) ? Evaluator::MaxColsAtCompileTime : Evaluator::MaxRowsAtCompileTime, diff --git a/Eigen/src/Core/arch/RVV10/Complex.h b/Eigen/src/Core/arch/RVV10/Complex.h new file mode 100644 index 000000000..73ef50cc5 --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/Complex.h @@ -0,0 +1,713 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_RVV10_H +#define EIGEN_COMPLEX_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +/********************************* float32 ************************************/ + +struct PacketXcf { + EIGEN_STRONG_INLINE PacketXcf() {} + EIGEN_STRONG_INLINE explicit PacketXcf(const PacketXf& _real, const PacketXf& _imag) : real(_real), imag(_imag) {} + EIGEN_STRONG_INLINE explicit PacketXcf(const PacketMul2Xf& a) + : real(__riscv_vget_v_f32m2_f32m1(a, 0)), imag(__riscv_vget_v_f32m2_f32m1(a, 1)) {} + PacketXf real; + PacketXf imag; +}; + +template +struct packet_traits, LMul> : default_packet_traits { + typedef PacketXcf type; + typedef PacketXcf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasSqrt = 1, + HasSign = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasLog = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + typedef PacketXcf half; + typedef PacketMul2Xf as_real; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXcf pcast(const PacketMul2Xf& a) { + return PacketXcf(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXcf& a) { + return __riscv_vcreate_v_f32m1_f32m2(a.real, a.imag); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pset1(const std::complex& from) { + PacketXf real = pset1(from.real()); + PacketXf imag = pset1(from.imag()); + return PacketXcf(real, imag); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf padd(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(padd(a.real, b.real), padd(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf psub(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(psub(a.real, b.real), psub(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pnegate(const PacketXcf& a) { + return PacketXcf(pnegate(a.real), pnegate(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pconj(const PacketXcf& a) { + return PacketXcf( + a.real, __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vx_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a.imag), + 0x80000000, unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pmul(const PacketXcf& a, const PacketXcf& b) { + PacketXf v1 = pmul(a.real, b.real); + PacketXf v2 = pmul(a.imag, b.imag); + PacketXf v3 = pmul(a.real, b.imag); + PacketXf v4 = pmul(a.imag, b.real); + return PacketXcf(psub(v1, v2), padd(v3, v4)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketXcf& a, const PacketXcf& b, const PacketXcf& c) { + PacketXf v1 = pmadd(a.real, b.real, c.real); + PacketXf v2 = pmul(a.imag, b.imag); + PacketXf v3 = pmadd(a.real, b.imag, c.imag); + PacketXf v4 = pmul(a.imag, b.real); + return PacketXcf(psub(v1, v2), padd(v3, v4)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pcmp_eq(const PacketXcf& a, const PacketXcf& b) { + PacketMask32 eq_both = pand(pcmp_eq_mask(a.real, b.real), pcmp_eq_mask(a.imag, b.imag)); + PacketXf res = pselect(eq_both, ptrue(a.real), pzero(a.real)); + return PacketXcf(res, res); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pand(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(pand(a.real, b.real), pand(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf por(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(por(a.real, b.real), por(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pxor(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(pxor(a.real, b.real), pxor(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pandnot(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pload(const std::complex* from) { + vfloat32m1x2_t res = __riscv_vlseg2e32_v_f32m1x2((const float*)from, unpacket_traits::size); + EIGEN_DEBUG_ALIGNED_LOAD return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), __riscv_vget_v_f32m1x2_f32m1(res, 1)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf ploadu(const std::complex* from) { + vfloat32m1x2_t res = __riscv_vlseg2e32_v_f32m1x2((const float*)from, unpacket_traits::size); + EIGEN_DEBUG_UNALIGNED_LOAD return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), + __riscv_vget_v_f32m1x2_f32m1(res, 1)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf ploaddup(const std::complex* from) { + PacketXu real_idx = __riscv_vid_v_u32m1(unpacket_traits::size); + real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + PacketXu imag_idx = __riscv_vadd_vx_u32m1(real_idx, sizeof(float), unpacket_traits::size); + // real_idx = 0 0 2*sizeof(float) 2*sizeof(float) 4*sizeof(float) 4*sizeof(float) ... + return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), + __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf ploadquad(const std::complex* from) { + PacketXu real_idx = __riscv_vid_v_u32m1(unpacket_traits::size); + real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + PacketXu imag_idx = __riscv_vadd_vx_u32m1(real_idx, sizeof(float), unpacket_traits::size); + // real_idx = 0 0 2*sizeof(float) 2*sizeof(float) 4*sizeof(float) 4*sizeof(float) ... + return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), + __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, const PacketXcf& from) { + vfloat32m1x2_t vx2 = __riscv_vundefined_f32m1x2(); + vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 0, from.real); + vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 1, from.imag); + EIGEN_DEBUG_ALIGNED_STORE __riscv_vsseg2e32_v_f32m1x2((float*)to, vx2, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const PacketXcf& from) { + vfloat32m1x2_t vx2 = __riscv_vundefined_f32m1x2(); + vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 0, from.real); + vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 1, from.imag); + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vsseg2e32_v_f32m1x2((float*)to, vx2, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXcf pgather, PacketXcf>(const std::complex* from, + Index stride) { + vfloat32m1x2_t res = + __riscv_vlsseg2e32_v_f32m1x2((const float*)from, 2 * stride * sizeof(float), unpacket_traits::size); + return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), __riscv_vget_v_f32m1x2_f32m1(res, 1)); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, PacketXcf>(std::complex* to, const PacketXcf& from, + Index stride) { + vfloat32m1x2_t from_rvv_type = __riscv_vundefined_f32m1x2(); + from_rvv_type = __riscv_vset_v_f32m1_f32m1x2(from_rvv_type, 0, from.real); + from_rvv_type = __riscv_vset_v_f32m1_f32m1x2(from_rvv_type, 1, from.imag); + __riscv_vssseg2e32_v_f32m1x2((float*)to, 2 * stride * sizeof(float), from_rvv_type, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const PacketXcf& a) { + return std::complex(pfirst(a.real), pfirst(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf preverse(const PacketXcf& a) { + return PacketXcf(preverse(a.real), preverse(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pcplxflip(const PacketXcf& a) { + return PacketXcf(a.imag, a.real); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const PacketXcf& a) { + return std::complex(predux(a.real), predux(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pdiv(const PacketXcf& a, const PacketXcf& b) { + PacketXcf b_conj = pconj(b); + PacketXcf dividend = pmul(a, b_conj); + PacketXf divider = psub(pmul(b.real, b_conj.real), pmul(b.imag, b_conj.imag)); + return PacketXcf(pdiv(dividend.real, divider), pdiv(dividend.imag, divider)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer_real[unpacket_traits::size * N]; + float buffer_imag[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer_real[i], N * sizeof(float), kernel.packet[i].real, unpacket_traits::size); + __riscv_vsse32(&buffer_imag[i], N * sizeof(float), kernel.packet[i].imag, unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i].real = + __riscv_vle32_v_f32m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i].imag = + __riscv_vle32_v_f32m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE Packet psqrt_complex_rvv(const Packet& a) { + typedef typename unpacket_traits::type Scalar; + typedef typename Scalar::value_type RealScalar; + typedef typename packet_traits::type RealPacket; + typedef typename unpacket_traits::packet_mask PacketMask; + + // Computes the principal sqrt of the complex numbers in the input. + // + // For example, for packets containing 2 complex numbers stored in + // [real0, real1, imag0, imag1] format + // a = [a0, a1] = [x0, x1, y0, y1], + // where x0 = real(a0), y0 = imag(a0) etc., this function returns + // b = [b0, b1] = [u0, u1, v0, v1], + // such that b0^2 = a0, b1^2 = a1. + // + // To derive the formula for the complex square roots, let's consider the equation for + // a single complex square root of the number x + i*y. We want to find real numbers + // u and v such that + // (u + i*v)^2 = x + i*y <=> + // u^2 - v^2 + i*2*u*v = x + i*v. + // By equating the real and imaginary parts we get: + // u^2 - v^2 = x + // 2*u*v = y. + // + // For x >= 0, this has the numerically stable solution + // u = sqrt(0.5 * (x + sqrt(x^2 + y^2))) + // v = 0.5 * (y / u) + // and for x < 0, + // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2))) + // u = 0.5 * (y / v) + // + // To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as + // l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) , + + // In the following, without lack of generality, we have annotated the code, assuming + // that the input is a packet of 2 complex numbers. + // + // Step 1. Compute l = [l0, l1], where + // l0 = sqrt(x0^2 + y0^2), l1 = sqrt(x1^2 + y1^2) + // To avoid over- and underflow, we use the stable formula for each hypotenuse + // l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)), + // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1. + + Packet a_abs = Packet(pabs(a.real), pabs(a.imag)); + RealPacket a_max = pmax(a_abs.real, a_abs.imag); + RealPacket a_min = pmin(a_abs.real, a_abs.imag); + + PacketMask a_min_zero_mask = pcmp_eq_mask(a_min, pzero(a_min)); + PacketMask a_max_zero_mask = pcmp_eq_mask(a_max, pzero(a_max)); + RealPacket r = pdiv(a_min, a_max); + + const RealPacket cst_one = pset1(RealScalar(1)); + const RealPacket cst_true = ptrue(cst_one); + RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); + // Set l to a_max if a_min is zero. + l = pselect(a_min_zero_mask, a_max, l); + + // Step 2. Compute [rho0, rho1], where + // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 = sqrt(0.5 * (l1 + |x1|)) + // We don't care about the imaginary parts computed here. They will be overwritten later. + const RealPacket cst_half = pset1(RealScalar(0.5)); + RealPacket rho = psqrt(pmul(cst_half, padd(a_abs.real, l))); + + // Step 3. Compute [rho0, rho1, eta0, eta1], where + // eta0 = (y0 / rho0) / 2, and eta1 = (y1 / rho1) / 2. + // set eta = 0 of input is 0 + i0. + RealPacket eta = pselect(a_max_zero_mask, pzero(cst_one), pmul(cst_half, pdiv(a.imag, rho))); + // Compute result for inputs with positive real part. + Packet positive_real_result = Packet(rho, eta); + + // Step 4. Compute solution for inputs with negative real part: + // [|eta0| |eta1|, sign(y0)*rho0, sign(y1)*rho1] + const RealPacket cst_imag_sign_mask = pset1(RealScalar(-0.0)); + RealPacket imag_signs = pand(a.imag, cst_imag_sign_mask); + Packet negative_real_result = Packet(pabs(eta), por(rho, imag_signs)); + + // Step 5. Select solution branch based on the sign of the real parts. + PacketMask negative_real_mask_half = pcmp_lt_mask(a.real, pzero(a.real)); + Packet result = Packet(pselect(negative_real_mask_half, negative_real_result.real, positive_real_result.real), + pselect(negative_real_mask_half, negative_real_result.imag, positive_real_result.imag)); + + // Step 6. Handle special cases for infinities: + // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN + // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN + // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y + // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y + const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); + PacketMask is_real_inf = pcmp_eq_mask(a_abs.real, cst_pos_inf); + // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part. + const Packet cst_one_zero = pset1(Scalar(RealScalar(1.0), RealScalar(0.0))); + Packet real_inf_result = Packet(pmul(a_abs.real, cst_one_zero.real), pmul(a_abs.imag, cst_one_zero.imag)); + real_inf_result = Packet(pselect(negative_real_mask_half, real_inf_result.imag, real_inf_result.real), + pselect(negative_real_mask_half, real_inf_result.real, real_inf_result.imag)); + // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part. + PacketMask is_imag_inf = pcmp_eq_mask(a_abs.imag, cst_pos_inf); + // unless otherwise specified, if either the real or imaginary component is nan, the entire result is nan + result = Packet(pselect(pcmp_eq_mask(result.real, result.real), result.real, cst_true), + pselect(pcmp_eq_mask(result.imag, result.imag), result.imag, cst_true)); + + result = Packet(pselect(is_real_inf, real_inf_result.real, result.real), + pselect(is_real_inf, real_inf_result.imag, result.imag)); + + return Packet(pselect(is_imag_inf, cst_pos_inf, result.real), pselect(is_imag_inf, a.imag, result.imag)); +} + +template +EIGEN_STRONG_INLINE Packet plog_complex_rvv(const Packet& x) { + typedef typename unpacket_traits::type Scalar; + typedef typename Scalar::value_type RealScalar; + typedef typename packet_traits::type RealPacket; + typedef typename unpacket_traits::packet_mask PacketMask; + + // log(sqrt(a^2 + b^2)), atan2(b, a) + RealPacket xlogr = plog(psqrt(padd(pmul(x.real, x.real), pmul(x.imag, x.imag)))); + RealPacket ximg = patan2(x.imag, x.real); + + const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); + RealPacket r_abs = pabs(x.real); + RealPacket i_abs = pabs(x.imag); + PacketMask is_r_pos_inf = pcmp_eq_mask(r_abs, cst_pos_inf); + PacketMask is_i_pos_inf = pcmp_eq_mask(i_abs, cst_pos_inf); + PacketMask is_any_inf = por(is_r_pos_inf, is_i_pos_inf); + RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr); + + return Packet(xreal, ximg); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf psqrt(const PacketXcf& a) { + return psqrt_complex_rvv(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf plog(const PacketXcf& a) { + return plog_complex_rvv(a); +} + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketMul2Xf& x, const PacketXcf& y, const PacketXcf& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcf pmul(const PacketMul2Xf& x, const PacketXcf& y) const { + return PacketXcf(Eigen::internal::pmul(x, pcast(y))); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketXcf& x, const PacketMul2Xf& y, const PacketXcf& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcf pmul(const PacketXcf& x, const PacketMul2Xf& y) const { + return PacketXcf(Eigen::internal::pmul(pcast(x), y)); + } +}; + +/********************************* double ************************************/ + +struct PacketXcd { + EIGEN_STRONG_INLINE PacketXcd() {} + EIGEN_STRONG_INLINE explicit PacketXcd(const PacketXd& _real, const PacketXd& _imag) : real(_real), imag(_imag) {} + EIGEN_STRONG_INLINE explicit PacketXcd(const PacketMul2Xd& a) + : real(__riscv_vget_v_f64m2_f64m1(a, 0)), imag(__riscv_vget_v_f64m2_f64m1(a, 1)) {} + PacketXd real; + PacketXd imag; +}; + +template +struct packet_traits, LMul> : default_packet_traits { + typedef PacketXcd type; + typedef PacketXcd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasSqrt = 1, + HasSign = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasLog = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + typedef PacketXcd half; + typedef PacketMul2Xd as_real; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXcd pcast(const PacketMul2Xd& a) { + return PacketXcd(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXcd& a) { + return __riscv_vcreate_v_f64m1_f64m2(a.real, a.imag); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pset1(const std::complex& from) { + PacketXd real = pset1(from.real()); + PacketXd imag = pset1(from.imag()); + return PacketXcd(real, imag); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd padd(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(padd(a.real, b.real), padd(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd psub(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(psub(a.real, b.real), psub(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pnegate(const PacketXcd& a) { + return PacketXcd(pnegate(a.real), pnegate(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pconj(const PacketXcd& a) { + return PacketXcd( + a.real, __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vx_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a.imag), 0x8000000000000000, unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pmul(const PacketXcd& a, const PacketXcd& b) { + PacketXd v1 = pmul(a.real, b.real); + PacketXd v2 = pmul(a.imag, b.imag); + PacketXd v3 = pmul(a.real, b.imag); + PacketXd v4 = pmul(a.imag, b.real); + return PacketXcd(psub(v1, v2), padd(v3, v4)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketXcd& a, const PacketXcd& b, const PacketXcd& c) { + PacketXd v1 = pmadd(a.real, b.real, c.real); + PacketXd v2 = pmul(a.imag, b.imag); + PacketXd v3 = pmadd(a.real, b.imag, c.imag); + PacketXd v4 = pmul(a.imag, b.real); + return PacketXcd(psub(v1, v2), padd(v3, v4)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pcmp_eq(const PacketXcd& a, const PacketXcd& b) { + PacketMask64 eq_both = pand(pcmp_eq_mask(a.real, b.real), pcmp_eq_mask(a.imag, b.imag)); + PacketXd res = pselect(eq_both, ptrue(a.real), pzero(a.real)); + return PacketXcd(res, res); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pand(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(pand(a.real, b.real), pand(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd por(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(por(a.real, b.real), por(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pxor(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(pxor(a.real, b.real), pxor(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pandnot(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pload(const std::complex* from) { + vfloat64m1x2_t res = __riscv_vlseg2e64_v_f64m1x2((const double*)from, unpacket_traits::size); + EIGEN_DEBUG_ALIGNED_LOAD return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), __riscv_vget_v_f64m1x2_f64m1(res, 1)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd ploadu(const std::complex* from) { + vfloat64m1x2_t res = __riscv_vlseg2e64_v_f64m1x2((const double*)from, unpacket_traits::size); + EIGEN_DEBUG_UNALIGNED_LOAD return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), + __riscv_vget_v_f64m1x2_f64m1(res, 1)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd ploaddup(const std::complex* from) { + PacketXul real_idx = __riscv_vid_v_u64m1(unpacket_traits::size); + real_idx = + __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffeu, unpacket_traits::size), 3, + unpacket_traits::size); + PacketXul imag_idx = __riscv_vadd_vx_u64m1(real_idx, sizeof(double), unpacket_traits::size); + // real_idx = 0 0 2*sizeof(double) 2*sizeof(double) 4*sizeof(double) 4*sizeof(double) ... + return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), + __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd ploadquad(const std::complex* from) { + PacketXul real_idx = __riscv_vid_v_u64m1(unpacket_traits::size); + real_idx = + __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffcu, unpacket_traits::size), 2, + unpacket_traits::size); + PacketXul imag_idx = __riscv_vadd_vx_u64m1(real_idx, sizeof(double), unpacket_traits::size); + // real_idx = 0 0 2*sizeof(double) 2*sizeof(double) 4*sizeof(double) 4*sizeof(double) ... + return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), + __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, const PacketXcd& from) { + vfloat64m1x2_t vx2 = __riscv_vundefined_f64m1x2(); + vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 0, from.real); + vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 1, from.imag); + EIGEN_DEBUG_ALIGNED_STORE __riscv_vsseg2e64_v_f64m1x2((double*)to, vx2, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const PacketXcd& from) { + vfloat64m1x2_t vx2 = __riscv_vundefined_f64m1x2(); + vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 0, from.real); + vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 1, from.imag); + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vsseg2e64_v_f64m1x2((double*)to, vx2, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXcd pgather, PacketXcd>(const std::complex* from, + Index stride) { + vfloat64m1x2_t res = + __riscv_vlsseg2e64_v_f64m1x2((const double*)from, 2 * stride * sizeof(double), unpacket_traits::size); + return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), __riscv_vget_v_f64m1x2_f64m1(res, 1)); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, PacketXcd>(std::complex* to, const PacketXcd& from, + Index stride) { + vfloat64m1x2_t from_rvv_type = __riscv_vundefined_f64m1x2(); + from_rvv_type = __riscv_vset_v_f64m1_f64m1x2(from_rvv_type, 0, from.real); + from_rvv_type = __riscv_vset_v_f64m1_f64m1x2(from_rvv_type, 1, from.imag); + __riscv_vssseg2e64_v_f64m1x2((double*)to, 2 * stride * sizeof(double), from_rvv_type, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const PacketXcd& a) { + return std::complex(pfirst(a.real), pfirst(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd preverse(const PacketXcd& a) { + return PacketXcd(preverse(a.real), preverse(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pcplxflip(const PacketXcd& a) { + return PacketXcd(a.imag, a.real); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const PacketXcd& a) { + return std::complex(predux(a.real), predux(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pdiv(const PacketXcd& a, const PacketXcd& b) { + PacketXcd b_conj = pconj(b); + PacketXcd dividend = pmul(a, b_conj); + PacketXd divider = psub(pmul(b.real, b_conj.real), pmul(b.imag, b_conj.imag)); + return PacketXcd(pdiv(dividend.real, divider), pdiv(dividend.imag, divider)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer_real[unpacket_traits::size * N]; + double buffer_imag[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer_real[i], N * sizeof(double), kernel.packet[i].real, unpacket_traits::size); + __riscv_vsse64(&buffer_imag[i], N * sizeof(double), kernel.packet[i].imag, unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i].real = + __riscv_vle64_v_f64m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i].imag = + __riscv_vle64_v_f64m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketXcd psqrt(const PacketXcd& a) { + return psqrt_complex_rvv(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd plog(const PacketXcd& a) { + return plog_complex_rvv(a); +} + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketMul2Xd& x, const PacketXcd& y, const PacketXcd& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcd pmul(const PacketMul2Xd& x, const PacketXcd& y) const { + return PacketXcd(Eigen::internal::pmul(x, pcast(y))); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketXcd& x, const PacketMul2Xd& y, const PacketXcd& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcd pmul(const PacketXcd& x, const PacketMul2Xd& y) const { + return PacketXcd(Eigen::internal::pmul(pcast(x), y)); + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h new file mode 100644 index 000000000..212df434f --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h @@ -0,0 +1,491 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H +#define EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/********************************* real ************************************/ + +template <> +struct gebp_traits + : gebp_traits { + typedef float RhsPacket; + typedef QuadPacket RhsPacketx4; + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = __riscv_vfmadd_vf_f32m1(a, b, c, unpacket_traits::size); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, + const LaneIdType& lane) const { + c = __riscv_vfmadd_vf_f32m1(a, b.get(lane), c, unpacket_traits::size); + } +}; + +template <> +struct gebp_traits + : gebp_traits { + typedef double RhsPacket; + typedef QuadPacket RhsPacketx4; + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = __riscv_vfmadd_vf_f64m1(a, b, c, unpacket_traits::size); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, + const LaneIdType& lane) const { + c = __riscv_vfmadd_vf_f64m1(a, b.get(lane), c, unpacket_traits::size); + } +}; + +#if defined(EIGEN_VECTORIZE_RVV10FP16) + +template <> +struct gebp_traits + : gebp_traits { + typedef half RhsPacket; + typedef PacketXh LhsPacket; + typedef PacketXh AccPacket; + typedef QuadPacket RhsPacketx4; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = pload(b); } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = __riscv_vfmadd_vf_f16m1(a, b, c, unpacket_traits::size); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, + const LaneIdType& lane) const { + c = __riscv_vfmadd_vf_f16m1(a, b.get(lane), c, unpacket_traits::size); + } +}; + +#endif + +/********************************* complex ************************************/ + +#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ + typedef typename packet_conditional< \ + packet_size, typename packet_traits::type, typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type name##Packet##postfix + +#define RISCV_COMPLEX_PACKET_DECL_COND_SCALAR(packet_size) \ + typedef typename packet_conditional< \ + packet_size, typename packet_traits::type, typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type ScalarPacket + +template +struct gebp_traits, std::complex, ConjLhs_, ConjRhs_, Architecture::RVV10, + PacketSize_> : gebp_traits, std::complex, ConjLhs_, ConjRhs_, + Architecture::Generic, PacketSize_> { + typedef std::complex Scalar; + typedef std::complex LhsScalar; + typedef std::complex RhsScalar; + typedef std::complex ResScalar; + typedef typename packet_traits>::type RealPacket; + + PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); + RISCV_COMPLEX_PACKET_DECL_COND_SCALAR(PacketSize_); +#undef RISCV_COMPLEX_PACKET_DECL_COND_SCALAR + + enum { + ConjLhs = ConjLhs_, + ConjRhs = ConjRhs_, + Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, + ResPacketSize = Vectorizable ? unpacket_traits::size : 1, + LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RealPacketSize = Vectorizable ? unpacket_traits::size : 1, + + nr = 4, + mr = ResPacketSize, + + LhsProgress = ResPacketSize, + RhsProgress = 1 + }; + + typedef DoublePacket DoublePacketType; + + typedef std::conditional_t LhsPacket4Packing; + typedef std::conditional_t LhsPacket; + typedef std::conditional_t, Scalar> RhsPacket; + typedef std::conditional_t ResPacket; + typedef std::conditional_t AccPacket; + + typedef QuadPacket RhsPacketx4; + + EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } + + EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) { + p.first = pset1(RealScalar(0)); + p.second = pset1(RealScalar(0)); + } + + // Scalar path + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { dest = pset1(*b); } + + // Vectorized path + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { + dest.first = pset1(numext::real(*b)); + dest.second = pset1(numext::imag(*b)); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + loadRhs(b, dest.B_0); + loadRhs(b + 1, dest.B1); + loadRhs(b + 2, dest.B2); + loadRhs(b + 3, dest.B3); + } + + // Scalar path + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { loadRhs(b, dest); } + + // Vectorized path + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket& dest) const { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { loadRhs(b, dest); } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacket& dest) const { + loadQuadToDoublePacket(b, dest); + } + + // nothing special here + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { + dest = pload((const typename unpacket_traits::type*)(a)); + } + + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { + dest = ploadu((const typename unpacket_traits::type*)(a)); + } + + EIGEN_STRONG_INLINE PacketXcf pmadd_scalar(const PacketXcf& a, float b, const PacketXcf& c) const { + PacketXf v1 = __riscv_vfmadd_vf_f32m1(a.real, b, c.real, unpacket_traits::size); + PacketXf v4 = __riscv_vfmadd_vf_f32m1(a.imag, b, c.imag, unpacket_traits::size); + return PacketXcf(v1, v4); + } + + EIGEN_STRONG_INLINE PacketXcd pmadd_scalar(const PacketXcd& a, double b, const PacketXcd& c) const { + PacketXd v1 = __riscv_vfmadd_vf_f64m1(a.real, b, c.real, unpacket_traits::size); + PacketXd v4 = __riscv_vfmadd_vf_f64m1(a.imag, b, c.imag, unpacket_traits::size); + return PacketXcd(v1, v4); + } + + template + EIGEN_STRONG_INLINE std::enable_if_t::value> madd(const LhsPacketType& a, + const RhsPacketType& b, + DoublePacket& c, + TmpType& /*tmp*/, + const LaneIdType&) const { + c.first = pmadd_scalar(a, b.first, c.first); + c.second = pmadd_scalar(a, b.second, c.second); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, + const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, + const LaneIdType&) const { + c = cj.pmadd(a, b, c); + } + + protected: + conj_helper cj; +}; + +#define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size) \ + typedef typename packet_conditional< \ + packet_size, typename packet_traits::type, typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type ScalarPacket##postfix + +template +class gebp_traits, false, ConjRhs_, Architecture::RVV10, PacketSize_> + : public gebp_traits, false, ConjRhs_, Architecture::Generic, PacketSize_> { + public: + typedef std::complex Scalar; + typedef RealScalar LhsScalar; + typedef Scalar RhsScalar; + typedef Scalar ResScalar; + PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Real, PacketSize_); + PACKET_DECL_COND_SCALAR_POSTFIX(_, PacketSize_); +#undef PACKET_DECL_COND_SCALAR_POSTFIX + + enum { + ConjLhs = false, + ConjRhs = ConjRhs_, + Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + // FIXME: should depend on NumberOfRegisters + nr = 4, + mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * ResPacketSize, + + LhsProgress = ResPacketSize, + RhsProgress = 1 + }; + + typedef std::conditional_t LhsPacket; + typedef RhsScalar RhsPacket; + typedef std::conditional_t ResPacket; + typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket RhsPacketx4; + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } + + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { + dest = pset1(*b); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload(a); } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); } + + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { + dest = ploadu((const typename unpacket_traits::type*)a); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, + const LaneIdType&) const { + madd_impl(a, b, c, tmp, std::conditional_t()); + } + + EIGEN_STRONG_INLINE PacketXcf pmadd_scalar(const PacketXf& a, std::complex b, const PacketXcf& c) const { + PacketXf v1 = __riscv_vfmadd_vf_f32m1(a, b.real(), c.real, unpacket_traits::size); + PacketXf v3 = __riscv_vfmadd_vf_f32m1(a, b.imag(), c.imag, unpacket_traits::size); + return PacketXcf(v1, v3); + } + + EIGEN_STRONG_INLINE PacketXcd pmadd_scalar(const PacketXd& a, std::complex b, const PacketXcd& c) const { + PacketXd v1 = __riscv_vfmadd_vf_f64m1(a, b.real(), c.real, unpacket_traits::size); + PacketXd v3 = __riscv_vfmadd_vf_f64m1(a, b.imag(), c.imag, unpacket_traits::size); + return PacketXcd(v1, v3); + } + + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, + RhsPacketType& tmp, const true_type&) const { + EIGEN_UNUSED_VARIABLE(tmp); + c = pmadd_scalar(a, b, c); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, + const false_type&) const { + c += a * b; + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, + const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + conj_helper cj; + r = cj.pmadd(alpha, c, r); + } +}; + +template +class gebp_traits, RealScalar, ConjLhs_, false, Architecture::RVV10, PacketSize_> + : public gebp_traits, ConjLhs_, false, Architecture::Generic, PacketSize_> { + public: + typedef std::complex LhsScalar; + typedef RealScalar RhsScalar; + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + + PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); +#undef PACKET_DECL_COND_POSTFIX + + enum { + ConjLhs = ConjLhs_, + ConjRhs = false, + Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits::size : 1, + + nr = 4, + mr = 3 * LhsPacketSize, + + LhsProgress = LhsPacketSize, + RhsProgress = 1 + }; + + typedef std::conditional_t LhsPacket; + typedef RhsScalar RhsPacket; + typedef std::conditional_t ResPacket; + typedef LhsPacket LhsPacket4Packing; + + typedef QuadPacket RhsPacketx4; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } + + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { + dest = pset1(*b); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { + loadRhsQuad_impl(b, dest, std::conditional_t()); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const { + // FIXME we can do better! + // what we want here is a ploadheight + RhsScalar tmp[4] = {b[0], b[0], b[1], b[1]}; + dest = ploadquad(tmp); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const { + eigen_internal_assert(RhsPacketSize <= 8); + dest = pset1(*b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload(a); } + + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { + dest = ploadu(a); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, + const LaneIdType&) const { + madd_impl(a, b, c, tmp, std::conditional_t()); + } + + EIGEN_STRONG_INLINE PacketXcf pmadd_scalar(const PacketXcf& a, float b, const PacketXcf& c) const { + PacketXf v1 = __riscv_vfmadd_vf_f32m1(a.real, b, c.real, unpacket_traits::size); + PacketXf v3 = __riscv_vfmadd_vf_f32m1(a.imag, b, c.imag, unpacket_traits::size); + return PacketXcf(v1, v3); + } + + EIGEN_STRONG_INLINE PacketXcd pmadd_scalar(const PacketXcd& a, double b, const PacketXcd& c) const { + PacketXd v1 = __riscv_vfmadd_vf_f64m1(a.real, b, c.real, unpacket_traits::size); + PacketXd v3 = __riscv_vfmadd_vf_f64m1(a.imag, b, c.imag, unpacket_traits::size); + return PacketXcd(v1, v3); + } + + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, + RhsPacketType& tmp, const true_type&) const { + EIGEN_UNUSED_VARIABLE(tmp); + c = pmadd_scalar(a, b, c); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, + const false_type&) const { + c += a * b; + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, + const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + conj_helper cj; + r = cj.pmadd(c, alpha, r); + } +}; + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H diff --git a/Eigen/src/Core/arch/RVV10/MathFunctions.h b/Eigen/src/Core/arch/RVV10/MathFunctions.h new file mode 100644 index 000000000..a77496540 --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/MathFunctions.h @@ -0,0 +1,30 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_RVV10_H +#define EIGEN_MATH_FUNCTIONS_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketXf) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketMul2Xf) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketMul4Xf) + +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketXd) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketMul2Xd) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketMul4Xd) + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h new file mode 100644 index 000000000..e59d198d6 --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -0,0 +1,5180 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_RVV10_H +#define EIGEN_PACKET_MATH_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 + +template +struct rvv_packet_size_selector { + enum { size = VectorLength * VectorLMul / (sizeof(Scalar) * CHAR_BIT) }; +}; + +template +struct rvv_packet_alignment_selector { + enum { + alignment = + (VectorLength * VectorLMul) >= 1024 + ? Aligned128 + : ((VectorLength * VectorLMul) >= 512 ? Aligned64 + : ((VectorLength * VectorLMul) >= 256 ? Aligned32 : Aligned16)) + }; +}; + +typedef vbool64_t PacketMask64; +typedef vbool32_t PacketMask32; +typedef vbool16_t PacketMask16; +typedef vbool8_t PacketMask8; +typedef vbool4_t PacketMask4; + +/********************************* int32 **************************************/ +typedef eigen_packet_wrapper PacketXi; +typedef eigen_packet_wrapper PacketXu; + +typedef eigen_packet_wrapper PacketMul2Xi; +typedef eigen_packet_wrapper PacketMul2Xu; + +typedef eigen_packet_wrapper PacketMul4Xi; +typedef eigen_packet_wrapper PacketMul4Xu; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXi type; + typedef PacketXi half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xi type; + typedef PacketXi half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xi type; + typedef PacketMul2Xi half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int32_t type; + typedef PacketXi half; // Half not yet implemented + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int32_t type; + typedef PacketXi half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int32_t type; + typedef PacketMul2Xi half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE void prefetch(const numext::int32_t* addr) { +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + __builtin_prefetch(addr); +#endif +} + +/********************************* PacketXi ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi plset(const numext::int32_t& a) { + PacketXi idx = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)); + return __riscv_vadd_vx_i32m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pzero(const PacketXi& /*a*/) { + return __riscv_vmv_v_x_i32m1(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi padd(const PacketXi& a, const PacketXi& b) { + return __riscv_vadd_vv_i32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi psub(const PacketXi& a, const PacketXi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmul(const PacketXi& a, const PacketXi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pdiv(const PacketXi& a, const PacketXi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmsub(const PacketXi& a, const PacketXi& b, const PacketXi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pnmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) { + return __riscv_vnmsub_vv_i32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pnmsub(const PacketXi& a, const PacketXi& b, const PacketXi& c) { + return __riscv_vnmsub_vv_i32m1(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmin(const PacketXi& a, const PacketXi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmax(const PacketXi& a, const PacketXi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_le(const PacketXi& a, const PacketXi& b) { + PacketMask32 mask = __riscv_vmsle_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_lt(const PacketXi& a, const PacketXi& b) { + PacketMask32 mask = __riscv_vmslt_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_eq(const PacketXi& a, const PacketXi& b) { + PacketMask32 mask = __riscv_vmseq_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ptrue(const PacketXi& /*a*/) { + return __riscv_vmv_v_x_i32m1(0xffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pand(const PacketXi& a, const PacketXi& b) { + return __riscv_vand_vv_i32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi por(const PacketXi& a, const PacketXi& b) { + return __riscv_vor_vv_i32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pxor(const PacketXi& a, const PacketXi& b) { + return __riscv_vxor_vv_i32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pandnot(const PacketXi& a, const PacketXi& b) { + return __riscv_vand_vv_i32m1(a, __riscv_vnot_v_i32m1(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) { + return __riscv_vsra_vx_i32m1(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) { + return __riscv_vreinterpret_i32m1( + __riscv_vsrl_vx_u32m1(__riscv_vreinterpret_u32m1(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a) { + return __riscv_vsll_vx_i32m1(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploaddup(const numext::int32_t* from) { + PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... + return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploadquad(const numext::int32_t* from) { + PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketXi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketXi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXi pgather(const numext::int32_t* from, Index stride) { + return __riscv_vlse32_v_i32m1(from, stride * sizeof(numext::int32_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketXi& from, + Index stride) { + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketXi& a) { + return __riscv_vmv_x_s_i32m1_i32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) { + PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a) { + PacketXi mask = __riscv_vsra_vx_i32m1(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const PacketXi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m1_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketXi& a) { + // Multiply the vector by its reverse + PacketXi prod = __riscv_vmul_vv_i32m1(preverse(a), a, unpacket_traits::size); + PacketXi half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_i32m1(prod, 8, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_i32m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_i32m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_i32m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketXi& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i32m1_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketXi& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i32m1_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_i32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul4Xi ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi plset(const numext::int32_t& a) { + PacketMul4Xi idx = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)); + return __riscv_vadd_vx_i32m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pzero(const PacketMul4Xi& /*a*/) { + return __riscv_vmv_v_x_i32m4(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi padd(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vadd_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi psub(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pnegate(const PacketMul4Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pconj(const PacketMul4Xi& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmul(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pdiv(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmadd(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmsub(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pnmadd(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vnmsub_vv_i32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pnmsub(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vnmsub_vv_i32m4(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmin(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmax(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcmp_le(const PacketMul4Xi& a, const PacketMul4Xi& b) { + PacketMask8 mask = __riscv_vmsle_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcmp_lt(const PacketMul4Xi& a, const PacketMul4Xi& b) { + PacketMask8 mask = __riscv_vmslt_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcmp_eq(const PacketMul4Xi& a, const PacketMul4Xi& b) { + PacketMask8 mask = __riscv_vmseq_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi ptrue(const PacketMul4Xi& /*a*/) { + return __riscv_vmv_v_x_i32m4(0xffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pand(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vand_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi por(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vor_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pxor(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vxor_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pandnot(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vand_vv_i32m4(a, __riscv_vnot_v_i32m4(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xi parithmetic_shift_right(PacketMul4Xi a) { + return __riscv_vsra_vx_i32m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_right(PacketMul4Xi a) { + return __riscv_vreinterpret_i32m4( + __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_u32m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_left(PacketMul4Xi a) { + return __riscv_vsll_vx_i32m4(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi ploaddup(const numext::int32_t* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... + return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi ploadquad(const numext::int32_t* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul4Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul4Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xi pgather(const numext::int32_t* from, + Index stride) { + return __riscv_vlse32_v_i32m4(from, stride * sizeof(numext::int32_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul4Xi& from, + Index stride) { + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul4Xi& a) { + return __riscv_vmv_x_s_i32m4_i32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi preverse(const PacketMul4Xi& a) { + PacketMul4Xu idx = + __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pabs(const PacketMul4Xi& a) { + PacketMul4Xi mask = __riscv_vsra_vx_i32m4(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m4(__riscv_vxor_vv_i32m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul4Xi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul4Xi& a) { + PacketXi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), __riscv_vget_v_i32m4_i32m1(a, 1), + unpacket_traits::size); + PacketXi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), __riscv_vget_v_i32m4_i32m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul4Xi& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul4Xi& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_i32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul2Xi ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi plset(const numext::int32_t& a) { + PacketMul2Xi idx = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)); + return __riscv_vadd_vx_i32m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pzero(const PacketMul2Xi& /*a*/) { + return __riscv_vmv_v_x_i32m2(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi padd(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vadd_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi psub(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pnegate(const PacketMul2Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pconj(const PacketMul2Xi& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmul(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pdiv(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmadd(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmsub(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pnmadd(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vnmsub_vv_i32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pnmsub(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vnmsub_vv_i32m2(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmin(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmax(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcmp_le(const PacketMul2Xi& a, const PacketMul2Xi& b) { + PacketMask16 mask = __riscv_vmsle_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcmp_lt(const PacketMul2Xi& a, const PacketMul2Xi& b) { + PacketMask16 mask = __riscv_vmslt_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcmp_eq(const PacketMul2Xi& a, const PacketMul2Xi& b) { + PacketMask16 mask = __riscv_vmseq_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ptrue(const PacketMul2Xi& /*a*/) { + return __riscv_vmv_v_x_i32m2(0xffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pand(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vand_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi por(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vor_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pxor(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vxor_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pandnot(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vand_vv_i32m2(a, __riscv_vnot_v_i32m2(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xi parithmetic_shift_right(PacketMul2Xi a) { + return __riscv_vsra_vx_i32m2(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_right(PacketMul2Xi a) { + return __riscv_vreinterpret_i32m2( + __riscv_vsrl_vx_u32m2(__riscv_vreinterpret_u32m2(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_left(PacketMul2Xi a) { + return __riscv_vsll_vx_i32m2(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ploaddup(const numext::int32_t* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... + return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ploadquad(const numext::int32_t* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul2Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul2Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xi pgather(const numext::int32_t* from, + Index stride) { + return __riscv_vlse32_v_i32m2(from, stride * sizeof(numext::int32_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul2Xi& from, + Index stride) { + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul2Xi& a) { + return __riscv_vmv_x_s_i32m2_i32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi preverse(const PacketMul2Xi& a) { + PacketMul2Xu idx = + __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pabs(const PacketMul2Xi& a) { + PacketMul2Xi mask = __riscv_vsra_vx_i32m2(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m2(__riscv_vxor_vv_i32m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul2Xi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul2Xi& a) { + return predux_mul(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul2Xi& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul2Xi& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_i32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xi>::type + predux_half_dowto4(const PacketMul4Xi& a) { + return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), __riscv_vget_v_i32m4_i32m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXi>::type + predux_half_dowto4(const PacketMul2Xi& a) { + return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), + unpacket_traits::size); +} + +/********************************* float32 ************************************/ + +typedef eigen_packet_wrapper PacketXf; +typedef eigen_packet_wrapper PacketMul2Xf; +typedef eigen_packet_wrapper PacketMul4Xf; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXf type; + typedef PacketXf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xf type; + typedef PacketXf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xf type; + typedef PacketMul2Xf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef PacketXf half; // Half not yet implemented + typedef PacketXi integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask32 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef PacketXf half; + typedef PacketMul2Xi integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask16 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef PacketMul2Xf half; + typedef PacketMul4Xi integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask8 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +/********************************* PacketXf ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXf ptrue(const PacketXf& /*a*/) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(0xffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pzero(const PacketXf& /*a*/) { + return __riscv_vfmv_v_f_f32m1(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a) { + return __riscv_vfabs_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf plset(const float& a) { + PacketXf idx = + __riscv_vfcvt_f_x_v_f32m1(__riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)), unpacket_traits::size); + return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf padd(const PacketXf& a, const PacketXf& b) { + return __riscv_vfadd_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf psub(const PacketXf& a, const PacketXf& b) { + return __riscv_vfsub_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a) { + return __riscv_vfneg_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmul(const PacketXf& a, const PacketXf& b) { + return __riscv_vfmul_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pdiv(const PacketXf& a, const PacketXf& b) { + return __riscv_vfdiv_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfmadd_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmsub(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfmsub_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pnmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfnmsub_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pnmsub(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfnmadd_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { + PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { + return __riscv_vfmin_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { + PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { + return __riscv_vfmax_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_le(const PacketXf& a, const PacketXf& b) { + PacketMask32 mask = __riscv_vmfle_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_lt(const PacketXf& a, const PacketXf& b) { + PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_eq(const PacketXf& a, const PacketXf& b) { + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan(const PacketXf& a, const PacketXf& b) { + PacketMask32 mask = __riscv_vmfge_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m1(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketXf pand(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf por(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vor_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pxor(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pandnot(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), + __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploaddup(const float* from) { + PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploadquad(const float* from) { + PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const PacketXf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketXf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m1(from, stride * sizeof(float), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketXf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const PacketXf& a) { + return __riscv_vfmv_f_s_f32m1_f32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXf psqrt(const PacketXf& a) { + return __riscv_vfsqrt_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf print(const PacketXf& a) { + const PacketXf limit = pset1(static_cast(1 << 23)); + const PacketXf abs_a = pabs(a); + + PacketMask32 mask = __riscv_vmfne_vv_f32m1_b32(a, a, unpacket_traits::size); + const PacketXf x = __riscv_vfadd_vv_f32m1_tumu(mask, a, a, a, unpacket_traits::size); + const PacketXf new_x = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(a, unpacket_traits::size), + unpacket_traits::size); + + mask = __riscv_vmflt_vv_f32m1_b32(abs_a, limit, unpacket_traits::size); + PacketXf signed_x = __riscv_vfsgnj_vv_f32m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pfloor(const PacketXf& a) { + PacketXf tmp = print(a); + // If greater, subtract one. + PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m1_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) { + PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pfrexp(const PacketXf& a, PacketXf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const PacketXf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) { + // Multiply the vector by its reverse + PacketXf prod = __riscv_vfmul_vv_f32m1(preverse(a), a, unpacket_traits::size); + PacketXf half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const PacketXf& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const PacketXf& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_f32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketXf pldexp(const PacketXf& a, const PacketXf& exponent) { + return pldexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE PacketMask32 por(const PacketMask32& a, const PacketMask32& b) { + return __riscv_vmor_mm_b32(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMask32 pand(const PacketMask32& a, const PacketMask32& b) { + return __riscv_vmand_mm_b32(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask32 pcmp_eq_mask(const PacketXf& a, const PacketXf& b) { + return __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask32 pcmp_lt_mask(const PacketXf& a, const PacketXf& b) { + return __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketXf pselect(const PacketMask32& mask, const PacketXf& a, const PacketXf& b) { + return __riscv_vmerge_vvm_f32m1(b, a, mask, unpacket_traits::size); +} + +/********************************* PacketMul4Xf ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ptrue(const PacketMul4Xf& /*a*/) { + return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(0xffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pzero(const PacketMul4Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m4(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pabs(const PacketMul4Xf& a) { + return __riscv_vfabs_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf plset(const float& a) { + PacketMul4Xf idx = __riscv_vfcvt_f_x_v_f32m4(__riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f32m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf padd(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfadd_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf psub(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfsub_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pnegate(const PacketMul4Xf& a) { + return __riscv_vfneg_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pconj(const PacketMul4Xf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmul(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfmul_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pdiv(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfdiv_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmadd(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfmadd_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmsub(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfmsub_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pnmadd(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfnmsub_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pnmsub(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfnmadd_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMul4Xf nans = + __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfmin_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMul4Xf nans = + __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfmax_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_le(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmfle_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_eq(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt_or_nan(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmfge_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m4(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pand(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf por(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pxor(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pandnot(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4( + __riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vnot_v_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ploaddup(const float* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ploadquad(const float* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul4Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul4Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m4(from, stride * sizeof(float), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul4Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const PacketMul4Xf& a) { + return __riscv_vfmv_f_s_f32m4_f32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf psqrt(const PacketMul4Xf& a) { + return __riscv_vfsqrt_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf print(const PacketMul4Xf& a) { + const PacketMul4Xf limit = pset1(static_cast(1 << 23)); + const PacketMul4Xf abs_a = pabs(a); + + PacketMask8 mask = __riscv_vmfne_vv_f32m4_b8(a, a, unpacket_traits::size); + const PacketMul4Xf x = __riscv_vfadd_vv_f32m4_tumu(mask, a, a, a, unpacket_traits::size); + const PacketMul4Xf new_x = __riscv_vfcvt_f_x_v_f32m4( + __riscv_vfcvt_x_f_v_i32m4(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f32m4_b8(abs_a, limit, unpacket_traits::size); + PacketMul4Xf signed_x = __riscv_vfsgnj_vv_f32m4(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pfloor(const PacketMul4Xf& a) { + PacketMul4Xf tmp = print(a); + // If greater, subtract one. + PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m4_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf preverse(const PacketMul4Xf& a) { + PacketMul4Xu idx = + __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pfrexp(const PacketMul4Xf& a, PacketMul4Xf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const PacketMul4Xf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketMul4Xf& a) { + PacketXf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), __riscv_vget_v_f32m4_f32m1(a, 1), + unpacket_traits::size); + PacketXf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), __riscv_vget_v_f32m4_f32m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const PacketMul4Xf& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const PacketMul4Xf& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_f32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pldexp(const PacketMul4Xf& a, const PacketMul4Xf& exponent) { + return pldexp_generic(a, exponent); +} + +/********************************* PacketMul2Xf ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ptrue(const PacketMul2Xf& /*a*/) { + return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(0xffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pzero(const PacketMul2Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m2(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pabs(const PacketMul2Xf& a) { + return __riscv_vfabs_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf plset(const float& a) { + PacketMul2Xf idx = __riscv_vfcvt_f_x_v_f32m2(__riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f32m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf padd(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfadd_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf psub(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfsub_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pnegate(const PacketMul2Xf& a) { + return __riscv_vfneg_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pconj(const PacketMul2Xf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmul(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfmul_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pdiv(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfdiv_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmadd(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfmadd_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmsub(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfmsub_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pnmadd(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfnmsub_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pnmsub(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfnmadd_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMul2Xf nans = + __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfmin_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMul2Xf nans = + __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfmax_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_le(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmfle_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_eq(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt_or_nan(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmfge_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m2(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pand(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf por(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pxor(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pandnot(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2( + __riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vnot_v_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ploaddup(const float* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ploadquad(const float* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul2Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul2Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m2(from, stride * sizeof(float), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul2Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const PacketMul2Xf& a) { + return __riscv_vfmv_f_s_f32m2_f32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf psqrt(const PacketMul2Xf& a) { + return __riscv_vfsqrt_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf print(const PacketMul2Xf& a) { + const PacketMul2Xf limit = pset1(static_cast(1 << 23)); + const PacketMul2Xf abs_a = pabs(a); + + PacketMask16 mask = __riscv_vmfne_vv_f32m2_b16(a, a, unpacket_traits::size); + const PacketMul2Xf x = __riscv_vfadd_vv_f32m2_tumu(mask, a, a, a, unpacket_traits::size); + const PacketMul2Xf new_x = __riscv_vfcvt_f_x_v_f32m2( + __riscv_vfcvt_x_f_v_i32m2(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f32m2_b16(abs_a, limit, unpacket_traits::size); + PacketMul2Xf signed_x = __riscv_vfsgnj_vv_f32m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pfloor(const PacketMul2Xf& a) { + PacketMul2Xf tmp = print(a); + // If greater, subtract one. + PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m2_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf preverse(const PacketMul2Xf& a) { + PacketMul2Xu idx = + __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pfrexp(const PacketMul2Xf& a, PacketMul2Xf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const PacketMul2Xf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketMul2Xf& a) { + return predux_mul(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const PacketMul2Xf& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const PacketMul2Xf& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_f32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pldexp(const PacketMul2Xf& a, const PacketMul2Xf& exponent) { + return pldexp_generic(a, exponent); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xf>::type + predux_half_dowto4(const PacketMul4Xf& a) { + return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), __riscv_vget_v_f32m4_f32m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXf>::type + predux_half_dowto4(const PacketMul2Xf& a) { + return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), + unpacket_traits::size); +} + +/********************************* int64 **************************************/ + +typedef eigen_packet_wrapper PacketXl; +typedef eigen_packet_wrapper PacketXul; + +typedef eigen_packet_wrapper PacketMul2Xl; +typedef eigen_packet_wrapper PacketMul2Xul; + +typedef eigen_packet_wrapper PacketMul4Xl; +typedef eigen_packet_wrapper PacketMul4Xul; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXl type; + typedef PacketXl half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xl type; + typedef PacketXl half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xl type; + typedef PacketMul2Xl half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int64_t type; + typedef PacketXl half; // Half not yet implemented + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int64_t type; + typedef PacketXl half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int64_t type; + typedef PacketMul2Xl half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE void prefetch(const numext::int64_t* addr) { +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + __builtin_prefetch(addr); +#endif +} + +/********************************* PacketXl ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl plset(const numext::int64_t& a) { + PacketXl idx = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)); + return __riscv_vadd_vx_i64m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pzero(const PacketXl& /*a*/) { + return __riscv_vmv_v_x_i64m1(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl padd(const PacketXl& a, const PacketXl& b) { + return __riscv_vadd_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl psub(const PacketXl& a, const PacketXl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pnegate(const PacketXl& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pconj(const PacketXl& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmul(const PacketXl& a, const PacketXl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pdiv(const PacketXl& a, const PacketXl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmadd(const PacketXl& a, const PacketXl& b, const PacketXl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmsub(const PacketXl& a, const PacketXl& b, const PacketXl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pnmadd(const PacketXl& a, const PacketXl& b, const PacketXl& c) { + return __riscv_vnmsub_vv_i64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pnmsub(const PacketXl& a, const PacketXl& b, const PacketXl& c) { + return __riscv_vnmsub_vv_i64m1(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmin(const PacketXl& a, const PacketXl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmax(const PacketXl& a, const PacketXl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pcmp_le(const PacketXl& a, const PacketXl& b) { + PacketMask64 mask = __riscv_vmsle_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pcmp_lt(const PacketXl& a, const PacketXl& b) { + PacketMask64 mask = __riscv_vmslt_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pcmp_eq(const PacketXl& a, const PacketXl& b) { + PacketMask64 mask = __riscv_vmseq_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl ptrue(const PacketXl& /*a*/) { + return __riscv_vmv_v_x_i64m1(0xffffffffffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pand(const PacketXl& a, const PacketXl& b) { + return __riscv_vand_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl por(const PacketXl& a, const PacketXl& b) { + return __riscv_vor_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pxor(const PacketXl& a, const PacketXl& b) { + return __riscv_vxor_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pandnot(const PacketXl& a, const PacketXl& b) { + return __riscv_vand_vv_i64m1(a, __riscv_vnot_v_i64m1(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXl parithmetic_shift_right(PacketXl a) { + return __riscv_vsra_vx_i64m1(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXl plogical_shift_right(PacketXl a) { + return __riscv_vreinterpret_i64m1( + __riscv_vsrl_vx_u64m1(__riscv_vreinterpret_u64m1(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketXl plogical_shift_left(PacketXl a) { + return __riscv_vsll_vx_i64m1(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl ploaddup(const numext::int64_t* from) { + PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl ploadquad(const numext::int64_t* from) { + PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + ; + return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketXl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketXl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXl pgather(const numext::int64_t* from, Index stride) { + return __riscv_vlse64_v_i64m1(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketXl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketXl& a) { + return __riscv_vmv_x_s_i64m1_i64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXl preverse(const PacketXl& a) { + PacketXul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pabs(const PacketXl& a) { + PacketXl mask = __riscv_vsra_vx_i64m1(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m1(__riscv_vxor_vv_i64m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux(const PacketXl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m1_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketXl& a) { + // Multiply the vector by its reverse + PacketXl prod = __riscv_vmul_vv_i64m1(preverse(a), a, unpacket_traits::size); + PacketXl half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketXl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m1_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketXl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m1_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_i64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul4Xl ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl plset(const numext::int64_t& a) { + PacketMul4Xl idx = __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)); + return __riscv_vadd_vx_i64m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pzero(const PacketMul4Xl& /*a*/) { + return __riscv_vmv_v_x_i64m4(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl padd(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vadd_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl psub(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pnegate(const PacketMul4Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pconj(const PacketMul4Xl& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmul(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pdiv(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmadd(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmsub(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pnmadd(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vnmsub_vv_i64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pnmsub(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vnmsub_vv_i64m4(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmin(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmax(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcmp_le(const PacketMul4Xl& a, const PacketMul4Xl& b) { + PacketMask16 mask = __riscv_vmsle_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcmp_lt(const PacketMul4Xl& a, const PacketMul4Xl& b) { + PacketMask16 mask = __riscv_vmslt_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcmp_eq(const PacketMul4Xl& a, const PacketMul4Xl& b) { + PacketMask16 mask = __riscv_vmseq_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ptrue(const PacketMul4Xl& /*a*/) { + return __riscv_vmv_v_x_i64m4(0xffffffffffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pand(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vand_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl por(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vor_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pxor(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vxor_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pandnot(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vand_vv_i64m4(a, __riscv_vnot_v_i64m4(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xl parithmetic_shift_right(PacketMul4Xl a) { + return __riscv_vsra_vx_i64m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_right(PacketMul4Xl a) { + return __riscv_vreinterpret_i64m4( + __riscv_vsrl_vx_u64m4(__riscv_vreinterpret_u64m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_left(PacketMul4Xl a) { + return __riscv_vsll_vx_i64m4(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ploaddup(const numext::int64_t* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ploadquad(const numext::int64_t* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul4Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul4Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xl pgather(const numext::int64_t* from, + Index stride) { + return __riscv_vlse64_v_i64m4(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul4Xl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul4Xl& a) { + return __riscv_vmv_x_s_i64m4_i64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl preverse(const PacketMul4Xl& a) { + PacketMul4Xul idx = + __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pabs(const PacketMul4Xl& a) { + PacketMul4Xl mask = __riscv_vsra_vx_i64m4(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m4(__riscv_vxor_vv_i64m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul4Xl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul4Xl& a) { + PacketXl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), __riscv_vget_v_i64m4_i64m1(a, 1), + unpacket_traits::size); + PacketXl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), __riscv_vget_v_i64m4_i64m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul4Xl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul4Xl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_i64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul2Xl ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl plset(const numext::int64_t& a) { + PacketMul2Xl idx = __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)); + return __riscv_vadd_vx_i64m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pzero(const PacketMul2Xl& /*a*/) { + return __riscv_vmv_v_x_i64m2(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl padd(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vadd_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl psub(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pnegate(const PacketMul2Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pconj(const PacketMul2Xl& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmul(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pdiv(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmadd(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmsub(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pnmadd(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vnmsub_vv_i64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pnmsub(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vnmsub_vv_i64m2(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmin(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmax(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcmp_le(const PacketMul2Xl& a, const PacketMul2Xl& b) { + PacketMask32 mask = __riscv_vmsle_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcmp_lt(const PacketMul2Xl& a, const PacketMul2Xl& b) { + PacketMask32 mask = __riscv_vmslt_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcmp_eq(const PacketMul2Xl& a, const PacketMul2Xl& b) { + PacketMask32 mask = __riscv_vmseq_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ptrue(const PacketMul2Xl& /*a*/) { + return __riscv_vmv_v_x_i64m2(0xffffffffffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pand(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vand_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl por(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vor_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pxor(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vxor_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pandnot(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vand_vv_i64m2(a, __riscv_vnot_v_i64m2(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xl parithmetic_shift_right(PacketMul2Xl a) { + return __riscv_vsra_vx_i64m2(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_right(PacketMul2Xl a) { + return __riscv_vreinterpret_i64m2( + __riscv_vsrl_vx_u64m2(__riscv_vreinterpret_u64m2(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_left(PacketMul2Xl a) { + return __riscv_vsll_vx_i64m2(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ploaddup(const numext::int64_t* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ploadquad(const numext::int64_t* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul2Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul2Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xl pgather(const numext::int64_t* from, + Index stride) { + return __riscv_vlse64_v_i64m2(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul2Xl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul2Xl& a) { + return __riscv_vmv_x_s_i64m2_i64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl preverse(const PacketMul2Xl& a) { + PacketMul2Xul idx = + __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pabs(const PacketMul2Xl& a) { + PacketMul2Xl mask = __riscv_vsra_vx_i64m2(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m2(__riscv_vxor_vv_i64m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul2Xl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul2Xl& a) { + return predux_mul(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul2Xl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul2Xl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_i64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xl>::type + predux_half_dowto4(const PacketMul4Xl& a) { + return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), __riscv_vget_v_i64m4_i64m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXl>::type + predux_half_dowto4(const PacketMul2Xl& a) { + return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), + unpacket_traits::size); +} + +/********************************* double ************************************/ + +typedef eigen_packet_wrapper PacketXd; +typedef eigen_packet_wrapper PacketMul2Xd; +typedef eigen_packet_wrapper PacketMul4Xd; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXd type; + typedef PacketXd half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xd type; + typedef PacketXd half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xd type; + typedef PacketMul2Xd half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + typedef PacketXd half; // Half not yet implemented + typedef PacketXl integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask64 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + typedef PacketXd half; + typedef PacketMul2Xl integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask32 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + typedef PacketMul2Xd half; + typedef PacketMul4Xl integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask16 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +/********************************* PacketXd ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXd ptrue(const PacketXd& /*a*/) { + return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(0xffffffffffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pzero(const PacketXd& /*a*/) { + return __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pabs(const PacketXd& a) { + return __riscv_vfabs_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd plset(const double& a) { + PacketXd idx = + __riscv_vfcvt_f_x_v_f64m1(__riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)), unpacket_traits::size); + return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd padd(const PacketXd& a, const PacketXd& b) { + return __riscv_vfadd_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd psub(const PacketXd& a, const PacketXd& b) { + return __riscv_vfsub_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pnegate(const PacketXd& a) { + return __riscv_vfneg_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pconj(const PacketXd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmul(const PacketXd& a, const PacketXd& b) { + return __riscv_vfmul_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pdiv(const PacketXd& a, const PacketXd& b) { + return __riscv_vfdiv_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmadd(const PacketXd& a, const PacketXd& b, const PacketXd& c) { + return __riscv_vfmadd_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmsub(const PacketXd& a, const PacketXd& b, const PacketXd& c) { + return __riscv_vfmsub_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pnmadd(const PacketXd& a, const PacketXd& b, const PacketXd& c) { + return __riscv_vfnmsub_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pnmsub(const PacketXd& a, const PacketXd& b, const PacketXd& c) { + return __riscv_vfnmadd_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { + PacketXd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); + PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { + return __riscv_vfmin_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { + PacketXd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); + PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { + return __riscv_vfmax_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pcmp_le(const PacketXd& a, const PacketXd& b) { + PacketMask64 mask = __riscv_vmfle_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pcmp_lt(const PacketXd& a, const PacketXd& b) { + PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pcmp_eq(const PacketXd& a, const PacketXd& b) { + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pcmp_lt_or_nan(const PacketXd& a, const PacketXd& b) { + PacketMask64 mask = __riscv_vmfge_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m1(ptrue(a), 0.0, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for double, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketXd pand(const PacketXd& a, const PacketXd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd por(const PacketXd& a, const PacketXd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vor_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pxor(const PacketXd& a, const PacketXd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pandnot(const PacketXd& a, const PacketXd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), + __riscv_vnot_v_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd ploaddup(const double* from) { + PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd ploadquad(const double* from) { + PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + ; + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const PacketXd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketXd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m1(from, stride * sizeof(double), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketXd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const PacketXd& a) { + return __riscv_vfmv_f_s_f64m1_f64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXd psqrt(const PacketXd& a) { + return __riscv_vfsqrt_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd print(const PacketXd& a) { + const PacketXd limit = pset1(static_cast(1ull << 52)); + const PacketXd abs_a = pabs(a); + + PacketMask64 mask = __riscv_vmfne_vv_f64m1_b64(a, a, unpacket_traits::size); + const PacketXd x = __riscv_vfadd_vv_f64m1_tumu(mask, a, a, a, unpacket_traits::size); + const PacketXd new_x = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1(a, unpacket_traits::size), + unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m1_b64(abs_a, limit, unpacket_traits::size); + PacketXd signed_x = __riscv_vfsgnj_vv_f64m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pfloor(const PacketXd& a) { + PacketXd tmp = print(a); + // If greater, subtract one. + PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m1_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd preverse(const PacketXd& a) { + PacketXul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pfrexp(const PacketXd& a, PacketXd& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE double predux(const PacketXd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const PacketXd& a) { + // Multiply the vector by its reverse + PacketXd prod = __riscv_vfmul_vv_f64m1(preverse(a), a, unpacket_traits::size); + PacketXd half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const PacketXd& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const PacketXd& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_f64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketXd pldexp(const PacketXd& a, const PacketXd& exponent) { + return pldexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE PacketMask64 por(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMask64 pandnot(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMask64 pand(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmand_mm_b64(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask64 pcmp_eq_mask(const PacketXd& a, const PacketXd& b) { + return __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask64 pcmp_lt_mask(const PacketXd& a, const PacketXd& b) { + return __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketXd pselect(const PacketMask64& mask, const PacketXd& a, const PacketXd& b) { + return __riscv_vmerge_vvm_f64m1(b, a, mask, unpacket_traits::size); +} + +/********************************* PacketMul4Xd ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ptrue(const PacketMul4Xd& /*a*/) { + return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(0xffffffffffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pzero(const PacketMul4Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m4(0.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pabs(const PacketMul4Xd& a) { + return __riscv_vfabs_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd plset(const double& a) { + PacketMul4Xd idx = __riscv_vfcvt_f_x_v_f64m4(__riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f64m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd padd(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfadd_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd psub(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfsub_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pnegate(const PacketMul4Xd& a) { + return __riscv_vfneg_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pconj(const PacketMul4Xd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmul(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfmul_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pdiv(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfdiv_vv_f64m4(a, b, unpacket_traits::size); +} + +#if 1 +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { +// return padd(pmul(a, b), c); + return __riscv_vfmadd_vv_f64m4(a, b, c, unpacket_traits::size); +} +#endif + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfmsub_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pnmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfnmsub_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pnmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfnmadd_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMul4Xd nans = + __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfmin_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMul4Xd nans = + __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfmax_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_le(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmfle_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_eq(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt_or_nan(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmfge_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m4(ptrue(a), 0.0, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for double, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pand(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd por(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pxor(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pandnot(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4( + __riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vnot_v_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ploaddup(const double* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ploadquad(const double* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul4Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul4Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m4(from, stride * sizeof(double), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul4Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const PacketMul4Xd& a) { + return __riscv_vfmv_f_s_f64m4_f64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd psqrt(const PacketMul4Xd& a) { + return __riscv_vfsqrt_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd print(const PacketMul4Xd& a) { + const PacketMul4Xd limit = pset1(static_cast(1ull << 52)); + const PacketMul4Xd abs_a = pabs(a); + + PacketMask16 mask = __riscv_vmfne_vv_f64m4_b16(a, a, unpacket_traits::size); + const PacketMul4Xd x = __riscv_vfadd_vv_f64m4_tumu(mask, a, a, a, unpacket_traits::size); + const PacketMul4Xd new_x = __riscv_vfcvt_f_x_v_f64m4( + __riscv_vfcvt_x_f_v_i64m4(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m4_b16(abs_a, limit, unpacket_traits::size); + PacketMul4Xd signed_x = __riscv_vfsgnj_vv_f64m4(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pfloor(const PacketMul4Xd& a) { + PacketMul4Xd tmp = print(a); + // If greater, subtract one. + PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m4_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd preverse(const PacketMul4Xd& a) { + PacketMul4Xul idx = + __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pfrexp(const PacketMul4Xd& a, PacketMul4Xd& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE double predux(const PacketMul4Xd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const PacketMul4Xd& a) { + PacketXd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), __riscv_vget_v_f64m4_f64m1(a, 1), + unpacket_traits::size); + PacketXd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), __riscv_vget_v_f64m4_f64m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const PacketMul4Xd& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const PacketMul4Xd& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_f64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pldexp(const PacketMul4Xd& a, const PacketMul4Xd& exponent) { + return pldexp_generic(a, exponent); +} + +/********************************* PacketMul2Xd ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ptrue(const PacketMul2Xd& /*a*/) { + return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(0xffffffffffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pzero(const PacketMul2Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m2(0.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pabs(const PacketMul2Xd& a) { + return __riscv_vfabs_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd plset(const double& a) { + PacketMul2Xd idx = __riscv_vfcvt_f_x_v_f64m2(__riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f64m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd padd(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfadd_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd psub(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfsub_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pnegate(const PacketMul2Xd& a) { + return __riscv_vfneg_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pconj(const PacketMul2Xd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmul(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfmul_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pdiv(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfdiv_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmadd(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfmadd_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmsub(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfmsub_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pnmadd(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfnmsub_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pnmsub(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfnmadd_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMul2Xd nans = + __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfmin_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMul2Xd nans = + __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfmax_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_le(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmfle_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_eq(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt_or_nan(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmfge_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m2(ptrue(a), 0.0, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for double, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pand(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd por(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pxor(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pandnot(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2( + __riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vnot_v_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ploaddup(const double* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ploadquad(const double* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul2Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul2Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m2(from, stride * sizeof(double), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul2Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const PacketMul2Xd& a) { + return __riscv_vfmv_f_s_f64m2_f64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd psqrt(const PacketMul2Xd& a) { + return __riscv_vfsqrt_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd print(const PacketMul2Xd& a) { + const PacketMul2Xd limit = pset1(static_cast(1ull << 52)); + const PacketMul2Xd abs_a = pabs(a); + + PacketMask32 mask = __riscv_vmfne_vv_f64m2_b32(a, a, unpacket_traits::size); + const PacketMul2Xd x = __riscv_vfadd_vv_f64m2_tumu(mask, a, a, a, unpacket_traits::size); + const PacketMul2Xd new_x = __riscv_vfcvt_f_x_v_f64m2( + __riscv_vfcvt_x_f_v_i64m2(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m2_b32(abs_a, limit, unpacket_traits::size); + PacketMul2Xd signed_x = __riscv_vfsgnj_vv_f64m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pfloor(const PacketMul2Xd& a) { + PacketMul2Xd tmp = print(a); + // If greater, subtract one. + PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m2_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd preverse(const PacketMul2Xd& a) { + PacketMul2Xul idx = + __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pfrexp(const PacketMul2Xd& a, PacketMul2Xd& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE double predux(const PacketMul2Xd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const PacketMul2Xd& a) { + return predux_mul(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const PacketMul2Xd& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const PacketMul2Xd& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_f64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pldexp(const PacketMul2Xd& a, const PacketMul2Xd& exponent) { + return pldexp_generic(a, exponent); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xd>::type + predux_half_dowto4(const PacketMul4Xd& a) { + return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), __riscv_vget_v_f64m4_f64m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXd>::type + predux_half_dowto4(const PacketMul2Xd& a) { + return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), + unpacket_traits::size); +} + +/********************************* short **************************************/ + +typedef eigen_packet_wrapper PacketXs; +typedef eigen_packet_wrapper PacketXsu; + +typedef eigen_packet_wrapper PacketMul2Xs; +typedef eigen_packet_wrapper PacketMul2Xsu; + +typedef eigen_packet_wrapper PacketMul4Xs; +typedef eigen_packet_wrapper PacketMul4Xsu; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXs type; + typedef PacketXs half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xs type; + typedef PacketXs half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xs type; + typedef PacketMul2Xs half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int16_t type; + typedef PacketXs half; // Half not yet implemented + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int16_t type; + typedef PacketXs half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int16_t type; + typedef PacketMul2Xs half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE void prefetch(const numext::int16_t* addr) { +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + __builtin_prefetch(addr); +#endif +} + +/********************************* PacketXs ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs plset(const numext::int16_t& a) { + PacketXs idx = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vid_v_u16m1(unpacket_traits::size)); + return __riscv_vadd_vx_i16m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pzero(const PacketXs& /*a*/) { + return __riscv_vmv_v_x_i16m1(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs padd(const PacketXs& a, const PacketXs& b) { + return __riscv_vadd_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs psub(const PacketXs& a, const PacketXs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pnegate(const PacketXs& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pconj(const PacketXs& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmul(const PacketXs& a, const PacketXs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pdiv(const PacketXs& a, const PacketXs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmadd(const PacketXs& a, const PacketXs& b, const PacketXs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmsub(const PacketXs& a, const PacketXs& b, const PacketXs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pnmadd(const PacketXs& a, const PacketXs& b, const PacketXs& c) { + return __riscv_vnmsub_vv_i16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pnmsub(const PacketXs& a, const PacketXs& b, const PacketXs& c) { + return __riscv_vnmsub_vv_i16m1(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmin(const PacketXs& a, const PacketXs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmax(const PacketXs& a, const PacketXs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pcmp_le(const PacketXs& a, const PacketXs& b) { + PacketMask16 mask = __riscv_vmsle_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pcmp_lt(const PacketXs& a, const PacketXs& b) { + PacketMask16 mask = __riscv_vmslt_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pcmp_eq(const PacketXs& a, const PacketXs& b) { + PacketMask16 mask = __riscv_vmseq_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs ptrue(const PacketXs& /*a*/) { + return __riscv_vmv_v_x_i16m1(static_cast(0xffffu), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pand(const PacketXs& a, const PacketXs& b) { + return __riscv_vand_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs por(const PacketXs& a, const PacketXs& b) { + return __riscv_vor_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pxor(const PacketXs& a, const PacketXs& b) { + return __riscv_vxor_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pandnot(const PacketXs& a, const PacketXs& b) { + return __riscv_vand_vv_i16m1(a, __riscv_vnot_v_i16m1(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXs parithmetic_shift_right(PacketXs a) { + return __riscv_vsra_vx_i16m1(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXs plogical_shift_right(PacketXs a) { + return __riscv_vreinterpret_i16m1( + __riscv_vsrl_vx_u16m1(__riscv_vreinterpret_u16m1(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketXs plogical_shift_left(PacketXs a) { + return __riscv_vsll_vx_i16m1(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs ploaddup(const numext::int16_t* from) { + PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); + // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... + return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs ploadquad(const numext::int16_t* from) { + PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketXs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketXs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXs pgather(const numext::int16_t* from, Index stride) { + return __riscv_vlse16_v_i16m1(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketXs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketXs& a) { + return __riscv_vmv_x_s_i16m1_i16(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXs preverse(const PacketXs& a) { + PacketXsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pabs(const PacketXs& a) { + PacketXs mask = __riscv_vsra_vx_i16m1(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux(const PacketXs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketXs& a) { + // Multiply the vector by its reverse + PacketXs prod = __riscv_vmul_vv_i16m1(preverse(a), a, unpacket_traits::size); + PacketXs half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 16, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 8, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_i16m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + + half_prod = __riscv_vslidedown_vx_i16m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketXs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m1_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketXs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m1_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle16_v_i16m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul4Xs ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs plset(const numext::int16_t& a) { + PacketMul4Xs idx = __riscv_vreinterpret_v_u16m4_i16m4(__riscv_vid_v_u16m4(unpacket_traits::size)); + return __riscv_vadd_vx_i16m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pzero(const PacketMul4Xs& /*a*/) { + return __riscv_vmv_v_x_i16m4(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs padd(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vadd_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs psub(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pnegate(const PacketMul4Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pconj(const PacketMul4Xs& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmul(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pdiv(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmadd(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmsub(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pnmadd(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vnmsub_vv_i16m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pnmsub(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vnmsub_vv_i16m4(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmin(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmax(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcmp_le(const PacketMul4Xs& a, const PacketMul4Xs& b) { + PacketMask4 mask = __riscv_vmsle_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcmp_lt(const PacketMul4Xs& a, const PacketMul4Xs& b) { + PacketMask4 mask = __riscv_vmslt_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcmp_eq(const PacketMul4Xs& a, const PacketMul4Xs& b) { + PacketMask4 mask = __riscv_vmseq_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ptrue(const PacketMul4Xs& /*a*/) { + return __riscv_vmv_v_x_i16m4(static_cast(0xffffu), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pand(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vand_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs por(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vor_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pxor(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vxor_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pandnot(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vand_vv_i16m4(a, __riscv_vnot_v_i16m4(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xs parithmetic_shift_right(PacketMul4Xs a) { + return __riscv_vsra_vx_i16m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_right(PacketMul4Xs a) { + return __riscv_vreinterpret_i16m4( + __riscv_vsrl_vx_u16m4(__riscv_vreinterpret_u16m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_left(PacketMul4Xs a) { + return __riscv_vsll_vx_i16m4(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ploaddup(const numext::int16_t* from) { + PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); + idx = __riscv_vand_vx_u16m4(idx, 0xfffeu, unpacket_traits::size); + // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... + return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ploadquad(const numext::int16_t* from) { + PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m4(__riscv_vand_vx_u16m4(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul4Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul4Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xs pgather(const numext::int16_t* from, + Index stride) { + return __riscv_vlse16_v_i16m4(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul4Xs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul4Xs& a) { + return __riscv_vmv_x_s_i16m4_i16(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs preverse(const PacketMul4Xs& a) { + PacketMul4Xsu idx = + __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pabs(const PacketMul4Xs& a) { + PacketMul4Xs mask = __riscv_vsra_vx_i16m4(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m4(__riscv_vxor_vv_i16m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul4Xs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul4Xs& a) { + PacketXs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), __riscv_vget_v_i16m4_i16m1(a, 1), + unpacket_traits::size); + PacketXs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), __riscv_vget_v_i16m4_i16m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul4Xs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul4Xs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle16_v_i16m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul2Xs ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs plset(const numext::int16_t& a) { + PacketMul2Xs idx = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(unpacket_traits::size)); + return __riscv_vadd_vx_i16m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pzero(const PacketMul2Xs& /*a*/) { + return __riscv_vmv_v_x_i16m2(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs padd(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vadd_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs psub(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pnegate(const PacketMul2Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pconj(const PacketMul2Xs& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmul(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pdiv(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmadd(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmsub(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pnmadd(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vnmsub_vv_i16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pnmsub(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vnmsub_vv_i16m2(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmin(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmax(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcmp_le(const PacketMul2Xs& a, const PacketMul2Xs& b) { + PacketMask8 mask = __riscv_vmsle_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcmp_lt(const PacketMul2Xs& a, const PacketMul2Xs& b) { + PacketMask8 mask = __riscv_vmslt_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcmp_eq(const PacketMul2Xs& a, const PacketMul2Xs& b) { + PacketMask8 mask = __riscv_vmseq_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ptrue(const PacketMul2Xs& /*a*/) { + return __riscv_vmv_v_x_i16m2(static_cast(0xffffu), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pand(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vand_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs por(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vor_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pxor(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vxor_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pandnot(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vand_vv_i16m2(a, __riscv_vnot_v_i16m2(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xs parithmetic_shift_right(PacketMul2Xs a) { + return __riscv_vsra_vx_i16m2(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_right(PacketMul2Xs a) { + return __riscv_vreinterpret_i16m2( + __riscv_vsrl_vx_u16m2(__riscv_vreinterpret_u16m2(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_left(PacketMul2Xs a) { + return __riscv_vsll_vx_i16m2(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ploaddup(const numext::int16_t* from) { + PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); + // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... + return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ploadquad(const numext::int16_t* from) { + PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul2Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul2Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xs pgather(const numext::int16_t* from, + Index stride) { + return __riscv_vlse16_v_i16m2(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul2Xs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul2Xs& a) { + return __riscv_vmv_x_s_i16m2_i16(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs preverse(const PacketMul2Xs& a) { + PacketMul2Xsu idx = + __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pabs(const PacketMul2Xs& a) { + PacketMul2Xs mask = __riscv_vsra_vx_i16m2(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m2(__riscv_vxor_vv_i16m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul2Xs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul2Xs& a) { + return predux_mul(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul2Xs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul2Xs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle16_v_i16m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xs>::type + predux_half_dowto4(const PacketMul4Xs& a) { + return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), __riscv_vget_v_i16m4_i16m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXs>::type + predux_half_dowto4(const PacketMul2Xs& a) { + return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), + unpacket_traits::size); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_PACKET_MATH_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h new file mode 100644 index 000000000..085952fcd --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h @@ -0,0 +1,917 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_FP16_RVV10_H +#define EIGEN_PACKET_MATH_FP16_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +typedef vfloat16m1_t PacketXh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vfloat16m2_t PacketMul2Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXh type; + typedef PacketXh half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 0, + HasExp = 0, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xh type; + typedef PacketXh half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 0, + HasExp = 0, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef PacketXh half; // Half not yet implemented + typedef PacketXs integer_packet; + typedef numext::uint8_t mask_t; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef PacketXh half; + typedef PacketMul2Xs integer_packet; + typedef numext::uint8_t mask_t; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +/********************************* PacketXh ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXh ptrue(const PacketXh& /*a*/) { + return __riscv_vreinterpret_f16m1(__riscv_vmv_v_x_u16m1(0xffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pzero(const PacketXh& /*a*/) { + return __riscv_vfmv_v_f_f16m1(static_cast(0.0), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pabs(const PacketXh& a) { + return __riscv_vfabs_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pset1(const Eigen::half& from) { + return __riscv_vfmv_v_f_f16m1(static_cast<_Float16>(from), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pset1frombits(numext::uint16_t from) { + return __riscv_vreinterpret_f16m1(__riscv_vmv_v_x_u16m1(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh plset(const Eigen::half& a) { + PacketXh idx = + __riscv_vfcvt_f_x_v_f16m1(__riscv_vid_v_i16m1(unpacket_traits::size), unpacket_traits::size); + return __riscv_vfadd_vf_f16m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh padd(const PacketXh& a, const PacketXh& b) { + return __riscv_vfadd_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh psub(const PacketXh& a, const PacketXh& b) { + return __riscv_vfsub_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pnegate(const PacketXh& a) { + return __riscv_vfneg_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pconj(const PacketXh& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmul(const PacketXh& a, const PacketXh& b) { + return __riscv_vfmul_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pdiv(const PacketXh& a, const PacketXh& b) { + return __riscv_vfdiv_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmadd(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfmadd_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmsub(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfmsub_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pnmadd(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfnmsub_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pnmsub(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfnmadd_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmin(const PacketXh& a, const PacketXh& b) { + PacketXh nans = + __riscv_vfmv_v_f_f16m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f16m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmin(const PacketXh& a, const PacketXh& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmin(const PacketXh& a, const PacketXh& b) { + return __riscv_vfmin_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmax(const PacketXh& a, const PacketXh& b) { + PacketXh nans = + __riscv_vfmv_v_f_f16m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f16m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmax(const PacketXh& a, const PacketXh& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmax(const PacketXh& a, const PacketXh& b) { + return __riscv_vfmax_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_le(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmfle_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_lt(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmflt_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_eq(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_lt_or_nan(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmfge_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f16m1(ptrue(a), static_cast(0.0), mask, + unpacket_traits::size); +} + +// Logical Operations are not supported for half, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketXh pand(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh por(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vor_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pxor(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vxor_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pandnot(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), + __riscv_vnot_v_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast(from), + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast(from), + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh ploaddup(const Eigen::half* from) { + PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); + return __riscv_vloxei16_v_f16m1(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh ploadquad(const Eigen::half* from) { + PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_f16m1(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const PacketXh& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const PacketXh& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXh pgather(const Eigen::half* from, Index stride) { + return __riscv_vlse16_v_f16m1(reinterpret_cast(from), stride * sizeof(Eigen::half), + unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const PacketXh& from, Index stride) { + __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const PacketXh& a) { + return static_cast(__riscv_vfmv_f_s_f16m1_f16(a)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh psqrt(const PacketXh& a) { + return __riscv_vfsqrt_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh print(const PacketXh& a) { + const PacketXh limit = pset1(static_cast(1 << 10)); + const PacketXh abs_a = pabs(a); + + PacketMask16 mask = __riscv_vmfne_vv_f16m1_b16(a, a, unpacket_traits::size); + const PacketXh x = __riscv_vfadd_vv_f16m1_tum(mask, a, a, a, unpacket_traits::size); + const PacketXh new_x = __riscv_vfcvt_f_x_v_f16m1(__riscv_vfcvt_x_f_v_i16m1(a, unpacket_traits::size), + unpacket_traits::size); + + mask = __riscv_vmflt_vv_f16m1_b16(abs_a, limit, unpacket_traits::size); + PacketXh signed_x = __riscv_vfsgnj_vv_f16m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pfloor(const PacketXh& a) { + PacketXh tmp = print(a); + // If greater, subtract one. + PacketMask16 mask = __riscv_vmflt_vv_f16m1_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f16m1_tum(mask, tmp, tmp, static_cast(1.0), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh preverse(const PacketXh& a) { + PacketXsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f16m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux(const PacketXh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m1_f16m1( + a, __riscv_vfmv_v_f_f16m1(static_cast(0.0), unpacket_traits::size), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_mul(const PacketXh& a) { + // Multiply the vector by its reverse + PacketXh prod = __riscv_vfmul_vv_f16m1(preverse(a), a, unpacket_traits::size); + PacketXh half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f16m1(prod, 16, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f16m1(prod, 8, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f16m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_f16m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + + half_prod = __riscv_vslidedown_vx_f16m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_min(const PacketXh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m1_f16m1( + a, __riscv_vfmv_v_f_f16m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_max(const PacketXh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m1_f16m1( + a, __riscv_vfmv_v_f_f16m1(-(std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size))); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + Eigen::half buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i], + unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle16_v_f16m1(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), + unpacket_traits::size); + } +} + +EIGEN_STRONG_INLINE PacketMul2Xf half2float(const PacketXh& a) { + return __riscv_vfwcvt_f_f_v_f32m2(a, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketXh float2half(const PacketMul2Xf& a) { + return __riscv_vfncvt_f_f_w_f16m1(a, unpacket_traits::size); +} + +/********************************* PacketMul2Xh ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh ptrue(const PacketMul2Xh& /*a*/) { + return __riscv_vreinterpret_f16m2(__riscv_vmv_v_x_u16m2(0xffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pzero(const PacketMul2Xh& /*a*/) { + return __riscv_vfmv_v_f_f16m2(static_cast(0.0), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pabs(const PacketMul2Xh& a) { + return __riscv_vfabs_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pset1(const Eigen::half& from) { + return __riscv_vfmv_v_f_f16m2(static_cast<_Float16>(from), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pset1frombits(numext::uint16_t from) { + return __riscv_vreinterpret_f16m2(__riscv_vmv_v_x_u16m2(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh plset(const Eigen::half& a) { + PacketMul2Xh idx = __riscv_vfcvt_f_x_v_f16m2(__riscv_vid_v_i16m2(unpacket_traits::size), + unpacket_traits::size); + return __riscv_vfadd_vf_f16m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh padd(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfadd_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh psub(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfsub_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pnegate(const PacketMul2Xh& a) { + return __riscv_vfneg_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pconj(const PacketMul2Xh& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmul(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfmul_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pdiv(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfdiv_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmadd(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { + return __riscv_vfmadd_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmsub(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { + return __riscv_vfmsub_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pnmadd(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { + return __riscv_vfnmsub_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pnmsub(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { + return __riscv_vfnmadd_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmin(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMul2Xh nans = + __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f16m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmin(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmin(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfmin_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmax(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMul2Xh nans = + __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f16m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmax(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmax(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfmax_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcmp_le(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMask8 mask = __riscv_vmfle_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcmp_lt(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcmp_eq(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcmp_lt_or_nan(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMask8 mask = __riscv_vmfge_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f16m2(ptrue(a), static_cast(0.0), mask, + unpacket_traits::size); +} + +// Logical Operations are not supported for half, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pand(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vreinterpret_v_f16m2_u16m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh por(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vreinterpret_v_f16m2_u16m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pxor(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vxor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vreinterpret_v_f16m2_u16m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pandnot(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2( + __riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vnot_v_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast(from), + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast(from), + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh ploaddup(const Eigen::half* from) { + PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); + return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh ploadquad(const Eigen::half* from) { + PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const PacketMul2Xh& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const PacketMul2Xh& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xh pgather(const Eigen::half* from, Index stride) { + return __riscv_vlse16_v_f16m2(reinterpret_cast(from), stride * sizeof(Eigen::half), + unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const PacketMul2Xh& from, + Index stride) { + __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const PacketMul2Xh& a) { + return static_cast(__riscv_vfmv_f_s_f16m2_f16(a)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh psqrt(const PacketMul2Xh& a) { + return __riscv_vfsqrt_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh print(const PacketMul2Xh& a) { + const PacketMul2Xh limit = pset1(static_cast(1 << 10)); + const PacketMul2Xh abs_a = pabs(a); + + PacketMask8 mask = __riscv_vmfne_vv_f16m2_b8(a, a, unpacket_traits::size); + const PacketMul2Xh x = __riscv_vfadd_vv_f16m2_tum(mask, a, a, a, unpacket_traits::size); + const PacketMul2Xh new_x = __riscv_vfcvt_f_x_v_f16m2( + __riscv_vfcvt_x_f_v_i16m2(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f16m2_b8(abs_a, limit, unpacket_traits::size); + PacketMul2Xh signed_x = __riscv_vfsgnj_vv_f16m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pfloor(const PacketMul2Xh& a) { + PacketMul2Xh tmp = print(a); + // If greater, subtract one. + PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f16m2_tum(mask, tmp, tmp, static_cast(1.0), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh preverse(const PacketMul2Xh& a) { + PacketMul2Xsu idx = + __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f16m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux(const PacketMul2Xh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m2_f16m1( + a, __riscv_vfmv_v_f_f16m1(static_cast(0.0), unpacket_traits::size / 4), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_mul(const PacketMul2Xh& a) { + return predux_mul(__riscv_vfmul_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_min(const PacketMul2Xh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m2_f16m1( + a, __riscv_vfmv_v_f_f16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_max(const PacketMul2Xh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m2_f16m1( + a, __riscv_vfmv_v_f_f16m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size))); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + Eigen::half buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i], + unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle16_v_f16m2(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), + unpacket_traits::size); + } +} + +EIGEN_STRONG_INLINE PacketMul4Xf half2float(const PacketMul2Xh& a) { + return __riscv_vfwcvt_f_f_v_f32m4(a, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMul2Xh float2half(const PacketMul4Xf& a) { + return __riscv_vfncvt_f_f_w_f16m2(a, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXh>::type + predux_half_dowto4(const PacketMul2Xh& a) { + return __riscv_vfadd_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), + unpacket_traits::size); +} + +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, pcos) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, pexp) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, pexpm1) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, plog) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, plog1p) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, plog2) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, preciprocal) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, prsqrt) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, psin) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, ptanh) + +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, pcos) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, pexp) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, pexpm1) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, plog) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, plog1p) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, plog2) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, preciprocal) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, prsqrt) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, psin) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, ptanh) + +/********************************* casting ************************************/ + +template <> +struct type_casting_traits<_Float16, numext::int16_t> { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXh pcast(const PacketXs& a) { + return __riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pcast(const PacketXh& a) { + return __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh preinterpret(const PacketXs& a) { + return __riscv_vreinterpret_v_i16m1_f16m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXs preinterpret(const PacketXh& a) { + return __riscv_vreinterpret_v_f16m1_i16m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketMul2Xs& a) { + return __riscv_vfcvt_f_x_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketMul2Xh& a) { + return __riscv_vfcvt_rtz_x_f_v_i16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh preinterpret(const PacketMul2Xs& a) { + return __riscv_vreinterpret_v_i16m2_f16m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs preinterpret(const PacketMul2Xh& a) { + return __riscv_vreinterpret_v_f16m2_i16m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXh& a, const PacketXh& b, const PacketXh& c, + const PacketXh& d) { + return __riscv_vcreate_v_i16m1_i16m4(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketXs& a, const PacketXs& b) { + return __riscv_vcreate_v_f16m1_f16m2(__riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f16m1(b, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketXh& a, const PacketXh& b) { + return __riscv_vcreate_v_f16m1_f16m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketXh& a, const PacketXh& b) { + return __riscv_vcreate_v_i16m1_i16m2(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size)); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_PACKET_MATH_FP16_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/TypeCasting.h b/Eigen/src/Core/arch/RVV10/TypeCasting.h new file mode 100644 index 000000000..67bc99d0b --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/TypeCasting.h @@ -0,0 +1,284 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_RVV10_H +#define EIGEN_TYPE_CASTING_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/********************************* 32 bits ************************************/ + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXf pcast(const PacketXi& a) { + return __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcast(const PacketXf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf preinterpret(const PacketXi& a) { + return __riscv_vreinterpret_v_i32m1_f32m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi preinterpret(const PacketXf& a) { + return __riscv_vreinterpret_v_f32m1_i32m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketMul4Xi& a) { + return __riscv_vfcvt_f_x_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketMul4Xf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf preinterpret(const PacketMul4Xi& a) { + return __riscv_vreinterpret_v_i32m4_f32m4(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi preinterpret(const PacketMul4Xf& a) { + return __riscv_vreinterpret_v_f32m4_i32m4(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketMul2Xi& a) { + return __riscv_vfcvt_f_x_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketMul2Xf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf preinterpret(const PacketMul2Xi& a) { + return __riscv_vreinterpret_v_i32m2_f32m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi preinterpret(const PacketMul2Xf& a) { + return __riscv_vreinterpret_v_f32m2_i32m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXi& a, const PacketXi& b, const PacketXi& c, + const PacketXi& d) { + return __riscv_vcreate_v_i32m1_i32m4(a, b, c, d); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXi& a, const PacketXi& b, const PacketXi& c, + const PacketXi& d) { + return __riscv_vcreate_v_f32m1_f32m4(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(c, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXf& a, const PacketXf& b, const PacketXf& c, + const PacketXf& d) { + return __riscv_vcreate_v_f32m1_f32m4(a, b, c, d); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXf& a, const PacketXf& b, const PacketXf& c, + const PacketXf& d) { + return __riscv_vcreate_v_i32m1_i32m4(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketXi& a, const PacketXi& b) { + return __riscv_vcreate_v_i32m1_i32m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXi& a, const PacketXi& b) { + return __riscv_vcreate_v_f32m1_f32m2(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXf& a, const PacketXf& b) { + return __riscv_vcreate_v_f32m1_f32m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketXf& a, const PacketXf& b) { + return __riscv_vcreate_v_i32m1_i32m2(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size)); +} + +/********************************* 64 bits ************************************/ + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXd pcast(const PacketXl& a) { + return __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pcast(const PacketXd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd preinterpret(const PacketXl& a) { + return __riscv_vreinterpret_v_i64m1_f64m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXl preinterpret(const PacketXd& a) { + return __riscv_vreinterpret_v_f64m1_i64m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketMul4Xl& a) { + return __riscv_vfcvt_f_x_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketMul4Xd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd preinterpret(const PacketMul4Xl& a) { + return __riscv_vreinterpret_v_i64m4_f64m4(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl preinterpret(const PacketMul4Xd& a) { + return __riscv_vreinterpret_v_f64m4_i64m4(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketMul2Xl& a) { + return __riscv_vfcvt_f_x_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketMul2Xd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd preinterpret(const PacketMul2Xl& a) { + return __riscv_vreinterpret_v_i64m2_f64m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl preinterpret(const PacketMul2Xd& a) { + return __riscv_vreinterpret_v_f64m2_i64m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXl& a, const PacketXl& b, const PacketXl& c, + const PacketXl& d) { + return __riscv_vcreate_v_i64m1_i64m4(a, b, c, d); + ; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXl& a, const PacketXl& b, const PacketXl& c, + const PacketXl& d) { + return __riscv_vcreate_v_f64m1_f64m4(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(c, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXd& a, const PacketXd& b, const PacketXd& c, + const PacketXd& d) { + return __riscv_vcreate_v_f64m1_f64m4(a, b, c, d); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXd& a, const PacketXd& b, const PacketXd& c, + const PacketXd& d) { + return __riscv_vcreate_v_i64m1_i64m4(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketXl& a, const PacketXl& b) { + return __riscv_vcreate_v_i64m1_i64m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXl& a, const PacketXl& b) { + return __riscv_vcreate_v_f64m1_f64m2(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXd& a, const PacketXd& b) { + return __riscv_vcreate_v_f64m1_f64m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketXd& a, const PacketXd& b) { + return __riscv_vcreate_v_i64m1_i64m2(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size)); +} + +/********************************* 16 bits ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketXs& a, const PacketXs& b) { + return __riscv_vcreate_v_i16m1_i16m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXs& a, const PacketXs& b, const PacketXs& c, + const PacketXs& d) { + return __riscv_vcreate_v_i16m1_i16m4(a, b, c, d); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_TYPE_CASTING_RVV10_H diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index ba7d97a03..326c6eab3 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -111,7 +111,11 @@ struct squared_norm_functor { } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { +#if defined EIGEN_VECTORIZE_RVV10 + return Packet(pmul(a.real, a.real), pmul(a.imag, a.imag)); +#else return Packet(pmul(a.v, a.v)); +#endif } }; template diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index ba72a8a4f..a691d092e 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -38,10 +38,21 @@ template ::ReturnType ResScalar; +#ifdef EIGEN_RISCV64_USE_RVV10 +#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ + typedef typename std::conditional_t< \ + NumTraits::IsComplex || NumTraits::IsComplex, \ + typename packet_traits::type, \ + typename gemv_packet_cond::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type> \ + name##Packet##postfix +#else #define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ typedef typename gemv_packet_cond< \ packet_size, typename packet_traits::type, typename packet_traits::half, \ typename unpacket_traits::half>::half>::type name##Packet##postfix +#endif PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 49f307c73..5be3e8028 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -68,6 +68,8 @@ #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 #elif defined __HVX__ && (__HVX_LENGTH__ == 128) #define EIGEN_IDEAL_MAX_ALIGN_BYTES 128 +#elif defined(EIGEN_RISCV64_USE_RVV10) +#define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 #else #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 #endif @@ -104,7 +106,7 @@ // Only static alignment is really problematic (relies on nonstandard compiler extensions), // try to keep heap alignment even when we have to disable static alignment. #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || \ - EIGEN_ARCH_MIPS || EIGEN_ARCH_LOONGARCH64) + EIGEN_ARCH_MIPS || EIGEN_ARCH_LOONGARCH64 || EIGEN_ARCH_RISCV) #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 #else #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 @@ -406,14 +408,48 @@ extern "C" { #define EIGEN_VECTORIZE_SVE #include -// Since we depend on knowing SVE vector lengths at compile-time, we need -// to ensure a fixed lengths is set +// Since we depend on knowing SVE vector length at compile-time, we need +// to ensure a fixed length is set #if defined __ARM_FEATURE_SVE_BITS #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS #else #error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set." #endif +#elif defined(EIGEN_ARCH_RISCV) + +#if defined(__riscv_zfh) +#define EIGEN_HAS_BUILTIN_FLOAT16 +#endif + +// We currently require RVV to be enabled explicitly via EIGEN_RISCV64_USE_RVV and +// will not select the backend automatically +#if (defined EIGEN_RISCV64_USE_RVV10) + +#define EIGEN_VECTORIZE +#define EIGEN_VECTORIZE_RVV10 +#include + +// Since we depend on knowing RVV vector length at compile-time, we need +// to ensure a fixed length is set +#if defined(__riscv_v_fixed_vlen) +#define EIGEN_RISCV64_RVV_VL __riscv_v_fixed_vlen +#if __riscv_v_fixed_vlen >= 256 +#undef EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT +#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 +#endif +#else +#error "Eigen requires a fixed RVV vector length but -mrvv-vector-bits=zvl is not set." +#endif + +#if defined(__riscv_zvfh) && defined(__riscv_zfh) +#define EIGEN_VECTORIZE_RVV10FP16 +#elif defined(__riscv_zvfh) +#error "The Eigen::Half vectorization requires Zfh and Zvfh extensions." +#endif + +#endif // defined(EIGEN_ARCH_RISCV) + #elif (defined __s390x__ && defined __VEC__) #define EIGEN_VECTORIZE diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index fcc2db822..8aba62b75 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -475,6 +475,7 @@ enum Type { SVE = 0x6, HVX = 0x7, LSX = 0x8, + RVV10 = 0x9, #if defined EIGEN_VECTORIZE_SSE Target = SSE #elif defined EIGEN_VECTORIZE_ALTIVEC @@ -491,6 +492,8 @@ enum Type { Target = HVX #elif defined EIGEN_VECTORIZE_LSX Target = LSX +#elif defined EIGEN_VECTORIZE_RVV10 + Target = RVV10 #else Target = Generic #endif diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 00d55577d..5f29a9c72 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -404,6 +404,13 @@ #define EIGEN_ARCH_PPC 0 #endif +/// \internal EIGEN_ARCH_RISCV set to 1 if the architecture is RISC-V. +#if defined(__riscv) +#define EIGEN_ARCH_RISCV 1 +#else +#define EIGEN_ARCH_RISCV 0 +#endif + //------------------------------------------------------------------------------------------ // Operating system identification, EIGEN_OS_* //------------------------------------------------------------------------------------------ @@ -976,7 +983,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); #if !defined(EIGEN_ASM_COMMENT) -#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64) +#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_RISCV) #define EIGEN_ASM_COMMENT(X) __asm__("#" X) #else #define EIGEN_ASM_COMMENT(X) diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index a0e160eba..e91a14e9d 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -264,7 +264,7 @@ struct functor_cost { static constexpr Index Cost = plain_enum_max(nested_functor_cost::Cost, 1); }; -template +template struct packet_traits; template @@ -285,9 +285,12 @@ struct find_best_packet_helper { typedef typename find_best_packet_helper::half>::type type; }; -template +template +struct find_best_packet; + +template struct find_best_packet { - typedef typename find_best_packet_helper::type>::type type; + typedef typename find_best_packet_helper::type>::type type; }; template () * diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index 2686a5237..9bc9b1099 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -301,12 +301,25 @@ template { static inline void run(Scalar* x, Index incrx, Scalar* y, Index incry, Index size, OtherScalar c, OtherScalar s) { +#ifdef EIGEN_RISCV64_USE_RVV10 + typedef + typename std::conditional_t::IsComplex || NumTraits::IsComplex, + typename packet_traits::type, typename packet_traits::type> + Packet; + typedef typename std::conditional_t::IsComplex || NumTraits::IsComplex, + typename packet_traits::type, + typename packet_traits::type> + OtherPacket; + + constexpr Index PacketSize = unpacket_traits::size; +#else typedef typename packet_traits::type Packet; typedef typename packet_traits::type OtherPacket; - constexpr int RequiredAlignment = - (std::max)(unpacket_traits::alignment, unpacket_traits::alignment); constexpr Index PacketSize = packet_traits::size; +#endif + constexpr int RequiredAlignment = + (std::max)(unpacket_traits::alignment, unpacket_traits::alignment); /*** dynamic-size vectorized paths ***/ if (size >= 2 * PacketSize && SizeAtCompileTime == Dynamic && ((incrx == 1 && incry == 1) || PacketSize == 1)) { diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 76475923f..4259b61b1 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -1816,6 +1816,19 @@ EIGEN_DECLARE_TEST(packetmath) { CALL_SUBTEST_14((packetmath::type>())); CALL_SUBTEST_14((packetmath_scatter_gather::type>())); CALL_SUBTEST_15(test::runner::run()); +#ifdef EIGEN_RISCV64_USE_RVV10 + CALL_SUBTEST_16((test::runner::type>::run())); + CALL_SUBTEST_17((test::runner::type>::run())); + CALL_SUBTEST_18((test::runner::type>::run())); + CALL_SUBTEST_19((test::runner::type>::run())); + CALL_SUBTEST_20((test::runner::type>::run())); + CALL_SUBTEST_21((test::runner::type>::run())); + CALL_SUBTEST_22((test::runner::type>::run())); + CALL_SUBTEST_23((test::runner::type>::run())); + CALL_SUBTEST_24((test::runner::type>::run())); + CALL_SUBTEST_25((test::runner::type>::run())); + CALL_SUBTEST_26((test::runner::type>::run())); +#endif g_first_pass = false; } } diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 724fa40ba..02d46cef8 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -107,7 +107,11 @@ template ::Vector struct vectorization_logic { typedef internal::packet_traits PacketTraits; +#ifdef EIGEN_RISCV64_USE_RVV10 + typedef typename internal::packet_traits::type PacketType; +#else typedef typename internal::packet_traits::type PacketType; +#endif typedef typename internal::unpacket_traits::half HalfPacketType; enum { PacketSize = internal::unpacket_traits::size, -- GitLab From 98d433188548d6442bfffd525db0839238faa1b0 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 15 Jul 2025 20:37:51 +0000 Subject: [PATCH 02/21] Fix size for PacketMul2X predux functions. --- Eigen/src/Core/arch/RVV10/PacketMath.h | 33 ++++++++++++-------------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h index e59d198d6..4fd3eb75f 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -906,7 +906,7 @@ EIGEN_STRONG_INLINE PacketMul2Xi pabs(const PacketMul2Xi& a) { template <> EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul2Xi& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1( - a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), unpacket_traits::size)); + a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> @@ -918,14 +918,14 @@ EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul2Xi& template <> EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul2Xi& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i32m2_i32m1( - a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul2Xi& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i32m2_i32m1( - a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 2), unpacket_traits::size)); } @@ -2091,7 +2091,7 @@ EIGEN_STRONG_INLINE PacketMul2Xf pfrexp(const PacketMul2Xf& a, Pac template <> EIGEN_STRONG_INLINE float predux(const PacketMul2Xf& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m2_f32m1( - a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> @@ -2103,14 +2103,14 @@ EIGEN_STRONG_INLINE float predux_mul(const PacketMul2Xf& a) { template <> EIGEN_STRONG_INLINE float predux_min(const PacketMul2Xf& a) { return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1( - a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE float predux_max(const PacketMul2Xf& a) { return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1( - a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 2), unpacket_traits::size)); } @@ -3013,7 +3013,7 @@ EIGEN_STRONG_INLINE PacketMul2Xl pabs(const PacketMul2Xl& a) { template <> EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul2Xl& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1( - a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), unpacket_traits::size)); + a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> @@ -3025,14 +3025,14 @@ EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul2Xl& template <> EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul2Xl& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i64m2_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul2Xl& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i64m2_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 2), unpacket_traits::size)); } @@ -3654,13 +3654,10 @@ EIGEN_STRONG_INLINE PacketMul4Xd pdiv(const PacketMul4Xd& a, const return __riscv_vfdiv_vv_f64m4(a, b, unpacket_traits::size); } -#if 1 template <> EIGEN_STRONG_INLINE PacketMul4Xd pmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { -// return padd(pmul(a, b), c); return __riscv_vfmadd_vv_f64m4(a, b, c, unpacket_traits::size); } -#endif template <> EIGEN_STRONG_INLINE PacketMul4Xd pmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { @@ -4195,7 +4192,7 @@ EIGEN_STRONG_INLINE PacketMul2Xd pfrexp(const PacketMul2Xd& a, Pac template <> EIGEN_STRONG_INLINE double predux(const PacketMul2Xd& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m2_f64m1( - a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> @@ -4207,14 +4204,14 @@ EIGEN_STRONG_INLINE double predux_mul(const PacketMul2Xd& a) { template <> EIGEN_STRONG_INLINE double predux_min(const PacketMul2Xd& a) { return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1( - a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE double predux_max(const PacketMul2Xd& a) { return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1( - a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 2), unpacket_traits::size)); } @@ -5119,7 +5116,7 @@ EIGEN_STRONG_INLINE PacketMul2Xs pabs(const PacketMul2Xs& a) { template <> EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul2Xs& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1( - a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), unpacket_traits::size)); + a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> @@ -5131,14 +5128,14 @@ EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul2Xs& template <> EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul2Xs& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i16m2_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 2), unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul2Xs& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i16m2_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 2), unpacket_traits::size)); } -- GitLab From 2489bc0d558bbc25ca02e9437201d01fbb704428 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 16 Jul 2025 15:19:49 +0000 Subject: [PATCH 03/21] Fix NaN propagation in predux_min and max. --- Eigen/src/Core/arch/RVV10/PacketMath.h | 72 +++++++++++++------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h index 4fd3eb75f..957f5d9c3 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -1437,16 +1437,16 @@ EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) { template <> EIGEN_STRONG_INLINE float predux_min(const PacketXf& a) { - return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( - a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); + return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE float predux_max(const PacketXf& a) { - return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( - a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); + return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), -(std::numeric_limits::max)()); } template @@ -1782,16 +1782,16 @@ EIGEN_STRONG_INLINE float predux_mul(const PacketMul4Xf& a) { template <> EIGEN_STRONG_INLINE float predux_min(const PacketMul4Xf& a) { - return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1( - a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 4), + unpacket_traits::size)), (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE float predux_max(const PacketMul4Xf& a) { - return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1( - a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 4), + unpacket_traits::size)), -(std::numeric_limits::max)()); } template @@ -2102,16 +2102,16 @@ EIGEN_STRONG_INLINE float predux_mul(const PacketMul2Xf& a) { template <> EIGEN_STRONG_INLINE float predux_min(const PacketMul2Xf& a) { - return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1( - a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 2), - unpacket_traits::size)); + return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 2), + unpacket_traits::size)), (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE float predux_max(const PacketMul2Xf& a) { - return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1( - a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 2), - unpacket_traits::size)); + return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 2), + unpacket_traits::size)), -(std::numeric_limits::max)()); } template @@ -3531,16 +3531,16 @@ EIGEN_STRONG_INLINE double predux_mul(const PacketXd& a) { template <> EIGEN_STRONG_INLINE double predux_min(const PacketXd& a) { - return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( - a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); + return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE double predux_max(const PacketXd& a) { - return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( - a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); + return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), -(std::numeric_limits::max)()); } template @@ -3882,16 +3882,16 @@ EIGEN_STRONG_INLINE double predux_mul(const PacketMul4Xd& a) { template <> EIGEN_STRONG_INLINE double predux_min(const PacketMul4Xd& a) { - return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1( - a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 4), + unpacket_traits::size)), (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE double predux_max(const PacketMul4Xd& a) { - return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1( - a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 4), + unpacket_traits::size)), -(std::numeric_limits::max)()); } template @@ -4203,16 +4203,16 @@ EIGEN_STRONG_INLINE double predux_mul(const PacketMul2Xd& a) { template <> EIGEN_STRONG_INLINE double predux_min(const PacketMul2Xd& a) { - return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1( - a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 2), - unpacket_traits::size)); + return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 2), + unpacket_traits::size)), (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE double predux_max(const PacketMul2Xd& a) { - return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1( - a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 2), - unpacket_traits::size)); + return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 2), + unpacket_traits::size)), -(std::numeric_limits::max)()); } template -- GitLab From 322e087690e482766ddf2781d37d3a66d3e3f2cd Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 16 Jul 2025 15:22:38 +0000 Subject: [PATCH 04/21] Add EIGEN_RISCV64_DEFAULT_LMUL to control the default LMul for vectors. --- Eigen/src/Core/AssignEvaluator.h | 4 ++-- Eigen/src/Core/CoreEvaluators.h | 2 +- Eigen/src/Core/ProductEvaluators.h | 4 ++-- Eigen/src/Core/Redux.h | 2 +- Eigen/src/Core/util/ConfigureVectorization.h | 7 +++++++ test/vectorization_logic.cpp | 2 +- 6 files changed, 14 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 093ceb435..467347546 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -65,8 +65,8 @@ struct copy_using_evaluator_traits { // TODO distinguish between linear traversal and inner-traversals #ifdef EIGEN_RISCV64_USE_RVV10 - using LinearPacketType = typename find_best_packet::type; - using InnerPacketType = typename find_best_packet::type; + using LinearPacketType = typename find_best_packet::type; + using InnerPacketType = typename find_best_packet::type; #else using LinearPacketType = typename find_best_packet::type; using InnerPacketType = typename find_best_packet::type; diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 87df2b3f4..162175edc 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -1370,7 +1370,7 @@ struct evaluator> typedef typename XprType::Scalar Scalar; // TODO: should check for smaller packet types once we can handle multi-sized packet types #ifdef EIGEN_RISCV64_USE_RVV10 - typedef typename packet_traits::type PacketScalar; + typedef typename packet_traits::type PacketScalar; #else typedef typename packet_traits::type PacketScalar; #endif diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 8da7e0372..1051e77bf 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -534,8 +534,8 @@ struct product_evaluator, ProductTag, DenseShape, }; #ifdef EIGEN_RISCV64_USE_RVV10 - typedef typename find_best_packet::type LhsVecPacketType; - typedef typename find_best_packet::type RhsVecPacketType; + typedef typename find_best_packet::type LhsVecPacketType; + typedef typename find_best_packet::type RhsVecPacketType; #else typedef typename find_best_packet::type LhsVecPacketType; typedef typename find_best_packet::type RhsVecPacketType; diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 716a7c00e..b4a3d8f6d 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -30,7 +30,7 @@ template struct redux_traits { public: #ifdef EIGEN_RISCV64_USE_RVV10 - typedef typename find_best_packet::type PacketType; + typedef typename find_best_packet::type PacketType; #else typedef typename find_best_packet::type PacketType; #endif diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 5be3e8028..b8e63e40e 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -534,6 +534,13 @@ extern "C" { #include #endif +#if defined(__riscv) +// Defines the default LMUL for RISC-V +#ifndef EIGEN_RISCV64_DEFAULT_LMUL +#define EIGEN_RISCV64_DEFAULT_LMUL 4 +#endif +#endif + /** \brief Namespace containing all symbols from the %Eigen library. */ // IWYU pragma: private #include "../InternalHeaderCheck.h" diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 02d46cef8..7f346814b 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -108,7 +108,7 @@ struct vectorization_logic { typedef internal::packet_traits PacketTraits; #ifdef EIGEN_RISCV64_USE_RVV10 - typedef typename internal::packet_traits::type PacketType; + typedef typename internal::packet_traits::type PacketType; #else typedef typename internal::packet_traits::type PacketType; #endif -- GitLab From 6ddcc1b97539a95d3c23b141d3bf749e67fefe8b Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 17 Oct 2025 16:18:41 +0000 Subject: [PATCH 05/21] Remove HasRint, HasCeil, HasFloor and replace with HasRound --- Eigen/src/Core/arch/RVV10/PacketMath.h | 18 ++++++------------ Eigen/src/Core/arch/RVV10/PacketMathFP16.h | 6 ++---- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h index 957f5d9c3..b1a7d01cc 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -994,8 +994,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasFloor = 1, - HasRint = 1, + HasRound = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -1034,8 +1033,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasFloor = 1, - HasRint = 1, + HasRound = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -1074,8 +1072,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasFloor = 1, - HasRint = 1, + HasRound = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -3101,8 +3098,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasFloor = 1, - HasRint = 1, + HasRound = 1, HasLog = 1, HasExp = 1, @@ -3137,8 +3133,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasFloor = 1, - HasRint = 1, + HasRound = 1, HasLog = 1, HasExp = 1, @@ -3173,8 +3168,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasFloor = 1, - HasRint = 1, + HasRound = 1, HasLog = 1, HasExp = 1, diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h index 085952fcd..d551c78e1 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h +++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h @@ -46,8 +46,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasFloor = 1, - HasRint = 1, + HasRound = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -86,8 +85,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasFloor = 1, - HasRint = 1, + HasRound = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, -- GitLab From 0cf675b5643a082b3fafd836f2c4fce318c909ae Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 17 Oct 2025 16:22:32 +0000 Subject: [PATCH 06/21] Since there is no fround function in RVV, turn flag off. --- Eigen/src/Core/arch/RVV10/PacketMath.h | 12 ++++++------ Eigen/src/Core/arch/RVV10/PacketMathFP16.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h index b1a7d01cc..c9ea9bb8c 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -994,7 +994,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 1, + HasRound = 0, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -1033,7 +1033,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 1, + HasRound = 0, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -1072,7 +1072,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 1, + HasRound = 0, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -3098,7 +3098,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 1, + HasRound = 0, HasLog = 1, HasExp = 1, @@ -3133,7 +3133,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 1, + HasRound = 0, HasLog = 1, HasExp = 1, @@ -3168,7 +3168,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 1, + HasRound = 0, HasLog = 1, HasExp = 1, diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h index d551c78e1..9322394bf 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h +++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h @@ -46,7 +46,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 1, + HasRound = 0, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -85,7 +85,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 1, + HasRound = 0, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, -- GitLab From ede26a80644745c5fee9219095c66810e3a67375 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 17 Oct 2025 16:38:11 +0000 Subject: [PATCH 07/21] Fix clang format issue. --- Eigen/src/Core/util/ConfigureVectorization.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index e3d335cdc..cf2bb063e 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -537,7 +537,7 @@ extern "C" { #if defined(__riscv) // Defines the default LMUL for RISC-V #ifndef EIGEN_RISCV64_DEFAULT_LMUL -#define EIGEN_RISCV64_DEFAULT_LMUL 4 +#define EIGEN_RISCV64_DEFAULT_LMUL 4 #endif #endif -- GitLab From bc57183c4993ab3af6851fb76e37e23f16c8ffb3 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 17 Oct 2025 16:53:48 +0000 Subject: [PATCH 08/21] Revert previous HasRound change. clang-format fixes. --- Eigen/src/Core/arch/RVV10/PacketMath.h | 242 +++++++++++++-------- Eigen/src/Core/arch/RVV10/PacketMathFP16.h | 10 +- 2 files changed, 155 insertions(+), 97 deletions(-) diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h index c9ea9bb8c..fcea6c08f 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -51,11 +51,15 @@ typedef vbool4_t PacketMask4; typedef eigen_packet_wrapper PacketXi; typedef eigen_packet_wrapper PacketXu; -typedef eigen_packet_wrapper PacketMul2Xi; -typedef eigen_packet_wrapper PacketMul2Xu; +typedef eigen_packet_wrapper + PacketMul2Xi; +typedef eigen_packet_wrapper + PacketMul2Xu; -typedef eigen_packet_wrapper PacketMul4Xi; -typedef eigen_packet_wrapper PacketMul4Xu; +typedef eigen_packet_wrapper + PacketMul4Xi; +typedef eigen_packet_wrapper + PacketMul4Xu; template <> struct packet_traits : default_packet_traits { @@ -945,18 +949,18 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { template EIGEN_STRONG_INLINE - typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xi>::type - predux_half_dowto4(const PacketMul4Xi& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xi>::type +predux_half_dowto4(const PacketMul4Xi& a) { return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), __riscv_vget_v_i32m4_i32m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE - typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketXi>::type - predux_half_dowto4(const PacketMul2Xi& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXi>::type +predux_half_dowto4(const PacketMul2Xi& a) { return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), unpacket_traits::size); } @@ -964,8 +968,10 @@ EIGEN_STRONG_INLINE /********************************* float32 ************************************/ typedef eigen_packet_wrapper PacketXf; -typedef eigen_packet_wrapper PacketMul2Xf; -typedef eigen_packet_wrapper PacketMul4Xf; +typedef eigen_packet_wrapper + PacketMul2Xf; +typedef eigen_packet_wrapper + PacketMul4Xf; template <> struct packet_traits : default_packet_traits { @@ -994,7 +1000,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 0, + HasRound = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -1033,7 +1039,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 0, + HasRound = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -1072,7 +1078,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 0, + HasRound = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -1164,8 +1170,9 @@ EIGEN_STRONG_INLINE PacketXf pset1frombits(numext::uint32_t from) { template <> EIGEN_STRONG_INLINE PacketXf plset(const float& a) { - PacketXf idx = - __riscv_vfcvt_f_x_v_f32m1(__riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)), unpacket_traits::size); + PacketXf idx = __riscv_vfcvt_f_x_v_f32m1( + __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)), + unpacket_traits::size); return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits::size); } @@ -1434,16 +1441,22 @@ EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) { template <> EIGEN_STRONG_INLINE float predux_min(const PacketXf& a) { - return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( - a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), (std::numeric_limits::max)()); + return ( + std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE float predux_max(const PacketXf& a) { - return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( - a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), -(std::numeric_limits::max)()); + return ( + std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + -(std::numeric_limits::max)()); } template @@ -1517,8 +1530,9 @@ EIGEN_STRONG_INLINE PacketMul4Xf pset1frombits(numext::uint32_t fr template <> EIGEN_STRONG_INLINE PacketMul4Xf plset(const float& a) { - PacketMul4Xf idx = __riscv_vfcvt_f_x_v_f32m4(__riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)), - unpacket_traits::size); + PacketMul4Xf idx = __riscv_vfcvt_f_x_v_f32m4( + __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)), + unpacket_traits::size); return __riscv_vfadd_vf_f32m4(idx, a, unpacket_traits::size); } @@ -1780,15 +1794,21 @@ EIGEN_STRONG_INLINE float predux_mul(const PacketMul4Xf& a) { template <> EIGEN_STRONG_INLINE float predux_min(const PacketMul4Xf& a) { return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1( - a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 4), - unpacket_traits::size)), (std::numeric_limits::max)()); + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 4), + unpacket_traits::size)), + (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE float predux_max(const PacketMul4Xf& a) { return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1( - a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 4), - unpacket_traits::size)), -(std::numeric_limits::max)()); + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 4), + unpacket_traits::size)), + -(std::numeric_limits::max)()); } template @@ -1840,8 +1860,9 @@ EIGEN_STRONG_INLINE PacketMul2Xf pset1frombits(numext::uint32_t fr template <> EIGEN_STRONG_INLINE PacketMul2Xf plset(const float& a) { - PacketMul2Xf idx = __riscv_vfcvt_f_x_v_f32m2(__riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)), - unpacket_traits::size); + PacketMul2Xf idx = __riscv_vfcvt_f_x_v_f32m2( + __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)), + unpacket_traits::size); return __riscv_vfadd_vf_f32m2(idx, a, unpacket_traits::size); } @@ -2100,15 +2121,21 @@ EIGEN_STRONG_INLINE float predux_mul(const PacketMul2Xf& a) { template <> EIGEN_STRONG_INLINE float predux_min(const PacketMul2Xf& a) { return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1( - a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 2), - unpacket_traits::size)), (std::numeric_limits::max)()); + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 2), + unpacket_traits::size)), + (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE float predux_max(const PacketMul2Xf& a) { return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1( - a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 2), - unpacket_traits::size)), -(std::numeric_limits::max)()); + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 2), + unpacket_traits::size)), + -(std::numeric_limits::max)()); } template @@ -2133,18 +2160,18 @@ EIGEN_STRONG_INLINE PacketMul2Xf pldexp(const PacketMul2Xf& a, con template EIGEN_STRONG_INLINE - typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xf>::type - predux_half_dowto4(const PacketMul4Xf& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xf>::type +predux_half_dowto4(const PacketMul4Xf& a) { return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), __riscv_vget_v_f32m4_f32m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE - typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketXf>::type - predux_half_dowto4(const PacketMul2Xf& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXf>::type +predux_half_dowto4(const PacketMul2Xf& a) { return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), unpacket_traits::size); } @@ -2154,11 +2181,15 @@ EIGEN_STRONG_INLINE typedef eigen_packet_wrapper PacketXl; typedef eigen_packet_wrapper PacketXul; -typedef eigen_packet_wrapper PacketMul2Xl; -typedef eigen_packet_wrapper PacketMul2Xul; +typedef eigen_packet_wrapper + PacketMul2Xl; +typedef eigen_packet_wrapper + PacketMul2Xul; -typedef eigen_packet_wrapper PacketMul4Xl; -typedef eigen_packet_wrapper PacketMul4Xul; +typedef eigen_packet_wrapper + PacketMul4Xl; +typedef eigen_packet_wrapper + PacketMul4Xul; template <> struct packet_traits : default_packet_traits { @@ -3049,18 +3080,18 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { template EIGEN_STRONG_INLINE - typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xl>::type - predux_half_dowto4(const PacketMul4Xl& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xl>::type +predux_half_dowto4(const PacketMul4Xl& a) { return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), __riscv_vget_v_i64m4_i64m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE - typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketXl>::type - predux_half_dowto4(const PacketMul2Xl& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXl>::type +predux_half_dowto4(const PacketMul2Xl& a) { return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), unpacket_traits::size); } @@ -3068,8 +3099,10 @@ EIGEN_STRONG_INLINE /********************************* double ************************************/ typedef eigen_packet_wrapper PacketXd; -typedef eigen_packet_wrapper PacketMul2Xd; -typedef eigen_packet_wrapper PacketMul4Xd; +typedef eigen_packet_wrapper + PacketMul2Xd; +typedef eigen_packet_wrapper + PacketMul4Xd; template <> struct packet_traits : default_packet_traits { @@ -3098,7 +3131,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 0, + HasRound = 1, HasLog = 1, HasExp = 1, @@ -3133,7 +3166,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 0, + HasRound = 1, HasLog = 1, HasExp = 1, @@ -3168,7 +3201,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 0, + HasRound = 1, HasLog = 1, HasExp = 1, @@ -3256,8 +3289,9 @@ EIGEN_STRONG_INLINE PacketXd pset1frombits(numext::uint64_t from) { template <> EIGEN_STRONG_INLINE PacketXd plset(const double& a) { - PacketXd idx = - __riscv_vfcvt_f_x_v_f64m1(__riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)), unpacket_traits::size); + PacketXd idx = __riscv_vfcvt_f_x_v_f64m1( + __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)), + unpacket_traits::size); return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits::size); } @@ -3525,16 +3559,22 @@ EIGEN_STRONG_INLINE double predux_mul(const PacketXd& a) { template <> EIGEN_STRONG_INLINE double predux_min(const PacketXd& a) { - return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( - a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), (std::numeric_limits::max)()); + return ( + std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE double predux_max(const PacketXd& a) { - return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( - a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), -(std::numeric_limits::max)()); + return ( + std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + -(std::numeric_limits::max)()); } template @@ -3613,8 +3653,9 @@ EIGEN_STRONG_INLINE PacketMul4Xd pset1frombits(numext::uint64_t fr template <> EIGEN_STRONG_INLINE PacketMul4Xd plset(const double& a) { - PacketMul4Xd idx = __riscv_vfcvt_f_x_v_f64m4(__riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)), - unpacket_traits::size); + PacketMul4Xd idx = __riscv_vfcvt_f_x_v_f64m4( + __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)), + unpacket_traits::size); return __riscv_vfadd_vf_f64m4(idx, a, unpacket_traits::size); } @@ -3877,15 +3918,21 @@ EIGEN_STRONG_INLINE double predux_mul(const PacketMul4Xd& a) { template <> EIGEN_STRONG_INLINE double predux_min(const PacketMul4Xd& a) { return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1( - a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 4), - unpacket_traits::size)), (std::numeric_limits::max)()); + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 4), + unpacket_traits::size)), + (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE double predux_max(const PacketMul4Xd& a) { return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1( - a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 4), - unpacket_traits::size)), -(std::numeric_limits::max)()); + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 4), + unpacket_traits::size)), + -(std::numeric_limits::max)()); } template @@ -3937,8 +3984,9 @@ EIGEN_STRONG_INLINE PacketMul2Xd pset1frombits(numext::uint64_t fr template <> EIGEN_STRONG_INLINE PacketMul2Xd plset(const double& a) { - PacketMul2Xd idx = __riscv_vfcvt_f_x_v_f64m2(__riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)), - unpacket_traits::size); + PacketMul2Xd idx = __riscv_vfcvt_f_x_v_f64m2( + __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)), + unpacket_traits::size); return __riscv_vfadd_vf_f64m2(idx, a, unpacket_traits::size); } @@ -4198,15 +4246,21 @@ EIGEN_STRONG_INLINE double predux_mul(const PacketMul2Xd& a) { template <> EIGEN_STRONG_INLINE double predux_min(const PacketMul2Xd& a) { return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1( - a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 2), - unpacket_traits::size)), (std::numeric_limits::max)()); + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 2), + unpacket_traits::size)), + (std::numeric_limits::max)()); } template <> EIGEN_STRONG_INLINE double predux_max(const PacketMul2Xd& a) { return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1( - a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size / 2), - unpacket_traits::size)), -(std::numeric_limits::max)()); + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 2), + unpacket_traits::size)), + -(std::numeric_limits::max)()); } template @@ -4231,18 +4285,18 @@ EIGEN_STRONG_INLINE PacketMul2Xd pldexp(const PacketMul2Xd& a, con template EIGEN_STRONG_INLINE - typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xd>::type - predux_half_dowto4(const PacketMul4Xd& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xd>::type +predux_half_dowto4(const PacketMul4Xd& a) { return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), __riscv_vget_v_f64m4_f64m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE - typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketXd>::type - predux_half_dowto4(const PacketMul2Xd& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXd>::type +predux_half_dowto4(const PacketMul2Xd& a) { return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), unpacket_traits::size); } @@ -4252,11 +4306,15 @@ EIGEN_STRONG_INLINE typedef eigen_packet_wrapper PacketXs; typedef eigen_packet_wrapper PacketXsu; -typedef eigen_packet_wrapper PacketMul2Xs; -typedef eigen_packet_wrapper PacketMul2Xsu; +typedef eigen_packet_wrapper + PacketMul2Xs; +typedef eigen_packet_wrapper + PacketMul2Xsu; -typedef eigen_packet_wrapper PacketMul4Xs; -typedef eigen_packet_wrapper PacketMul4Xsu; +typedef eigen_packet_wrapper + PacketMul4Xs; +typedef eigen_packet_wrapper + PacketMul4Xsu; template <> struct packet_traits : default_packet_traits { @@ -5149,18 +5207,18 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { template EIGEN_STRONG_INLINE - typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xs>::type - predux_half_dowto4(const PacketMul4Xs& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xs>::type +predux_half_dowto4(const PacketMul4Xs& a) { return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), __riscv_vget_v_i16m4_i16m2(a, 1), unpacket_traits::size); } template EIGEN_STRONG_INLINE - typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketXs>::type - predux_half_dowto4(const PacketMul2Xs& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXs>::type +predux_half_dowto4(const PacketMul2Xs& a) { return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), unpacket_traits::size); } diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h index 9322394bf..9b6564f00 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h +++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h @@ -46,7 +46,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 0, + HasRound = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -85,7 +85,7 @@ struct packet_traits : default_packet_traits { HasCmp = 1, HasDiv = 1, - HasRound = 0, + HasRound = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, @@ -802,9 +802,9 @@ EIGEN_STRONG_INLINE PacketMul2Xh float2half(const PacketMul4Xf& a) { template EIGEN_STRONG_INLINE - typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketXh>::type - predux_half_dowto4(const PacketMul2Xh& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXh>::type +predux_half_dowto4(const PacketMul2Xh& a) { return __riscv_vfadd_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), unpacket_traits::size); } -- GitLab From c63c974dd025f70d66e89d9e0a0a6f005241b289 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 17 Oct 2025 19:29:09 +0000 Subject: [PATCH 09/21] Add pmsub to complex forms in RVV to fix compilation issue. --- Eigen/src/Core/arch/RVV10/Complex.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Eigen/src/Core/arch/RVV10/Complex.h b/Eigen/src/Core/arch/RVV10/Complex.h index 73ef50cc5..e8a782ed1 100644 --- a/Eigen/src/Core/arch/RVV10/Complex.h +++ b/Eigen/src/Core/arch/RVV10/Complex.h @@ -409,6 +409,9 @@ struct conj_helper { EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketMul2Xf& x, const PacketXcf& y, const PacketXcf& c) const { return padd(c, this->pmul(x, y)); } + EIGEN_STRONG_INLINE PacketXcf pmsub(const PacketMul2Xf& x, const PacketXcf& y, const PacketXcf& c) const { + return psub(c, this->pmul(x, y)); + } EIGEN_STRONG_INLINE PacketXcf pmul(const PacketMul2Xf& x, const PacketXcf& y) const { return PacketXcf(Eigen::internal::pmul(x, pcast(y))); } @@ -419,6 +422,9 @@ struct conj_helper { EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketXcf& x, const PacketMul2Xf& y, const PacketXcf& c) const { return padd(c, this->pmul(x, y)); } + EIGEN_STRONG_INLINE PacketXcf pmsub(const PacketXcf& x, const PacketMul2Xf& y, const PacketXcf& c) const { + return psub(c, this->pmul(x, y)); + } EIGEN_STRONG_INLINE PacketXcf pmul(const PacketXcf& x, const PacketMul2Xf& y) const { return PacketXcf(Eigen::internal::pmul(pcast(x), y)); } @@ -691,6 +697,9 @@ struct conj_helper { EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketMul2Xd& x, const PacketXcd& y, const PacketXcd& c) const { return padd(c, this->pmul(x, y)); } + EIGEN_STRONG_INLINE PacketXcd pmsub(const PacketMul2Xd& x, const PacketXcd& y, const PacketXcd& c) const { + return psub(c, this->pmul(x, y)); + } EIGEN_STRONG_INLINE PacketXcd pmul(const PacketMul2Xd& x, const PacketXcd& y) const { return PacketXcd(Eigen::internal::pmul(x, pcast(y))); } @@ -701,6 +710,9 @@ struct conj_helper { EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketXcd& x, const PacketMul2Xd& y, const PacketXcd& c) const { return padd(c, this->pmul(x, y)); } + EIGEN_STRONG_INLINE PacketXcd pmsub(const PacketXcd& x, const PacketMul2Xd& y, const PacketXcd& c) const { + return psub(c, this->pmul(x, y)); + } EIGEN_STRONG_INLINE PacketXcd pmul(const PacketXcd& x, const PacketMul2Xd& y) const { return PacketXcd(Eigen::internal::pmul(pcast(x), y)); } -- GitLab From a08e4179240c8e4f701c640e21ea9f681ee34ffe Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 17 Oct 2025 20:16:27 +0000 Subject: [PATCH 10/21] Inverted pmsub parameters. --- Eigen/src/Core/arch/RVV10/Complex.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/RVV10/Complex.h b/Eigen/src/Core/arch/RVV10/Complex.h index e8a782ed1..a17ac016c 100644 --- a/Eigen/src/Core/arch/RVV10/Complex.h +++ b/Eigen/src/Core/arch/RVV10/Complex.h @@ -423,7 +423,7 @@ struct conj_helper { return padd(c, this->pmul(x, y)); } EIGEN_STRONG_INLINE PacketXcf pmsub(const PacketXcf& x, const PacketMul2Xf& y, const PacketXcf& c) const { - return psub(c, this->pmul(x, y)); + return psub(this->pmul(x, y), c); } EIGEN_STRONG_INLINE PacketXcf pmul(const PacketXcf& x, const PacketMul2Xf& y) const { return PacketXcf(Eigen::internal::pmul(pcast(x), y)); @@ -698,7 +698,7 @@ struct conj_helper { return padd(c, this->pmul(x, y)); } EIGEN_STRONG_INLINE PacketXcd pmsub(const PacketMul2Xd& x, const PacketXcd& y, const PacketXcd& c) const { - return psub(c, this->pmul(x, y)); + return psub(this->pmul(x, y), c); } EIGEN_STRONG_INLINE PacketXcd pmul(const PacketMul2Xd& x, const PacketXcd& y) const { return PacketXcd(Eigen::internal::pmul(x, pcast(y))); @@ -711,7 +711,7 @@ struct conj_helper { return padd(c, this->pmul(x, y)); } EIGEN_STRONG_INLINE PacketXcd pmsub(const PacketXcd& x, const PacketMul2Xd& y, const PacketXcd& c) const { - return psub(c, this->pmul(x, y)); + return psub(this->pmul(x, y), c); } EIGEN_STRONG_INLINE PacketXcd pmul(const PacketXcd& x, const PacketMul2Xd& y) const { return PacketXcd(Eigen::internal::pmul(pcast(x), y)); -- GitLab From 1a69fa82cd0c55a8ff270ce07541099a17035de5 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 17 Oct 2025 20:17:19 +0000 Subject: [PATCH 11/21] Miss one case in previous change. --- Eigen/src/Core/arch/RVV10/Complex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/RVV10/Complex.h b/Eigen/src/Core/arch/RVV10/Complex.h index a17ac016c..2840542f0 100644 --- a/Eigen/src/Core/arch/RVV10/Complex.h +++ b/Eigen/src/Core/arch/RVV10/Complex.h @@ -410,7 +410,7 @@ struct conj_helper { return padd(c, this->pmul(x, y)); } EIGEN_STRONG_INLINE PacketXcf pmsub(const PacketMul2Xf& x, const PacketXcf& y, const PacketXcf& c) const { - return psub(c, this->pmul(x, y)); + return psub(this->pmul(x, y), c); } EIGEN_STRONG_INLINE PacketXcf pmul(const PacketMul2Xf& x, const PacketXcf& y) const { return PacketXcf(Eigen::internal::pmul(x, pcast(y))); -- GitLab From f1298d8b1cfc5cbefe7904f12fb7934de11621df Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 6 Nov 2025 20:32:04 +0000 Subject: [PATCH 12/21] Initial attempt at a single fixed sized packet. --- Eigen/Core | 4 +- Eigen/src/Core/AssignEvaluator.h | 5 - Eigen/src/Core/CoreEvaluators.h | 4 - Eigen/src/Core/GenericPacketMath.h | 2 +- Eigen/src/Core/ProductEvaluators.h | 5 - Eigen/src/Core/Redux.h | 4 - Eigen/src/Core/arch/RVV10/Complex.h | 216 +-- Eigen/src/Core/arch/RVV10/MathFunctions.h | 4 +- Eigen/src/Core/arch/RVV10/PacketMath.h | 1521 +++++++++-------- Eigen/src/Core/arch/RVV10/PacketMathFP16.h | 5 +- Eigen/src/Core/arch/RVV10/TypeCasting.h | 126 +- Eigen/src/Core/products/GeneralMatrixVector.h | 11 - Eigen/src/Core/util/XprHelper.h | 9 +- Eigen/src/Jacobi/Jacobi.h | 15 +- test/packetmath.cpp | 13 - test/vectorization_logic.cpp | 4 - 16 files changed, 974 insertions(+), 974 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index e1e89f985..358af27d3 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -274,7 +274,7 @@ using std::ptrdiff_t; #include "src/Core/arch/RVV10/PacketMath.h" #include "src/Core/arch/RVV10/TypeCasting.h" #include "src/Core/arch/RVV10/MathFunctions.h" -#include "src/Core/arch/RVV10/Complex.h" +//#include "src/Core/arch/RVV10/Complex.h" #if defined EIGEN_VECTORIZE_RVV10FP16 #include "src/Core/arch/RVV10/PacketMathFP16.h" #endif @@ -428,7 +428,7 @@ using std::ptrdiff_t; #endif #if defined(EIGEN_VECTORIZE_RVV10) -#include "src/Core/arch/RVV10/GeneralBlockPanelKernel.h" +//#include "src/Core/arch/RVV10/GeneralBlockPanelKernel.h" #endif #include "src/Core/Select.h" diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 467347546..36f0a9d74 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -64,13 +64,8 @@ struct copy_using_evaluator_traits { static constexpr int OuterStride = outer_stride_at_compile_time::ret; // TODO distinguish between linear traversal and inner-traversals -#ifdef EIGEN_RISCV64_USE_RVV10 - using LinearPacketType = typename find_best_packet::type; - using InnerPacketType = typename find_best_packet::type; -#else using LinearPacketType = typename find_best_packet::type; using InnerPacketType = typename find_best_packet::type; -#endif static constexpr int LinearPacketSize = unpacket_traits::size; static constexpr int InnerPacketSize = unpacket_traits::size; diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 4efe49476..60857e2cc 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -1380,11 +1380,7 @@ struct evaluator> typedef Block XprType; typedef typename XprType::Scalar Scalar; // TODO: should check for smaller packet types once we can handle multi-sized packet types -#ifdef EIGEN_RISCV64_USE_RVV10 - typedef typename packet_traits::type PacketScalar; -#else typedef typename packet_traits::type PacketScalar; -#endif enum { CoeffReadCost = evaluator::CoeffReadCost, diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 6c5370b44..64e11231e 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -105,7 +105,7 @@ struct default_packet_traits { }; }; -template +template struct packet_traits : default_packet_traits { typedef T type; typedef T half; diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 034603816..be55be5e8 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -533,13 +533,8 @@ struct product_evaluator, ProductTag, DenseShape, MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime }; -#ifdef EIGEN_RISCV64_USE_RVV10 - typedef typename find_best_packet::type LhsVecPacketType; - typedef typename find_best_packet::type RhsVecPacketType; -#else typedef typename find_best_packet::type LhsVecPacketType; typedef typename find_best_packet::type RhsVecPacketType; -#endif enum { diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index b4a3d8f6d..4e9ab0e4f 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -29,11 +29,7 @@ namespace internal { template struct redux_traits { public: -#ifdef EIGEN_RISCV64_USE_RVV10 - typedef typename find_best_packet::type PacketType; -#else typedef typename find_best_packet::type PacketType; -#endif enum { PacketSize = unpacket_traits::size, InnerMaxSize = int(Evaluator::IsRowMajor) ? Evaluator::MaxColsAtCompileTime : Evaluator::MaxRowsAtCompileTime, diff --git a/Eigen/src/Core/arch/RVV10/Complex.h b/Eigen/src/Core/arch/RVV10/Complex.h index 2840542f0..b330ca4f8 100644 --- a/Eigen/src/Core/arch/RVV10/Complex.h +++ b/Eigen/src/Core/arch/RVV10/Complex.h @@ -21,15 +21,15 @@ namespace internal { struct PacketXcf { EIGEN_STRONG_INLINE PacketXcf() {} - EIGEN_STRONG_INLINE explicit PacketXcf(const PacketXf& _real, const PacketXf& _imag) : real(_real), imag(_imag) {} + EIGEN_STRONG_INLINE explicit PacketXcf(const PacketMul1Xf& _real, const PacketMul1Xf& _imag) : real(_real), imag(_imag) {} EIGEN_STRONG_INLINE explicit PacketXcf(const PacketMul2Xf& a) : real(__riscv_vget_v_f32m2_f32m1(a, 0)), imag(__riscv_vget_v_f32m2_f32m1(a, 1)) {} - PacketXf real; - PacketXf imag; + PacketMul1Xf real; + PacketMul1Xf imag; }; -template -struct packet_traits, LMul> : default_packet_traits { +template <> +struct packet_traits> : default_packet_traits { typedef PacketXcf type; typedef PacketXcf half; enum { @@ -79,111 +79,111 @@ EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXcf& template <> EIGEN_STRONG_INLINE PacketXcf pset1(const std::complex& from) { - PacketXf real = pset1(from.real()); - PacketXf imag = pset1(from.imag()); + PacketMul1Xf real = pset1(from.real()); + PacketMul1Xf imag = pset1(from.imag()); return PacketXcf(real, imag); } template <> EIGEN_STRONG_INLINE PacketXcf padd(const PacketXcf& a, const PacketXcf& b) { - return PacketXcf(padd(a.real, b.real), padd(a.imag, b.imag)); + return PacketXcf(padd(a.real, b.real), padd(a.imag, b.imag)); } template <> EIGEN_STRONG_INLINE PacketXcf psub(const PacketXcf& a, const PacketXcf& b) { - return PacketXcf(psub(a.real, b.real), psub(a.imag, b.imag)); + return PacketXcf(psub(a.real, b.real), psub(a.imag, b.imag)); } template <> EIGEN_STRONG_INLINE PacketXcf pnegate(const PacketXcf& a) { - return PacketXcf(pnegate(a.real), pnegate(a.imag)); + return PacketXcf(pnegate(a.real), pnegate(a.imag)); } template <> EIGEN_STRONG_INLINE PacketXcf pconj(const PacketXcf& a) { return PacketXcf( a.real, __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vx_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a.imag), - 0x80000000, unpacket_traits::size))); + 0x80000000, unpacket_traits::size))); } template <> EIGEN_STRONG_INLINE PacketXcf pmul(const PacketXcf& a, const PacketXcf& b) { - PacketXf v1 = pmul(a.real, b.real); - PacketXf v2 = pmul(a.imag, b.imag); - PacketXf v3 = pmul(a.real, b.imag); - PacketXf v4 = pmul(a.imag, b.real); - return PacketXcf(psub(v1, v2), padd(v3, v4)); + PacketMul1Xf v1 = pmul(a.real, b.real); + PacketMul1Xf v2 = pmul(a.imag, b.imag); + PacketMul1Xf v3 = pmul(a.real, b.imag); + PacketMul1Xf v4 = pmul(a.imag, b.real); + return PacketXcf(psub(v1, v2), padd(v3, v4)); } template <> EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketXcf& a, const PacketXcf& b, const PacketXcf& c) { - PacketXf v1 = pmadd(a.real, b.real, c.real); - PacketXf v2 = pmul(a.imag, b.imag); - PacketXf v3 = pmadd(a.real, b.imag, c.imag); - PacketXf v4 = pmul(a.imag, b.real); - return PacketXcf(psub(v1, v2), padd(v3, v4)); + PacketMul1Xf v1 = pmadd(a.real, b.real, c.real); + PacketMul1Xf v2 = pmul(a.imag, b.imag); + PacketMul1Xf v3 = pmadd(a.real, b.imag, c.imag); + PacketMul1Xf v4 = pmul(a.imag, b.real); + return PacketXcf(psub(v1, v2), padd(v3, v4)); } template <> EIGEN_STRONG_INLINE PacketXcf pcmp_eq(const PacketXcf& a, const PacketXcf& b) { PacketMask32 eq_both = pand(pcmp_eq_mask(a.real, b.real), pcmp_eq_mask(a.imag, b.imag)); - PacketXf res = pselect(eq_both, ptrue(a.real), pzero(a.real)); + PacketMul1Xf res = pselect(eq_both, ptrue(a.real), pzero(a.real)); return PacketXcf(res, res); } template <> EIGEN_STRONG_INLINE PacketXcf pand(const PacketXcf& a, const PacketXcf& b) { - return PacketXcf(pand(a.real, b.real), pand(a.imag, b.imag)); + return PacketXcf(pand(a.real, b.real), pand(a.imag, b.imag)); } template <> EIGEN_STRONG_INLINE PacketXcf por(const PacketXcf& a, const PacketXcf& b) { - return PacketXcf(por(a.real, b.real), por(a.imag, b.imag)); + return PacketXcf(por(a.real, b.real), por(a.imag, b.imag)); } template <> EIGEN_STRONG_INLINE PacketXcf pxor(const PacketXcf& a, const PacketXcf& b) { - return PacketXcf(pxor(a.real, b.real), pxor(a.imag, b.imag)); + return PacketXcf(pxor(a.real, b.real), pxor(a.imag, b.imag)); } template <> EIGEN_STRONG_INLINE PacketXcf pandnot(const PacketXcf& a, const PacketXcf& b) { - return PacketXcf(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); + return PacketXcf(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); } template <> EIGEN_STRONG_INLINE PacketXcf pload(const std::complex* from) { - vfloat32m1x2_t res = __riscv_vlseg2e32_v_f32m1x2((const float*)from, unpacket_traits::size); + vfloat32m1x2_t res = __riscv_vlseg2e32_v_f32m1x2((const float*)from, unpacket_traits::size); EIGEN_DEBUG_ALIGNED_LOAD return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), __riscv_vget_v_f32m1x2_f32m1(res, 1)); } template <> EIGEN_STRONG_INLINE PacketXcf ploadu(const std::complex* from) { - vfloat32m1x2_t res = __riscv_vlseg2e32_v_f32m1x2((const float*)from, unpacket_traits::size); + vfloat32m1x2_t res = __riscv_vlseg2e32_v_f32m1x2((const float*)from, unpacket_traits::size); EIGEN_DEBUG_UNALIGNED_LOAD return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), __riscv_vget_v_f32m1x2_f32m1(res, 1)); } template <> EIGEN_STRONG_INLINE PacketXcf ploaddup(const std::complex* from) { - PacketXu real_idx = __riscv_vid_v_u32m1(unpacket_traits::size); - real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - PacketXu imag_idx = __riscv_vadd_vx_u32m1(real_idx, sizeof(float), unpacket_traits::size); + PacketMul1Xu real_idx = __riscv_vid_v_u32m1(unpacket_traits::size); + real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + PacketMul1Xu imag_idx = __riscv_vadd_vx_u32m1(real_idx, sizeof(float), unpacket_traits::size); // real_idx = 0 0 2*sizeof(float) 2*sizeof(float) 4*sizeof(float) 4*sizeof(float) ... - return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), - __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); + return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), + __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXcf ploadquad(const std::complex* from) { - PacketXu real_idx = __riscv_vid_v_u32m1(unpacket_traits::size); - real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - PacketXu imag_idx = __riscv_vadd_vx_u32m1(real_idx, sizeof(float), unpacket_traits::size); + PacketMul1Xu real_idx = __riscv_vid_v_u32m1(unpacket_traits::size); + real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + PacketMul1Xu imag_idx = __riscv_vadd_vx_u32m1(real_idx, sizeof(float), unpacket_traits::size); // real_idx = 0 0 2*sizeof(float) 2*sizeof(float) 4*sizeof(float) 4*sizeof(float) ... - return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), - __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); + return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), + __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); } template <> @@ -199,14 +199,14 @@ EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, vfloat32m1x2_t vx2 = __riscv_vundefined_f32m1x2(); vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 0, from.real); vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 1, from.imag); - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vsseg2e32_v_f32m1x2((float*)to, vx2, unpacket_traits::size); + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vsseg2e32_v_f32m1x2((float*)to, vx2, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketXcf pgather, PacketXcf>(const std::complex* from, Index stride) { vfloat32m1x2_t res = - __riscv_vlsseg2e32_v_f32m1x2((const float*)from, 2 * stride * sizeof(float), unpacket_traits::size); + __riscv_vlsseg2e32_v_f32m1x2((const float*)from, 2 * stride * sizeof(float), unpacket_traits::size); return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), __riscv_vget_v_f32m1x2_f32m1(res, 1)); } @@ -216,17 +216,17 @@ EIGEN_DEVICE_FUNC inline void pscatter, PacketXcf>(std::comp vfloat32m1x2_t from_rvv_type = __riscv_vundefined_f32m1x2(); from_rvv_type = __riscv_vset_v_f32m1_f32m1x2(from_rvv_type, 0, from.real); from_rvv_type = __riscv_vset_v_f32m1_f32m1x2(from_rvv_type, 1, from.imag); - __riscv_vssseg2e32_v_f32m1x2((float*)to, 2 * stride * sizeof(float), from_rvv_type, unpacket_traits::size); + __riscv_vssseg2e32_v_f32m1x2((float*)to, 2 * stride * sizeof(float), from_rvv_type, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE std::complex pfirst(const PacketXcf& a) { - return std::complex(pfirst(a.real), pfirst(a.imag)); + return std::complex(pfirst(a.real), pfirst(a.imag)); } template <> EIGEN_STRONG_INLINE PacketXcf preverse(const PacketXcf& a) { - return PacketXcf(preverse(a.real), preverse(a.imag)); + return PacketXcf(preverse(a.real), preverse(a.imag)); } template <> @@ -236,33 +236,33 @@ EIGEN_STRONG_INLINE PacketXcf pcplxflip(const PacketXcf& a) { template <> EIGEN_STRONG_INLINE std::complex predux(const PacketXcf& a) { - return std::complex(predux(a.real), predux(a.imag)); + return std::complex(predux(a.real), predux(a.imag)); } template <> EIGEN_STRONG_INLINE PacketXcf pdiv(const PacketXcf& a, const PacketXcf& b) { PacketXcf b_conj = pconj(b); PacketXcf dividend = pmul(a, b_conj); - PacketXf divider = psub(pmul(b.real, b_conj.real), pmul(b.imag, b_conj.imag)); - return PacketXcf(pdiv(dividend.real, divider), pdiv(dividend.imag, divider)); + PacketMul1Xf divider = psub(pmul(b.real, b_conj.real), pmul(b.imag, b_conj.imag)); + return PacketXcf(pdiv(dividend.real, divider), pdiv(dividend.imag, divider)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - float buffer_real[unpacket_traits::size * N]; - float buffer_imag[unpacket_traits::size * N]; + float buffer_real[unpacket_traits::size * N]; + float buffer_imag[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer_real[i], N * sizeof(float), kernel.packet[i].real, unpacket_traits::size); - __riscv_vsse32(&buffer_imag[i], N * sizeof(float), kernel.packet[i].imag, unpacket_traits::size); + __riscv_vsse32(&buffer_real[i], N * sizeof(float), kernel.packet[i].real, unpacket_traits::size); + __riscv_vsse32(&buffer_imag[i], N * sizeof(float), kernel.packet[i].imag, unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i].real = - __riscv_vle32_v_f32m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle32_v_f32m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); kernel.packet[i].imag = - __riscv_vle32_v_f32m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle32_v_f32m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); } } @@ -434,15 +434,15 @@ struct conj_helper { struct PacketXcd { EIGEN_STRONG_INLINE PacketXcd() {} - EIGEN_STRONG_INLINE explicit PacketXcd(const PacketXd& _real, const PacketXd& _imag) : real(_real), imag(_imag) {} + EIGEN_STRONG_INLINE explicit PacketXcd(const PacketMul1Xd& _real, const PacketMul1Xd& _imag) : real(_real), imag(_imag) {} EIGEN_STRONG_INLINE explicit PacketXcd(const PacketMul2Xd& a) : real(__riscv_vget_v_f64m2_f64m1(a, 0)), imag(__riscv_vget_v_f64m2_f64m1(a, 1)) {} - PacketXd real; - PacketXd imag; + PacketMul1Xd real; + PacketMul1Xd imag; }; -template -struct packet_traits, LMul> : default_packet_traits { +template <> +struct packet_traits> : default_packet_traits { typedef PacketXcd type; typedef PacketXcd half; enum { @@ -492,113 +492,113 @@ EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXcd& template <> EIGEN_STRONG_INLINE PacketXcd pset1(const std::complex& from) { - PacketXd real = pset1(from.real()); - PacketXd imag = pset1(from.imag()); + PacketMul1Xd real = pset1(from.real()); + PacketMul1Xd imag = pset1(from.imag()); return PacketXcd(real, imag); } template <> EIGEN_STRONG_INLINE PacketXcd padd(const PacketXcd& a, const PacketXcd& b) { - return PacketXcd(padd(a.real, b.real), padd(a.imag, b.imag)); + return PacketXcd(padd(a.real, b.real), padd(a.imag, b.imag)); } template <> EIGEN_STRONG_INLINE PacketXcd psub(const PacketXcd& a, const PacketXcd& b) { - return PacketXcd(psub(a.real, b.real), psub(a.imag, b.imag)); + return PacketXcd(psub(a.real, b.real), psub(a.imag, b.imag)); } template <> EIGEN_STRONG_INLINE PacketXcd pnegate(const PacketXcd& a) { - return PacketXcd(pnegate(a.real), pnegate(a.imag)); + return PacketXcd(pnegate(a.real), pnegate(a.imag)); } template <> EIGEN_STRONG_INLINE PacketXcd pconj(const PacketXcd& a) { return PacketXcd( a.real, __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vx_u64m1( - __riscv_vreinterpret_v_f64m1_u64m1(a.imag), 0x8000000000000000, unpacket_traits::size))); + __riscv_vreinterpret_v_f64m1_u64m1(a.imag), 0x8000000000000000, unpacket_traits::size))); } template <> EIGEN_STRONG_INLINE PacketXcd pmul(const PacketXcd& a, const PacketXcd& b) { - PacketXd v1 = pmul(a.real, b.real); - PacketXd v2 = pmul(a.imag, b.imag); - PacketXd v3 = pmul(a.real, b.imag); - PacketXd v4 = pmul(a.imag, b.real); - return PacketXcd(psub(v1, v2), padd(v3, v4)); + PacketMul1Xd v1 = pmul(a.real, b.real); + PacketMul1Xd v2 = pmul(a.imag, b.imag); + PacketMul1Xd v3 = pmul(a.real, b.imag); + PacketMul1Xd v4 = pmul(a.imag, b.real); + return PacketXcd(psub(v1, v2), padd(v3, v4)); } template <> EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketXcd& a, const PacketXcd& b, const PacketXcd& c) { - PacketXd v1 = pmadd(a.real, b.real, c.real); - PacketXd v2 = pmul(a.imag, b.imag); - PacketXd v3 = pmadd(a.real, b.imag, c.imag); - PacketXd v4 = pmul(a.imag, b.real); - return PacketXcd(psub(v1, v2), padd(v3, v4)); + PacketMul1Xd v1 = pmadd(a.real, b.real, c.real); + PacketMul1Xd v2 = pmul(a.imag, b.imag); + PacketMul1Xd v3 = pmadd(a.real, b.imag, c.imag); + PacketMul1Xd v4 = pmul(a.imag, b.real); + return PacketXcd(psub(v1, v2), padd(v3, v4)); } template <> EIGEN_STRONG_INLINE PacketXcd pcmp_eq(const PacketXcd& a, const PacketXcd& b) { PacketMask64 eq_both = pand(pcmp_eq_mask(a.real, b.real), pcmp_eq_mask(a.imag, b.imag)); - PacketXd res = pselect(eq_both, ptrue(a.real), pzero(a.real)); + PacketMul1Xd res = pselect(eq_both, ptrue(a.real), pzero(a.real)); return PacketXcd(res, res); } template <> EIGEN_STRONG_INLINE PacketXcd pand(const PacketXcd& a, const PacketXcd& b) { - return PacketXcd(pand(a.real, b.real), pand(a.imag, b.imag)); + return PacketXcd(pand(a.real, b.real), pand(a.imag, b.imag)); } template <> EIGEN_STRONG_INLINE PacketXcd por(const PacketXcd& a, const PacketXcd& b) { - return PacketXcd(por(a.real, b.real), por(a.imag, b.imag)); + return PacketXcd(por(a.real, b.real), por(a.imag, b.imag)); } template <> EIGEN_STRONG_INLINE PacketXcd pxor(const PacketXcd& a, const PacketXcd& b) { - return PacketXcd(pxor(a.real, b.real), pxor(a.imag, b.imag)); + return PacketXcd(pxor(a.real, b.real), pxor(a.imag, b.imag)); } template <> EIGEN_STRONG_INLINE PacketXcd pandnot(const PacketXcd& a, const PacketXcd& b) { - return PacketXcd(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); + return PacketXcd(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); } template <> EIGEN_STRONG_INLINE PacketXcd pload(const std::complex* from) { - vfloat64m1x2_t res = __riscv_vlseg2e64_v_f64m1x2((const double*)from, unpacket_traits::size); + vfloat64m1x2_t res = __riscv_vlseg2e64_v_f64m1x2((const double*)from, unpacket_traits::size); EIGEN_DEBUG_ALIGNED_LOAD return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), __riscv_vget_v_f64m1x2_f64m1(res, 1)); } template <> EIGEN_STRONG_INLINE PacketXcd ploadu(const std::complex* from) { - vfloat64m1x2_t res = __riscv_vlseg2e64_v_f64m1x2((const double*)from, unpacket_traits::size); + vfloat64m1x2_t res = __riscv_vlseg2e64_v_f64m1x2((const double*)from, unpacket_traits::size); EIGEN_DEBUG_UNALIGNED_LOAD return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), __riscv_vget_v_f64m1x2_f64m1(res, 1)); } template <> EIGEN_STRONG_INLINE PacketXcd ploaddup(const std::complex* from) { - PacketXul real_idx = __riscv_vid_v_u64m1(unpacket_traits::size); + PacketMul1Xul real_idx = __riscv_vid_v_u64m1(unpacket_traits::size); real_idx = - __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffeu, unpacket_traits::size), 3, - unpacket_traits::size); - PacketXul imag_idx = __riscv_vadd_vx_u64m1(real_idx, sizeof(double), unpacket_traits::size); + __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffeu, unpacket_traits::size), 3, + unpacket_traits::size); + PacketMul1Xul imag_idx = __riscv_vadd_vx_u64m1(real_idx, sizeof(double), unpacket_traits::size); // real_idx = 0 0 2*sizeof(double) 2*sizeof(double) 4*sizeof(double) 4*sizeof(double) ... - return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), - __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); + return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), + __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); } template <> EIGEN_STRONG_INLINE PacketXcd ploadquad(const std::complex* from) { - PacketXul real_idx = __riscv_vid_v_u64m1(unpacket_traits::size); + PacketMul1Xul real_idx = __riscv_vid_v_u64m1(unpacket_traits::size); real_idx = - __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffcu, unpacket_traits::size), 2, - unpacket_traits::size); - PacketXul imag_idx = __riscv_vadd_vx_u64m1(real_idx, sizeof(double), unpacket_traits::size); + __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffcu, unpacket_traits::size), 2, + unpacket_traits::size); + PacketMul1Xul imag_idx = __riscv_vadd_vx_u64m1(real_idx, sizeof(double), unpacket_traits::size); // real_idx = 0 0 2*sizeof(double) 2*sizeof(double) 4*sizeof(double) 4*sizeof(double) ... - return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), - __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); + return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), + __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); } template <> @@ -614,14 +614,14 @@ EIGEN_STRONG_INLINE void pstoreu >(std::complex* to vfloat64m1x2_t vx2 = __riscv_vundefined_f64m1x2(); vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 0, from.real); vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 1, from.imag); - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vsseg2e64_v_f64m1x2((double*)to, vx2, unpacket_traits::size); + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vsseg2e64_v_f64m1x2((double*)to, vx2, unpacket_traits::size); } template <> EIGEN_DEVICE_FUNC inline PacketXcd pgather, PacketXcd>(const std::complex* from, Index stride) { vfloat64m1x2_t res = - __riscv_vlsseg2e64_v_f64m1x2((const double*)from, 2 * stride * sizeof(double), unpacket_traits::size); + __riscv_vlsseg2e64_v_f64m1x2((const double*)from, 2 * stride * sizeof(double), unpacket_traits::size); return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), __riscv_vget_v_f64m1x2_f64m1(res, 1)); } @@ -632,17 +632,17 @@ EIGEN_DEVICE_FUNC inline void pscatter, PacketXcd>(std::com from_rvv_type = __riscv_vset_v_f64m1_f64m1x2(from_rvv_type, 0, from.real); from_rvv_type = __riscv_vset_v_f64m1_f64m1x2(from_rvv_type, 1, from.imag); __riscv_vssseg2e64_v_f64m1x2((double*)to, 2 * stride * sizeof(double), from_rvv_type, - unpacket_traits::size); + unpacket_traits::size); } template <> EIGEN_STRONG_INLINE std::complex pfirst(const PacketXcd& a) { - return std::complex(pfirst(a.real), pfirst(a.imag)); + return std::complex(pfirst(a.real), pfirst(a.imag)); } template <> EIGEN_STRONG_INLINE PacketXcd preverse(const PacketXcd& a) { - return PacketXcd(preverse(a.real), preverse(a.imag)); + return PacketXcd(preverse(a.real), preverse(a.imag)); } template <> @@ -652,33 +652,33 @@ EIGEN_STRONG_INLINE PacketXcd pcplxflip(const PacketXcd& a) { template <> EIGEN_STRONG_INLINE std::complex predux(const PacketXcd& a) { - return std::complex(predux(a.real), predux(a.imag)); + return std::complex(predux(a.real), predux(a.imag)); } template <> EIGEN_STRONG_INLINE PacketXcd pdiv(const PacketXcd& a, const PacketXcd& b) { PacketXcd b_conj = pconj(b); PacketXcd dividend = pmul(a, b_conj); - PacketXd divider = psub(pmul(b.real, b_conj.real), pmul(b.imag, b_conj.imag)); - return PacketXcd(pdiv(dividend.real, divider), pdiv(dividend.imag, divider)); + PacketMul1Xd divider = psub(pmul(b.real, b_conj.real), pmul(b.imag, b_conj.imag)); + return PacketXcd(pdiv(dividend.real, divider), pdiv(dividend.imag, divider)); } template EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - double buffer_real[unpacket_traits::size * N]; - double buffer_imag[unpacket_traits::size * N]; + double buffer_real[unpacket_traits::size * N]; + double buffer_imag[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer_real[i], N * sizeof(double), kernel.packet[i].real, unpacket_traits::size); - __riscv_vsse64(&buffer_imag[i], N * sizeof(double), kernel.packet[i].imag, unpacket_traits::size); + __riscv_vsse64(&buffer_real[i], N * sizeof(double), kernel.packet[i].real, unpacket_traits::size); + __riscv_vsse64(&buffer_imag[i], N * sizeof(double), kernel.packet[i].imag, unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i].real = - __riscv_vle64_v_f64m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle64_v_f64m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); kernel.packet[i].imag = - __riscv_vle64_v_f64m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle64_v_f64m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); } } diff --git a/Eigen/src/Core/arch/RVV10/MathFunctions.h b/Eigen/src/Core/arch/RVV10/MathFunctions.h index a77496540..6bbf8fe72 100644 --- a/Eigen/src/Core/arch/RVV10/MathFunctions.h +++ b/Eigen/src/Core/arch/RVV10/MathFunctions.h @@ -16,11 +16,11 @@ namespace Eigen { namespace internal { -EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketXf) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketMul1Xf) EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketMul2Xf) EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketMul4Xf) -EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketXd) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketMul1Xd) EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketMul2Xd) EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketMul4Xd) diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h index fcea6c08f..e12a1a4a1 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -48,8 +48,8 @@ typedef vbool8_t PacketMask8; typedef vbool4_t PacketMask4; /********************************* int32 **************************************/ -typedef eigen_packet_wrapper PacketXi; -typedef eigen_packet_wrapper PacketXu; +typedef eigen_packet_wrapper PacketMul1Xi; +typedef eigen_packet_wrapper PacketMul1Xu; typedef eigen_packet_wrapper PacketMul2Xi; @@ -61,10 +61,14 @@ typedef eigen_packet_wrapper PacketMul4Xu; +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 +typedef PacketMul1Xi PacketXi; +typedef PacketMul1Xu PacketXu; + template <> struct packet_traits : default_packet_traits { - typedef PacketXi type; - typedef PacketXi half; // Half not implemented yet + typedef PacketMul1Xi type; + typedef PacketMul1Xi half; // Half not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -87,10 +91,14 @@ struct packet_traits : default_packet_traits { }; }; +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 +typedef PacketMul2Xi PacketXi; +typedef PacketMul2Xu PacketXu; + template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits { typedef PacketMul2Xi type; - typedef PacketXi half; + typedef PacketMul1Xi half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -113,8 +121,12 @@ struct packet_traits : default_packet_traits { }; }; +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 +typedef PacketMul4Xi PacketXi; +typedef PacketMul4Xu PacketXu; + template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits { typedef PacketMul4Xi type; typedef PacketMul2Xi half; enum { @@ -138,11 +150,12 @@ struct packet_traits : default_packet_traits { HasReduxp = 0 }; }; +#endif template <> -struct unpacket_traits { +struct unpacket_traits { typedef numext::int32_t type; - typedef PacketXi half; // Half not yet implemented + typedef PacketMul1Xi half; // Half not yet implemented typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -156,7 +169,7 @@ struct unpacket_traits { template <> struct unpacket_traits { typedef numext::int32_t type; - typedef PacketXi half; + typedef PacketMul1Xi half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -188,267 +201,267 @@ EIGEN_STRONG_INLINE void prefetch(const numext::int32_t* addr) #endif } -/********************************* PacketXi ************************************/ +/********************************* PacketMul1Xi ************************************/ template <> -EIGEN_STRONG_INLINE PacketXi pset1(const numext::int32_t& from) { - return __riscv_vmv_v_x_i32m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi plset(const numext::int32_t& a) { - PacketXi idx = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)); - return __riscv_vadd_vx_i32m1(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi plset(const numext::int32_t& a) { + PacketMul1Xi idx = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)); + return __riscv_vadd_vx_i32m1(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pzero(const PacketXi& /*a*/) { - return __riscv_vmv_v_x_i32m1(0, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pzero(const PacketMul1Xi& /*a*/) { + return __riscv_vmv_v_x_i32m1(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi padd(const PacketXi& a, const PacketXi& b) { - return __riscv_vadd_vv_i32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi padd(const PacketMul1Xi& a, const PacketMul1Xi& b) { + return __riscv_vadd_vv_i32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi psub(const PacketXi& a, const PacketXi& b) { - return __riscv_vsub(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi psub(const PacketMul1Xi& a, const PacketMul1Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a) { - return __riscv_vneg(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pnegate(const PacketMul1Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a) { +EIGEN_STRONG_INLINE PacketMul1Xi pconj(const PacketMul1Xi& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketXi pmul(const PacketXi& a, const PacketXi& b) { - return __riscv_vmul(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pmul(const PacketMul1Xi& a, const PacketMul1Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pdiv(const PacketXi& a, const PacketXi& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pdiv(const PacketMul1Xi& a, const PacketMul1Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pmadd(const PacketMul1Xi& a, const PacketMul1Xi& b, const PacketMul1Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pmsub(const PacketXi& a, const PacketXi& b, const PacketXi& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pmsub(const PacketMul1Xi& a, const PacketMul1Xi& b, const PacketMul1Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pnmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) { - return __riscv_vnmsub_vv_i32m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pnmadd(const PacketMul1Xi& a, const PacketMul1Xi& b, const PacketMul1Xi& c) { + return __riscv_vnmsub_vv_i32m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pnmsub(const PacketXi& a, const PacketXi& b, const PacketXi& c) { - return __riscv_vnmsub_vv_i32m1(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pnmsub(const PacketMul1Xi& a, const PacketMul1Xi& b, const PacketMul1Xi& c) { + return __riscv_vnmsub_vv_i32m1(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pmin(const PacketXi& a, const PacketXi& b) { - return __riscv_vmin(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pmin(const PacketMul1Xi& a, const PacketMul1Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pmax(const PacketXi& a, const PacketXi& b) { - return __riscv_vmax(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pmax(const PacketMul1Xi& a, const PacketMul1Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pcmp_le(const PacketXi& a, const PacketXi& b) { - PacketMask32 mask = __riscv_vmsle_vv_i32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pcmp_le(const PacketMul1Xi& a, const PacketMul1Xi& b) { + PacketMask32 mask = __riscv_vmsle_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pcmp_lt(const PacketXi& a, const PacketXi& b) { - PacketMask32 mask = __riscv_vmslt_vv_i32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pcmp_lt(const PacketMul1Xi& a, const PacketMul1Xi& b) { + PacketMask32 mask = __riscv_vmslt_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pcmp_eq(const PacketXi& a, const PacketXi& b) { - PacketMask32 mask = __riscv_vmseq_vv_i32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pcmp_eq(const PacketMul1Xi& a, const PacketMul1Xi& b) { + PacketMask32 mask = __riscv_vmseq_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi ptrue(const PacketXi& /*a*/) { - return __riscv_vmv_v_x_i32m1(0xffffffffu, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi ptrue(const PacketMul1Xi& /*a*/) { + return __riscv_vmv_v_x_i32m1(0xffffffffu, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pand(const PacketXi& a, const PacketXi& b) { - return __riscv_vand_vv_i32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pand(const PacketMul1Xi& a, const PacketMul1Xi& b) { + return __riscv_vand_vv_i32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi por(const PacketXi& a, const PacketXi& b) { - return __riscv_vor_vv_i32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi por(const PacketMul1Xi& a, const PacketMul1Xi& b) { + return __riscv_vor_vv_i32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pxor(const PacketXi& a, const PacketXi& b) { - return __riscv_vxor_vv_i32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pxor(const PacketMul1Xi& a, const PacketMul1Xi& b) { + return __riscv_vxor_vv_i32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pandnot(const PacketXi& a, const PacketXi& b) { - return __riscv_vand_vv_i32m1(a, __riscv_vnot_v_i32m1(b, unpacket_traits::size), - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pandnot(const PacketMul1Xi& a, const PacketMul1Xi& b) { + return __riscv_vand_vv_i32m1(a, __riscv_vnot_v_i32m1(b, unpacket_traits::size), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) { - return __riscv_vsra_vx_i32m1(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi parithmetic_shift_right(PacketMul1Xi a) { + return __riscv_vsra_vx_i32m1(a, N, unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) { +EIGEN_STRONG_INLINE PacketMul1Xi plogical_shift_right(PacketMul1Xi a) { return __riscv_vreinterpret_i32m1( - __riscv_vsrl_vx_u32m1(__riscv_vreinterpret_u32m1(a), N, unpacket_traits::size)); + __riscv_vsrl_vx_u32m1(__riscv_vreinterpret_u32m1(a), N, unpacket_traits::size)); } template -EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a) { - return __riscv_vsll_vx_i32m1(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi plogical_shift_left(PacketMul1Xi a) { + return __riscv_vsll_vx_i32m1(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pload(const numext::int32_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi ploadu(const numext::int32_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi ploaddup(const numext::int32_t* from) { - PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi ploaddup(const numext::int32_t* from) { + PacketMul1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... - return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); + return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi ploadquad(const numext::int32_t* from) { - PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi ploadquad(const numext::int32_t* from) { + PacketMul1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketXi& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul1Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketXi& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul1Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketXi pgather(const numext::int32_t* from, Index stride) { - return __riscv_vlse32_v_i32m1(from, stride * sizeof(numext::int32_t), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline PacketMul1Xi pgather(const numext::int32_t* from, Index stride) { + return __riscv_vlse32_v_i32m1(from, stride * sizeof(numext::int32_t), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketXi& from, +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul1Xi& from, Index stride) { - __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketXi& a) { +EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul1Xi& a) { return __riscv_vmv_x_s_i32m1_i32(a); } template <> -EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) { - PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i32m1(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi preverse(const PacketMul1Xi& a) { + PacketMul1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m1(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a) { - PacketXi mask = __riscv_vsra_vx_i32m1(a, 31, unpacket_traits::size); - return __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pabs(const PacketMul1Xi& a) { + PacketMul1Xi mask = __riscv_vsra_vx_i32m1(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux(const PacketXi& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i32m1_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size), - unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul1Xi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m1_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketXi& a) { +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul1Xi& a) { // Multiply the vector by its reverse - PacketXi prod = __riscv_vmul_vv_i32m1(preverse(a), a, unpacket_traits::size); - PacketXi half_prod; + PacketMul1Xi prod = __riscv_vmul_vv_i32m1(preverse(a), a, unpacket_traits::size); + PacketMul1Xi half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_i32m1(prod, 8, unpacket_traits::size); - prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 8, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_i32m1(prod, 4, unpacket_traits::size); - prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_i32m1(prod, 2, unpacket_traits::size); - prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } // Last reduction - half_prod = __riscv_vslidedown_vx_i32m1(prod, 1, unpacket_traits::size); - prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); // The reduction is done to the first element. return pfirst(prod); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketXi& a) { +EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul1Xi& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i32m1_i32m1( - a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketXi& a) { +EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul1Xi& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i32m1_i32m1( - a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int32_t buffer[unpacket_traits::size * N] = {0}; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle32_v_i32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle32_v_i32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } @@ -666,11 +679,11 @@ EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul4Xi& a) template <> EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul4Xi& a) { - PacketXi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), __riscv_vget_v_i32m4_i32m1(a, 1), - unpacket_traits::size); - PacketXi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), __riscv_vget_v_i32m4_i32m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits::size)); + PacketMul1Xi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), __riscv_vget_v_i32m4_i32m1(a, 1), + unpacket_traits::size); + PacketMul1Xi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), __riscv_vget_v_i32m4_i32m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits::size)); } template <> @@ -915,8 +928,8 @@ EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul2Xi& a) template <> EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul2Xi& a) { - return predux_mul(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), - unpacket_traits::size)); + return predux_mul(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), + unpacket_traits::size)); } template <> @@ -959,24 +972,27 @@ predux_half_dowto4(const PacketMul4Xi& a) { template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketXi>::type + PacketMul1Xi>::type predux_half_dowto4(const PacketMul2Xi& a) { return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); } /********************************* float32 ************************************/ -typedef eigen_packet_wrapper PacketXf; +typedef eigen_packet_wrapper PacketMul1Xf; typedef eigen_packet_wrapper PacketMul2Xf; typedef eigen_packet_wrapper PacketMul4Xf; +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 +typedef PacketMul1Xf PacketXf; + template <> struct packet_traits : default_packet_traits { - typedef PacketXf type; - typedef PacketXf half; + typedef PacketMul1Xf type; + typedef PacketMul1Xf half; enum { Vectorizable = 1, @@ -1012,10 +1028,13 @@ struct packet_traits : default_packet_traits { }; }; +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 +typedef PacketMul2Xf PacketXf; + template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits { typedef PacketMul2Xf type; - typedef PacketXf half; + typedef PacketMul1Xf half; enum { Vectorizable = 1, @@ -1051,8 +1070,11 @@ struct packet_traits : default_packet_traits { }; }; +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 +typedef PacketMul4Xf PacketXf; + template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits { typedef PacketMul4Xf type; typedef PacketMul2Xf half; @@ -1089,12 +1111,13 @@ struct packet_traits : default_packet_traits { HasErf = EIGEN_FAST_MATH }; }; +#endif template <> -struct unpacket_traits { +struct unpacket_traits { typedef float type; - typedef PacketXf half; // Half not yet implemented - typedef PacketXi integer_packet; + typedef PacketMul1Xf half; // Half not yet implemented + typedef PacketMul1Xi integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask32 packet_mask; @@ -1110,7 +1133,7 @@ struct unpacket_traits { template <> struct unpacket_traits { typedef float type; - typedef PacketXf half; + typedef PacketMul1Xf half; typedef PacketMul2Xi integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask16 packet_mask; @@ -1141,364 +1164,364 @@ struct unpacket_traits { }; }; -/********************************* PacketXf ************************************/ +/********************************* PacketMul1Xf ************************************/ template <> -EIGEN_STRONG_INLINE PacketXf ptrue(const PacketXf& /*a*/) { - return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(0xffffffffu, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xf ptrue(const PacketMul1Xf& /*a*/) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(0xffffffffu, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketXf pzero(const PacketXf& /*a*/) { - return __riscv_vfmv_v_f_f32m1(0.0f, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pzero(const PacketMul1Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m1(0.0f, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a) { - return __riscv_vfabs_v_f32m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pabs(const PacketMul1Xf& a) { + return __riscv_vfabs_v_f32m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pset1(const float& from) { - return __riscv_vfmv_v_f_f32m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pset1frombits(numext::uint32_t from) { - return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(from, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(from, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketXf plset(const float& a) { - PacketXf idx = __riscv_vfcvt_f_x_v_f32m1( - __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)), - unpacket_traits::size); - return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf plset(const float& a) { + PacketMul1Xf idx = __riscv_vfcvt_f_x_v_f32m1( + __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf padd(const PacketXf& a, const PacketXf& b) { - return __riscv_vfadd_vv_f32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf padd(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vfadd_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf psub(const PacketXf& a, const PacketXf& b) { - return __riscv_vfsub_vv_f32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf psub(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vfsub_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a) { - return __riscv_vfneg_v_f32m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pnegate(const PacketMul1Xf& a) { + return __riscv_vfneg_v_f32m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a) { +EIGEN_STRONG_INLINE PacketMul1Xf pconj(const PacketMul1Xf& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketXf pmul(const PacketXf& a, const PacketXf& b) { - return __riscv_vfmul_vv_f32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pmul(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vfmul_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pdiv(const PacketXf& a, const PacketXf& b) { - return __riscv_vfdiv_vv_f32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pdiv(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vfdiv_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) { - return __riscv_vfmadd_vv_f32m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pmadd(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { + return __riscv_vfmadd_vv_f32m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmsub(const PacketXf& a, const PacketXf& b, const PacketXf& c) { - return __riscv_vfmsub_vv_f32m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pmsub(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { + return __riscv_vfmsub_vv_f32m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pnmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) { - return __riscv_vfnmsub_vv_f32m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pnmadd(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { + return __riscv_vfnmsub_vv_f32m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pnmsub(const PacketXf& a, const PacketXf& b, const PacketXf& c) { - return __riscv_vfnmadd_vv_f32m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pnmsub(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { + return __riscv_vfnmadd_vv_f32m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { - PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); - PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pmin(const PacketMul1Xf& a, const PacketMul1Xf& b) { + PacketMul1Xf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); - return __riscv_vfmin_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmin_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { - return pmin(a, b); +EIGEN_STRONG_INLINE PacketMul1Xf pmin(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return pmin(a, b); } template <> -EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { - return __riscv_vfmin_vv_f32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pmin(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vfmin_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { - PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); - PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pmax(const PacketMul1Xf& a, const PacketMul1Xf& b) { + PacketMul1Xf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); - return __riscv_vfmax_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmax_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { - return pmax(a, b); +EIGEN_STRONG_INLINE PacketMul1Xf pmax(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return pmax(a, b); } template <> -EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { - return __riscv_vfmax_vv_f32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pmax(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vfmax_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pcmp_le(const PacketXf& a, const PacketXf& b) { - PacketMask32 mask = __riscv_vmfle_vv_f32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pcmp_le(const PacketMul1Xf& a, const PacketMul1Xf& b) { + PacketMask32 mask = __riscv_vmfle_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pcmp_lt(const PacketXf& a, const PacketXf& b) { - PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pcmp_lt(const PacketMul1Xf& a, const PacketMul1Xf& b) { + PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pcmp_eq(const PacketXf& a, const PacketXf& b) { - PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pcmp_eq(const PacketMul1Xf& a, const PacketMul1Xf& b) { + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan(const PacketXf& a, const PacketXf& b) { - PacketMask32 mask = __riscv_vmfge_vv_f32m1_b32(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f32m1(ptrue(a), 0.0f, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pcmp_lt_or_nan(const PacketMul1Xf& a, const PacketMul1Xf& b) { + PacketMask32 mask = __riscv_vmfge_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m1(ptrue(a), 0.0f, mask, unpacket_traits::size); } // Logical Operations are not supported for float, so reinterpret casts template <> -EIGEN_STRONG_INLINE PacketXf pand(const PacketXf& a, const PacketXf& b) { +EIGEN_STRONG_INLINE PacketMul1Xf pand(const PacketMul1Xf& a, const PacketMul1Xf& b) { return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( - __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketXf por(const PacketXf& a, const PacketXf& b) { +EIGEN_STRONG_INLINE PacketMul1Xf por(const PacketMul1Xf& a, const PacketMul1Xf& b) { return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vor_vv_u32m1( - __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketXf pxor(const PacketXf& a, const PacketXf& b) { +EIGEN_STRONG_INLINE PacketMul1Xf pxor(const PacketMul1Xf& a, const PacketMul1Xf& b) { return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1( - __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketXf pandnot(const PacketXf& a, const PacketXf& b) { +EIGEN_STRONG_INLINE PacketMul1Xf pandnot(const PacketMul1Xf& a, const PacketMul1Xf& b) { return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( __riscv_vreinterpret_v_f32m1_u32m1(a), - __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size), - unpacket_traits::size)); + __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketXf pload(const float* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf ploadu(const float* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf ploaddup(const float* from) { - PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf ploaddup(const float* from) { + PacketMul1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf ploadquad(const float* from) { - PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf ploadquad(const float* from) { + PacketMul1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(float* to, const PacketXf& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul1Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketXf& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul1Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketXf pgather(const float* from, Index stride) { - return __riscv_vlse32_v_f32m1(from, stride * sizeof(float), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline PacketMul1Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m1(from, stride * sizeof(float), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketXf& from, Index stride) { - __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul1Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE float pfirst(const PacketXf& a) { +EIGEN_STRONG_INLINE float pfirst(const PacketMul1Xf& a) { return __riscv_vfmv_f_s_f32m1_f32(a); } template <> -EIGEN_STRONG_INLINE PacketXf psqrt(const PacketXf& a) { - return __riscv_vfsqrt_v_f32m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf psqrt(const PacketMul1Xf& a) { + return __riscv_vfsqrt_v_f32m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf print(const PacketXf& a) { - const PacketXf limit = pset1(static_cast(1 << 23)); - const PacketXf abs_a = pabs(a); +EIGEN_STRONG_INLINE PacketMul1Xf print(const PacketMul1Xf& a) { + const PacketMul1Xf limit = pset1(static_cast(1 << 23)); + const PacketMul1Xf abs_a = pabs(a); - PacketMask32 mask = __riscv_vmfne_vv_f32m1_b32(a, a, unpacket_traits::size); - const PacketXf x = __riscv_vfadd_vv_f32m1_tumu(mask, a, a, a, unpacket_traits::size); - const PacketXf new_x = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(a, unpacket_traits::size), - unpacket_traits::size); + PacketMask32 mask = __riscv_vmfne_vv_f32m1_b32(a, a, unpacket_traits::size); + const PacketMul1Xf x = __riscv_vfadd_vv_f32m1_tumu(mask, a, a, a, unpacket_traits::size); + const PacketMul1Xf new_x = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(a, unpacket_traits::size), + unpacket_traits::size); - mask = __riscv_vmflt_vv_f32m1_b32(abs_a, limit, unpacket_traits::size); - PacketXf signed_x = __riscv_vfsgnj_vv_f32m1(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m1(x, signed_x, mask, unpacket_traits::size); + mask = __riscv_vmflt_vv_f32m1_b32(abs_a, limit, unpacket_traits::size); + PacketMul1Xf signed_x = __riscv_vfsgnj_vv_f32m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(x, signed_x, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pfloor(const PacketXf& a) { - PacketXf tmp = print(a); +EIGEN_STRONG_INLINE PacketMul1Xf pfloor(const PacketMul1Xf& a) { + PacketMul1Xf tmp = print(a); // If greater, subtract one. - PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f32m1_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); + PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m1_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) { - PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf preverse(const PacketMul1Xf& a) { + PacketMul1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf pfrexp(const PacketXf& a, PacketXf& exponent) { +EIGEN_STRONG_INLINE PacketMul1Xf pfrexp(const PacketMul1Xf& a, PacketMul1Xf& exponent) { return pfrexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE float predux(const PacketXf& a) { +EIGEN_STRONG_INLINE float predux(const PacketMul1Xf& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m1_f32m1( - a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size), unpacket_traits::size)); + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) { +EIGEN_STRONG_INLINE float predux_mul(const PacketMul1Xf& a) { // Multiply the vector by its reverse - PacketXf prod = __riscv_vfmul_vv_f32m1(preverse(a), a, unpacket_traits::size); - PacketXf half_prod; + PacketMul1Xf prod = __riscv_vfmul_vv_f32m1(preverse(a), a, unpacket_traits::size); + PacketMul1Xf half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, unpacket_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, unpacket_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, unpacket_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); } // Last reduction - half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, unpacket_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); // The reduction is done to the first element. return pfirst(prod); } template <> -EIGEN_STRONG_INLINE float predux_min(const PacketXf& a) { +EIGEN_STRONG_INLINE float predux_min(const PacketMul1Xf& a) { return ( std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), (std::numeric_limits::max)()); } template <> -EIGEN_STRONG_INLINE float predux_max(const PacketXf& a) { +EIGEN_STRONG_INLINE float predux_max(const PacketMul1Xf& a) { return ( std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), -(std::numeric_limits::max)()); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - float buffer[unpacket_traits::size * N]; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle32_v_f32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle32_v_f32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template <> -EIGEN_STRONG_INLINE PacketXf pldexp(const PacketXf& a, const PacketXf& exponent) { +EIGEN_STRONG_INLINE PacketMul1Xf pldexp(const PacketMul1Xf& a, const PacketMul1Xf& exponent) { return pldexp_generic(a, exponent); } template <> EIGEN_STRONG_INLINE PacketMask32 por(const PacketMask32& a, const PacketMask32& b) { - return __riscv_vmor_mm_b32(a, b, unpacket_traits::size); + return __riscv_vmor_mm_b32(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMask32 pand(const PacketMask32& a, const PacketMask32& b) { - return __riscv_vmand_mm_b32(a, b, unpacket_traits::size); + return __riscv_vmand_mm_b32(a, b, unpacket_traits::size); } -EIGEN_STRONG_INLINE PacketMask32 pcmp_eq_mask(const PacketXf& a, const PacketXf& b) { - return __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask32 pcmp_eq_mask(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); } -EIGEN_STRONG_INLINE PacketMask32 pcmp_lt_mask(const PacketXf& a, const PacketXf& b) { - return __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask32 pcmp_lt_mask(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); } -EIGEN_STRONG_INLINE PacketXf pselect(const PacketMask32& mask, const PacketXf& a, const PacketXf& b) { - return __riscv_vmerge_vvm_f32m1(b, a, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pselect(const PacketMask32& mask, const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vmerge_vvm_f32m1(b, a, mask, unpacket_traits::size); } /********************************* PacketMul4Xf ************************************/ @@ -1784,11 +1807,11 @@ EIGEN_STRONG_INLINE float predux(const PacketMul4Xf& a) { template <> EIGEN_STRONG_INLINE float predux_mul(const PacketMul4Xf& a) { - PacketXf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), __riscv_vget_v_f32m4_f32m1(a, 1), - unpacket_traits::size); - PacketXf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), __riscv_vget_v_f32m4_f32m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits::size)); + PacketMul1Xf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), __riscv_vget_v_f32m4_f32m1(a, 1), + unpacket_traits::size); + PacketMul1Xf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), __riscv_vget_v_f32m4_f32m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits::size)); } template <> @@ -2114,8 +2137,8 @@ EIGEN_STRONG_INLINE float predux(const PacketMul2Xf& a) { template <> EIGEN_STRONG_INLINE float predux_mul(const PacketMul2Xf& a) { - return predux_mul(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), - unpacket_traits::size)); + return predux_mul(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), + unpacket_traits::size)); } template <> @@ -2170,16 +2193,16 @@ predux_half_dowto4(const PacketMul4Xf& a) { template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketXf>::type + PacketMul1Xf>::type predux_half_dowto4(const PacketMul2Xf& a) { return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); } /********************************* int64 **************************************/ -typedef eigen_packet_wrapper PacketXl; -typedef eigen_packet_wrapper PacketXul; +typedef eigen_packet_wrapper PacketMul1Xl; +typedef eigen_packet_wrapper PacketMul1Xul; typedef eigen_packet_wrapper PacketMul2Xl; @@ -2191,10 +2214,14 @@ typedef eigen_packet_wrapper PacketMul4Xul; +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 +typedef PacketMul1Xl PacketXl; +typedef PacketMul1Xul PacketXul; + template <> struct packet_traits : default_packet_traits { - typedef PacketXl type; - typedef PacketXl half; // Half not implemented yet + typedef PacketMul1Xl type; + typedef PacketMul1Xl half; // Half not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -2217,10 +2244,14 @@ struct packet_traits : default_packet_traits { }; }; +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 +typedef PacketMul2Xl PacketXl; +typedef PacketMul2Xul PacketXul; + template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits { typedef PacketMul2Xl type; - typedef PacketXl half; + typedef PacketMul1Xl half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -2243,8 +2274,12 @@ struct packet_traits : default_packet_traits { }; }; +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 +typedef PacketMul4Xl PacketXl; +typedef PacketMul4Xul PacketXul; + template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits { typedef PacketMul4Xl type; typedef PacketMul2Xl half; enum { @@ -2268,11 +2303,12 @@ struct packet_traits : default_packet_traits { HasReduxp = 0 }; }; +#endif template <> -struct unpacket_traits { +struct unpacket_traits { typedef numext::int64_t type; - typedef PacketXl half; // Half not yet implemented + typedef PacketMul1Xl half; // Half not yet implemented typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -2286,7 +2322,7 @@ struct unpacket_traits { template <> struct unpacket_traits { typedef numext::int64_t type; - typedef PacketXl half; + typedef PacketMul1Xl half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -2318,235 +2354,235 @@ EIGEN_STRONG_INLINE void prefetch(const numext::int64_t* addr) #endif } -/********************************* PacketXl ************************************/ +/********************************* PacketMul1Xl ************************************/ template <> -EIGEN_STRONG_INLINE PacketXl pset1(const numext::int64_t& from) { - return __riscv_vmv_v_x_i64m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl plset(const numext::int64_t& a) { - PacketXl idx = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)); - return __riscv_vadd_vx_i64m1(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl plset(const numext::int64_t& a) { + PacketMul1Xl idx = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)); + return __riscv_vadd_vx_i64m1(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pzero(const PacketXl& /*a*/) { - return __riscv_vmv_v_x_i64m1(0, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pzero(const PacketMul1Xl& /*a*/) { + return __riscv_vmv_v_x_i64m1(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl padd(const PacketXl& a, const PacketXl& b) { - return __riscv_vadd_vv_i64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl padd(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vadd_vv_i64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl psub(const PacketXl& a, const PacketXl& b) { - return __riscv_vsub(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl psub(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pnegate(const PacketXl& a) { - return __riscv_vneg(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pnegate(const PacketMul1Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pconj(const PacketXl& a) { +EIGEN_STRONG_INLINE PacketMul1Xl pconj(const PacketMul1Xl& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketXl pmul(const PacketXl& a, const PacketXl& b) { - return __riscv_vmul(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pmul(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pdiv(const PacketXl& a, const PacketXl& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pdiv(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pmadd(const PacketXl& a, const PacketXl& b, const PacketXl& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pmadd(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pmsub(const PacketXl& a, const PacketXl& b, const PacketXl& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pmsub(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pnmadd(const PacketXl& a, const PacketXl& b, const PacketXl& c) { - return __riscv_vnmsub_vv_i64m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pnmadd(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { + return __riscv_vnmsub_vv_i64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pnmsub(const PacketXl& a, const PacketXl& b, const PacketXl& c) { - return __riscv_vnmsub_vv_i64m1(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pnmsub(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { + return __riscv_vnmsub_vv_i64m1(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pmin(const PacketXl& a, const PacketXl& b) { - return __riscv_vmin(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pmin(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pmax(const PacketXl& a, const PacketXl& b) { - return __riscv_vmax(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pmax(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pcmp_le(const PacketXl& a, const PacketXl& b) { - PacketMask64 mask = __riscv_vmsle_vv_i64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pcmp_le(const PacketMul1Xl& a, const PacketMul1Xl& b) { + PacketMask64 mask = __riscv_vmsle_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pcmp_lt(const PacketXl& a, const PacketXl& b) { - PacketMask64 mask = __riscv_vmslt_vv_i64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pcmp_lt(const PacketMul1Xl& a, const PacketMul1Xl& b) { + PacketMask64 mask = __riscv_vmslt_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pcmp_eq(const PacketXl& a, const PacketXl& b) { - PacketMask64 mask = __riscv_vmseq_vv_i64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pcmp_eq(const PacketMul1Xl& a, const PacketMul1Xl& b) { + PacketMask64 mask = __riscv_vmseq_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl ptrue(const PacketXl& /*a*/) { - return __riscv_vmv_v_x_i64m1(0xffffffffffffffffu, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl ptrue(const PacketMul1Xl& /*a*/) { + return __riscv_vmv_v_x_i64m1(0xffffffffffffffffu, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pand(const PacketXl& a, const PacketXl& b) { - return __riscv_vand_vv_i64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pand(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vand_vv_i64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl por(const PacketXl& a, const PacketXl& b) { - return __riscv_vor_vv_i64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl por(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vor_vv_i64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pxor(const PacketXl& a, const PacketXl& b) { - return __riscv_vxor_vv_i64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pxor(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vxor_vv_i64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pandnot(const PacketXl& a, const PacketXl& b) { - return __riscv_vand_vv_i64m1(a, __riscv_vnot_v_i64m1(b, unpacket_traits::size), - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pandnot(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vand_vv_i64m1(a, __riscv_vnot_v_i64m1(b, unpacket_traits::size), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketXl parithmetic_shift_right(PacketXl a) { - return __riscv_vsra_vx_i64m1(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl parithmetic_shift_right(PacketMul1Xl a) { + return __riscv_vsra_vx_i64m1(a, N, unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketXl plogical_shift_right(PacketXl a) { +EIGEN_STRONG_INLINE PacketMul1Xl plogical_shift_right(PacketMul1Xl a) { return __riscv_vreinterpret_i64m1( - __riscv_vsrl_vx_u64m1(__riscv_vreinterpret_u64m1(a), N, unpacket_traits::size)); + __riscv_vsrl_vx_u64m1(__riscv_vreinterpret_u64m1(a), N, unpacket_traits::size)); } template -EIGEN_STRONG_INLINE PacketXl plogical_shift_left(PacketXl a) { - return __riscv_vsll_vx_i64m1(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl plogical_shift_left(PacketMul1Xl a) { + return __riscv_vsll_vx_i64m1(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pload(const numext::int64_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl ploadu(const numext::int64_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl ploaddup(const numext::int64_t* from) { - PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl ploaddup(const numext::int64_t* from) { + PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... - return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); + return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl ploadquad(const numext::int64_t* from) { - PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl ploadquad(const numext::int64_t* from) { + PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); ; - return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); + return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketXl& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul1Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketXl& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul1Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketXl pgather(const numext::int64_t* from, Index stride) { - return __riscv_vlse64_v_i64m1(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline PacketMul1Xl pgather(const numext::int64_t* from, Index stride) { + return __riscv_vlse64_v_i64m1(from, stride * sizeof(numext::int64_t), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketXl& from, +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul1Xl& from, Index stride) { - __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketXl& a) { +EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul1Xl& a) { return __riscv_vmv_x_s_i64m1_i64(a); } template <> -EIGEN_STRONG_INLINE PacketXl preverse(const PacketXl& a) { - PacketXul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl preverse(const PacketMul1Xl& a) { + PacketMul1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pabs(const PacketXl& a) { - PacketXl mask = __riscv_vsra_vx_i64m1(a, 63, unpacket_traits::size); - return __riscv_vsub_vv_i64m1(__riscv_vxor_vv_i64m1(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pabs(const PacketMul1Xl& a) { + PacketMul1Xl mask = __riscv_vsra_vx_i64m1(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m1(__riscv_vxor_vv_i64m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int64_t predux(const PacketXl& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i64m1_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size), - unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul1Xl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m1_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketXl& a) { +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul1Xl& a) { // Multiply the vector by its reverse - PacketXl prod = __riscv_vmul_vv_i64m1(preverse(a), a, unpacket_traits::size); - PacketXl half_prod; + PacketMul1Xl prod = __riscv_vmul_vv_i64m1(preverse(a), a, unpacket_traits::size); + PacketMul1Xl half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_i64m1(prod, 4, unpacket_traits::size); - prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i64m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_i64m1(prod, 2, unpacket_traits::size); - prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i64m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_i64m1(prod, 1, unpacket_traits::size); - prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i64m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); } // The reduction is done to the first element. @@ -2554,30 +2590,30 @@ EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketXl& a) { } template <> -EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketXl& a) { +EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul1Xl& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i64m1_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketXl& a) { +EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul1Xl& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i64m1_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int64_t buffer[unpacket_traits::size * N] = {0}; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle64_v_i64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle64_v_i64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } @@ -2796,11 +2832,11 @@ EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul4Xl& a) template <> EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul4Xl& a) { - PacketXl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), __riscv_vget_v_i64m4_i64m1(a, 1), - unpacket_traits::size); - PacketXl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), __riscv_vget_v_i64m4_i64m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits::size)); + PacketMul1Xl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), __riscv_vget_v_i64m4_i64m1(a, 1), + unpacket_traits::size); + PacketMul1Xl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), __riscv_vget_v_i64m4_i64m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits::size)); } template <> @@ -3046,8 +3082,8 @@ EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul2Xl& a) template <> EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul2Xl& a) { - return predux_mul(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), - unpacket_traits::size)); + return predux_mul(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), + unpacket_traits::size)); } template <> @@ -3090,24 +3126,27 @@ predux_half_dowto4(const PacketMul4Xl& a) { template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketXl>::type + PacketMul1Xl>::type predux_half_dowto4(const PacketMul2Xl& a) { return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); } /********************************* double ************************************/ -typedef eigen_packet_wrapper PacketXd; +typedef eigen_packet_wrapper PacketMul1Xd; typedef eigen_packet_wrapper PacketMul2Xd; typedef eigen_packet_wrapper PacketMul4Xd; +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 +typedef PacketMul1Xd PacketXd; + template <> struct packet_traits : default_packet_traits { - typedef PacketXd type; - typedef PacketXd half; + typedef PacketMul1Xd type; + typedef PacketMul1Xd half; enum { Vectorizable = 1, @@ -3139,10 +3178,13 @@ struct packet_traits : default_packet_traits { }; }; +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 +typedef PacketMul2Xd PacketXd; + template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits { typedef PacketMul2Xd type; - typedef PacketXd half; + typedef PacketMul1Xd half; enum { Vectorizable = 1, @@ -3174,8 +3216,11 @@ struct packet_traits : default_packet_traits { }; }; +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 +typedef PacketMul4Xd PacketXd; + template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits { typedef PacketMul4Xd type; typedef PacketMul2Xd half; @@ -3208,12 +3253,13 @@ struct packet_traits : default_packet_traits { HasSqrt = 1 }; }; +#endif template <> -struct unpacket_traits { +struct unpacket_traits { typedef double type; - typedef PacketXd half; // Half not yet implemented - typedef PacketXl integer_packet; + typedef PacketMul1Xd half; // Half not yet implemented + typedef PacketMul1Xl integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask64 packet_mask; @@ -3229,7 +3275,7 @@ struct unpacket_traits { template <> struct unpacket_traits { typedef double type; - typedef PacketXd half; + typedef PacketMul1Xd half; typedef PacketMul2Xl integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask32 packet_mask; @@ -3260,297 +3306,297 @@ struct unpacket_traits { }; }; -/********************************* PacketXd ************************************/ +/********************************* PacketMul1Xd ************************************/ template <> -EIGEN_STRONG_INLINE PacketXd ptrue(const PacketXd& /*a*/) { - return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(0xffffffffffffffffu, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xd ptrue(const PacketMul1Xd& /*a*/) { + return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(0xffffffffffffffffu, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketXd pzero(const PacketXd& /*a*/) { - return __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pzero(const PacketMul1Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pabs(const PacketXd& a) { - return __riscv_vfabs_v_f64m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pabs(const PacketMul1Xd& a) { + return __riscv_vfabs_v_f64m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pset1(const double& from) { - return __riscv_vfmv_v_f_f64m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pset1frombits(numext::uint64_t from) { - return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(from, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(from, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketXd plset(const double& a) { - PacketXd idx = __riscv_vfcvt_f_x_v_f64m1( - __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)), - unpacket_traits::size); - return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd plset(const double& a) { + PacketMul1Xd idx = __riscv_vfcvt_f_x_v_f64m1( + __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd padd(const PacketXd& a, const PacketXd& b) { - return __riscv_vfadd_vv_f64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd padd(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vfadd_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd psub(const PacketXd& a, const PacketXd& b) { - return __riscv_vfsub_vv_f64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd psub(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vfsub_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pnegate(const PacketXd& a) { - return __riscv_vfneg_v_f64m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pnegate(const PacketMul1Xd& a) { + return __riscv_vfneg_v_f64m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pconj(const PacketXd& a) { +EIGEN_STRONG_INLINE PacketMul1Xd pconj(const PacketMul1Xd& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketXd pmul(const PacketXd& a, const PacketXd& b) { - return __riscv_vfmul_vv_f64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pmul(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vfmul_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pdiv(const PacketXd& a, const PacketXd& b) { - return __riscv_vfdiv_vv_f64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pdiv(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vfdiv_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pmadd(const PacketXd& a, const PacketXd& b, const PacketXd& c) { - return __riscv_vfmadd_vv_f64m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pmadd(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { + return __riscv_vfmadd_vv_f64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pmsub(const PacketXd& a, const PacketXd& b, const PacketXd& c) { - return __riscv_vfmsub_vv_f64m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pmsub(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { + return __riscv_vfmsub_vv_f64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pnmadd(const PacketXd& a, const PacketXd& b, const PacketXd& c) { - return __riscv_vfnmsub_vv_f64m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pnmadd(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { + return __riscv_vfnmsub_vv_f64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pnmsub(const PacketXd& a, const PacketXd& b, const PacketXd& c) { - return __riscv_vfnmadd_vv_f64m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pnmsub(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { + return __riscv_vfnmadd_vv_f64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { - PacketXd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); - PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pmin(const PacketMul1Xd& a, const PacketMul1Xd& b) { + PacketMul1Xd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); + PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); - return __riscv_vfmin_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmin_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { - return pmin(a, b); +EIGEN_STRONG_INLINE PacketMul1Xd pmin(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return pmin(a, b); } template <> -EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { - return __riscv_vfmin_vv_f64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pmin(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vfmin_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { - PacketXd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); - PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pmax(const PacketMul1Xd& a, const PacketMul1Xd& b) { + PacketMul1Xd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); + PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); - return __riscv_vfmax_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmax_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { - return pmax(a, b); +EIGEN_STRONG_INLINE PacketMul1Xd pmax(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return pmax(a, b); } template <> -EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { - return __riscv_vfmax_vv_f64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pmax(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vfmax_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pcmp_le(const PacketXd& a, const PacketXd& b) { - PacketMask64 mask = __riscv_vmfle_vv_f64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pcmp_le(const PacketMul1Xd& a, const PacketMul1Xd& b) { + PacketMask64 mask = __riscv_vmfle_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pcmp_lt(const PacketXd& a, const PacketXd& b) { - PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pcmp_lt(const PacketMul1Xd& a, const PacketMul1Xd& b) { + PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pcmp_eq(const PacketXd& a, const PacketXd& b) { - PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pcmp_eq(const PacketMul1Xd& a, const PacketMul1Xd& b) { + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pcmp_lt_or_nan(const PacketXd& a, const PacketXd& b) { - PacketMask64 mask = __riscv_vmfge_vv_f64m1_b64(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f64m1(ptrue(a), 0.0, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pcmp_lt_or_nan(const PacketMul1Xd& a, const PacketMul1Xd& b) { + PacketMask64 mask = __riscv_vmfge_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m1(ptrue(a), 0.0, mask, unpacket_traits::size); } // Logical Operations are not supported for double, so reinterpret casts template <> -EIGEN_STRONG_INLINE PacketXd pand(const PacketXd& a, const PacketXd& b) { +EIGEN_STRONG_INLINE PacketMul1Xd pand(const PacketMul1Xd& a, const PacketMul1Xd& b) { return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( - __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketXd por(const PacketXd& a, const PacketXd& b) { +EIGEN_STRONG_INLINE PacketMul1Xd por(const PacketMul1Xd& a, const PacketMul1Xd& b) { return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vor_vv_u64m1( - __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketXd pxor(const PacketXd& a, const PacketXd& b) { +EIGEN_STRONG_INLINE PacketMul1Xd pxor(const PacketMul1Xd& a, const PacketMul1Xd& b) { return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vv_u64m1( - __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketXd pandnot(const PacketXd& a, const PacketXd& b) { +EIGEN_STRONG_INLINE PacketMul1Xd pandnot(const PacketMul1Xd& a, const PacketMul1Xd& b) { return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( __riscv_vreinterpret_v_f64m1_u64m1(a), - __riscv_vnot_v_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size), - unpacket_traits::size)); + __riscv_vnot_v_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketXd pload(const double* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd ploadu(const double* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd ploaddup(const double* from) { - PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd ploaddup(const double* from) { + PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd ploadquad(const double* from) { - PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd ploadquad(const double* from) { + PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); ; - return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(double* to, const PacketXd& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul1Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketXd& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul1Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketXd pgather(const double* from, Index stride) { - return __riscv_vlse64_v_f64m1(from, stride * sizeof(double), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline PacketMul1Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m1(from, stride * sizeof(double), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketXd& from, Index stride) { - __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul1Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE double pfirst(const PacketXd& a) { +EIGEN_STRONG_INLINE double pfirst(const PacketMul1Xd& a) { return __riscv_vfmv_f_s_f64m1_f64(a); } template <> -EIGEN_STRONG_INLINE PacketXd psqrt(const PacketXd& a) { - return __riscv_vfsqrt_v_f64m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd psqrt(const PacketMul1Xd& a) { + return __riscv_vfsqrt_v_f64m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd print(const PacketXd& a) { - const PacketXd limit = pset1(static_cast(1ull << 52)); - const PacketXd abs_a = pabs(a); +EIGEN_STRONG_INLINE PacketMul1Xd print(const PacketMul1Xd& a) { + const PacketMul1Xd limit = pset1(static_cast(1ull << 52)); + const PacketMul1Xd abs_a = pabs(a); - PacketMask64 mask = __riscv_vmfne_vv_f64m1_b64(a, a, unpacket_traits::size); - const PacketXd x = __riscv_vfadd_vv_f64m1_tumu(mask, a, a, a, unpacket_traits::size); - const PacketXd new_x = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1(a, unpacket_traits::size), - unpacket_traits::size); + PacketMask64 mask = __riscv_vmfne_vv_f64m1_b64(a, a, unpacket_traits::size); + const PacketMul1Xd x = __riscv_vfadd_vv_f64m1_tumu(mask, a, a, a, unpacket_traits::size); + const PacketMul1Xd new_x = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1(a, unpacket_traits::size), + unpacket_traits::size); - mask = __riscv_vmflt_vv_f64m1_b64(abs_a, limit, unpacket_traits::size); - PacketXd signed_x = __riscv_vfsgnj_vv_f64m1(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m1(x, signed_x, mask, unpacket_traits::size); + mask = __riscv_vmflt_vv_f64m1_b64(abs_a, limit, unpacket_traits::size); + PacketMul1Xd signed_x = __riscv_vfsgnj_vv_f64m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(x, signed_x, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pfloor(const PacketXd& a) { - PacketXd tmp = print(a); +EIGEN_STRONG_INLINE PacketMul1Xd pfloor(const PacketMul1Xd& a) { + PacketMul1Xd tmp = print(a); // If greater, subtract one. - PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f64m1_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); + PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m1_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd preverse(const PacketXd& a) { - PacketXul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd preverse(const PacketMul1Xd& a) { + PacketMul1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd pfrexp(const PacketXd& a, PacketXd& exponent) { +EIGEN_STRONG_INLINE PacketMul1Xd pfrexp(const PacketMul1Xd& a, PacketMul1Xd& exponent) { return pfrexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE double predux(const PacketXd& a) { +EIGEN_STRONG_INLINE double predux(const PacketMul1Xd& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m1_f64m1( - a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size), unpacket_traits::size)); + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE double predux_mul(const PacketXd& a) { +EIGEN_STRONG_INLINE double predux_mul(const PacketMul1Xd& a) { // Multiply the vector by its reverse - PacketXd prod = __riscv_vfmul_vv_f64m1(preverse(a), a, unpacket_traits::size); - PacketXd half_prod; + PacketMul1Xd prod = __riscv_vfmul_vv_f64m1(preverse(a), a, unpacket_traits::size); + PacketMul1Xd half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_f64m1(prod, 4, unpacket_traits::size); - prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f64m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_f64m1(prod, 2, unpacket_traits::size); - prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f64m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_f64m1(prod, 1, unpacket_traits::size); - prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f64m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); } // The reduction is done to the first element. @@ -3558,70 +3604,70 @@ EIGEN_STRONG_INLINE double predux_mul(const PacketXd& a) { } template <> -EIGEN_STRONG_INLINE double predux_min(const PacketXd& a) { +EIGEN_STRONG_INLINE double predux_min(const PacketMul1Xd& a) { return ( std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), (std::numeric_limits::max)()); } template <> -EIGEN_STRONG_INLINE double predux_max(const PacketXd& a) { +EIGEN_STRONG_INLINE double predux_max(const PacketMul1Xd& a) { return ( std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), -(std::numeric_limits::max)()); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - double buffer[unpacket_traits::size * N]; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle64_v_f64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle64_v_f64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template <> -EIGEN_STRONG_INLINE PacketXd pldexp(const PacketXd& a, const PacketXd& exponent) { +EIGEN_STRONG_INLINE PacketMul1Xd pldexp(const PacketMul1Xd& a, const PacketMul1Xd& exponent) { return pldexp_generic(a, exponent); } template <> EIGEN_STRONG_INLINE PacketMask64 por(const PacketMask64& a, const PacketMask64& b) { - return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); + return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMask64 pandnot(const PacketMask64& a, const PacketMask64& b) { - return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); + return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMask64 pand(const PacketMask64& a, const PacketMask64& b) { - return __riscv_vmand_mm_b64(a, b, unpacket_traits::size); + return __riscv_vmand_mm_b64(a, b, unpacket_traits::size); } -EIGEN_STRONG_INLINE PacketMask64 pcmp_eq_mask(const PacketXd& a, const PacketXd& b) { - return __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask64 pcmp_eq_mask(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); } -EIGEN_STRONG_INLINE PacketMask64 pcmp_lt_mask(const PacketXd& a, const PacketXd& b) { - return __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask64 pcmp_lt_mask(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); } -EIGEN_STRONG_INLINE PacketXd pselect(const PacketMask64& mask, const PacketXd& a, const PacketXd& b) { - return __riscv_vmerge_vvm_f64m1(b, a, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pselect(const PacketMask64& mask, const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vmerge_vvm_f64m1(b, a, mask, unpacket_traits::size); } /********************************* PacketMul4Xd ************************************/ @@ -3908,11 +3954,11 @@ EIGEN_STRONG_INLINE double predux(const PacketMul4Xd& a) { template <> EIGEN_STRONG_INLINE double predux_mul(const PacketMul4Xd& a) { - PacketXd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), __riscv_vget_v_f64m4_f64m1(a, 1), - unpacket_traits::size); - PacketXd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), __riscv_vget_v_f64m4_f64m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits::size)); + PacketMul1Xd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), __riscv_vget_v_f64m4_f64m1(a, 1), + unpacket_traits::size); + PacketMul1Xd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), __riscv_vget_v_f64m4_f64m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits::size)); } template <> @@ -4239,8 +4285,8 @@ EIGEN_STRONG_INLINE double predux(const PacketMul2Xd& a) { template <> EIGEN_STRONG_INLINE double predux_mul(const PacketMul2Xd& a) { - return predux_mul(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), - unpacket_traits::size)); + return predux_mul(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), + unpacket_traits::size)); } template <> @@ -4295,16 +4341,16 @@ predux_half_dowto4(const PacketMul4Xd& a) { template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketXd>::type + PacketMul1Xd>::type predux_half_dowto4(const PacketMul2Xd& a) { return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); } /********************************* short **************************************/ -typedef eigen_packet_wrapper PacketXs; -typedef eigen_packet_wrapper PacketXsu; +typedef eigen_packet_wrapper PacketMul1Xs; +typedef eigen_packet_wrapper PacketMul1Xsu; typedef eigen_packet_wrapper PacketMul2Xs; @@ -4316,10 +4362,14 @@ typedef eigen_packet_wrapper PacketMul4Xsu; +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 +typedef PacketMul1Xs PacketXs; +typedef PacketMul1Xsu PacketXsu; + template <> struct packet_traits : default_packet_traits { - typedef PacketXs type; - typedef PacketXs half; // Half not implemented yet + typedef PacketMul1Xs type; + typedef PacketMul1Xs half; // Half not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -4342,10 +4392,14 @@ struct packet_traits : default_packet_traits { }; }; +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 +typedef PacketMul2Xs PacketXs; +typedef PacketMul2Xsu PacketXsu; + template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits { typedef PacketMul2Xs type; - typedef PacketXs half; + typedef PacketMul1Xs half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -4368,8 +4422,12 @@ struct packet_traits : default_packet_traits { }; }; +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 +typedef PacketMul4Xs PacketXs; +typedef PacketMul4Xsu PacketXsu; + template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits { typedef PacketMul4Xs type; typedef PacketMul2Xs half; enum { @@ -4393,11 +4451,12 @@ struct packet_traits : default_packet_traits { HasReduxp = 0 }; }; +#endif template <> -struct unpacket_traits { +struct unpacket_traits { typedef numext::int16_t type; - typedef PacketXs half; // Half not yet implemented + typedef PacketMul1Xs half; // Half not yet implemented typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -4411,7 +4470,7 @@ struct unpacket_traits { template <> struct unpacket_traits { typedef numext::int16_t type; - typedef PacketXs half; + typedef PacketMul1Xs half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -4443,270 +4502,270 @@ EIGEN_STRONG_INLINE void prefetch(const numext::int16_t* addr) #endif } -/********************************* PacketXs ************************************/ +/********************************* PacketMul1Xs ************************************/ template <> -EIGEN_STRONG_INLINE PacketXs pset1(const numext::int16_t& from) { - return __riscv_vmv_v_x_i16m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs plset(const numext::int16_t& a) { - PacketXs idx = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vid_v_u16m1(unpacket_traits::size)); - return __riscv_vadd_vx_i16m1(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs plset(const numext::int16_t& a) { + PacketMul1Xs idx = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vid_v_u16m1(unpacket_traits::size)); + return __riscv_vadd_vx_i16m1(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pzero(const PacketXs& /*a*/) { - return __riscv_vmv_v_x_i16m1(0, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pzero(const PacketMul1Xs& /*a*/) { + return __riscv_vmv_v_x_i16m1(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs padd(const PacketXs& a, const PacketXs& b) { - return __riscv_vadd_vv_i16m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs padd(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vadd_vv_i16m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs psub(const PacketXs& a, const PacketXs& b) { - return __riscv_vsub(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs psub(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pnegate(const PacketXs& a) { - return __riscv_vneg(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pnegate(const PacketMul1Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pconj(const PacketXs& a) { +EIGEN_STRONG_INLINE PacketMul1Xs pconj(const PacketMul1Xs& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketXs pmul(const PacketXs& a, const PacketXs& b) { - return __riscv_vmul(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pmul(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pdiv(const PacketXs& a, const PacketXs& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pdiv(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pmadd(const PacketXs& a, const PacketXs& b, const PacketXs& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pmadd(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pmsub(const PacketXs& a, const PacketXs& b, const PacketXs& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pmsub(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pnmadd(const PacketXs& a, const PacketXs& b, const PacketXs& c) { - return __riscv_vnmsub_vv_i16m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pnmadd(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { + return __riscv_vnmsub_vv_i16m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pnmsub(const PacketXs& a, const PacketXs& b, const PacketXs& c) { - return __riscv_vnmsub_vv_i16m1(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pnmsub(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { + return __riscv_vnmsub_vv_i16m1(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pmin(const PacketXs& a, const PacketXs& b) { - return __riscv_vmin(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pmin(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pmax(const PacketXs& a, const PacketXs& b) { - return __riscv_vmax(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pmax(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pcmp_le(const PacketXs& a, const PacketXs& b) { - PacketMask16 mask = __riscv_vmsle_vv_i16m1_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pcmp_le(const PacketMul1Xs& a, const PacketMul1Xs& b) { + PacketMask16 mask = __riscv_vmsle_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pcmp_lt(const PacketXs& a, const PacketXs& b) { - PacketMask16 mask = __riscv_vmslt_vv_i16m1_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pcmp_lt(const PacketMul1Xs& a, const PacketMul1Xs& b) { + PacketMask16 mask = __riscv_vmslt_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pcmp_eq(const PacketXs& a, const PacketXs& b) { - PacketMask16 mask = __riscv_vmseq_vv_i16m1_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pcmp_eq(const PacketMul1Xs& a, const PacketMul1Xs& b) { + PacketMask16 mask = __riscv_vmseq_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs ptrue(const PacketXs& /*a*/) { - return __riscv_vmv_v_x_i16m1(static_cast(0xffffu), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs ptrue(const PacketMul1Xs& /*a*/) { + return __riscv_vmv_v_x_i16m1(static_cast(0xffffu), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pand(const PacketXs& a, const PacketXs& b) { - return __riscv_vand_vv_i16m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pand(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vand_vv_i16m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs por(const PacketXs& a, const PacketXs& b) { - return __riscv_vor_vv_i16m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs por(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vor_vv_i16m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pxor(const PacketXs& a, const PacketXs& b) { - return __riscv_vxor_vv_i16m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pxor(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vxor_vv_i16m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pandnot(const PacketXs& a, const PacketXs& b) { - return __riscv_vand_vv_i16m1(a, __riscv_vnot_v_i16m1(b, unpacket_traits::size), - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pandnot(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vand_vv_i16m1(a, __riscv_vnot_v_i16m1(b, unpacket_traits::size), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketXs parithmetic_shift_right(PacketXs a) { - return __riscv_vsra_vx_i16m1(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs parithmetic_shift_right(PacketMul1Xs a) { + return __riscv_vsra_vx_i16m1(a, N, unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketXs plogical_shift_right(PacketXs a) { +EIGEN_STRONG_INLINE PacketMul1Xs plogical_shift_right(PacketMul1Xs a) { return __riscv_vreinterpret_i16m1( - __riscv_vsrl_vx_u16m1(__riscv_vreinterpret_u16m1(a), N, unpacket_traits::size)); + __riscv_vsrl_vx_u16m1(__riscv_vreinterpret_u16m1(a), N, unpacket_traits::size)); } template -EIGEN_STRONG_INLINE PacketXs plogical_shift_left(PacketXs a) { - return __riscv_vsll_vx_i16m1(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs plogical_shift_left(PacketMul1Xs a) { + return __riscv_vsll_vx_i16m1(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pload(const numext::int16_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs ploadu(const numext::int16_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs ploaddup(const numext::int16_t* from) { - PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); - idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs ploaddup(const numext::int16_t* from) { + PacketMul1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... - return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); + return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs ploadquad(const numext::int16_t* from) { - PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs ploadquad(const numext::int16_t* from) { + PacketMul1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketXs& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul1Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketXs& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul1Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketXs pgather(const numext::int16_t* from, Index stride) { - return __riscv_vlse16_v_i16m1(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline PacketMul1Xs pgather(const numext::int16_t* from, Index stride) { + return __riscv_vlse16_v_i16m1(from, stride * sizeof(numext::int16_t), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketXs& from, +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul1Xs& from, Index stride) { - __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketXs& a) { +EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul1Xs& a) { return __riscv_vmv_x_s_i16m1_i16(a); } template <> -EIGEN_STRONG_INLINE PacketXs preverse(const PacketXs& a) { - PacketXsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs preverse(const PacketMul1Xs& a) { + PacketMul1Xsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXs pabs(const PacketXs& a) { - PacketXs mask = __riscv_vsra_vx_i16m1(a, 15, unpacket_traits::size); - return __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pabs(const PacketMul1Xs& a) { + PacketMul1Xs mask = __riscv_vsra_vx_i16m1(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux(const PacketXs& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size), - unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul1Xs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketXs& a) { +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul1Xs& a) { // Multiply the vector by its reverse - PacketXs prod = __riscv_vmul_vv_i16m1(preverse(a), a, unpacket_traits::size); - PacketXs half_prod; + PacketMul1Xs prod = __riscv_vmul_vv_i16m1(preverse(a), a, unpacket_traits::size); + PacketMul1Xs half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_i16m1(prod, 16, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 16, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_i16m1(prod, 8, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 8, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_i16m1(prod, 4, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); } // Last reduction - half_prod = __riscv_vslidedown_vx_i16m1(prod, 2, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); - half_prod = __riscv_vslidedown_vx_i16m1(prod, 1, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); // The reduction is done to the first element. return pfirst(prod); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketXs& a) { +EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul1Xs& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i16m1_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketXs& a) { +EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul1Xs& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i16m1_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int16_t buffer[unpacket_traits::size * N] = {0}; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle16_v_i16m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle16_v_i16m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } @@ -4924,11 +4983,11 @@ EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul4Xs& a) template <> EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul4Xs& a) { - PacketXs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), __riscv_vget_v_i16m4_i16m1(a, 1), - unpacket_traits::size); - PacketXs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), __riscv_vget_v_i16m4_i16m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits::size)); + PacketMul1Xs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), __riscv_vget_v_i16m4_i16m1(a, 1), + unpacket_traits::size); + PacketMul1Xs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), __riscv_vget_v_i16m4_i16m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits::size)); } template <> @@ -5173,8 +5232,8 @@ EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul2Xs& a) template <> EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul2Xs& a) { - return predux_mul(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), - unpacket_traits::size)); + return predux_mul(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), + unpacket_traits::size)); } template <> @@ -5217,10 +5276,10 @@ predux_half_dowto4(const PacketMul4Xs& a) { template EIGEN_STRONG_INLINE typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketXs>::type + PacketMul1Xs>::type predux_half_dowto4(const PacketMul2Xs& a) { return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); } } // namespace internal diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h index 9b6564f00..1857e48c9 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h +++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h @@ -19,6 +19,7 @@ namespace internal { typedef vfloat16m1_t PacketXh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); typedef vfloat16m2_t PacketMul2Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 template <> struct packet_traits : default_packet_traits { typedef PacketXh type; @@ -58,8 +59,9 @@ struct packet_traits : default_packet_traits { }; }; +#else template <> -struct packet_traits : default_packet_traits { +struct packet_traits : default_packet_traits { typedef PacketMul2Xh type; typedef PacketXh half; @@ -96,6 +98,7 @@ struct packet_traits : default_packet_traits { HasErf = 0 }; }; +#endif template <> struct unpacket_traits { diff --git a/Eigen/src/Core/arch/RVV10/TypeCasting.h b/Eigen/src/Core/arch/RVV10/TypeCasting.h index 67bc99d0b..eeb9141b4 100644 --- a/Eigen/src/Core/arch/RVV10/TypeCasting.h +++ b/Eigen/src/Core/arch/RVV10/TypeCasting.h @@ -29,22 +29,22 @@ struct type_casting_traits { }; template <> -EIGEN_STRONG_INLINE PacketXf pcast(const PacketXi& a) { - return __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pcast(const PacketMul1Xi& a) { + return __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXi pcast(const PacketXf& a) { - return __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xi pcast(const PacketMul1Xf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXf preinterpret(const PacketXi& a) { +EIGEN_STRONG_INLINE PacketMul1Xf preinterpret(const PacketMul1Xi& a) { return __riscv_vreinterpret_v_i32m1_f32m1(a); } template <> -EIGEN_STRONG_INLINE PacketXi preinterpret(const PacketXf& a) { +EIGEN_STRONG_INLINE PacketMul1Xi preinterpret(const PacketMul1Xf& a) { return __riscv_vreinterpret_v_f32m1_i32m1(a); } @@ -89,55 +89,55 @@ EIGEN_STRONG_INLINE PacketMul2Xi preinterpret(const } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXi& a, const PacketXi& b, const PacketXi& c, - const PacketXi& d) { +EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketMul1Xi& a, const PacketMul1Xi& b, const PacketMul1Xi& c, + const PacketMul1Xi& d) { return __riscv_vcreate_v_i32m1_i32m4(a, b, c, d); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXi& a, const PacketXi& b, const PacketXi& c, - const PacketXi& d) { - return __riscv_vcreate_v_f32m1_f32m4(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f32m1(c, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f32m1(d, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketMul1Xi& a, const PacketMul1Xi& b, const PacketMul1Xi& c, + const PacketMul1Xi& d) { + return __riscv_vcreate_v_f32m1_f32m4(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(c, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(d, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXf& a, const PacketXf& b, const PacketXf& c, - const PacketXf& d) { +EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c, + const PacketMul1Xf& d) { return __riscv_vcreate_v_f32m1_f32m4(a, b, c, d); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXf& a, const PacketXf& b, const PacketXf& c, - const PacketXf& d) { - return __riscv_vcreate_v_i32m1_i32m4(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i32m1(c, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i32m1(d, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c, + const PacketMul1Xf& d) { + return __riscv_vcreate_v_i32m1_i32m4(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(d, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketXi& a, const PacketXi& b) { +EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketMul1Xi& a, const PacketMul1Xi& b) { return __riscv_vcreate_v_i32m1_i32m2(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXi& a, const PacketXi& b) { - return __riscv_vcreate_v_f32m1_f32m2(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketMul1Xi& a, const PacketMul1Xi& b) { + return __riscv_vcreate_v_f32m1_f32m2(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXf& a, const PacketXf& b) { +EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketMul1Xf& a, const PacketMul1Xf& b) { return __riscv_vcreate_v_f32m1_f32m2(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketXf& a, const PacketXf& b) { - return __riscv_vcreate_v_i32m1_i32m2(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vcreate_v_i32m1_i32m2(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size)); } /********************************* 64 bits ************************************/ @@ -153,22 +153,22 @@ struct type_casting_traits { }; template <> -EIGEN_STRONG_INLINE PacketXd pcast(const PacketXl& a) { - return __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pcast(const PacketMul1Xl& a) { + return __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXl pcast(const PacketXd& a) { - return __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pcast(const PacketMul1Xd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketXd preinterpret(const PacketXl& a) { +EIGEN_STRONG_INLINE PacketMul1Xd preinterpret(const PacketMul1Xl& a) { return __riscv_vreinterpret_v_i64m1_f64m1(a); } template <> -EIGEN_STRONG_INLINE PacketXl preinterpret(const PacketXd& a) { +EIGEN_STRONG_INLINE PacketMul1Xl preinterpret(const PacketMul1Xd& a) { return __riscv_vreinterpret_v_f64m1_i64m1(a); } @@ -213,68 +213,68 @@ EIGEN_STRONG_INLINE PacketMul2Xl preinterpret(const } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXl& a, const PacketXl& b, const PacketXl& c, - const PacketXl& d) { +EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c, + const PacketMul1Xl& d) { return __riscv_vcreate_v_i64m1_i64m4(a, b, c, d); ; } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXl& a, const PacketXl& b, const PacketXl& c, - const PacketXl& d) { - return __riscv_vcreate_v_f64m1_f64m4(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f64m1(c, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f64m1(d, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c, + const PacketMul1Xl& d) { + return __riscv_vcreate_v_f64m1_f64m4(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(c, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(d, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXd& a, const PacketXd& b, const PacketXd& c, - const PacketXd& d) { +EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c, + const PacketMul1Xd& d) { return __riscv_vcreate_v_f64m1_f64m4(a, b, c, d); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXd& a, const PacketXd& b, const PacketXd& c, - const PacketXd& d) { - return __riscv_vcreate_v_i64m1_i64m4(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i64m1(c, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i64m1(d, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c, + const PacketMul1Xd& d) { + return __riscv_vcreate_v_i64m1_i64m4(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(d, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketXl& a, const PacketXl& b) { +EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketMul1Xl& a, const PacketMul1Xl& b) { return __riscv_vcreate_v_i64m1_i64m2(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXl& a, const PacketXl& b) { - return __riscv_vcreate_v_f64m1_f64m2(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vcreate_v_f64m1_f64m2(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXd& a, const PacketXd& b) { +EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketMul1Xd& a, const PacketMul1Xd& b) { return __riscv_vcreate_v_f64m1_f64m2(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketXd& a, const PacketXd& b) { - return __riscv_vcreate_v_i64m1_i64m2(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vcreate_v_i64m1_i64m2(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size)); } /********************************* 16 bits ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketXs& a, const PacketXs& b) { +EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketMul1Xs& a, const PacketMul1Xs& b) { return __riscv_vcreate_v_i16m1_i16m2(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXs& a, const PacketXs& b, const PacketXs& c, - const PacketXs& d) { +EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c, + const PacketMul1Xs& d) { return __riscv_vcreate_v_i16m1_i16m4(a, b, c, d); } diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index a691d092e..ba72a8a4f 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -38,21 +38,10 @@ template ::ReturnType ResScalar; -#ifdef EIGEN_RISCV64_USE_RVV10 -#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ - typedef typename std::conditional_t< \ - NumTraits::IsComplex || NumTraits::IsComplex, \ - typename packet_traits::type, \ - typename gemv_packet_cond::type, \ - typename packet_traits::half, \ - typename unpacket_traits::half>::half>::type> \ - name##Packet##postfix -#else #define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ typedef typename gemv_packet_cond< \ packet_size, typename packet_traits::type, typename packet_traits::half, \ typename unpacket_traits::half>::half>::type name##Packet##postfix -#endif PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index e91a14e9d..a0e160eba 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -264,7 +264,7 @@ struct functor_cost { static constexpr Index Cost = plain_enum_max(nested_functor_cost::Cost, 1); }; -template +template struct packet_traits; template @@ -285,12 +285,9 @@ struct find_best_packet_helper { typedef typename find_best_packet_helper::half>::type type; }; -template -struct find_best_packet; - -template +template struct find_best_packet { - typedef typename find_best_packet_helper::type>::type type; + typedef typename find_best_packet_helper::type>::type type; }; template { static inline void run(Scalar* x, Index incrx, Scalar* y, Index incry, Index size, OtherScalar c, OtherScalar s) { -#ifdef EIGEN_RISCV64_USE_RVV10 - typedef - typename std::conditional_t::IsComplex || NumTraits::IsComplex, - typename packet_traits::type, typename packet_traits::type> - Packet; - typedef typename std::conditional_t::IsComplex || NumTraits::IsComplex, - typename packet_traits::type, - typename packet_traits::type> - OtherPacket; - - constexpr Index PacketSize = unpacket_traits::size; -#else typedef typename packet_traits::type Packet; typedef typename packet_traits::type OtherPacket; - constexpr Index PacketSize = packet_traits::size; -#endif constexpr int RequiredAlignment = (std::max)(unpacket_traits::alignment, unpacket_traits::alignment); + constexpr Index PacketSize = packet_traits::size; /*** dynamic-size vectorized paths ***/ if (size >= 2 * PacketSize && SizeAtCompileTime == Dynamic && ((incrx == 1 && incry == 1) || PacketSize == 1)) { diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 86d88ce6c..f21c72621 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -1815,19 +1815,6 @@ EIGEN_DECLARE_TEST(packetmath) { CALL_SUBTEST_14((packetmath::type>())); CALL_SUBTEST_14((packetmath_scatter_gather::type>())); CALL_SUBTEST_15(test::runner::run()); -#ifdef EIGEN_RISCV64_USE_RVV10 - CALL_SUBTEST_16((test::runner::type>::run())); - CALL_SUBTEST_17((test::runner::type>::run())); - CALL_SUBTEST_18((test::runner::type>::run())); - CALL_SUBTEST_19((test::runner::type>::run())); - CALL_SUBTEST_20((test::runner::type>::run())); - CALL_SUBTEST_21((test::runner::type>::run())); - CALL_SUBTEST_22((test::runner::type>::run())); - CALL_SUBTEST_23((test::runner::type>::run())); - CALL_SUBTEST_24((test::runner::type>::run())); - CALL_SUBTEST_25((test::runner::type>::run())); - CALL_SUBTEST_26((test::runner::type>::run())); -#endif g_first_pass = false; } } diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 7f346814b..724fa40ba 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -107,11 +107,7 @@ template ::Vector struct vectorization_logic { typedef internal::packet_traits PacketTraits; -#ifdef EIGEN_RISCV64_USE_RVV10 - typedef typename internal::packet_traits::type PacketType; -#else typedef typename internal::packet_traits::type PacketType; -#endif typedef typename internal::unpacket_traits::half HalfPacketType; enum { PacketSize = internal::unpacket_traits::size, -- GitLab From ce896ac1702dbce0f411dda2b31857fec015c922 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 7 Nov 2025 13:31:17 +0000 Subject: [PATCH 13/21] Increase stack size to prevent assertion. --- Eigen/src/Core/util/ConfigureVectorization.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index cf2bb063e..9944887a3 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -442,6 +442,9 @@ extern "C" { #error "Eigen requires a fixed RVV vector length but -mrvv-vector-bits=zvl is not set." #endif +#undef EIGEN_STACK_ALLOCATION_LIMIT +#define EIGEN_STACK_ALLOCATION_LIMIT 196608 + #if defined(__riscv_zvfh) && defined(__riscv_zfh) #define EIGEN_VECTORIZE_RVV10FP16 #elif defined(__riscv_zvfh) -- GitLab From 9ee3d620600ff7166af42c04042a751baf2d8d88 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 7 Nov 2025 14:48:42 +0000 Subject: [PATCH 14/21] set EIGEN_RISCV64_DEFAULT_LMUL to 1. --- Eigen/src/Core/arch/RVV10/Complex.h | 725 ------------------ .../Core/arch/RVV10/GeneralBlockPanelKernel.h | 491 ------------ Eigen/src/Core/arch/RVV10/PacketMath.h | 4 + Eigen/src/Core/util/ConfigureVectorization.h | 2 +- 4 files changed, 5 insertions(+), 1217 deletions(-) delete mode 100644 Eigen/src/Core/arch/RVV10/Complex.h delete mode 100644 Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h diff --git a/Eigen/src/Core/arch/RVV10/Complex.h b/Eigen/src/Core/arch/RVV10/Complex.h deleted file mode 100644 index b330ca4f8..000000000 --- a/Eigen/src/Core/arch/RVV10/Complex.h +++ /dev/null @@ -1,725 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2025 Kseniya Zaytseva -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_COMPLEX_RVV10_H -#define EIGEN_COMPLEX_RVV10_H - -// IWYU pragma: private -#include "../../InternalHeaderCheck.h" - -namespace Eigen { - -namespace internal { - -/********************************* float32 ************************************/ - -struct PacketXcf { - EIGEN_STRONG_INLINE PacketXcf() {} - EIGEN_STRONG_INLINE explicit PacketXcf(const PacketMul1Xf& _real, const PacketMul1Xf& _imag) : real(_real), imag(_imag) {} - EIGEN_STRONG_INLINE explicit PacketXcf(const PacketMul2Xf& a) - : real(__riscv_vget_v_f32m2_f32m1(a, 0)), imag(__riscv_vget_v_f32m2_f32m1(a, 1)) {} - PacketMul1Xf real; - PacketMul1Xf imag; -}; - -template <> -struct packet_traits> : default_packet_traits { - typedef PacketXcf type; - typedef PacketXcf half; - enum { - Vectorizable = 1, - AlignedOnScalar = 0, - size = rvv_packet_size_selector::size, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasSqrt = 1, - HasSign = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasLog = 0, - HasSetLinear = 0 - }; -}; - -template <> -struct unpacket_traits { - typedef std::complex type; - typedef PacketXcf half; - typedef PacketMul2Xf as_real; - enum { - size = rvv_packet_size_selector::size, - alignment = rvv_packet_alignment_selector::alignment, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - -template <> -EIGEN_STRONG_INLINE PacketXcf pcast(const PacketMul2Xf& a) { - return PacketXcf(a); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXcf& a) { - return __riscv_vcreate_v_f32m1_f32m2(a.real, a.imag); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf pset1(const std::complex& from) { - PacketMul1Xf real = pset1(from.real()); - PacketMul1Xf imag = pset1(from.imag()); - return PacketXcf(real, imag); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf padd(const PacketXcf& a, const PacketXcf& b) { - return PacketXcf(padd(a.real, b.real), padd(a.imag, b.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf psub(const PacketXcf& a, const PacketXcf& b) { - return PacketXcf(psub(a.real, b.real), psub(a.imag, b.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf pnegate(const PacketXcf& a) { - return PacketXcf(pnegate(a.real), pnegate(a.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf pconj(const PacketXcf& a) { - return PacketXcf( - a.real, __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vx_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a.imag), - 0x80000000, unpacket_traits::size))); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf pmul(const PacketXcf& a, const PacketXcf& b) { - PacketMul1Xf v1 = pmul(a.real, b.real); - PacketMul1Xf v2 = pmul(a.imag, b.imag); - PacketMul1Xf v3 = pmul(a.real, b.imag); - PacketMul1Xf v4 = pmul(a.imag, b.real); - return PacketXcf(psub(v1, v2), padd(v3, v4)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketXcf& a, const PacketXcf& b, const PacketXcf& c) { - PacketMul1Xf v1 = pmadd(a.real, b.real, c.real); - PacketMul1Xf v2 = pmul(a.imag, b.imag); - PacketMul1Xf v3 = pmadd(a.real, b.imag, c.imag); - PacketMul1Xf v4 = pmul(a.imag, b.real); - return PacketXcf(psub(v1, v2), padd(v3, v4)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf pcmp_eq(const PacketXcf& a, const PacketXcf& b) { - PacketMask32 eq_both = pand(pcmp_eq_mask(a.real, b.real), pcmp_eq_mask(a.imag, b.imag)); - PacketMul1Xf res = pselect(eq_both, ptrue(a.real), pzero(a.real)); - return PacketXcf(res, res); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf pand(const PacketXcf& a, const PacketXcf& b) { - return PacketXcf(pand(a.real, b.real), pand(a.imag, b.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf por(const PacketXcf& a, const PacketXcf& b) { - return PacketXcf(por(a.real, b.real), por(a.imag, b.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf pxor(const PacketXcf& a, const PacketXcf& b) { - return PacketXcf(pxor(a.real, b.real), pxor(a.imag, b.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf pandnot(const PacketXcf& a, const PacketXcf& b) { - return PacketXcf(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf pload(const std::complex* from) { - vfloat32m1x2_t res = __riscv_vlseg2e32_v_f32m1x2((const float*)from, unpacket_traits::size); - EIGEN_DEBUG_ALIGNED_LOAD return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), __riscv_vget_v_f32m1x2_f32m1(res, 1)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf ploadu(const std::complex* from) { - vfloat32m1x2_t res = __riscv_vlseg2e32_v_f32m1x2((const float*)from, unpacket_traits::size); - EIGEN_DEBUG_UNALIGNED_LOAD return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), - __riscv_vget_v_f32m1x2_f32m1(res, 1)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf ploaddup(const std::complex* from) { - PacketMul1Xu real_idx = __riscv_vid_v_u32m1(unpacket_traits::size); - real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - PacketMul1Xu imag_idx = __riscv_vadd_vx_u32m1(real_idx, sizeof(float), unpacket_traits::size); - // real_idx = 0 0 2*sizeof(float) 2*sizeof(float) 4*sizeof(float) 4*sizeof(float) ... - return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), - __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf ploadquad(const std::complex* from) { - PacketMul1Xu real_idx = __riscv_vid_v_u32m1(unpacket_traits::size); - real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - PacketMul1Xu imag_idx = __riscv_vadd_vx_u32m1(real_idx, sizeof(float), unpacket_traits::size); - // real_idx = 0 0 2*sizeof(float) 2*sizeof(float) 4*sizeof(float) 4*sizeof(float) ... - return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), - __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE void pstore >(std::complex* to, const PacketXcf& from) { - vfloat32m1x2_t vx2 = __riscv_vundefined_f32m1x2(); - vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 0, from.real); - vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 1, from.imag); - EIGEN_DEBUG_ALIGNED_STORE __riscv_vsseg2e32_v_f32m1x2((float*)to, vx2, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const PacketXcf& from) { - vfloat32m1x2_t vx2 = __riscv_vundefined_f32m1x2(); - vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 0, from.real); - vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 1, from.imag); - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vsseg2e32_v_f32m1x2((float*)to, vx2, unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline PacketXcf pgather, PacketXcf>(const std::complex* from, - Index stride) { - vfloat32m1x2_t res = - __riscv_vlsseg2e32_v_f32m1x2((const float*)from, 2 * stride * sizeof(float), unpacket_traits::size); - return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), __riscv_vget_v_f32m1x2_f32m1(res, 1)); -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter, PacketXcf>(std::complex* to, const PacketXcf& from, - Index stride) { - vfloat32m1x2_t from_rvv_type = __riscv_vundefined_f32m1x2(); - from_rvv_type = __riscv_vset_v_f32m1_f32m1x2(from_rvv_type, 0, from.real); - from_rvv_type = __riscv_vset_v_f32m1_f32m1x2(from_rvv_type, 1, from.imag); - __riscv_vssseg2e32_v_f32m1x2((float*)to, 2 * stride * sizeof(float), from_rvv_type, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE std::complex pfirst(const PacketXcf& a) { - return std::complex(pfirst(a.real), pfirst(a.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf preverse(const PacketXcf& a) { - return PacketXcf(preverse(a.real), preverse(a.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf pcplxflip(const PacketXcf& a) { - return PacketXcf(a.imag, a.real); -} - -template <> -EIGEN_STRONG_INLINE std::complex predux(const PacketXcf& a) { - return std::complex(predux(a.real), predux(a.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf pdiv(const PacketXcf& a, const PacketXcf& b) { - PacketXcf b_conj = pconj(b); - PacketXcf dividend = pmul(a, b_conj); - PacketMul1Xf divider = psub(pmul(b.real, b_conj.real), pmul(b.imag, b_conj.imag)); - return PacketXcf(pdiv(dividend.real, divider), pdiv(dividend.imag, divider)); -} - -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - float buffer_real[unpacket_traits::size * N]; - float buffer_imag[unpacket_traits::size * N]; - int i = 0; - - for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer_real[i], N * sizeof(float), kernel.packet[i].real, unpacket_traits::size); - __riscv_vsse32(&buffer_imag[i], N * sizeof(float), kernel.packet[i].imag, unpacket_traits::size); - } - - for (i = 0; i < N; i++) { - kernel.packet[i].real = - __riscv_vle32_v_f32m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); - kernel.packet[i].imag = - __riscv_vle32_v_f32m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); - } -} - -template -EIGEN_STRONG_INLINE Packet psqrt_complex_rvv(const Packet& a) { - typedef typename unpacket_traits::type Scalar; - typedef typename Scalar::value_type RealScalar; - typedef typename packet_traits::type RealPacket; - typedef typename unpacket_traits::packet_mask PacketMask; - - // Computes the principal sqrt of the complex numbers in the input. - // - // For example, for packets containing 2 complex numbers stored in - // [real0, real1, imag0, imag1] format - // a = [a0, a1] = [x0, x1, y0, y1], - // where x0 = real(a0), y0 = imag(a0) etc., this function returns - // b = [b0, b1] = [u0, u1, v0, v1], - // such that b0^2 = a0, b1^2 = a1. - // - // To derive the formula for the complex square roots, let's consider the equation for - // a single complex square root of the number x + i*y. We want to find real numbers - // u and v such that - // (u + i*v)^2 = x + i*y <=> - // u^2 - v^2 + i*2*u*v = x + i*v. - // By equating the real and imaginary parts we get: - // u^2 - v^2 = x - // 2*u*v = y. - // - // For x >= 0, this has the numerically stable solution - // u = sqrt(0.5 * (x + sqrt(x^2 + y^2))) - // v = 0.5 * (y / u) - // and for x < 0, - // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2))) - // u = 0.5 * (y / v) - // - // To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as - // l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) , - - // In the following, without lack of generality, we have annotated the code, assuming - // that the input is a packet of 2 complex numbers. - // - // Step 1. Compute l = [l0, l1], where - // l0 = sqrt(x0^2 + y0^2), l1 = sqrt(x1^2 + y1^2) - // To avoid over- and underflow, we use the stable formula for each hypotenuse - // l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)), - // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1. - - Packet a_abs = Packet(pabs(a.real), pabs(a.imag)); - RealPacket a_max = pmax(a_abs.real, a_abs.imag); - RealPacket a_min = pmin(a_abs.real, a_abs.imag); - - PacketMask a_min_zero_mask = pcmp_eq_mask(a_min, pzero(a_min)); - PacketMask a_max_zero_mask = pcmp_eq_mask(a_max, pzero(a_max)); - RealPacket r = pdiv(a_min, a_max); - - const RealPacket cst_one = pset1(RealScalar(1)); - const RealPacket cst_true = ptrue(cst_one); - RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); - // Set l to a_max if a_min is zero. - l = pselect(a_min_zero_mask, a_max, l); - - // Step 2. Compute [rho0, rho1], where - // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 = sqrt(0.5 * (l1 + |x1|)) - // We don't care about the imaginary parts computed here. They will be overwritten later. - const RealPacket cst_half = pset1(RealScalar(0.5)); - RealPacket rho = psqrt(pmul(cst_half, padd(a_abs.real, l))); - - // Step 3. Compute [rho0, rho1, eta0, eta1], where - // eta0 = (y0 / rho0) / 2, and eta1 = (y1 / rho1) / 2. - // set eta = 0 of input is 0 + i0. - RealPacket eta = pselect(a_max_zero_mask, pzero(cst_one), pmul(cst_half, pdiv(a.imag, rho))); - // Compute result for inputs with positive real part. - Packet positive_real_result = Packet(rho, eta); - - // Step 4. Compute solution for inputs with negative real part: - // [|eta0| |eta1|, sign(y0)*rho0, sign(y1)*rho1] - const RealPacket cst_imag_sign_mask = pset1(RealScalar(-0.0)); - RealPacket imag_signs = pand(a.imag, cst_imag_sign_mask); - Packet negative_real_result = Packet(pabs(eta), por(rho, imag_signs)); - - // Step 5. Select solution branch based on the sign of the real parts. - PacketMask negative_real_mask_half = pcmp_lt_mask(a.real, pzero(a.real)); - Packet result = Packet(pselect(negative_real_mask_half, negative_real_result.real, positive_real_result.real), - pselect(negative_real_mask_half, negative_real_result.imag, positive_real_result.imag)); - - // Step 6. Handle special cases for infinities: - // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN - // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN - // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y - // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y - const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); - PacketMask is_real_inf = pcmp_eq_mask(a_abs.real, cst_pos_inf); - // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part. - const Packet cst_one_zero = pset1(Scalar(RealScalar(1.0), RealScalar(0.0))); - Packet real_inf_result = Packet(pmul(a_abs.real, cst_one_zero.real), pmul(a_abs.imag, cst_one_zero.imag)); - real_inf_result = Packet(pselect(negative_real_mask_half, real_inf_result.imag, real_inf_result.real), - pselect(negative_real_mask_half, real_inf_result.real, real_inf_result.imag)); - // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part. - PacketMask is_imag_inf = pcmp_eq_mask(a_abs.imag, cst_pos_inf); - // unless otherwise specified, if either the real or imaginary component is nan, the entire result is nan - result = Packet(pselect(pcmp_eq_mask(result.real, result.real), result.real, cst_true), - pselect(pcmp_eq_mask(result.imag, result.imag), result.imag, cst_true)); - - result = Packet(pselect(is_real_inf, real_inf_result.real, result.real), - pselect(is_real_inf, real_inf_result.imag, result.imag)); - - return Packet(pselect(is_imag_inf, cst_pos_inf, result.real), pselect(is_imag_inf, a.imag, result.imag)); -} - -template -EIGEN_STRONG_INLINE Packet plog_complex_rvv(const Packet& x) { - typedef typename unpacket_traits::type Scalar; - typedef typename Scalar::value_type RealScalar; - typedef typename packet_traits::type RealPacket; - typedef typename unpacket_traits::packet_mask PacketMask; - - // log(sqrt(a^2 + b^2)), atan2(b, a) - RealPacket xlogr = plog(psqrt(padd(pmul(x.real, x.real), pmul(x.imag, x.imag)))); - RealPacket ximg = patan2(x.imag, x.real); - - const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); - RealPacket r_abs = pabs(x.real); - RealPacket i_abs = pabs(x.imag); - PacketMask is_r_pos_inf = pcmp_eq_mask(r_abs, cst_pos_inf); - PacketMask is_i_pos_inf = pcmp_eq_mask(i_abs, cst_pos_inf); - PacketMask is_any_inf = por(is_r_pos_inf, is_i_pos_inf); - RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr); - - return Packet(xreal, ximg); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf psqrt(const PacketXcf& a) { - return psqrt_complex_rvv(a); -} - -template <> -EIGEN_STRONG_INLINE PacketXcf plog(const PacketXcf& a) { - return plog_complex_rvv(a); -} - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketMul2Xf& x, const PacketXcf& y, const PacketXcf& c) const { - return padd(c, this->pmul(x, y)); - } - EIGEN_STRONG_INLINE PacketXcf pmsub(const PacketMul2Xf& x, const PacketXcf& y, const PacketXcf& c) const { - return psub(this->pmul(x, y), c); - } - EIGEN_STRONG_INLINE PacketXcf pmul(const PacketMul2Xf& x, const PacketXcf& y) const { - return PacketXcf(Eigen::internal::pmul(x, pcast(y))); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketXcf& x, const PacketMul2Xf& y, const PacketXcf& c) const { - return padd(c, this->pmul(x, y)); - } - EIGEN_STRONG_INLINE PacketXcf pmsub(const PacketXcf& x, const PacketMul2Xf& y, const PacketXcf& c) const { - return psub(this->pmul(x, y), c); - } - EIGEN_STRONG_INLINE PacketXcf pmul(const PacketXcf& x, const PacketMul2Xf& y) const { - return PacketXcf(Eigen::internal::pmul(pcast(x), y)); - } -}; - -/********************************* double ************************************/ - -struct PacketXcd { - EIGEN_STRONG_INLINE PacketXcd() {} - EIGEN_STRONG_INLINE explicit PacketXcd(const PacketMul1Xd& _real, const PacketMul1Xd& _imag) : real(_real), imag(_imag) {} - EIGEN_STRONG_INLINE explicit PacketXcd(const PacketMul2Xd& a) - : real(__riscv_vget_v_f64m2_f64m1(a, 0)), imag(__riscv_vget_v_f64m2_f64m1(a, 1)) {} - PacketMul1Xd real; - PacketMul1Xd imag; -}; - -template <> -struct packet_traits> : default_packet_traits { - typedef PacketXcd type; - typedef PacketXcd half; - enum { - Vectorizable = 1, - AlignedOnScalar = 0, - size = rvv_packet_size_selector::size, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasSqrt = 1, - HasSign = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasLog = 0, - HasSetLinear = 0 - }; -}; - -template <> -struct unpacket_traits { - typedef std::complex type; - typedef PacketXcd half; - typedef PacketMul2Xd as_real; - enum { - size = rvv_packet_size_selector::size, - alignment = rvv_packet_alignment_selector::alignment, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - -template <> -EIGEN_STRONG_INLINE PacketXcd pcast(const PacketMul2Xd& a) { - return PacketXcd(a); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXcd& a) { - return __riscv_vcreate_v_f64m1_f64m2(a.real, a.imag); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd pset1(const std::complex& from) { - PacketMul1Xd real = pset1(from.real()); - PacketMul1Xd imag = pset1(from.imag()); - return PacketXcd(real, imag); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd padd(const PacketXcd& a, const PacketXcd& b) { - return PacketXcd(padd(a.real, b.real), padd(a.imag, b.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd psub(const PacketXcd& a, const PacketXcd& b) { - return PacketXcd(psub(a.real, b.real), psub(a.imag, b.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd pnegate(const PacketXcd& a) { - return PacketXcd(pnegate(a.real), pnegate(a.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd pconj(const PacketXcd& a) { - return PacketXcd( - a.real, __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vx_u64m1( - __riscv_vreinterpret_v_f64m1_u64m1(a.imag), 0x8000000000000000, unpacket_traits::size))); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd pmul(const PacketXcd& a, const PacketXcd& b) { - PacketMul1Xd v1 = pmul(a.real, b.real); - PacketMul1Xd v2 = pmul(a.imag, b.imag); - PacketMul1Xd v3 = pmul(a.real, b.imag); - PacketMul1Xd v4 = pmul(a.imag, b.real); - return PacketXcd(psub(v1, v2), padd(v3, v4)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketXcd& a, const PacketXcd& b, const PacketXcd& c) { - PacketMul1Xd v1 = pmadd(a.real, b.real, c.real); - PacketMul1Xd v2 = pmul(a.imag, b.imag); - PacketMul1Xd v3 = pmadd(a.real, b.imag, c.imag); - PacketMul1Xd v4 = pmul(a.imag, b.real); - return PacketXcd(psub(v1, v2), padd(v3, v4)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd pcmp_eq(const PacketXcd& a, const PacketXcd& b) { - PacketMask64 eq_both = pand(pcmp_eq_mask(a.real, b.real), pcmp_eq_mask(a.imag, b.imag)); - PacketMul1Xd res = pselect(eq_both, ptrue(a.real), pzero(a.real)); - return PacketXcd(res, res); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd pand(const PacketXcd& a, const PacketXcd& b) { - return PacketXcd(pand(a.real, b.real), pand(a.imag, b.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd por(const PacketXcd& a, const PacketXcd& b) { - return PacketXcd(por(a.real, b.real), por(a.imag, b.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd pxor(const PacketXcd& a, const PacketXcd& b) { - return PacketXcd(pxor(a.real, b.real), pxor(a.imag, b.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd pandnot(const PacketXcd& a, const PacketXcd& b) { - return PacketXcd(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd pload(const std::complex* from) { - vfloat64m1x2_t res = __riscv_vlseg2e64_v_f64m1x2((const double*)from, unpacket_traits::size); - EIGEN_DEBUG_ALIGNED_LOAD return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), __riscv_vget_v_f64m1x2_f64m1(res, 1)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd ploadu(const std::complex* from) { - vfloat64m1x2_t res = __riscv_vlseg2e64_v_f64m1x2((const double*)from, unpacket_traits::size); - EIGEN_DEBUG_UNALIGNED_LOAD return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), - __riscv_vget_v_f64m1x2_f64m1(res, 1)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd ploaddup(const std::complex* from) { - PacketMul1Xul real_idx = __riscv_vid_v_u64m1(unpacket_traits::size); - real_idx = - __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffeu, unpacket_traits::size), 3, - unpacket_traits::size); - PacketMul1Xul imag_idx = __riscv_vadd_vx_u64m1(real_idx, sizeof(double), unpacket_traits::size); - // real_idx = 0 0 2*sizeof(double) 2*sizeof(double) 4*sizeof(double) 4*sizeof(double) ... - return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), - __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd ploadquad(const std::complex* from) { - PacketMul1Xul real_idx = __riscv_vid_v_u64m1(unpacket_traits::size); - real_idx = - __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffcu, unpacket_traits::size), 2, - unpacket_traits::size); - PacketMul1Xul imag_idx = __riscv_vadd_vx_u64m1(real_idx, sizeof(double), unpacket_traits::size); - // real_idx = 0 0 2*sizeof(double) 2*sizeof(double) 4*sizeof(double) 4*sizeof(double) ... - return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), - __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE void pstore >(std::complex* to, const PacketXcd& from) { - vfloat64m1x2_t vx2 = __riscv_vundefined_f64m1x2(); - vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 0, from.real); - vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 1, from.imag); - EIGEN_DEBUG_ALIGNED_STORE __riscv_vsseg2e64_v_f64m1x2((double*)to, vx2, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const PacketXcd& from) { - vfloat64m1x2_t vx2 = __riscv_vundefined_f64m1x2(); - vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 0, from.real); - vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 1, from.imag); - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vsseg2e64_v_f64m1x2((double*)to, vx2, unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline PacketXcd pgather, PacketXcd>(const std::complex* from, - Index stride) { - vfloat64m1x2_t res = - __riscv_vlsseg2e64_v_f64m1x2((const double*)from, 2 * stride * sizeof(double), unpacket_traits::size); - return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), __riscv_vget_v_f64m1x2_f64m1(res, 1)); -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter, PacketXcd>(std::complex* to, const PacketXcd& from, - Index stride) { - vfloat64m1x2_t from_rvv_type = __riscv_vundefined_f64m1x2(); - from_rvv_type = __riscv_vset_v_f64m1_f64m1x2(from_rvv_type, 0, from.real); - from_rvv_type = __riscv_vset_v_f64m1_f64m1x2(from_rvv_type, 1, from.imag); - __riscv_vssseg2e64_v_f64m1x2((double*)to, 2 * stride * sizeof(double), from_rvv_type, - unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE std::complex pfirst(const PacketXcd& a) { - return std::complex(pfirst(a.real), pfirst(a.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd preverse(const PacketXcd& a) { - return PacketXcd(preverse(a.real), preverse(a.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd pcplxflip(const PacketXcd& a) { - return PacketXcd(a.imag, a.real); -} - -template <> -EIGEN_STRONG_INLINE std::complex predux(const PacketXcd& a) { - return std::complex(predux(a.real), predux(a.imag)); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd pdiv(const PacketXcd& a, const PacketXcd& b) { - PacketXcd b_conj = pconj(b); - PacketXcd dividend = pmul(a, b_conj); - PacketMul1Xd divider = psub(pmul(b.real, b_conj.real), pmul(b.imag, b_conj.imag)); - return PacketXcd(pdiv(dividend.real, divider), pdiv(dividend.imag, divider)); -} - -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - double buffer_real[unpacket_traits::size * N]; - double buffer_imag[unpacket_traits::size * N]; - int i = 0; - - for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer_real[i], N * sizeof(double), kernel.packet[i].real, unpacket_traits::size); - __riscv_vsse64(&buffer_imag[i], N * sizeof(double), kernel.packet[i].imag, unpacket_traits::size); - } - - for (i = 0; i < N; i++) { - kernel.packet[i].real = - __riscv_vle64_v_f64m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); - kernel.packet[i].imag = - __riscv_vle64_v_f64m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); - } -} - -template <> -EIGEN_STRONG_INLINE PacketXcd psqrt(const PacketXcd& a) { - return psqrt_complex_rvv(a); -} - -template <> -EIGEN_STRONG_INLINE PacketXcd plog(const PacketXcd& a) { - return plog_complex_rvv(a); -} - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketMul2Xd& x, const PacketXcd& y, const PacketXcd& c) const { - return padd(c, this->pmul(x, y)); - } - EIGEN_STRONG_INLINE PacketXcd pmsub(const PacketMul2Xd& x, const PacketXcd& y, const PacketXcd& c) const { - return psub(this->pmul(x, y), c); - } - EIGEN_STRONG_INLINE PacketXcd pmul(const PacketMul2Xd& x, const PacketXcd& y) const { - return PacketXcd(Eigen::internal::pmul(x, pcast(y))); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketXcd& x, const PacketMul2Xd& y, const PacketXcd& c) const { - return padd(c, this->pmul(x, y)); - } - EIGEN_STRONG_INLINE PacketXcd pmsub(const PacketXcd& x, const PacketMul2Xd& y, const PacketXcd& c) const { - return psub(this->pmul(x, y), c); - } - EIGEN_STRONG_INLINE PacketXcd pmul(const PacketXcd& x, const PacketMul2Xd& y) const { - return PacketXcd(Eigen::internal::pmul(pcast(x), y)); - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_COMPLEX_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h deleted file mode 100644 index 212df434f..000000000 --- a/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h +++ /dev/null @@ -1,491 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2024 Kseniya Zaytseva -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H -#define EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H -#include "../../InternalHeaderCheck.h" - -namespace Eigen { -namespace internal { - -/********************************* real ************************************/ - -template <> -struct gebp_traits - : gebp_traits { - typedef float RhsPacket; - typedef QuadPacket RhsPacketx4; - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, - const FixedInt<0>&) const { - c = __riscv_vfmadd_vf_f32m1(a, b, c, unpacket_traits::size); - } - - template - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, - const LaneIdType& lane) const { - c = __riscv_vfmadd_vf_f32m1(a, b.get(lane), c, unpacket_traits::size); - } -}; - -template <> -struct gebp_traits - : gebp_traits { - typedef double RhsPacket; - typedef QuadPacket RhsPacketx4; - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, - const FixedInt<0>&) const { - c = __riscv_vfmadd_vf_f64m1(a, b, c, unpacket_traits::size); - } - - template - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, - const LaneIdType& lane) const { - c = __riscv_vfmadd_vf_f64m1(a, b.get(lane), c, unpacket_traits::size); - } -}; - -#if defined(EIGEN_VECTORIZE_RVV10FP16) - -template <> -struct gebp_traits - : gebp_traits { - typedef half RhsPacket; - typedef PacketXh LhsPacket; - typedef PacketXh AccPacket; - typedef QuadPacket RhsPacketx4; - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = pload(b); } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, - const FixedInt<0>&) const { - c = __riscv_vfmadd_vf_f16m1(a, b, c, unpacket_traits::size); - } - - template - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, - const LaneIdType& lane) const { - c = __riscv_vfmadd_vf_f16m1(a, b.get(lane), c, unpacket_traits::size); - } -}; - -#endif - -/********************************* complex ************************************/ - -#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ - typedef typename packet_conditional< \ - packet_size, typename packet_traits::type, typename packet_traits::half, \ - typename unpacket_traits::half>::half>::type name##Packet##postfix - -#define RISCV_COMPLEX_PACKET_DECL_COND_SCALAR(packet_size) \ - typedef typename packet_conditional< \ - packet_size, typename packet_traits::type, typename packet_traits::half, \ - typename unpacket_traits::half>::half>::type ScalarPacket - -template -struct gebp_traits, std::complex, ConjLhs_, ConjRhs_, Architecture::RVV10, - PacketSize_> : gebp_traits, std::complex, ConjLhs_, ConjRhs_, - Architecture::Generic, PacketSize_> { - typedef std::complex Scalar; - typedef std::complex LhsScalar; - typedef std::complex RhsScalar; - typedef std::complex ResScalar; - typedef typename packet_traits>::type RealPacket; - - PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); - PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); - PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); - RISCV_COMPLEX_PACKET_DECL_COND_SCALAR(PacketSize_); -#undef RISCV_COMPLEX_PACKET_DECL_COND_SCALAR - - enum { - ConjLhs = ConjLhs_, - ConjRhs = ConjRhs_, - Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, - ResPacketSize = Vectorizable ? unpacket_traits::size : 1, - LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, - RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, - RealPacketSize = Vectorizable ? unpacket_traits::size : 1, - - nr = 4, - mr = ResPacketSize, - - LhsProgress = ResPacketSize, - RhsProgress = 1 - }; - - typedef DoublePacket DoublePacketType; - - typedef std::conditional_t LhsPacket4Packing; - typedef std::conditional_t LhsPacket; - typedef std::conditional_t, Scalar> RhsPacket; - typedef std::conditional_t ResPacket; - typedef std::conditional_t AccPacket; - - typedef QuadPacket RhsPacketx4; - - EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } - - EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) { - p.first = pset1(RealScalar(0)); - p.second = pset1(RealScalar(0)); - } - - // Scalar path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { dest = pset1(*b); } - - // Vectorized path - template - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { - dest.first = pset1(numext::real(*b)); - dest.second = pset1(numext::imag(*b)); - } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - loadRhs(b, dest.B_0); - loadRhs(b + 1, dest.B1); - loadRhs(b + 2, dest.B2); - loadRhs(b + 3, dest.B3); - } - - // Scalar path - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { loadRhs(b, dest); } - - // Vectorized path - template - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket& dest) const { - loadRhs(b, dest); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { loadRhs(b, dest); } - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacket& dest) const { - loadQuadToDoublePacket(b, dest); - } - - // nothing special here - EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { - dest = pload((const typename unpacket_traits::type*)(a)); - } - - template - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu((const typename unpacket_traits::type*)(a)); - } - - EIGEN_STRONG_INLINE PacketXcf pmadd_scalar(const PacketXcf& a, float b, const PacketXcf& c) const { - PacketXf v1 = __riscv_vfmadd_vf_f32m1(a.real, b, c.real, unpacket_traits::size); - PacketXf v4 = __riscv_vfmadd_vf_f32m1(a.imag, b, c.imag, unpacket_traits::size); - return PacketXcf(v1, v4); - } - - EIGEN_STRONG_INLINE PacketXcd pmadd_scalar(const PacketXcd& a, double b, const PacketXcd& c) const { - PacketXd v1 = __riscv_vfmadd_vf_f64m1(a.real, b, c.real, unpacket_traits::size); - PacketXd v4 = __riscv_vfmadd_vf_f64m1(a.imag, b, c.imag, unpacket_traits::size); - return PacketXcd(v1, v4); - } - - template - EIGEN_STRONG_INLINE std::enable_if_t::value> madd(const LhsPacketType& a, - const RhsPacketType& b, - DoublePacket& c, - TmpType& /*tmp*/, - const LaneIdType&) const { - c.first = pmadd_scalar(a, b.first, c.first); - c.second = pmadd_scalar(a, b.second, c.second); - } - - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, - const LaneIdType& lane) const { - madd(a, b.get(lane), c, tmp, lane); - } - - template - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, - const LaneIdType&) const { - c = cj.pmadd(a, b, c); - } - - protected: - conj_helper cj; -}; - -#define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size) \ - typedef typename packet_conditional< \ - packet_size, typename packet_traits::type, typename packet_traits::half, \ - typename unpacket_traits::half>::half>::type ScalarPacket##postfix - -template -class gebp_traits, false, ConjRhs_, Architecture::RVV10, PacketSize_> - : public gebp_traits, false, ConjRhs_, Architecture::Generic, PacketSize_> { - public: - typedef std::complex Scalar; - typedef RealScalar LhsScalar; - typedef Scalar RhsScalar; - typedef Scalar ResScalar; - PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); - PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); - PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); - PACKET_DECL_COND_POSTFIX(_, Real, PacketSize_); - PACKET_DECL_COND_SCALAR_POSTFIX(_, PacketSize_); -#undef PACKET_DECL_COND_SCALAR_POSTFIX - - enum { - ConjLhs = false, - ConjRhs = ConjRhs_, - Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, - LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, - RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, - ResPacketSize = Vectorizable ? unpacket_traits::size : 1, - - NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - // FIXME: should depend on NumberOfRegisters - nr = 4, - mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * ResPacketSize, - - LhsProgress = ResPacketSize, - RhsProgress = 1 - }; - - typedef std::conditional_t LhsPacket; - typedef RhsScalar RhsPacket; - typedef std::conditional_t ResPacket; - typedef LhsPacket LhsPacket4Packing; - typedef QuadPacket RhsPacketx4; - typedef ResPacket AccPacket; - - EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } - - template - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); - } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); - } - - template - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const { - loadRhs(b, dest); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} - - EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload(a); } - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); } - - template - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu((const typename unpacket_traits::type*)a); - } - - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, - const LaneIdType&) const { - madd_impl(a, b, c, tmp, std::conditional_t()); - } - - EIGEN_STRONG_INLINE PacketXcf pmadd_scalar(const PacketXf& a, std::complex b, const PacketXcf& c) const { - PacketXf v1 = __riscv_vfmadd_vf_f32m1(a, b.real(), c.real, unpacket_traits::size); - PacketXf v3 = __riscv_vfmadd_vf_f32m1(a, b.imag(), c.imag, unpacket_traits::size); - return PacketXcf(v1, v3); - } - - EIGEN_STRONG_INLINE PacketXcd pmadd_scalar(const PacketXd& a, std::complex b, const PacketXcd& c) const { - PacketXd v1 = __riscv_vfmadd_vf_f64m1(a, b.real(), c.real, unpacket_traits::size); - PacketXd v3 = __riscv_vfmadd_vf_f64m1(a, b.imag(), c.imag, unpacket_traits::size); - return PacketXcd(v1, v3); - } - - template - EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, - RhsPacketType& tmp, const true_type&) const { - EIGEN_UNUSED_VARIABLE(tmp); - c = pmadd_scalar(a, b, c); - } - - EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, - const false_type&) const { - c += a * b; - } - - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, - const LaneIdType& lane) const { - madd(a, b.get(lane), c, tmp, lane); - } - - template - EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { - conj_helper cj; - r = cj.pmadd(alpha, c, r); - } -}; - -template -class gebp_traits, RealScalar, ConjLhs_, false, Architecture::RVV10, PacketSize_> - : public gebp_traits, ConjLhs_, false, Architecture::Generic, PacketSize_> { - public: - typedef std::complex LhsScalar; - typedef RealScalar RhsScalar; - typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - - PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); - PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); - PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); -#undef PACKET_DECL_COND_POSTFIX - - enum { - ConjLhs = ConjLhs_, - ConjRhs = false, - Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, - LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, - RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, - ResPacketSize = Vectorizable ? unpacket_traits::size : 1, - - nr = 4, - mr = 3 * LhsPacketSize, - - LhsProgress = LhsPacketSize, - RhsProgress = 1 - }; - - typedef std::conditional_t LhsPacket; - typedef RhsScalar RhsPacket; - typedef std::conditional_t ResPacket; - typedef LhsPacket LhsPacket4Packing; - - typedef QuadPacket RhsPacketx4; - - typedef ResPacket AccPacket; - - EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } - - template - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { - dest = pset1(*b); - } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { - pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); - } - - template - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const { - loadRhs(b, dest); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - loadRhsQuad_impl(b, dest, std::conditional_t()); - } - - EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const { - // FIXME we can do better! - // what we want here is a ploadheight - RhsScalar tmp[4] = {b[0], b[0], b[1], b[1]}; - dest = ploadquad(tmp); - } - - EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const { - eigen_internal_assert(RhsPacketSize <= 8); - dest = pset1(*b); - } - - EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload(a); } - - template - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { - dest = ploadu(a); - } - - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, - const LaneIdType&) const { - madd_impl(a, b, c, tmp, std::conditional_t()); - } - - EIGEN_STRONG_INLINE PacketXcf pmadd_scalar(const PacketXcf& a, float b, const PacketXcf& c) const { - PacketXf v1 = __riscv_vfmadd_vf_f32m1(a.real, b, c.real, unpacket_traits::size); - PacketXf v3 = __riscv_vfmadd_vf_f32m1(a.imag, b, c.imag, unpacket_traits::size); - return PacketXcf(v1, v3); - } - - EIGEN_STRONG_INLINE PacketXcd pmadd_scalar(const PacketXcd& a, double b, const PacketXcd& c) const { - PacketXd v1 = __riscv_vfmadd_vf_f64m1(a.real, b, c.real, unpacket_traits::size); - PacketXd v3 = __riscv_vfmadd_vf_f64m1(a.imag, b, c.imag, unpacket_traits::size); - return PacketXcd(v1, v3); - } - - template - EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, - RhsPacketType& tmp, const true_type&) const { - EIGEN_UNUSED_VARIABLE(tmp); - c = pmadd_scalar(a, b, c); - } - - EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, - const false_type&) const { - c += a * b; - } - - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, - const LaneIdType& lane) const { - madd(a, b.get(lane), c, tmp, lane); - } - - template - EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { - conj_helper cj; - r = cj.pmadd(c, alpha, r); - } -}; - -} // namespace internal -} // namespace Eigen - -#endif // EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h index e12a1a4a1..ef7311406 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -25,6 +25,10 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +// Temporarily make LMUL = 1 +#undef EIGEN_RISCV64_DEFAULT_LMUL +#define EIGEN_RISCV64_DEFAULT_LMUL 1 + template struct rvv_packet_size_selector { enum { size = VectorLength * VectorLMul / (sizeof(Scalar) * CHAR_BIT) }; diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 9944887a3..38402e4db 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -540,7 +540,7 @@ extern "C" { #if defined(__riscv) // Defines the default LMUL for RISC-V #ifndef EIGEN_RISCV64_DEFAULT_LMUL -#define EIGEN_RISCV64_DEFAULT_LMUL 4 +#define EIGEN_RISCV64_DEFAULT_LMUL 1 #endif #endif -- GitLab From b9104f18a7628b3ea7a239883324bff1f8847c7a Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 7 Nov 2025 20:49:30 +0000 Subject: [PATCH 15/21] Fix PacketMathFP16 to match PacketMath style. --- Eigen/src/Core/arch/RVV10/PacketMathFP16.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h index 1857e48c9..3801b858b 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h +++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h @@ -16,14 +16,16 @@ namespace Eigen { namespace internal { -typedef vfloat16m1_t PacketXh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vfloat16m1_t PacketMul1Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); typedef vfloat16m2_t PacketMul2Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); #if EIGEN_RISCV64_DEFAULT_LMUL == 1 +typedef PacketMul1Xh PacketXh; + template <> struct packet_traits : default_packet_traits { - typedef PacketXh type; - typedef PacketXh half; + typedef PacketMul1Xh type; + typedef PacketMul1Xh half; enum { Vectorizable = 1, @@ -60,10 +62,12 @@ struct packet_traits : default_packet_traits { }; #else +typedef PacketMul2Xh PacketXh; + template <> struct packet_traits : default_packet_traits { typedef PacketMul2Xh type; - typedef PacketXh half; + typedef PacketMul1Xh half; enum { Vectorizable = 1, @@ -101,9 +105,9 @@ struct packet_traits : default_packet_traits { #endif template <> -struct unpacket_traits { +struct unpacket_traits { typedef Eigen::half type; - typedef PacketXh half; // Half not yet implemented + typedef PacketMul1Xh half; // Half not yet implemented typedef PacketXs integer_packet; typedef numext::uint8_t mask_t; @@ -119,7 +123,7 @@ struct unpacket_traits { template <> struct unpacket_traits { typedef Eigen::half type; - typedef PacketXh half; + typedef PacketMul1Xh half; typedef PacketMul2Xs integer_packet; typedef numext::uint8_t mask_t; -- GitLab From edacb9721ddd30278126977f201350036584354b Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 7 Nov 2025 20:51:16 +0000 Subject: [PATCH 16/21] Remove unused files. --- Eigen/Core | 5 ----- 1 file changed, 5 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 1b9cf29d9..6968c2b09 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -280,7 +280,6 @@ using std::ptrdiff_t; #include "src/Core/arch/RVV10/PacketMath.h" #include "src/Core/arch/RVV10/TypeCasting.h" #include "src/Core/arch/RVV10/MathFunctions.h" -//#include "src/Core/arch/RVV10/Complex.h" #if defined EIGEN_VECTORIZE_RVV10FP16 #include "src/Core/arch/RVV10/PacketMathFP16.h" #endif @@ -437,10 +436,6 @@ using std::ptrdiff_t; #endif #endif -#if defined(EIGEN_VECTORIZE_RVV10) -//#include "src/Core/arch/RVV10/GeneralBlockPanelKernel.h" -#endif - #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" #include "src/Core/PartialReduxEvaluator.h" -- GitLab From 4232b261add19c4b08a41b6bcfedebea39fdd100 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Mon, 10 Nov 2025 19:36:09 +0000 Subject: [PATCH 17/21] Remove old code. --- Eigen/src/Core/functors/UnaryFunctors.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index da3e10e90..202995ff0 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -114,11 +114,7 @@ struct squared_norm_functor { } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { -#if defined EIGEN_VECTORIZE_RVV10 - return Packet(pmul(a.real, a.real), pmul(a.imag, a.imag)); -#else return Packet(pmul(a.v, a.v)); -#endif } }; template -- GitLab From 9047bef724f25d8a46ebe89cfdc904a8d78e3bdb Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 12 Nov 2025 15:03:45 +0000 Subject: [PATCH 18/21] Split PacketMath.h into 3 parts - one for each LMUL = 1, 2 and 4. --- Eigen/src/Core/arch/RVV10/PacketMath.h | 5028 +++++------------------ Eigen/src/Core/arch/RVV10/PacketMath2.h | 1506 +++++++ Eigen/src/Core/arch/RVV10/PacketMath4.h | 1431 +++++++ 3 files changed, 4006 insertions(+), 3959 deletions(-) create mode 100644 Eigen/src/Core/arch/RVV10/PacketMath2.h create mode 100644 Eigen/src/Core/arch/RVV10/PacketMath4.h diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h index ef7311406..b7a6db817 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -469,2693 +469,26 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { } } -/********************************* PacketMul4Xi ************************************/ - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pset1(const numext::int32_t& from) { - return __riscv_vmv_v_x_i32m4(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi plset(const numext::int32_t& a) { - PacketMul4Xi idx = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)); - return __riscv_vadd_vx_i32m4(idx, a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pzero(const PacketMul4Xi& /*a*/) { - return __riscv_vmv_v_x_i32m4(0, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi padd(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vadd_vv_i32m4(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi psub(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vsub(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pnegate(const PacketMul4Xi& a) { - return __riscv_vneg(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pconj(const PacketMul4Xi& a) { - return a; -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pmul(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vmul(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pdiv(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pmadd(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pmsub(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pnmadd(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { - return __riscv_vnmsub_vv_i32m4(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pnmsub(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { - return __riscv_vnmsub_vv_i32m4(a, b, pnegate(c), unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pmin(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vmin(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pmax(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vmax(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pcmp_le(const PacketMul4Xi& a, const PacketMul4Xi& b) { - PacketMask8 mask = __riscv_vmsle_vv_i32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pcmp_lt(const PacketMul4Xi& a, const PacketMul4Xi& b) { - PacketMask8 mask = __riscv_vmslt_vv_i32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pcmp_eq(const PacketMul4Xi& a, const PacketMul4Xi& b) { - PacketMask8 mask = __riscv_vmseq_vv_i32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi ptrue(const PacketMul4Xi& /*a*/) { - return __riscv_vmv_v_x_i32m4(0xffffffffu, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pand(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vand_vv_i32m4(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi por(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vor_vv_i32m4(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pxor(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vxor_vv_i32m4(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pandnot(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vand_vv_i32m4(a, __riscv_vnot_v_i32m4(b, unpacket_traits::size), - unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE PacketMul4Xi parithmetic_shift_right(PacketMul4Xi a) { - return __riscv_vsra_vx_i32m4(a, N, unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_right(PacketMul4Xi a) { - return __riscv_vreinterpret_i32m4( - __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_u32m4(a), N, unpacket_traits::size)); -} - -template -EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_left(PacketMul4Xi a) { - return __riscv_vsll_vx_i32m4(a, N, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pload(const numext::int32_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi ploadu(const numext::int32_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi ploaddup(const numext::int32_t* from) { - PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... - return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi ploadquad(const numext::int32_t* from) { - PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); - idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul4Xi& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul4Xi& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline PacketMul4Xi pgather(const numext::int32_t* from, - Index stride) { - return __riscv_vlse32_v_i32m4(from, stride * sizeof(numext::int32_t), unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul4Xi& from, - Index stride) { - __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul4Xi& a) { - return __riscv_vmv_x_s_i32m4_i32(a); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi preverse(const PacketMul4Xi& a) { - PacketMul4Xu idx = - __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i32m4(a, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xi pabs(const PacketMul4Xi& a) { - PacketMul4Xi mask = __riscv_vsra_vx_i32m4(a, 31, unpacket_traits::size); - return __riscv_vsub_vv_i32m4(__riscv_vxor_vv_i32m4(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul4Xi& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1( - a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul4Xi& a) { - PacketMul1Xi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), __riscv_vget_v_i32m4_i32m1(a, 1), - unpacket_traits::size); - PacketMul1Xi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), __riscv_vget_v_i32m4_i32m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul4Xi& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i32m4_i32m1( - a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul4Xi& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i32m4_i32m1( - a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), - unpacket_traits::size)); -} - -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int32_t buffer[unpacket_traits::size * N] = {0}; - int i = 0; - - for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); - } - for (i = 0; i < N; i++) { - kernel.packet[i] = - __riscv_vle32_v_i32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); - } -} - -/********************************* PacketMul2Xi ************************************/ - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pset1(const numext::int32_t& from) { - return __riscv_vmv_v_x_i32m2(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi plset(const numext::int32_t& a) { - PacketMul2Xi idx = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)); - return __riscv_vadd_vx_i32m2(idx, a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pzero(const PacketMul2Xi& /*a*/) { - return __riscv_vmv_v_x_i32m2(0, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi padd(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vadd_vv_i32m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi psub(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vsub(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pnegate(const PacketMul2Xi& a) { - return __riscv_vneg(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pconj(const PacketMul2Xi& a) { - return a; -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pmul(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vmul(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pdiv(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pmadd(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pmsub(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pnmadd(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { - return __riscv_vnmsub_vv_i32m2(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pnmsub(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { - return __riscv_vnmsub_vv_i32m2(a, b, pnegate(c), unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pmin(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vmin(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pmax(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vmax(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pcmp_le(const PacketMul2Xi& a, const PacketMul2Xi& b) { - PacketMask16 mask = __riscv_vmsle_vv_i32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pcmp_lt(const PacketMul2Xi& a, const PacketMul2Xi& b) { - PacketMask16 mask = __riscv_vmslt_vv_i32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pcmp_eq(const PacketMul2Xi& a, const PacketMul2Xi& b) { - PacketMask16 mask = __riscv_vmseq_vv_i32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi ptrue(const PacketMul2Xi& /*a*/) { - return __riscv_vmv_v_x_i32m2(0xffffffffu, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pand(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vand_vv_i32m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi por(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vor_vv_i32m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pxor(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vxor_vv_i32m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pandnot(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vand_vv_i32m2(a, __riscv_vnot_v_i32m2(b, unpacket_traits::size), - unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE PacketMul2Xi parithmetic_shift_right(PacketMul2Xi a) { - return __riscv_vsra_vx_i32m2(a, N, unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_right(PacketMul2Xi a) { - return __riscv_vreinterpret_i32m2( - __riscv_vsrl_vx_u32m2(__riscv_vreinterpret_u32m2(a), N, unpacket_traits::size)); -} - -template -EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_left(PacketMul2Xi a) { - return __riscv_vsll_vx_i32m2(a, N, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pload(const numext::int32_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi ploadu(const numext::int32_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi ploaddup(const numext::int32_t* from) { - PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... - return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi ploadquad(const numext::int32_t* from) { - PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); - idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul2Xi& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul2Xi& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline PacketMul2Xi pgather(const numext::int32_t* from, - Index stride) { - return __riscv_vlse32_v_i32m2(from, stride * sizeof(numext::int32_t), unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul2Xi& from, - Index stride) { - __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul2Xi& a) { - return __riscv_vmv_x_s_i32m2_i32(a); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi preverse(const PacketMul2Xi& a) { - PacketMul2Xu idx = - __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i32m2(a, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xi pabs(const PacketMul2Xi& a) { - PacketMul2Xi mask = __riscv_vsra_vx_i32m2(a, 31, unpacket_traits::size); - return __riscv_vsub_vv_i32m2(__riscv_vxor_vv_i32m2(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul2Xi& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1( - a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 2), unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul2Xi& a) { - return predux_mul(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul2Xi& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i32m2_i32m1( - a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 2), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul2Xi& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i32m2_i32m1( - a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 2), - unpacket_traits::size)); -} - -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int32_t buffer[unpacket_traits::size * N] = {0}; - int i = 0; - - for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); - } - for (i = 0; i < N; i++) { - kernel.packet[i] = - __riscv_vle32_v_i32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); - } -} - -template -EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xi>::type -predux_half_dowto4(const PacketMul4Xi& a) { - return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), __riscv_vget_v_i32m4_i32m2(a, 1), - unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul1Xi>::type -predux_half_dowto4(const PacketMul2Xi& a) { - return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), - unpacket_traits::size); -} - -/********************************* float32 ************************************/ - -typedef eigen_packet_wrapper PacketMul1Xf; -typedef eigen_packet_wrapper - PacketMul2Xf; -typedef eigen_packet_wrapper - PacketMul4Xf; - -#if EIGEN_RISCV64_DEFAULT_LMUL == 1 -typedef PacketMul1Xf PacketXf; - -template <> -struct packet_traits : default_packet_traits { - typedef PacketMul1Xf type; - typedef PacketMul1Xf half; - - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = rvv_packet_size_selector::size, - - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasReduxp = 0, - - HasCmp = 1, - HasDiv = 1, - HasRound = 1, - - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH - }; -}; - -#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 -typedef PacketMul2Xf PacketXf; - -template <> -struct packet_traits : default_packet_traits { - typedef PacketMul2Xf type; - typedef PacketMul1Xf half; - - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = rvv_packet_size_selector::size, - - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasReduxp = 0, - - HasCmp = 1, - HasDiv = 1, - HasRound = 1, - - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH - }; -}; - -#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 -typedef PacketMul4Xf PacketXf; - -template <> -struct packet_traits : default_packet_traits { - typedef PacketMul4Xf type; - typedef PacketMul2Xf half; - - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = rvv_packet_size_selector::size, - - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasReduxp = 0, - - HasCmp = 1, - HasDiv = 1, - HasRound = 1, - - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH - }; -}; -#endif - -template <> -struct unpacket_traits { - typedef float type; - typedef PacketMul1Xf half; // Half not yet implemented - typedef PacketMul1Xi integer_packet; - typedef numext::uint8_t mask_t; - typedef PacketMask32 packet_mask; - - enum { - size = rvv_packet_size_selector::size, - alignment = rvv_packet_alignment_selector::alignment, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - -template <> -struct unpacket_traits { - typedef float type; - typedef PacketMul1Xf half; - typedef PacketMul2Xi integer_packet; - typedef numext::uint8_t mask_t; - typedef PacketMask16 packet_mask; - - enum { - size = rvv_packet_size_selector::size, - alignment = rvv_packet_alignment_selector::alignment, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - -template <> -struct unpacket_traits { - typedef float type; - typedef PacketMul2Xf half; - typedef PacketMul4Xi integer_packet; - typedef numext::uint8_t mask_t; - typedef PacketMask8 packet_mask; - - enum { - size = rvv_packet_size_selector::size, - alignment = rvv_packet_alignment_selector::alignment, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - -/********************************* PacketMul1Xf ************************************/ - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf ptrue(const PacketMul1Xf& /*a*/) { - return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(0xffffffffu, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pzero(const PacketMul1Xf& /*a*/) { - return __riscv_vfmv_v_f_f32m1(0.0f, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pabs(const PacketMul1Xf& a) { - return __riscv_vfabs_v_f32m1(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pset1(const float& from) { - return __riscv_vfmv_v_f_f32m1(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pset1frombits(numext::uint32_t from) { - return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(from, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf plset(const float& a) { - PacketMul1Xf idx = __riscv_vfcvt_f_x_v_f32m1( - __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)), - unpacket_traits::size); - return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf padd(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vfadd_vv_f32m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf psub(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vfsub_vv_f32m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pnegate(const PacketMul1Xf& a) { - return __riscv_vfneg_v_f32m1(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pconj(const PacketMul1Xf& a) { - return a; -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmul(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vfmul_vv_f32m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pdiv(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vfdiv_vv_f32m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmadd(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { - return __riscv_vfmadd_vv_f32m1(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmsub(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { - return __riscv_vfmsub_vv_f32m1(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pnmadd(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { - return __riscv_vfnmsub_vv_f32m1(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pnmsub(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { - return __riscv_vfnmadd_vv_f32m1(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmin(const PacketMul1Xf& a, const PacketMul1Xf& b) { - PacketMul1Xf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); - PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); - - return __riscv_vfmin_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmin(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return pmin(a, b); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmin(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vfmin_vv_f32m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmax(const PacketMul1Xf& a, const PacketMul1Xf& b) { - PacketMul1Xf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); - PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); - - return __riscv_vfmax_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmax(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return pmax(a, b); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmax(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vfmax_vv_f32m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pcmp_le(const PacketMul1Xf& a, const PacketMul1Xf& b) { - PacketMask32 mask = __riscv_vmfle_vv_f32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pcmp_lt(const PacketMul1Xf& a, const PacketMul1Xf& b) { - PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pcmp_eq(const PacketMul1Xf& a, const PacketMul1Xf& b) { - PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pcmp_lt_or_nan(const PacketMul1Xf& a, const PacketMul1Xf& b) { - PacketMask32 mask = __riscv_vmfge_vv_f32m1_b32(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f32m1(ptrue(a), 0.0f, mask, unpacket_traits::size); -} - -// Logical Operations are not supported for float, so reinterpret casts -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pand(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( - __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf por(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vor_vv_u32m1( - __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pxor(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1( - __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pandnot(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( - __riscv_vreinterpret_v_f32m1_u32m1(a), - __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pload(const float* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf ploadu(const float* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf ploaddup(const float* from) { - PacketMul1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf ploadquad(const float* from) { - PacketMul1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul1Xf& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul1Xf& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline PacketMul1Xf pgather(const float* from, Index stride) { - return __riscv_vlse32_v_f32m1(from, stride * sizeof(float), unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul1Xf& from, Index stride) { - __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE float pfirst(const PacketMul1Xf& a) { - return __riscv_vfmv_f_s_f32m1_f32(a); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf psqrt(const PacketMul1Xf& a) { - return __riscv_vfsqrt_v_f32m1(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf print(const PacketMul1Xf& a) { - const PacketMul1Xf limit = pset1(static_cast(1 << 23)); - const PacketMul1Xf abs_a = pabs(a); - - PacketMask32 mask = __riscv_vmfne_vv_f32m1_b32(a, a, unpacket_traits::size); - const PacketMul1Xf x = __riscv_vfadd_vv_f32m1_tumu(mask, a, a, a, unpacket_traits::size); - const PacketMul1Xf new_x = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(a, unpacket_traits::size), - unpacket_traits::size); - - mask = __riscv_vmflt_vv_f32m1_b32(abs_a, limit, unpacket_traits::size); - PacketMul1Xf signed_x = __riscv_vfsgnj_vv_f32m1(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m1(x, signed_x, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pfloor(const PacketMul1Xf& a) { - PacketMul1Xf tmp = print(a); - // If greater, subtract one. - PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f32m1_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf preverse(const PacketMul1Xf& a) { - PacketMul1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pfrexp(const PacketMul1Xf& a, PacketMul1Xf& exponent) { - return pfrexp_generic(a, exponent); -} - -template <> -EIGEN_STRONG_INLINE float predux(const PacketMul1Xf& a) { - return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m1_f32m1( - a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size), unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE float predux_mul(const PacketMul1Xf& a) { - // Multiply the vector by its reverse - PacketMul1Xf prod = __riscv_vfmul_vv_f32m1(preverse(a), a, unpacket_traits::size); - PacketMul1Xf half_prod; - - if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, unpacket_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); - } - if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, unpacket_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); - } - if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, unpacket_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); - } - // Last reduction - half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, unpacket_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); - - // The reduction is done to the first element. - return pfirst(prod); -} - -template <> -EIGEN_STRONG_INLINE float predux_min(const PacketMul1Xf& a) { - return ( - std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( - a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), - (std::numeric_limits::max)()); -} - -template <> -EIGEN_STRONG_INLINE float predux_max(const PacketMul1Xf& a) { - return ( - std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( - a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), - -(std::numeric_limits::max)()); -} - -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - float buffer[unpacket_traits::size * N]; - int i = 0; - - for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); - } - - for (i = 0; i < N; i++) { - kernel.packet[i] = - __riscv_vle32_v_f32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); - } -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xf pldexp(const PacketMul1Xf& a, const PacketMul1Xf& exponent) { - return pldexp_generic(a, exponent); -} - -template <> -EIGEN_STRONG_INLINE PacketMask32 por(const PacketMask32& a, const PacketMask32& b) { - return __riscv_vmor_mm_b32(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMask32 pand(const PacketMask32& a, const PacketMask32& b) { - return __riscv_vmand_mm_b32(a, b, unpacket_traits::size); -} - -EIGEN_STRONG_INLINE PacketMask32 pcmp_eq_mask(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); -} - -EIGEN_STRONG_INLINE PacketMask32 pcmp_lt_mask(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); -} - -EIGEN_STRONG_INLINE PacketMul1Xf pselect(const PacketMask32& mask, const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vmerge_vvm_f32m1(b, a, mask, unpacket_traits::size); -} - -/********************************* PacketMul4Xf ************************************/ - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf ptrue(const PacketMul4Xf& /*a*/) { - return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(0xffffffffu, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pzero(const PacketMul4Xf& /*a*/) { - return __riscv_vfmv_v_f_f32m4(0.0f, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pabs(const PacketMul4Xf& a) { - return __riscv_vfabs_v_f32m4(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pset1(const float& from) { - return __riscv_vfmv_v_f_f32m4(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pset1frombits(numext::uint32_t from) { - return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(from, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf plset(const float& a) { - PacketMul4Xf idx = __riscv_vfcvt_f_x_v_f32m4( - __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)), - unpacket_traits::size); - return __riscv_vfadd_vf_f32m4(idx, a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf padd(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vfadd_vv_f32m4(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf psub(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vfsub_vv_f32m4(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pnegate(const PacketMul4Xf& a) { - return __riscv_vfneg_v_f32m4(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pconj(const PacketMul4Xf& a) { - return a; -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmul(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vfmul_vv_f32m4(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pdiv(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vfdiv_vv_f32m4(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmadd(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { - return __riscv_vfmadd_vv_f32m4(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmsub(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { - return __riscv_vfmsub_vv_f32m4(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pnmadd(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { - return __riscv_vfnmsub_vv_f32m4(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pnmsub(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { - return __riscv_vfnmadd_vv_f32m4(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { - PacketMul4Xf nans = - __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); - PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); - - return __riscv_vfmin_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return pmin(a, b); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vfmin_vv_f32m4(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { - PacketMul4Xf nans = - __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); - PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); - - return __riscv_vfmax_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return pmax(a, b); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vfmax_vv_f32m4(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcmp_le(const PacketMul4Xf& a, const PacketMul4Xf& b) { - PacketMask8 mask = __riscv_vmfle_vv_f32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt(const PacketMul4Xf& a, const PacketMul4Xf& b) { - PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcmp_eq(const PacketMul4Xf& a, const PacketMul4Xf& b) { - PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt_or_nan(const PacketMul4Xf& a, const PacketMul4Xf& b) { - PacketMask8 mask = __riscv_vmfge_vv_f32m4_b8(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f32m4(ptrue(a), 0.0f, mask, unpacket_traits::size); -} - -// Logical Operations are not supported for float, so reinterpret casts -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pand(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), - __riscv_vreinterpret_v_f32m4_u32m4(b), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf por(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), - __riscv_vreinterpret_v_f32m4_u32m4(b), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pxor(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), - __riscv_vreinterpret_v_f32m4_u32m4(b), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pandnot(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4( - __riscv_vreinterpret_v_f32m4_u32m4(a), - __riscv_vnot_v_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pload(const float* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf ploadu(const float* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf ploaddup(const float* from) { - PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf ploadquad(const float* from) { - PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); - idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul4Xf& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul4Xf& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline PacketMul4Xf pgather(const float* from, Index stride) { - return __riscv_vlse32_v_f32m4(from, stride * sizeof(float), unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul4Xf& from, Index stride) { - __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE float pfirst(const PacketMul4Xf& a) { - return __riscv_vfmv_f_s_f32m4_f32(a); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf psqrt(const PacketMul4Xf& a) { - return __riscv_vfsqrt_v_f32m4(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf print(const PacketMul4Xf& a) { - const PacketMul4Xf limit = pset1(static_cast(1 << 23)); - const PacketMul4Xf abs_a = pabs(a); - - PacketMask8 mask = __riscv_vmfne_vv_f32m4_b8(a, a, unpacket_traits::size); - const PacketMul4Xf x = __riscv_vfadd_vv_f32m4_tumu(mask, a, a, a, unpacket_traits::size); - const PacketMul4Xf new_x = __riscv_vfcvt_f_x_v_f32m4( - __riscv_vfcvt_x_f_v_i32m4(a, unpacket_traits::size), unpacket_traits::size); - - mask = __riscv_vmflt_vv_f32m4_b8(abs_a, limit, unpacket_traits::size); - PacketMul4Xf signed_x = __riscv_vfsgnj_vv_f32m4(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m4(x, signed_x, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pfloor(const PacketMul4Xf& a) { - PacketMul4Xf tmp = print(a); - // If greater, subtract one. - PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f32m4_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf preverse(const PacketMul4Xf& a) { - PacketMul4Xu idx = - __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f32m4(a, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pfrexp(const PacketMul4Xf& a, PacketMul4Xf& exponent) { - return pfrexp_generic(a, exponent); -} - -template <> -EIGEN_STRONG_INLINE float predux(const PacketMul4Xf& a) { - return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m4_f32m1( - a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE float predux_mul(const PacketMul4Xf& a) { - PacketMul1Xf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), __riscv_vget_v_f32m4_f32m1(a, 1), - unpacket_traits::size); - PacketMul1Xf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), __riscv_vget_v_f32m4_f32m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE float predux_min(const PacketMul4Xf& a) { - return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1( - a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 4), - unpacket_traits::size)), - (std::numeric_limits::max)()); -} - -template <> -EIGEN_STRONG_INLINE float predux_max(const PacketMul4Xf& a) { - return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1( - a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 4), - unpacket_traits::size)), - -(std::numeric_limits::max)()); -} - -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - float buffer[unpacket_traits::size * N]; - int i = 0; - - for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); - } - - for (i = 0; i < N; i++) { - kernel.packet[i] = - __riscv_vle32_v_f32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); - } -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xf pldexp(const PacketMul4Xf& a, const PacketMul4Xf& exponent) { - return pldexp_generic(a, exponent); -} - -/********************************* PacketMul2Xf ************************************/ - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf ptrue(const PacketMul2Xf& /*a*/) { - return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(0xffffffffu, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pzero(const PacketMul2Xf& /*a*/) { - return __riscv_vfmv_v_f_f32m2(0.0f, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pabs(const PacketMul2Xf& a) { - return __riscv_vfabs_v_f32m2(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pset1(const float& from) { - return __riscv_vfmv_v_f_f32m2(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pset1frombits(numext::uint32_t from) { - return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(from, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf plset(const float& a) { - PacketMul2Xf idx = __riscv_vfcvt_f_x_v_f32m2( - __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)), - unpacket_traits::size); - return __riscv_vfadd_vf_f32m2(idx, a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf padd(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vfadd_vv_f32m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf psub(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vfsub_vv_f32m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pnegate(const PacketMul2Xf& a) { - return __riscv_vfneg_v_f32m2(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pconj(const PacketMul2Xf& a) { - return a; -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmul(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vfmul_vv_f32m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pdiv(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vfdiv_vv_f32m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmadd(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { - return __riscv_vfmadd_vv_f32m2(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmsub(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { - return __riscv_vfmsub_vv_f32m2(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pnmadd(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { - return __riscv_vfnmsub_vv_f32m2(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pnmsub(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { - return __riscv_vfnmadd_vv_f32m2(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { - PacketMul2Xf nans = - __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); - PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); - - return __riscv_vfmin_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return pmin(a, b); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vfmin_vv_f32m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { - PacketMul2Xf nans = - __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); - PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); - - return __riscv_vfmax_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return pmax(a, b); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vfmax_vv_f32m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pcmp_le(const PacketMul2Xf& a, const PacketMul2Xf& b) { - PacketMask16 mask = __riscv_vmfle_vv_f32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt(const PacketMul2Xf& a, const PacketMul2Xf& b) { - PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pcmp_eq(const PacketMul2Xf& a, const PacketMul2Xf& b) { - PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt_or_nan(const PacketMul2Xf& a, const PacketMul2Xf& b) { - PacketMask16 mask = __riscv_vmfge_vv_f32m2_b16(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f32m2(ptrue(a), 0.0f, mask, unpacket_traits::size); -} - -// Logical Operations are not supported for float, so reinterpret casts -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pand(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), - __riscv_vreinterpret_v_f32m2_u32m2(b), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf por(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), - __riscv_vreinterpret_v_f32m2_u32m2(b), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pxor(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), - __riscv_vreinterpret_v_f32m2_u32m2(b), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pandnot(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2( - __riscv_vreinterpret_v_f32m2_u32m2(a), - __riscv_vnot_v_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pload(const float* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf ploadu(const float* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf ploaddup(const float* from) { - PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf ploadquad(const float* from) { - PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); - idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul2Xf& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul2Xf& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline PacketMul2Xf pgather(const float* from, Index stride) { - return __riscv_vlse32_v_f32m2(from, stride * sizeof(float), unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul2Xf& from, Index stride) { - __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE float pfirst(const PacketMul2Xf& a) { - return __riscv_vfmv_f_s_f32m2_f32(a); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf psqrt(const PacketMul2Xf& a) { - return __riscv_vfsqrt_v_f32m2(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf print(const PacketMul2Xf& a) { - const PacketMul2Xf limit = pset1(static_cast(1 << 23)); - const PacketMul2Xf abs_a = pabs(a); - - PacketMask16 mask = __riscv_vmfne_vv_f32m2_b16(a, a, unpacket_traits::size); - const PacketMul2Xf x = __riscv_vfadd_vv_f32m2_tumu(mask, a, a, a, unpacket_traits::size); - const PacketMul2Xf new_x = __riscv_vfcvt_f_x_v_f32m2( - __riscv_vfcvt_x_f_v_i32m2(a, unpacket_traits::size), unpacket_traits::size); - - mask = __riscv_vmflt_vv_f32m2_b16(abs_a, limit, unpacket_traits::size); - PacketMul2Xf signed_x = __riscv_vfsgnj_vv_f32m2(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m2(x, signed_x, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pfloor(const PacketMul2Xf& a) { - PacketMul2Xf tmp = print(a); - // If greater, subtract one. - PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f32m2_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf preverse(const PacketMul2Xf& a) { - PacketMul2Xu idx = - __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f32m2(a, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pfrexp(const PacketMul2Xf& a, PacketMul2Xf& exponent) { - return pfrexp_generic(a, exponent); -} - -template <> -EIGEN_STRONG_INLINE float predux(const PacketMul2Xf& a) { - return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m2_f32m1( - a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 2), unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE float predux_mul(const PacketMul2Xf& a) { - return predux_mul(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE float predux_min(const PacketMul2Xf& a) { - return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1( - a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 2), - unpacket_traits::size)), - (std::numeric_limits::max)()); -} - -template <> -EIGEN_STRONG_INLINE float predux_max(const PacketMul2Xf& a) { - return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1( - a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 2), - unpacket_traits::size)), - -(std::numeric_limits::max)()); -} - -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - float buffer[unpacket_traits::size * N]; - int i = 0; - - for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); - } - - for (i = 0; i < N; i++) { - kernel.packet[i] = - __riscv_vle32_v_f32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); - } -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xf pldexp(const PacketMul2Xf& a, const PacketMul2Xf& exponent) { - return pldexp_generic(a, exponent); -} - -template -EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xf>::type -predux_half_dowto4(const PacketMul4Xf& a) { - return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), __riscv_vget_v_f32m4_f32m2(a, 1), - unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul1Xf>::type -predux_half_dowto4(const PacketMul2Xf& a) { - return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), - unpacket_traits::size); -} - -/********************************* int64 **************************************/ - -typedef eigen_packet_wrapper PacketMul1Xl; -typedef eigen_packet_wrapper PacketMul1Xul; - -typedef eigen_packet_wrapper - PacketMul2Xl; -typedef eigen_packet_wrapper - PacketMul2Xul; - -typedef eigen_packet_wrapper - PacketMul4Xl; -typedef eigen_packet_wrapper - PacketMul4Xul; - -#if EIGEN_RISCV64_DEFAULT_LMUL == 1 -typedef PacketMul1Xl PacketXl; -typedef PacketMul1Xul PacketXul; - -template <> -struct packet_traits : default_packet_traits { - typedef PacketMul1Xl type; - typedef PacketMul1Xl half; // Half not implemented yet - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = rvv_packet_size_selector::size, - - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasReduxp = 0 - }; -}; - -#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 -typedef PacketMul2Xl PacketXl; -typedef PacketMul2Xul PacketXul; - -template <> -struct packet_traits : default_packet_traits { - typedef PacketMul2Xl type; - typedef PacketMul1Xl half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = rvv_packet_size_selector::size, - - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasReduxp = 0 - }; -}; - -#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 -typedef PacketMul4Xl PacketXl; -typedef PacketMul4Xul PacketXul; - -template <> -struct packet_traits : default_packet_traits { - typedef PacketMul4Xl type; - typedef PacketMul2Xl half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = rvv_packet_size_selector::size, - - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasReduxp = 0 - }; -}; -#endif - -template <> -struct unpacket_traits { - typedef numext::int64_t type; - typedef PacketMul1Xl half; // Half not yet implemented - typedef numext::uint8_t mask_t; - enum { - size = rvv_packet_size_selector::size, - alignment = rvv_packet_alignment_selector::alignment, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - -template <> -struct unpacket_traits { - typedef numext::int64_t type; - typedef PacketMul1Xl half; - typedef numext::uint8_t mask_t; - enum { - size = rvv_packet_size_selector::size, - alignment = rvv_packet_alignment_selector::alignment, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - -template <> -struct unpacket_traits { - typedef numext::int64_t type; - typedef PacketMul2Xl half; - typedef numext::uint8_t mask_t; - enum { - size = rvv_packet_size_selector::size, - alignment = rvv_packet_alignment_selector::alignment, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - -template <> -EIGEN_STRONG_INLINE void prefetch(const numext::int64_t* addr) { -#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC - __builtin_prefetch(addr); -#endif -} - -/********************************* PacketMul1Xl ************************************/ - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pset1(const numext::int64_t& from) { - return __riscv_vmv_v_x_i64m1(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl plset(const numext::int64_t& a) { - PacketMul1Xl idx = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)); - return __riscv_vadd_vx_i64m1(idx, a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pzero(const PacketMul1Xl& /*a*/) { - return __riscv_vmv_v_x_i64m1(0, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl padd(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vadd_vv_i64m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl psub(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vsub(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pnegate(const PacketMul1Xl& a) { - return __riscv_vneg(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pconj(const PacketMul1Xl& a) { - return a; -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pmul(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vmul(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pdiv(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pmadd(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pmsub(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pnmadd(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { - return __riscv_vnmsub_vv_i64m1(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pnmsub(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { - return __riscv_vnmsub_vv_i64m1(a, b, pnegate(c), unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pmin(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vmin(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pmax(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vmax(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pcmp_le(const PacketMul1Xl& a, const PacketMul1Xl& b) { - PacketMask64 mask = __riscv_vmsle_vv_i64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pcmp_lt(const PacketMul1Xl& a, const PacketMul1Xl& b) { - PacketMask64 mask = __riscv_vmslt_vv_i64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pcmp_eq(const PacketMul1Xl& a, const PacketMul1Xl& b) { - PacketMask64 mask = __riscv_vmseq_vv_i64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl ptrue(const PacketMul1Xl& /*a*/) { - return __riscv_vmv_v_x_i64m1(0xffffffffffffffffu, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pand(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vand_vv_i64m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl por(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vor_vv_i64m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pxor(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vxor_vv_i64m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pandnot(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vand_vv_i64m1(a, __riscv_vnot_v_i64m1(b, unpacket_traits::size), - unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE PacketMul1Xl parithmetic_shift_right(PacketMul1Xl a) { - return __riscv_vsra_vx_i64m1(a, N, unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE PacketMul1Xl plogical_shift_right(PacketMul1Xl a) { - return __riscv_vreinterpret_i64m1( - __riscv_vsrl_vx_u64m1(__riscv_vreinterpret_u64m1(a), N, unpacket_traits::size)); -} - -template -EIGEN_STRONG_INLINE PacketMul1Xl plogical_shift_left(PacketMul1Xl a) { - return __riscv_vsll_vx_i64m1(a, N, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pload(const numext::int64_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl ploadu(const numext::int64_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl ploaddup(const numext::int64_t* from) { - PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... - return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl ploadquad(const numext::int64_t* from) { - PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - ; - return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul1Xl& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul1Xl& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline PacketMul1Xl pgather(const numext::int64_t* from, Index stride) { - return __riscv_vlse64_v_i64m1(from, stride * sizeof(numext::int64_t), unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul1Xl& from, - Index stride) { - __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul1Xl& a) { - return __riscv_vmv_x_s_i64m1_i64(a); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl preverse(const PacketMul1Xl& a) { - PacketMul1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xl pabs(const PacketMul1Xl& a) { - PacketMul1Xl mask = __riscv_vsra_vx_i64m1(a, 63, unpacket_traits::size); - return __riscv_vsub_vv_i64m1(__riscv_vxor_vv_i64m1(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul1Xl& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i64m1_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul1Xl& a) { - // Multiply the vector by its reverse - PacketMul1Xl prod = __riscv_vmul_vv_i64m1(preverse(a), a, unpacket_traits::size); - PacketMul1Xl half_prod; - - if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_i64m1(prod, 4, unpacket_traits::size); - prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); - } - if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_i64m1(prod, 2, unpacket_traits::size); - prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); - } - if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_i64m1(prod, 1, unpacket_traits::size); - prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); - } - - // The reduction is done to the first element. - return pfirst(prod); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul1Xl& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i64m1_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul1Xl& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i64m1_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size), - unpacket_traits::size)); -} - -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int64_t buffer[unpacket_traits::size * N] = {0}; - int i = 0; - - for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); - } - for (i = 0; i < N; i++) { - kernel.packet[i] = - __riscv_vle64_v_i64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); - } -} - -/********************************* PacketMul4Xl ************************************/ - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pset1(const numext::int64_t& from) { - return __riscv_vmv_v_x_i64m4(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl plset(const numext::int64_t& a) { - PacketMul4Xl idx = __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)); - return __riscv_vadd_vx_i64m4(idx, a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pzero(const PacketMul4Xl& /*a*/) { - return __riscv_vmv_v_x_i64m4(0, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl padd(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vadd_vv_i64m4(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl psub(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vsub(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pnegate(const PacketMul4Xl& a) { - return __riscv_vneg(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pconj(const PacketMul4Xl& a) { - return a; -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pmul(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vmul(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pdiv(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pmadd(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pmsub(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pnmadd(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { - return __riscv_vnmsub_vv_i64m4(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pnmsub(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { - return __riscv_vnmsub_vv_i64m4(a, b, pnegate(c), unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pmin(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vmin(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pmax(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vmax(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pcmp_le(const PacketMul4Xl& a, const PacketMul4Xl& b) { - PacketMask16 mask = __riscv_vmsle_vv_i64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pcmp_lt(const PacketMul4Xl& a, const PacketMul4Xl& b) { - PacketMask16 mask = __riscv_vmslt_vv_i64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pcmp_eq(const PacketMul4Xl& a, const PacketMul4Xl& b) { - PacketMask16 mask = __riscv_vmseq_vv_i64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl ptrue(const PacketMul4Xl& /*a*/) { - return __riscv_vmv_v_x_i64m4(0xffffffffffffffffu, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pand(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vand_vv_i64m4(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl por(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vor_vv_i64m4(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pxor(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vxor_vv_i64m4(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pandnot(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vand_vv_i64m4(a, __riscv_vnot_v_i64m4(b, unpacket_traits::size), - unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE PacketMul4Xl parithmetic_shift_right(PacketMul4Xl a) { - return __riscv_vsra_vx_i64m4(a, N, unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_right(PacketMul4Xl a) { - return __riscv_vreinterpret_i64m4( - __riscv_vsrl_vx_u64m4(__riscv_vreinterpret_u64m4(a), N, unpacket_traits::size)); -} - -template -EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_left(PacketMul4Xl a) { - return __riscv_vsll_vx_i64m4(a, N, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pload(const numext::int64_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl ploadu(const numext::int64_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl ploaddup(const numext::int64_t* from) { - PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... - return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl ploadquad(const numext::int64_t* from) { - PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul4Xl& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul4Xl& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline PacketMul4Xl pgather(const numext::int64_t* from, - Index stride) { - return __riscv_vlse64_v_i64m4(from, stride * sizeof(numext::int64_t), unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul4Xl& from, - Index stride) { - __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul4Xl& a) { - return __riscv_vmv_x_s_i64m4_i64(a); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl preverse(const PacketMul4Xl& a) { - PacketMul4Xul idx = - __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i64m4(a, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul4Xl pabs(const PacketMul4Xl& a) { - PacketMul4Xl mask = __riscv_vsra_vx_i64m4(a, 63, unpacket_traits::size); - return __riscv_vsub_vv_i64m4(__riscv_vxor_vv_i64m4(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul4Xl& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1( - a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul4Xl& a) { - PacketMul1Xl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), __riscv_vget_v_i64m4_i64m1(a, 1), - unpacket_traits::size); - PacketMul1Xl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), __riscv_vget_v_i64m4_i64m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul4Xl& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i64m4_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul4Xl& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i64m4_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), - unpacket_traits::size)); -} - -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int64_t buffer[unpacket_traits::size * N] = {0}; - int i = 0; - - for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); - } - for (i = 0; i < N; i++) { - kernel.packet[i] = - __riscv_vle64_v_i64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); - } -} - -/********************************* PacketMul2Xl ************************************/ - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pset1(const numext::int64_t& from) { - return __riscv_vmv_v_x_i64m2(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl plset(const numext::int64_t& a) { - PacketMul2Xl idx = __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)); - return __riscv_vadd_vx_i64m2(idx, a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pzero(const PacketMul2Xl& /*a*/) { - return __riscv_vmv_v_x_i64m2(0, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl padd(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vadd_vv_i64m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl psub(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vsub(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pnegate(const PacketMul2Xl& a) { - return __riscv_vneg(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pconj(const PacketMul2Xl& a) { - return a; -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pmul(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vmul(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pdiv(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pmadd(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pmsub(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pnmadd(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { - return __riscv_vnmsub_vv_i64m2(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pnmsub(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { - return __riscv_vnmsub_vv_i64m2(a, b, pnegate(c), unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pmin(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vmin(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pmax(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vmax(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pcmp_le(const PacketMul2Xl& a, const PacketMul2Xl& b) { - PacketMask32 mask = __riscv_vmsle_vv_i64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pcmp_lt(const PacketMul2Xl& a, const PacketMul2Xl& b) { - PacketMask32 mask = __riscv_vmslt_vv_i64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pcmp_eq(const PacketMul2Xl& a, const PacketMul2Xl& b) { - PacketMask32 mask = __riscv_vmseq_vv_i64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl ptrue(const PacketMul2Xl& /*a*/) { - return __riscv_vmv_v_x_i64m2(0xffffffffffffffffu, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pand(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vand_vv_i64m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl por(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vor_vv_i64m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pxor(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vxor_vv_i64m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pandnot(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vand_vv_i64m2(a, __riscv_vnot_v_i64m2(b, unpacket_traits::size), - unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE PacketMul2Xl parithmetic_shift_right(PacketMul2Xl a) { - return __riscv_vsra_vx_i64m2(a, N, unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_right(PacketMul2Xl a) { - return __riscv_vreinterpret_i64m2( - __riscv_vsrl_vx_u64m2(__riscv_vreinterpret_u64m2(a), N, unpacket_traits::size)); -} - -template -EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_left(PacketMul2Xl a) { - return __riscv_vsll_vx_i64m2(a, N, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pload(const numext::int64_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl ploadu(const numext::int64_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl ploaddup(const numext::int64_t* from) { - PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... - return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl ploadquad(const numext::int64_t* from) { - PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul2Xl& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul2Xl& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline PacketMul2Xl pgather(const numext::int64_t* from, - Index stride) { - return __riscv_vlse64_v_i64m2(from, stride * sizeof(numext::int64_t), unpacket_traits::size); -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul2Xl& from, - Index stride) { - __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul2Xl& a) { - return __riscv_vmv_x_s_i64m2_i64(a); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl preverse(const PacketMul2Xl& a) { - PacketMul2Xul idx = - __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i64m2(a, idx, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xl pabs(const PacketMul2Xl& a) { - PacketMul2Xl mask = __riscv_vsra_vx_i64m2(a, 63, unpacket_traits::size); - return __riscv_vsub_vv_i64m2(__riscv_vxor_vv_i64m2(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul2Xl& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1( - a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 2), unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul2Xl& a) { - return predux_mul(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul2Xl& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i64m2_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 2), - unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul2Xl& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i64m2_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 2), - unpacket_traits::size)); -} - -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int64_t buffer[unpacket_traits::size * N] = {0}; - int i = 0; - - for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); - } - for (i = 0; i < N; i++) { - kernel.packet[i] = - __riscv_vle64_v_i64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); - } -} - -template -EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xl>::type -predux_half_dowto4(const PacketMul4Xl& a) { - return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), __riscv_vget_v_i64m4_i64m2(a, 1), - unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul1Xl>::type -predux_half_dowto4(const PacketMul2Xl& a) { - return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), - unpacket_traits::size); -} - -/********************************* double ************************************/ +/********************************* float32 ************************************/ -typedef eigen_packet_wrapper PacketMul1Xd; -typedef eigen_packet_wrapper - PacketMul2Xd; -typedef eigen_packet_wrapper - PacketMul4Xd; +typedef eigen_packet_wrapper PacketMul1Xf; +typedef eigen_packet_wrapper + PacketMul2Xf; +typedef eigen_packet_wrapper + PacketMul4Xf; #if EIGEN_RISCV64_DEFAULT_LMUL == 1 -typedef PacketMul1Xd PacketXd; +typedef PacketMul1Xf PacketXf; template <> -struct packet_traits : default_packet_traits { - typedef PacketMul1Xd type; - typedef PacketMul1Xd half; +struct packet_traits : default_packet_traits { + typedef PacketMul1Xf type; + typedef PacketMul1Xf half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size = rvv_packet_size_selector::size, + size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, @@ -3176,24 +509,28 @@ struct packet_traits : default_packet_traits { HasDiv = 1, HasRound = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH }; }; #elif EIGEN_RISCV64_DEFAULT_LMUL == 2 -typedef PacketMul2Xd PacketXd; +typedef PacketMul2Xf PacketXf; template <> -struct packet_traits : default_packet_traits { - typedef PacketMul2Xd type; - typedef PacketMul1Xd half; +struct packet_traits : default_packet_traits { + typedef PacketMul2Xf type; + typedef PacketMul1Xf half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size = rvv_packet_size_selector::size, + size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, @@ -3214,24 +551,28 @@ struct packet_traits : default_packet_traits { HasDiv = 1, HasRound = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH }; }; #elif EIGEN_RISCV64_DEFAULT_LMUL == 4 -typedef PacketMul4Xd PacketXd; +typedef PacketMul4Xf PacketXf; template <> -struct packet_traits : default_packet_traits { - typedef PacketMul4Xd type; - typedef PacketMul2Xd half; +struct packet_traits : default_packet_traits { + typedef PacketMul4Xf type; + typedef PacketMul2Xf half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size = rvv_packet_size_selector::size, + size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, @@ -3252,1132 +593,866 @@ struct packet_traits : default_packet_traits { HasDiv = 1, HasRound = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, HasLog = 1, HasExp = 1, - HasSqrt = 1 + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH }; }; #endif template <> -struct unpacket_traits { - typedef double type; - typedef PacketMul1Xd half; // Half not yet implemented - typedef PacketMul1Xl integer_packet; - typedef numext::uint8_t mask_t; - typedef PacketMask64 packet_mask; - - enum { - size = rvv_packet_size_selector::size, - alignment = rvv_packet_alignment_selector::alignment, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - -template <> -struct unpacket_traits { - typedef double type; - typedef PacketMul1Xd half; - typedef PacketMul2Xl integer_packet; +struct unpacket_traits { + typedef float type; + typedef PacketMul1Xf half; // Half not yet implemented + typedef PacketMul1Xi integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask32 packet_mask; enum { - size = rvv_packet_size_selector::size, - alignment = rvv_packet_alignment_selector::alignment, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; - -template <> -struct unpacket_traits { - typedef double type; - typedef PacketMul2Xd half; - typedef PacketMul4Xl integer_packet; - typedef numext::uint8_t mask_t; - typedef PacketMask16 packet_mask; - - enum { - size = rvv_packet_size_selector::size, - alignment = rvv_packet_alignment_selector::alignment, + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, masked_store_available = false }; -}; - -/********************************* PacketMul1Xd ************************************/ - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd ptrue(const PacketMul1Xd& /*a*/) { - return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(0xffffffffffffffffu, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pzero(const PacketMul1Xd& /*a*/) { - return __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pabs(const PacketMul1Xd& a) { - return __riscv_vfabs_v_f64m1(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pset1(const double& from) { - return __riscv_vfmv_v_f_f64m1(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pset1frombits(numext::uint64_t from) { - return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(from, unpacket_traits::size)); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd plset(const double& a) { - PacketMul1Xd idx = __riscv_vfcvt_f_x_v_f64m1( - __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)), - unpacket_traits::size); - return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd padd(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vfadd_vv_f64m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd psub(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vfsub_vv_f64m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pnegate(const PacketMul1Xd& a) { - return __riscv_vfneg_v_f64m1(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pconj(const PacketMul1Xd& a) { - return a; -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmul(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vfmul_vv_f64m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pdiv(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vfdiv_vv_f64m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmadd(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { - return __riscv_vfmadd_vv_f64m1(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmsub(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { - return __riscv_vfmsub_vv_f64m1(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pnmadd(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { - return __riscv_vfnmsub_vv_f64m1(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pnmsub(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { - return __riscv_vfnmadd_vv_f64m1(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmin(const PacketMul1Xd& a, const PacketMul1Xd& b) { - PacketMul1Xd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); - PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); - - return __riscv_vfmin_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmin(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return pmin(a, b); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmin(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vfmin_vv_f64m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmax(const PacketMul1Xd& a, const PacketMul1Xd& b) { - PacketMul1Xd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); - PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); - - return __riscv_vfmax_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmax(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return pmax(a, b); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmax(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vfmax_vv_f64m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pcmp_le(const PacketMul1Xd& a, const PacketMul1Xd& b) { - PacketMask64 mask = __riscv_vmfle_vv_f64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pcmp_lt(const PacketMul1Xd& a, const PacketMul1Xd& b) { - PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pcmp_eq(const PacketMul1Xd& a, const PacketMul1Xd& b) { - PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); -} +}; template <> -EIGEN_STRONG_INLINE PacketMul1Xd pcmp_lt_or_nan(const PacketMul1Xd& a, const PacketMul1Xd& b) { - PacketMask64 mask = __riscv_vmfge_vv_f64m1_b64(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f64m1(ptrue(a), 0.0, mask, unpacket_traits::size); -} +struct unpacket_traits { + typedef float type; + typedef PacketMul1Xf half; + typedef PacketMul2Xi integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask16 packet_mask; -// Logical Operations are not supported for double, so reinterpret casts -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pand(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( - __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); -} + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; template <> -EIGEN_STRONG_INLINE PacketMul1Xd por(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vor_vv_u64m1( - __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); -} +struct unpacket_traits { + typedef float type; + typedef PacketMul2Xf half; + typedef PacketMul4Xi integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask8 packet_mask; -template <> -EIGEN_STRONG_INLINE PacketMul1Xd pxor(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vv_u64m1( - __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); -} + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +/********************************* PacketMul1Xf ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul1Xd pandnot(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( - __riscv_vreinterpret_v_f64m1_u64m1(a), - __riscv_vnot_v_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size), - unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xf ptrue(const PacketMul1Xf& /*a*/) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(0xffffffffu, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pload(const double* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pzero(const PacketMul1Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m1(0.0f, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd ploadu(const double* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pabs(const PacketMul1Xf& a) { + return __riscv_vfabs_v_f32m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd ploaddup(const double* from) { - PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd ploadquad(const double* from) { - PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - ; - return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(from, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul1Xd& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf plset(const float& a) { + PacketMul1Xf idx = __riscv_vfcvt_f_x_v_f32m1( + __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul1Xd& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf padd(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vfadd_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul1Xd pgather(const double* from, Index stride) { - return __riscv_vlse64_v_f64m1(from, stride * sizeof(double), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf psub(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vfsub_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul1Xd& from, Index stride) { - __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pnegate(const PacketMul1Xf& a) { + return __riscv_vfneg_v_f32m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE double pfirst(const PacketMul1Xd& a) { - return __riscv_vfmv_f_s_f64m1_f64(a); +EIGEN_STRONG_INLINE PacketMul1Xf pconj(const PacketMul1Xf& a) { + return a; } template <> -EIGEN_STRONG_INLINE PacketMul1Xd psqrt(const PacketMul1Xd& a) { - return __riscv_vfsqrt_v_f64m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pmul(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vfmul_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd print(const PacketMul1Xd& a) { - const PacketMul1Xd limit = pset1(static_cast(1ull << 52)); - const PacketMul1Xd abs_a = pabs(a); - - PacketMask64 mask = __riscv_vmfne_vv_f64m1_b64(a, a, unpacket_traits::size); - const PacketMul1Xd x = __riscv_vfadd_vv_f64m1_tumu(mask, a, a, a, unpacket_traits::size); - const PacketMul1Xd new_x = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1(a, unpacket_traits::size), - unpacket_traits::size); - - mask = __riscv_vmflt_vv_f64m1_b64(abs_a, limit, unpacket_traits::size); - PacketMul1Xd signed_x = __riscv_vfsgnj_vv_f64m1(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m1(x, signed_x, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pdiv(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vfdiv_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pfloor(const PacketMul1Xd& a) { - PacketMul1Xd tmp = print(a); - // If greater, subtract one. - PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f64m1_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pmadd(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { + return __riscv_vfmadd_vv_f32m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd preverse(const PacketMul1Xd& a) { - PacketMul1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pmsub(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { + return __riscv_vfmsub_vv_f32m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pfrexp(const PacketMul1Xd& a, PacketMul1Xd& exponent) { - return pfrexp_generic(a, exponent); +EIGEN_STRONG_INLINE PacketMul1Xf pnmadd(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { + return __riscv_vfnmsub_vv_f32m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE double predux(const PacketMul1Xd& a) { - return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m1_f64m1( - a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size), unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xf pnmsub(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { + return __riscv_vfnmadd_vv_f32m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE double predux_mul(const PacketMul1Xd& a) { - // Multiply the vector by its reverse - PacketMul1Xd prod = __riscv_vfmul_vv_f64m1(preverse(a), a, unpacket_traits::size); - PacketMul1Xd half_prod; - - if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_f64m1(prod, 4, unpacket_traits::size); - prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); - } - if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_f64m1(prod, 2, unpacket_traits::size); - prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); - } - if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_f64m1(prod, 1, unpacket_traits::size); - prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); - } +EIGEN_STRONG_INLINE PacketMul1Xf pmin(const PacketMul1Xf& a, const PacketMul1Xf& b) { + PacketMul1Xf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); - // The reduction is done to the first element. - return pfirst(prod); + return __riscv_vfmin_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE double predux_min(const PacketMul1Xd& a) { - return ( - std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( - a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), - (std::numeric_limits::max)()); +EIGEN_STRONG_INLINE PacketMul1Xf pmin(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return pmin(a, b); } template <> -EIGEN_STRONG_INLINE double predux_max(const PacketMul1Xd& a) { - return ( - std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( - a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), - -(std::numeric_limits::max)()); -} - -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - double buffer[unpacket_traits::size * N]; - int i = 0; - - for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); - } - - for (i = 0; i < N; i++) { - kernel.packet[i] = - __riscv_vle64_v_f64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); - } +EIGEN_STRONG_INLINE PacketMul1Xf pmin(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vfmin_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pldexp(const PacketMul1Xd& a, const PacketMul1Xd& exponent) { - return pldexp_generic(a, exponent); -} +EIGEN_STRONG_INLINE PacketMul1Xf pmax(const PacketMul1Xf& a, const PacketMul1Xf& b) { + PacketMul1Xf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); -template <> -EIGEN_STRONG_INLINE PacketMask64 por(const PacketMask64& a, const PacketMask64& b) { - return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); + return __riscv_vfmax_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMask64 pandnot(const PacketMask64& a, const PacketMask64& b) { - return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pmax(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return pmax(a, b); } template <> -EIGEN_STRONG_INLINE PacketMask64 pand(const PacketMask64& a, const PacketMask64& b) { - return __riscv_vmand_mm_b64(a, b, unpacket_traits::size); -} - -EIGEN_STRONG_INLINE PacketMask64 pcmp_eq_mask(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); -} - -EIGEN_STRONG_INLINE PacketMask64 pcmp_lt_mask(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); -} - -EIGEN_STRONG_INLINE PacketMul1Xd pselect(const PacketMask64& mask, const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vmerge_vvm_f64m1(b, a, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pmax(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vfmax_vv_f32m1(a, b, unpacket_traits::size); } -/********************************* PacketMul4Xd ************************************/ - template <> -EIGEN_STRONG_INLINE PacketMul4Xd ptrue(const PacketMul4Xd& /*a*/) { - return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(0xffffffffffffffffu, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xf pcmp_le(const PacketMul1Xf& a, const PacketMul1Xf& b) { + PacketMask32 mask = __riscv_vmfle_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pzero(const PacketMul4Xd& /*a*/) { - return __riscv_vfmv_v_f_f64m4(0.0, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pcmp_lt(const PacketMul1Xf& a, const PacketMul1Xf& b) { + PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pabs(const PacketMul4Xd& a) { - return __riscv_vfabs_v_f64m4(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pcmp_eq(const PacketMul1Xf& a, const PacketMul1Xf& b) { + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pset1(const double& from) { - return __riscv_vfmv_v_f_f64m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pcmp_lt_or_nan(const PacketMul1Xf& a, const PacketMul1Xf& b) { + PacketMask32 mask = __riscv_vmfge_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m1(ptrue(a), 0.0f, mask, unpacket_traits::size); } +// Logical Operations are not supported for float, so reinterpret casts template <> -EIGEN_STRONG_INLINE PacketMul4Xd pset1frombits(numext::uint64_t from) { - return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(from, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xf pand(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd plset(const double& a) { - PacketMul4Xd idx = __riscv_vfcvt_f_x_v_f64m4( - __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)), - unpacket_traits::size); - return __riscv_vfadd_vf_f64m4(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf por(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vor_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd padd(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vfadd_vv_f64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pxor(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd psub(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vfsub_vv_f64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pandnot(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), + __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pnegate(const PacketMul4Xd& a) { - return __riscv_vfneg_v_f64m4(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pconj(const PacketMul4Xd& a) { - return a; +EIGEN_STRONG_INLINE PacketMul1Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmul(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vfmul_vv_f64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf ploaddup(const float* from) { + PacketMul1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pdiv(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vfdiv_vv_f64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf ploadquad(const float* from) { + PacketMul1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { - return __riscv_vfmadd_vv_f64m4(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul1Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { - return __riscv_vfmsub_vv_f64m4(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul1Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pnmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { - return __riscv_vfnmsub_vv_f64m4(a, b, c, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline PacketMul1Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m1(from, stride * sizeof(float), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pnmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { - return __riscv_vfnmadd_vv_f64m4(a, b, c, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul1Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { - PacketMul4Xd nans = - __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); - PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); - - return __riscv_vfmin_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE float pfirst(const PacketMul1Xf& a) { + return __riscv_vfmv_f_s_f32m1_f32(a); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return pmin(a, b); +EIGEN_STRONG_INLINE PacketMul1Xf psqrt(const PacketMul1Xf& a) { + return __riscv_vfsqrt_v_f32m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vfmin_vv_f64m4(a, b, unpacket_traits::size); -} +EIGEN_STRONG_INLINE PacketMul1Xf print(const PacketMul1Xf& a) { + const PacketMul1Xf limit = pset1(static_cast(1 << 23)); + const PacketMul1Xf abs_a = pabs(a); -template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { - PacketMul4Xd nans = - __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); - PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + PacketMask32 mask = __riscv_vmfne_vv_f32m1_b32(a, a, unpacket_traits::size); + const PacketMul1Xf x = __riscv_vfadd_vv_f32m1_tumu(mask, a, a, a, unpacket_traits::size); + const PacketMul1Xf new_x = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(a, unpacket_traits::size), + unpacket_traits::size); - return __riscv_vfmax_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits::size); + mask = __riscv_vmflt_vv_f32m1_b32(abs_a, limit, unpacket_traits::size); + PacketMul1Xf signed_x = __riscv_vfsgnj_vv_f32m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(x, signed_x, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return pmax(a, b); +EIGEN_STRONG_INLINE PacketMul1Xf pfloor(const PacketMul1Xf& a) { + PacketMul1Xf tmp = print(a); + // If greater, subtract one. + PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m1_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vfmax_vv_f64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf preverse(const PacketMul1Xf& a) { + PacketMul1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcmp_le(const PacketMul4Xd& a, const PacketMul4Xd& b) { - PacketMask16 mask = __riscv_vmfle_vv_f64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pfrexp(const PacketMul1Xf& a, PacketMul1Xf& exponent) { + return pfrexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt(const PacketMul4Xd& a, const PacketMul4Xd& b) { - PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE float predux(const PacketMul1Xf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcmp_eq(const PacketMul4Xd& a, const PacketMul4Xd& b) { - PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); -} +EIGEN_STRONG_INLINE float predux_mul(const PacketMul1Xf& a) { + // Multiply the vector by its reverse + PacketMul1Xf prod = __riscv_vfmul_vv_f32m1(preverse(a), a, unpacket_traits::size); + PacketMul1Xf half_prod; -template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt_or_nan(const PacketMul4Xd& a, const PacketMul4Xd& b) { - PacketMask16 mask = __riscv_vmfge_vv_f64m4_b16(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f64m4(ptrue(a), 0.0, mask, unpacket_traits::size); -} + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); -// Logical Operations are not supported for double, so reinterpret casts -template <> -EIGEN_STRONG_INLINE PacketMul4Xd pand(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), - __riscv_vreinterpret_v_f64m4_u64m4(b), - unpacket_traits::size)); + // The reduction is done to the first element. + return pfirst(prod); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd por(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), - __riscv_vreinterpret_v_f64m4_u64m4(b), - unpacket_traits::size)); +EIGEN_STRONG_INLINE float predux_min(const PacketMul1Xf& a) { + return ( + std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + (std::numeric_limits::max)()); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pxor(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), - __riscv_vreinterpret_v_f64m4_u64m4(b), - unpacket_traits::size)); +EIGEN_STRONG_INLINE float predux_max(const PacketMul1Xf& a) { + return ( + std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + -(std::numeric_limits::max)()); } -template <> -EIGEN_STRONG_INLINE PacketMul4Xd pandnot(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4( - __riscv_vreinterpret_v_f64m4_u64m4(a), - __riscv_vnot_v_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size), - unpacket_traits::size)); -} +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; -template <> -EIGEN_STRONG_INLINE PacketMul4Xd pload(const double* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); -} + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } -template <> -EIGEN_STRONG_INLINE PacketMul4Xd ploadu(const double* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_f32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } } template <> -EIGEN_STRONG_INLINE PacketMul4Xd ploaddup(const double* from) { - PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pldexp(const PacketMul1Xf& a, const PacketMul1Xf& exponent) { + return pldexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd ploadquad(const double* from) { - PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask32 por(const PacketMask32& a, const PacketMask32& b) { + return __riscv_vmor_mm_b32(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul4Xd& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask32 pand(const PacketMask32& a, const PacketMask32& b) { + return __riscv_vmand_mm_b32(a, b, unpacket_traits::size); } -template <> -EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul4Xd& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask32 pcmp_eq_mask(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); } -template <> -EIGEN_DEVICE_FUNC inline PacketMul4Xd pgather(const double* from, Index stride) { - return __riscv_vlse64_v_f64m4(from, stride * sizeof(double), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask32 pcmp_lt_mask(const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); } -template <> -EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul4Xd& from, Index stride) { - __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xf pselect(const PacketMask32& mask, const PacketMul1Xf& a, const PacketMul1Xf& b) { + return __riscv_vmerge_vvm_f32m1(b, a, mask, unpacket_traits::size); } -template <> -EIGEN_STRONG_INLINE double pfirst(const PacketMul4Xd& a) { - return __riscv_vfmv_f_s_f64m4_f64(a); -} +/********************************* int64 **************************************/ -template <> -EIGEN_STRONG_INLINE PacketMul4Xd psqrt(const PacketMul4Xd& a) { - return __riscv_vfsqrt_v_f64m4(a, unpacket_traits::size); -} +typedef eigen_packet_wrapper PacketMul1Xl; +typedef eigen_packet_wrapper PacketMul1Xul; -template <> -EIGEN_STRONG_INLINE PacketMul4Xd print(const PacketMul4Xd& a) { - const PacketMul4Xd limit = pset1(static_cast(1ull << 52)); - const PacketMul4Xd abs_a = pabs(a); +typedef eigen_packet_wrapper + PacketMul2Xl; +typedef eigen_packet_wrapper + PacketMul2Xul; - PacketMask16 mask = __riscv_vmfne_vv_f64m4_b16(a, a, unpacket_traits::size); - const PacketMul4Xd x = __riscv_vfadd_vv_f64m4_tumu(mask, a, a, a, unpacket_traits::size); - const PacketMul4Xd new_x = __riscv_vfcvt_f_x_v_f64m4( - __riscv_vfcvt_x_f_v_i64m4(a, unpacket_traits::size), unpacket_traits::size); +typedef eigen_packet_wrapper + PacketMul4Xl; +typedef eigen_packet_wrapper + PacketMul4Xul; - mask = __riscv_vmflt_vv_f64m4_b16(abs_a, limit, unpacket_traits::size); - PacketMul4Xd signed_x = __riscv_vfsgnj_vv_f64m4(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m4(x, signed_x, mask, unpacket_traits::size); -} +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 +typedef PacketMul1Xl PacketXl; +typedef PacketMul1Xul PacketXul; template <> -EIGEN_STRONG_INLINE PacketMul4Xd pfloor(const PacketMul4Xd& a) { - PacketMul4Xd tmp = print(a); - // If greater, subtract one. - PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f64m4_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); -} +struct packet_traits : default_packet_traits { + typedef PacketMul1Xl type; + typedef PacketMul1Xl half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, -template <> -EIGEN_STRONG_INLINE PacketMul4Xd preverse(const PacketMul4Xd& a) { - PacketMul4Xul idx = - __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f64m4(a, idx, unpacket_traits::size); -} + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; -template <> -EIGEN_STRONG_INLINE PacketMul4Xd pfrexp(const PacketMul4Xd& a, PacketMul4Xd& exponent) { - return pfrexp_generic(a, exponent); -} +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 +typedef PacketMul2Xl PacketXl; +typedef PacketMul2Xul PacketXul; template <> -EIGEN_STRONG_INLINE double predux(const PacketMul4Xd& a) { - return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m4_f64m1( - a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); -} +struct packet_traits : default_packet_traits { + typedef PacketMul2Xl type; + typedef PacketMul1Xl half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, -template <> -EIGEN_STRONG_INLINE double predux_mul(const PacketMul4Xd& a) { - PacketMul1Xd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), __riscv_vget_v_f64m4_f64m1(a, 1), - unpacket_traits::size); - PacketMul1Xd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), __riscv_vget_v_f64m4_f64m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits::size)); -} + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; -template <> -EIGEN_STRONG_INLINE double predux_min(const PacketMul4Xd& a) { - return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1( - a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 4), - unpacket_traits::size)), - (std::numeric_limits::max)()); -} +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 +typedef PacketMul4Xl PacketXl; +typedef PacketMul4Xul PacketXul; template <> -EIGEN_STRONG_INLINE double predux_max(const PacketMul4Xd& a) { - return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1( - a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 4), - unpacket_traits::size)), - -(std::numeric_limits::max)()); -} - -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - double buffer[unpacket_traits::size * N]; - int i = 0; +struct packet_traits : default_packet_traits { + typedef PacketMul4Xl type; + typedef PacketMul2Xl half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, - for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); - } + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; +#endif - for (i = 0; i < N; i++) { - kernel.packet[i] = - __riscv_vle64_v_f64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); - } -} +template <> +struct unpacket_traits { + typedef numext::int64_t type; + typedef PacketMul1Xl half; // Half not yet implemented + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; template <> -EIGEN_STRONG_INLINE PacketMul4Xd pldexp(const PacketMul4Xd& a, const PacketMul4Xd& exponent) { - return pldexp_generic(a, exponent); -} - -/********************************* PacketMul2Xd ************************************/ +struct unpacket_traits { + typedef numext::int64_t type; + typedef PacketMul1Xl half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; template <> -EIGEN_STRONG_INLINE PacketMul2Xd ptrue(const PacketMul2Xd& /*a*/) { - return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(0xffffffffffffffffu, unpacket_traits::size)); -} +struct unpacket_traits { + typedef numext::int64_t type; + typedef PacketMul2Xl half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; template <> -EIGEN_STRONG_INLINE PacketMul2Xd pzero(const PacketMul2Xd& /*a*/) { - return __riscv_vfmv_v_f_f64m2(0.0, unpacket_traits::size); +EIGEN_STRONG_INLINE void prefetch(const numext::int64_t* addr) { +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + __builtin_prefetch(addr); +#endif } -template <> -EIGEN_STRONG_INLINE PacketMul2Xd pabs(const PacketMul2Xd& a) { - return __riscv_vfabs_v_f64m2(a, unpacket_traits::size); -} +/********************************* PacketMul1Xl ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul2Xd pset1(const double& from) { - return __riscv_vfmv_v_f_f64m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pset1frombits(numext::uint64_t from) { - return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(from, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xl plset(const numext::int64_t& a) { + PacketMul1Xl idx = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)); + return __riscv_vadd_vx_i64m1(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd plset(const double& a) { - PacketMul2Xd idx = __riscv_vfcvt_f_x_v_f64m2( - __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)), - unpacket_traits::size); - return __riscv_vfadd_vf_f64m2(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pzero(const PacketMul1Xl& /*a*/) { + return __riscv_vmv_v_x_i64m1(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd padd(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vfadd_vv_f64m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl padd(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vadd_vv_i64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd psub(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vfsub_vv_f64m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl psub(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pnegate(const PacketMul2Xd& a) { - return __riscv_vfneg_v_f64m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pnegate(const PacketMul1Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pconj(const PacketMul2Xd& a) { +EIGEN_STRONG_INLINE PacketMul1Xl pconj(const PacketMul1Xl& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmul(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vfmul_vv_f64m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xd pdiv(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vfdiv_vv_f64m2(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmadd(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { - return __riscv_vfmadd_vv_f64m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pmul(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmsub(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { - return __riscv_vfmsub_vv_f64m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pdiv(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pnmadd(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { - return __riscv_vfnmsub_vv_f64m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pmadd(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pnmsub(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { - return __riscv_vfnmadd_vv_f64m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pmsub(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { - PacketMul2Xd nans = - __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); - PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); - - return __riscv_vfmin_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pnmadd(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { + return __riscv_vnmsub_vv_i64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return pmin(a, b); +EIGEN_STRONG_INLINE PacketMul1Xl pnmsub(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { + return __riscv_vnmsub_vv_i64m1(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vfmin_vv_f64m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pmin(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { - PacketMul2Xd nans = - __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); - PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); - - return __riscv_vfmax_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pmax(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return pmax(a, b); +EIGEN_STRONG_INLINE PacketMul1Xl pcmp_le(const PacketMul1Xl& a, const PacketMul1Xl& b) { + PacketMask64 mask = __riscv_vmsle_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vfmax_vv_f64m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pcmp_lt(const PacketMul1Xl& a, const PacketMul1Xl& b) { + PacketMask64 mask = __riscv_vmslt_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pcmp_le(const PacketMul2Xd& a, const PacketMul2Xd& b) { - PacketMask32 mask = __riscv_vmfle_vv_f64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pcmp_eq(const PacketMul1Xl& a, const PacketMul1Xl& b) { + PacketMask64 mask = __riscv_vmseq_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt(const PacketMul2Xd& a, const PacketMul2Xd& b) { - PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl ptrue(const PacketMul1Xl& /*a*/) { + return __riscv_vmv_v_x_i64m1(0xffffffffffffffffu, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pcmp_eq(const PacketMul2Xd& a, const PacketMul2Xd& b) { - PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pand(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vand_vv_i64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt_or_nan(const PacketMul2Xd& a, const PacketMul2Xd& b) { - PacketMask32 mask = __riscv_vmfge_vv_f64m2_b32(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f64m2(ptrue(a), 0.0, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl por(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vor_vv_i64m1(a, b, unpacket_traits::size); } -// Logical Operations are not supported for double, so reinterpret casts template <> -EIGEN_STRONG_INLINE PacketMul2Xd pand(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), - __riscv_vreinterpret_v_f64m2_u64m2(b), - unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xl pxor(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vxor_vv_i64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd por(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), - __riscv_vreinterpret_v_f64m2_u64m2(b), - unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xl pandnot(const PacketMul1Xl& a, const PacketMul1Xl& b) { + return __riscv_vand_vv_i64m1(a, __riscv_vnot_v_i64m1(b, unpacket_traits::size), + unpacket_traits::size); } -template <> -EIGEN_STRONG_INLINE PacketMul2Xd pxor(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), - __riscv_vreinterpret_v_f64m2_u64m2(b), - unpacket_traits::size)); +template +EIGEN_STRONG_INLINE PacketMul1Xl parithmetic_shift_right(PacketMul1Xl a) { + return __riscv_vsra_vx_i64m1(a, N, unpacket_traits::size); } -template <> -EIGEN_STRONG_INLINE PacketMul2Xd pandnot(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2( - __riscv_vreinterpret_v_f64m2_u64m2(a), - __riscv_vnot_v_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size), - unpacket_traits::size)); +template +EIGEN_STRONG_INLINE PacketMul1Xl plogical_shift_right(PacketMul1Xl a) { + return __riscv_vreinterpret_i64m1( + __riscv_vsrl_vx_u64m1(__riscv_vreinterpret_u64m1(a), N, unpacket_traits::size)); } -template <> -EIGEN_STRONG_INLINE PacketMul2Xd pload(const double* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); +template +EIGEN_STRONG_INLINE PacketMul1Xl plogical_shift_left(PacketMul1Xl a) { + return __riscv_vsll_vx_i64m1(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd ploadu(const double* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd ploaddup(const double* from) { - PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd ploadquad(const double* from) { - PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl ploaddup(const numext::int64_t* from) { + PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul2Xd& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl ploadquad(const numext::int64_t* from) { + PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + ; + return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul2Xd& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul1Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul2Xd pgather(const double* from, Index stride) { - return __riscv_vlse64_v_f64m2(from, stride * sizeof(double), unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul1Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul2Xd& from, Index stride) { - __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline PacketMul1Xl pgather(const numext::int64_t* from, Index stride) { + return __riscv_vlse64_v_i64m1(from, stride * sizeof(numext::int64_t), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE double pfirst(const PacketMul2Xd& a) { - return __riscv_vfmv_f_s_f64m2_f64(a); +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul1Xl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd psqrt(const PacketMul2Xd& a) { - return __riscv_vfsqrt_v_f64m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul1Xl& a) { + return __riscv_vmv_x_s_i64m1_i64(a); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd print(const PacketMul2Xd& a) { - const PacketMul2Xd limit = pset1(static_cast(1ull << 52)); - const PacketMul2Xd abs_a = pabs(a); - - PacketMask32 mask = __riscv_vmfne_vv_f64m2_b32(a, a, unpacket_traits::size); - const PacketMul2Xd x = __riscv_vfadd_vv_f64m2_tumu(mask, a, a, a, unpacket_traits::size); - const PacketMul2Xd new_x = __riscv_vfcvt_f_x_v_f64m2( - __riscv_vfcvt_x_f_v_i64m2(a, unpacket_traits::size), unpacket_traits::size); - - mask = __riscv_vmflt_vv_f64m2_b32(abs_a, limit, unpacket_traits::size); - PacketMul2Xd signed_x = __riscv_vfsgnj_vv_f64m2(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m2(x, signed_x, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl preverse(const PacketMul1Xl& a) { + PacketMul1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pfloor(const PacketMul2Xd& a) { - PacketMul2Xd tmp = print(a); - // If greater, subtract one. - PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f64m2_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xl pabs(const PacketMul1Xl& a) { + PacketMul1Xl mask = __riscv_vsra_vx_i64m1(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m1(__riscv_vxor_vv_i64m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd preverse(const PacketMul2Xd& a) { - PacketMul2Xul idx = - __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f64m2(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul1Xl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m1_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pfrexp(const PacketMul2Xd& a, PacketMul2Xd& exponent) { - return pfrexp_generic(a, exponent); -} +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul1Xl& a) { + // Multiply the vector by its reverse + PacketMul1Xl prod = __riscv_vmul_vv_i64m1(preverse(a), a, unpacket_traits::size); + PacketMul1Xl half_prod; -template <> -EIGEN_STRONG_INLINE double predux(const PacketMul2Xd& a) { - return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m2_f64m1( - a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 2), unpacket_traits::size)); -} + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } -template <> -EIGEN_STRONG_INLINE double predux_mul(const PacketMul2Xd& a) { - return predux_mul(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), - unpacket_traits::size)); + // The reduction is done to the first element. + return pfirst(prod); } template <> -EIGEN_STRONG_INLINE double predux_min(const PacketMul2Xd& a) { - return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1( - a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 2), - unpacket_traits::size)), - (std::numeric_limits::max)()); +EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul1Xl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m1_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE double predux_max(const PacketMul2Xd& a) { - return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1( - a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 2), - unpacket_traits::size)), - -(std::numeric_limits::max)()); +EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul1Xl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m1_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - double buffer[unpacket_traits::size * N]; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); } - for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle64_v_f64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle64_v_i64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } -template <> -EIGEN_STRONG_INLINE PacketMul2Xd pldexp(const PacketMul2Xd& a, const PacketMul2Xd& exponent) { - return pldexp_generic(a, exponent); -} - -template -EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xd>::type -predux_half_dowto4(const PacketMul4Xd& a) { - return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), __riscv_vget_v_f64m4_f64m2(a, 1), - unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul1Xd>::type -predux_half_dowto4(const PacketMul2Xd& a) { - return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), - unpacket_traits::size); -} - -/********************************* short **************************************/ - -typedef eigen_packet_wrapper PacketMul1Xs; -typedef eigen_packet_wrapper PacketMul1Xsu; - -typedef eigen_packet_wrapper - PacketMul2Xs; -typedef eigen_packet_wrapper - PacketMul2Xsu; +/********************************* double ************************************/ -typedef eigen_packet_wrapper - PacketMul4Xs; -typedef eigen_packet_wrapper - PacketMul4Xsu; +typedef eigen_packet_wrapper PacketMul1Xd; +typedef eigen_packet_wrapper + PacketMul2Xd; +typedef eigen_packet_wrapper + PacketMul4Xd; #if EIGEN_RISCV64_DEFAULT_LMUL == 1 -typedef PacketMul1Xs PacketXs; -typedef PacketMul1Xsu PacketXsu; +typedef PacketMul1Xd PacketXd; template <> -struct packet_traits : default_packet_traits { - typedef PacketMul1Xs type; - typedef PacketMul1Xs half; // Half not implemented yet +struct packet_traits : default_packet_traits { + typedef PacketMul1Xd type; + typedef PacketMul1Xd half; + enum { Vectorizable = 1, AlignedOnScalar = 1, - size = rvv_packet_size_selector::size, + size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, @@ -4392,22 +1467,30 @@ struct packet_traits : default_packet_traits { HasConj = 1, HasSetLinear = 0, HasBlend = 0, - HasReduxp = 0 + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 }; }; #elif EIGEN_RISCV64_DEFAULT_LMUL == 2 -typedef PacketMul2Xs PacketXs; -typedef PacketMul2Xsu PacketXsu; +typedef PacketMul2Xd PacketXd; template <> -struct packet_traits : default_packet_traits { - typedef PacketMul2Xs type; - typedef PacketMul1Xs half; +struct packet_traits : default_packet_traits { + typedef PacketMul2Xd type; + typedef PacketMul1Xd half; + enum { Vectorizable = 1, AlignedOnScalar = 1, - size = rvv_packet_size_selector::size, + size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, @@ -4422,22 +1505,30 @@ struct packet_traits : default_packet_traits { HasConj = 1, HasSetLinear = 0, HasBlend = 0, - HasReduxp = 0 + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 }; }; #elif EIGEN_RISCV64_DEFAULT_LMUL == 4 -typedef PacketMul4Xs PacketXs; -typedef PacketMul4Xsu PacketXsu; +typedef PacketMul4Xd PacketXd; template <> -struct packet_traits : default_packet_traits { - typedef PacketMul4Xs type; - typedef PacketMul2Xs half; +struct packet_traits : default_packet_traits { + typedef PacketMul4Xd type; + typedef PacketMul2Xd half; + enum { Vectorizable = 1, AlignedOnScalar = 1, - size = rvv_packet_size_selector::size, + size = rvv_packet_size_selector::size, HasAdd = 1, HasSub = 1, @@ -4452,18 +1543,29 @@ struct packet_traits : default_packet_traits { HasConj = 1, HasSetLinear = 0, HasBlend = 0, - HasReduxp = 0 + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 }; }; #endif template <> -struct unpacket_traits { - typedef numext::int16_t type; - typedef PacketMul1Xs half; // Half not yet implemented +struct unpacket_traits { + typedef double type; + typedef PacketMul1Xd half; // Half not yet implemented + typedef PacketMul1Xl integer_packet; typedef numext::uint8_t mask_t; + typedef PacketMask64 packet_mask; + enum { - size = rvv_packet_size_selector::size, + size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, @@ -4472,12 +1574,15 @@ struct unpacket_traits { }; template <> -struct unpacket_traits { - typedef numext::int16_t type; - typedef PacketMul1Xs half; +struct unpacket_traits { + typedef double type; + typedef PacketMul1Xd half; + typedef PacketMul2Xl integer_packet; typedef numext::uint8_t mask_t; + typedef PacketMask32 packet_mask; + enum { - size = rvv_packet_size_selector::size, + size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, @@ -4486,12 +1591,15 @@ struct unpacket_traits { }; template <> -struct unpacket_traits { - typedef numext::int16_t type; - typedef PacketMul2Xs half; +struct unpacket_traits { + typedef double type; + typedef PacketMul2Xd half; + typedef PacketMul4Xl integer_packet; typedef numext::uint8_t mask_t; + typedef PacketMask16 packet_mask; + enum { - size = rvv_packet_size_selector::size, + size = rvv_packet_size_selector::size, alignment = rvv_packet_alignment_selector::alignment, vectorizable = true, masked_load_available = false, @@ -4499,794 +1607,796 @@ struct unpacket_traits { }; }; -template <> -EIGEN_STRONG_INLINE void prefetch(const numext::int16_t* addr) { -#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC - __builtin_prefetch(addr); -#endif -} - -/********************************* PacketMul1Xs ************************************/ - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs pset1(const numext::int16_t& from) { - return __riscv_vmv_v_x_i16m1(from, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs plset(const numext::int16_t& a) { - PacketMul1Xs idx = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vid_v_u16m1(unpacket_traits::size)); - return __riscv_vadd_vx_i16m1(idx, a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs pzero(const PacketMul1Xs& /*a*/) { - return __riscv_vmv_v_x_i16m1(0, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs padd(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vadd_vv_i16m1(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs psub(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vsub(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs pnegate(const PacketMul1Xs& a) { - return __riscv_vneg(a, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs pconj(const PacketMul1Xs& a) { - return a; -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs pmul(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vmul(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs pdiv(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs pmadd(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs pmsub(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs pnmadd(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { - return __riscv_vnmsub_vv_i16m1(a, b, c, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs pnmsub(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { - return __riscv_vnmsub_vv_i16m1(a, b, pnegate(c), unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs pmin(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vmin(a, b, unpacket_traits::size); -} - -template <> -EIGEN_STRONG_INLINE PacketMul1Xs pmax(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vmax(a, b, unpacket_traits::size); -} +/********************************* PacketMul1Xd ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul1Xs pcmp_le(const PacketMul1Xs& a, const PacketMul1Xs& b) { - PacketMask16 mask = __riscv_vmsle_vv_i16m1_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd ptrue(const PacketMul1Xd& /*a*/) { + return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(0xffffffffffffffffu, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pcmp_lt(const PacketMul1Xs& a, const PacketMul1Xs& b) { - PacketMask16 mask = __riscv_vmslt_vv_i16m1_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pzero(const PacketMul1Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pcmp_eq(const PacketMul1Xs& a, const PacketMul1Xs& b) { - PacketMask16 mask = __riscv_vmseq_vv_i16m1_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pabs(const PacketMul1Xd& a) { + return __riscv_vfabs_v_f64m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs ptrue(const PacketMul1Xs& /*a*/) { - return __riscv_vmv_v_x_i16m1(static_cast(0xffffu), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pand(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vand_vv_i16m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(from, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs por(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vor_vv_i16m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd plset(const double& a) { + PacketMul1Xd idx = __riscv_vfcvt_f_x_v_f64m1( + __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pxor(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vxor_vv_i16m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd padd(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vfadd_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pandnot(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vand_vv_i16m1(a, __riscv_vnot_v_i16m1(b, unpacket_traits::size), - unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE PacketMul1Xs parithmetic_shift_right(PacketMul1Xs a) { - return __riscv_vsra_vx_i16m1(a, N, unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE PacketMul1Xs plogical_shift_right(PacketMul1Xs a) { - return __riscv_vreinterpret_i16m1( - __riscv_vsrl_vx_u16m1(__riscv_vreinterpret_u16m1(a), N, unpacket_traits::size)); -} - -template -EIGEN_STRONG_INLINE PacketMul1Xs plogical_shift_left(PacketMul1Xs a) { - return __riscv_vsll_vx_i16m1(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd psub(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vfsub_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pload(const numext::int16_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pnegate(const PacketMul1Xd& a) { + return __riscv_vfneg_v_f64m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs ploadu(const numext::int16_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pconj(const PacketMul1Xd& a) { + return a; } template <> -EIGEN_STRONG_INLINE PacketMul1Xs ploaddup(const numext::int16_t* from) { - PacketMul1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); - idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); - // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... - return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pmul(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vfmul_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs ploadquad(const numext::int16_t* from) { - PacketMul1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pdiv(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vfdiv_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul1Xs& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pmadd(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { + return __riscv_vfmadd_vv_f64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul1Xs& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pmsub(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { + return __riscv_vfmsub_vv_f64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul1Xs pgather(const numext::int16_t* from, Index stride) { - return __riscv_vlse16_v_i16m1(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pnmadd(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { + return __riscv_vfnmsub_vv_f64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul1Xs& from, - Index stride) { - __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pnmsub(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { + return __riscv_vfnmadd_vv_f64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul1Xs& a) { - return __riscv_vmv_x_s_i16m1_i16(a); -} +EIGEN_STRONG_INLINE PacketMul1Xd pmin(const PacketMul1Xd& a, const PacketMul1Xd& b) { + PacketMul1Xd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); + PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); -template <> -EIGEN_STRONG_INLINE PacketMul1Xs preverse(const PacketMul1Xs& a) { - PacketMul1Xsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits::size); + return __riscv_vfmin_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pabs(const PacketMul1Xs& a) { - PacketMul1Xs mask = __riscv_vsra_vx_i16m1(a, 15, unpacket_traits::size); - return __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pmin(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return pmin(a, b); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul1Xs& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size), - unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xd pmin(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vfmin_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul1Xs& a) { - // Multiply the vector by its reverse - PacketMul1Xs prod = __riscv_vmul_vv_i16m1(preverse(a), a, unpacket_traits::size); - PacketMul1Xs half_prod; - - if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_i16m1(prod, 16, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); - } - if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_i16m1(prod, 8, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); - } - if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_i16m1(prod, 4, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); - } - // Last reduction - half_prod = __riscv_vslidedown_vx_i16m1(prod, 2, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); - - half_prod = __riscv_vslidedown_vx_i16m1(prod, 1, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pmax(const PacketMul1Xd& a, const PacketMul1Xd& b) { + PacketMul1Xd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); + PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); - // The reduction is done to the first element. - return pfirst(prod); + return __riscv_vfmax_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul1Xs& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i16m1_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xd pmax(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return pmax(a, b); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul1Xs& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i16m1_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size), - unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xd pmax(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vfmax_vv_f64m1(a, b, unpacket_traits::size); } -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int16_t buffer[unpacket_traits::size * N] = {0}; - int i = 0; - - for (i = 0; i < N; i++) { - __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); - } - for (i = 0; i < N; i++) { - kernel.packet[i] = - __riscv_vle16_v_i16m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); - } +template <> +EIGEN_STRONG_INLINE PacketMul1Xd pcmp_le(const PacketMul1Xd& a, const PacketMul1Xd& b) { + PacketMask64 mask = __riscv_vmfle_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } -/********************************* PacketMul4Xs ************************************/ - template <> -EIGEN_STRONG_INLINE PacketMul4Xs pset1(const numext::int16_t& from) { - return __riscv_vmv_v_x_i16m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pcmp_lt(const PacketMul1Xd& a, const PacketMul1Xd& b) { + PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs plset(const numext::int16_t& a) { - PacketMul4Xs idx = __riscv_vreinterpret_v_u16m4_i16m4(__riscv_vid_v_u16m4(unpacket_traits::size)); - return __riscv_vadd_vx_i16m4(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pcmp_eq(const PacketMul1Xd& a, const PacketMul1Xd& b) { + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pzero(const PacketMul4Xs& /*a*/) { - return __riscv_vmv_v_x_i16m4(0, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pcmp_lt_or_nan(const PacketMul1Xd& a, const PacketMul1Xd& b) { + PacketMask64 mask = __riscv_vmfge_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m1(ptrue(a), 0.0, mask, unpacket_traits::size); } +// Logical Operations are not supported for double, so reinterpret casts template <> -EIGEN_STRONG_INLINE PacketMul4Xs padd(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vadd_vv_i16m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pand(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs psub(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vsub(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd por(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vor_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pnegate(const PacketMul4Xs& a) { - return __riscv_vneg(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pxor(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pconj(const PacketMul4Xs& a) { - return a; +EIGEN_STRONG_INLINE PacketMul1Xd pandnot(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), + __riscv_vnot_v_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pmul(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vmul(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pdiv(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pmadd(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd ploaddup(const double* from) { + PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pmsub(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd ploadquad(const double* from) { + PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + ; + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pnmadd(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { - return __riscv_vnmsub_vv_i16m4(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul1Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pnmsub(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { - return __riscv_vnmsub_vv_i16m4(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul1Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pmin(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vmin(a, b, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline PacketMul1Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m1(from, stride * sizeof(double), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pmax(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vmax(a, b, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul1Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pcmp_le(const PacketMul4Xs& a, const PacketMul4Xs& b) { - PacketMask4 mask = __riscv_vmsle_vv_i16m4_b4(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE double pfirst(const PacketMul1Xd& a) { + return __riscv_vfmv_f_s_f64m1_f64(a); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pcmp_lt(const PacketMul4Xs& a, const PacketMul4Xs& b) { - PacketMask4 mask = __riscv_vmslt_vv_i16m4_b4(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd psqrt(const PacketMul1Xd& a) { + return __riscv_vfsqrt_v_f64m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pcmp_eq(const PacketMul4Xs& a, const PacketMul4Xs& b) { - PacketMask4 mask = __riscv_vmseq_vv_i16m4_b4(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd print(const PacketMul1Xd& a) { + const PacketMul1Xd limit = pset1(static_cast(1ull << 52)); + const PacketMul1Xd abs_a = pabs(a); + + PacketMask64 mask = __riscv_vmfne_vv_f64m1_b64(a, a, unpacket_traits::size); + const PacketMul1Xd x = __riscv_vfadd_vv_f64m1_tumu(mask, a, a, a, unpacket_traits::size); + const PacketMul1Xd new_x = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1(a, unpacket_traits::size), + unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m1_b64(abs_a, limit, unpacket_traits::size); + PacketMul1Xd signed_x = __riscv_vfsgnj_vv_f64m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(x, signed_x, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs ptrue(const PacketMul4Xs& /*a*/) { - return __riscv_vmv_v_x_i16m4(static_cast(0xffffu), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pfloor(const PacketMul1Xd& a) { + PacketMul1Xd tmp = print(a); + // If greater, subtract one. + PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m1_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pand(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vand_vv_i16m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd preverse(const PacketMul1Xd& a) { + PacketMul1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs por(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vor_vv_i16m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pfrexp(const PacketMul1Xd& a, PacketMul1Xd& exponent) { + return pfrexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pxor(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vxor_vv_i16m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE double predux(const PacketMul1Xd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pandnot(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vand_vv_i16m4(a, __riscv_vnot_v_i16m4(b, unpacket_traits::size), - unpacket_traits::size); +EIGEN_STRONG_INLINE double predux_mul(const PacketMul1Xd& a) { + // Multiply the vector by its reverse + PacketMul1Xd prod = __riscv_vfmul_vv_f64m1(preverse(a), a, unpacket_traits::size); + PacketMul1Xd half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + + // The reduction is done to the first element. + return pfirst(prod); } -template -EIGEN_STRONG_INLINE PacketMul4Xs parithmetic_shift_right(PacketMul4Xs a) { - return __riscv_vsra_vx_i16m4(a, N, unpacket_traits::size); +template <> +EIGEN_STRONG_INLINE double predux_min(const PacketMul1Xd& a) { + return ( + std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + (std::numeric_limits::max)()); } -template -EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_right(PacketMul4Xs a) { - return __riscv_vreinterpret_i16m4( - __riscv_vsrl_vx_u16m4(__riscv_vreinterpret_u16m4(a), N, unpacket_traits::size)); +template <> +EIGEN_STRONG_INLINE double predux_max(const PacketMul1Xd& a) { + return ( + std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + -(std::numeric_limits::max)()); } template -EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_left(PacketMul4Xs a) { - return __riscv_vsll_vx_i16m4(a, N, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_f64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pload(const numext::int16_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xd pldexp(const PacketMul1Xd& a, const PacketMul1Xd& exponent) { + return pldexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs ploadu(const numext::int16_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask64 por(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs ploaddup(const numext::int16_t* from) { - PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); - idx = __riscv_vand_vx_u16m4(idx, 0xfffeu, unpacket_traits::size); - // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... - return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask64 pandnot(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs ploadquad(const numext::int16_t* from) { - PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m4(__riscv_vand_vx_u16m4(idx, 0xfffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask64 pand(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmand_mm_b64(a, b, unpacket_traits::size); } -template <> -EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul4Xs& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask64 pcmp_eq_mask(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask64 pcmp_lt_mask(const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMul1Xd pselect(const PacketMask64& mask, const PacketMul1Xd& a, const PacketMul1Xd& b) { + return __riscv_vmerge_vvm_f64m1(b, a, mask, unpacket_traits::size); } -template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul4Xs& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); -} +/********************************* short **************************************/ + +typedef eigen_packet_wrapper PacketMul1Xs; +typedef eigen_packet_wrapper PacketMul1Xsu; + +typedef eigen_packet_wrapper + PacketMul2Xs; +typedef eigen_packet_wrapper + PacketMul2Xsu; + +typedef eigen_packet_wrapper + PacketMul4Xs; +typedef eigen_packet_wrapper + PacketMul4Xsu; + +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 +typedef PacketMul1Xs PacketXs; +typedef PacketMul1Xsu PacketXsu; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul1Xs type; + typedef PacketMul1Xs half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 +typedef PacketMul2Xs PacketXs; +typedef PacketMul2Xsu PacketXsu; template <> -EIGEN_DEVICE_FUNC inline PacketMul4Xs pgather(const numext::int16_t* from, - Index stride) { - return __riscv_vlse16_v_i16m4(from, stride * sizeof(numext::int16_t), unpacket_traits::size); -} +struct packet_traits : default_packet_traits { + typedef PacketMul2Xs type; + typedef PacketMul1Xs half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, -template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul4Xs& from, - Index stride) { - __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); -} + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; -template <> -EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul4Xs& a) { - return __riscv_vmv_x_s_i16m4_i16(a); -} +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 +typedef PacketMul4Xs PacketXs; +typedef PacketMul4Xsu PacketXsu; template <> -EIGEN_STRONG_INLINE PacketMul4Xs preverse(const PacketMul4Xs& a) { - PacketMul4Xsu idx = - __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i16m4(a, idx, unpacket_traits::size); -} +struct packet_traits : default_packet_traits { + typedef PacketMul4Xs type; + typedef PacketMul2Xs half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, -template <> -EIGEN_STRONG_INLINE PacketMul4Xs pabs(const PacketMul4Xs& a) { - PacketMul4Xs mask = __riscv_vsra_vx_i16m4(a, 15, unpacket_traits::size); - return __riscv_vsub_vv_i16m4(__riscv_vxor_vv_i16m4(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); -} + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; +#endif template <> -EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul4Xs& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1( - a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), unpacket_traits::size)); -} +struct unpacket_traits { + typedef numext::int16_t type; + typedef PacketMul1Xs half; // Half not yet implemented + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; template <> -EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul4Xs& a) { - PacketMul1Xs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), __riscv_vget_v_i16m4_i16m1(a, 1), - unpacket_traits::size); - PacketMul1Xs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), __riscv_vget_v_i16m4_i16m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits::size)); -} +struct unpacket_traits { + typedef numext::int16_t type; + typedef PacketMul1Xs half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; template <> -EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul4Xs& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i16m4_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); -} +struct unpacket_traits { + typedef numext::int16_t type; + typedef PacketMul2Xs half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; template <> -EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul4Xs& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i16m4_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), - unpacket_traits::size)); -} - -template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int16_t buffer[unpacket_traits::size * N] = {0}; - int i = 0; - - for (i = 0; i < N; i++) { - __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); - } - for (i = 0; i < N; i++) { - kernel.packet[i] = - __riscv_vle16_v_i16m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); - } +EIGEN_STRONG_INLINE void prefetch(const numext::int16_t* addr) { +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + __builtin_prefetch(addr); +#endif } -/********************************* PacketMul2Xs ************************************/ +/********************************* PacketMul1Xs ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul2Xs pset1(const numext::int16_t& from) { - return __riscv_vmv_v_x_i16m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs plset(const numext::int16_t& a) { - PacketMul2Xs idx = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(unpacket_traits::size)); - return __riscv_vadd_vx_i16m2(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs plset(const numext::int16_t& a) { + PacketMul1Xs idx = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vid_v_u16m1(unpacket_traits::size)); + return __riscv_vadd_vx_i16m1(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pzero(const PacketMul2Xs& /*a*/) { - return __riscv_vmv_v_x_i16m2(0, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pzero(const PacketMul1Xs& /*a*/) { + return __riscv_vmv_v_x_i16m1(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs padd(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vadd_vv_i16m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs padd(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vadd_vv_i16m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs psub(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vsub(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs psub(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pnegate(const PacketMul2Xs& a) { - return __riscv_vneg(a, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pnegate(const PacketMul1Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pconj(const PacketMul2Xs& a) { +EIGEN_STRONG_INLINE PacketMul1Xs pconj(const PacketMul1Xs& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pmul(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vmul(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pmul(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pdiv(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pdiv(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pmadd(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pmadd(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pmsub(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pmsub(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pnmadd(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { - return __riscv_vnmsub_vv_i16m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pnmadd(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { + return __riscv_vnmsub_vv_i16m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pnmsub(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { - return __riscv_vnmsub_vv_i16m2(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pnmsub(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { + return __riscv_vnmsub_vv_i16m1(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pmin(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vmin(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pmin(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pmax(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vmax(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pmax(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pcmp_le(const PacketMul2Xs& a, const PacketMul2Xs& b) { - PacketMask8 mask = __riscv_vmsle_vv_i16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pcmp_le(const PacketMul1Xs& a, const PacketMul1Xs& b) { + PacketMask16 mask = __riscv_vmsle_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pcmp_lt(const PacketMul2Xs& a, const PacketMul2Xs& b) { - PacketMask8 mask = __riscv_vmslt_vv_i16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pcmp_lt(const PacketMul1Xs& a, const PacketMul1Xs& b) { + PacketMask16 mask = __riscv_vmslt_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pcmp_eq(const PacketMul2Xs& a, const PacketMul2Xs& b) { - PacketMask8 mask = __riscv_vmseq_vv_i16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pcmp_eq(const PacketMul1Xs& a, const PacketMul1Xs& b) { + PacketMask16 mask = __riscv_vmseq_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs ptrue(const PacketMul2Xs& /*a*/) { - return __riscv_vmv_v_x_i16m2(static_cast(0xffffu), unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs ptrue(const PacketMul1Xs& /*a*/) { + return __riscv_vmv_v_x_i16m1(static_cast(0xffffu), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pand(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vand_vv_i16m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pand(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vand_vv_i16m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs por(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vor_vv_i16m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs por(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vor_vv_i16m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pxor(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vxor_vv_i16m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pxor(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vxor_vv_i16m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pandnot(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vand_vv_i16m2(a, __riscv_vnot_v_i16m2(b, unpacket_traits::size), - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pandnot(const PacketMul1Xs& a, const PacketMul1Xs& b) { + return __riscv_vand_vv_i16m1(a, __riscv_vnot_v_i16m1(b, unpacket_traits::size), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul2Xs parithmetic_shift_right(PacketMul2Xs a) { - return __riscv_vsra_vx_i16m2(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs parithmetic_shift_right(PacketMul1Xs a) { + return __riscv_vsra_vx_i16m1(a, N, unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_right(PacketMul2Xs a) { - return __riscv_vreinterpret_i16m2( - __riscv_vsrl_vx_u16m2(__riscv_vreinterpret_u16m2(a), N, unpacket_traits::size)); +EIGEN_STRONG_INLINE PacketMul1Xs plogical_shift_right(PacketMul1Xs a) { + return __riscv_vreinterpret_i16m1( + __riscv_vsrl_vx_u16m1(__riscv_vreinterpret_u16m1(a), N, unpacket_traits::size)); } template -EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_left(PacketMul2Xs a) { - return __riscv_vsll_vx_i16m2(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs plogical_shift_left(PacketMul1Xs a) { + return __riscv_vsll_vx_i16m1(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pload(const numext::int16_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs ploadu(const numext::int16_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs ploaddup(const numext::int16_t* from) { - PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); - idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs ploaddup(const numext::int16_t* from) { + PacketMul1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... - return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); + return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs ploadquad(const numext::int16_t* from) { - PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs ploadquad(const numext::int16_t* from) { + PacketMul1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul2Xs& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul1Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul2Xs& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul1Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul2Xs pgather(const numext::int16_t* from, - Index stride) { - return __riscv_vlse16_v_i16m2(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline PacketMul1Xs pgather(const numext::int16_t* from, Index stride) { + return __riscv_vlse16_v_i16m1(from, stride * sizeof(numext::int16_t), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul2Xs& from, - Index stride) { - __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul1Xs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul2Xs& a) { - return __riscv_vmv_x_s_i16m2_i16(a); +EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul1Xs& a) { + return __riscv_vmv_x_s_i16m1_i16(a); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs preverse(const PacketMul2Xs& a) { - PacketMul2Xsu idx = - __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i16m2(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs preverse(const PacketMul1Xs& a) { + PacketMul1Xsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pabs(const PacketMul2Xs& a) { - PacketMul2Xs mask = __riscv_vsra_vx_i16m2(a, 15, unpacket_traits::size); - return __riscv_vsub_vv_i16m2(__riscv_vxor_vv_i16m2(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMul1Xs pabs(const PacketMul1Xs& a) { + PacketMul1Xs mask = __riscv_vsra_vx_i16m1(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul2Xs& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1( - a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 2), unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul1Xs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul2Xs& a) { - return predux_mul(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), - unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul1Xs& a) { + // Multiply the vector by its reverse + PacketMul1Xs prod = __riscv_vmul_vv_i16m1(preverse(a), a, unpacket_traits::size); + PacketMul1Xs half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 16, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 8, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_i16m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + + half_prod = __riscv_vslidedown_vx_i16m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul2Xs& a) { - return __riscv_vmv_x(__riscv_vredmin_vs_i16m2_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 2), - unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul1Xs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m1_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul2Xs& a) { - return __riscv_vmv_x(__riscv_vredmax_vs_i16m2_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 2), - unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul1Xs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m1_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int16_t buffer[unpacket_traits::size * N] = {0}; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle16_v_i16m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle16_v_i16m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } -template -EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xs>::type -predux_half_dowto4(const PacketMul4Xs& a) { - return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), __riscv_vget_v_i16m4_i16m2(a, 1), - unpacket_traits::size); -} - -template -EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul1Xs>::type -predux_half_dowto4(const PacketMul2Xs& a) { - return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), - unpacket_traits::size); -} - } // namespace internal } // namespace Eigen +#include "PacketMath4.h" +#include "PacketMath2.h" + #endif // EIGEN_PACKET_MATH_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/PacketMath2.h b/Eigen/src/Core/arch/RVV10/PacketMath2.h new file mode 100644 index 000000000..4e262cfe8 --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/PacketMath2.h @@ -0,0 +1,1506 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET2_MATH_RVV10_H +#define EIGEN_PACKET2_MATH_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/********************************* PacketMul2Xi ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi plset(const numext::int32_t& a) { + PacketMul2Xi idx = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)); + return __riscv_vadd_vx_i32m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pzero(const PacketMul2Xi& /*a*/) { + return __riscv_vmv_v_x_i32m2(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi padd(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vadd_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi psub(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pnegate(const PacketMul2Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pconj(const PacketMul2Xi& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmul(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pdiv(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmadd(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmsub(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pnmadd(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vnmsub_vv_i32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pnmsub(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vnmsub_vv_i32m2(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmin(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmax(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcmp_le(const PacketMul2Xi& a, const PacketMul2Xi& b) { + PacketMask16 mask = __riscv_vmsle_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcmp_lt(const PacketMul2Xi& a, const PacketMul2Xi& b) { + PacketMask16 mask = __riscv_vmslt_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcmp_eq(const PacketMul2Xi& a, const PacketMul2Xi& b) { + PacketMask16 mask = __riscv_vmseq_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ptrue(const PacketMul2Xi& /*a*/) { + return __riscv_vmv_v_x_i32m2(0xffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pand(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vand_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi por(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vor_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pxor(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vxor_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pandnot(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vand_vv_i32m2(a, __riscv_vnot_v_i32m2(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xi parithmetic_shift_right(PacketMul2Xi a) { + return __riscv_vsra_vx_i32m2(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_right(PacketMul2Xi a) { + return __riscv_vreinterpret_i32m2( + __riscv_vsrl_vx_u32m2(__riscv_vreinterpret_u32m2(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_left(PacketMul2Xi a) { + return __riscv_vsll_vx_i32m2(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ploaddup(const numext::int32_t* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... + return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ploadquad(const numext::int32_t* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul2Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul2Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xi pgather(const numext::int32_t* from, + Index stride) { + return __riscv_vlse32_v_i32m2(from, stride * sizeof(numext::int32_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul2Xi& from, + Index stride) { + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul2Xi& a) { + return __riscv_vmv_x_s_i32m2_i32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi preverse(const PacketMul2Xi& a) { + PacketMul2Xu idx = + __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pabs(const PacketMul2Xi& a) { + PacketMul2Xi mask = __riscv_vsra_vx_i32m2(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m2(__riscv_vxor_vv_i32m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul2Xi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 2), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul2Xi& a) { + return predux_mul(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul2Xi& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 2), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul2Xi& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 2), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_i32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xi>::type +predux_half_dowto4(const PacketMul4Xi& a) { + return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), __riscv_vget_v_i32m4_i32m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul1Xi>::type +predux_half_dowto4(const PacketMul2Xi& a) { + return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), + unpacket_traits::size); +} + +/********************************* PacketMul2Xf ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ptrue(const PacketMul2Xf& /*a*/) { + return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(0xffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pzero(const PacketMul2Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m2(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pabs(const PacketMul2Xf& a) { + return __riscv_vfabs_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf plset(const float& a) { + PacketMul2Xf idx = __riscv_vfcvt_f_x_v_f32m2( + __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f32m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf padd(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfadd_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf psub(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfsub_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pnegate(const PacketMul2Xf& a) { + return __riscv_vfneg_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pconj(const PacketMul2Xf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmul(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfmul_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pdiv(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfdiv_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmadd(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfmadd_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmsub(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfmsub_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pnmadd(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfnmsub_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pnmsub(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfnmadd_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMul2Xf nans = + __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfmin_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMul2Xf nans = + __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfmax_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_le(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmfle_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_eq(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt_or_nan(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmfge_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m2(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pand(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf por(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pxor(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pandnot(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2( + __riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vnot_v_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ploaddup(const float* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ploadquad(const float* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul2Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul2Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m2(from, stride * sizeof(float), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul2Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const PacketMul2Xf& a) { + return __riscv_vfmv_f_s_f32m2_f32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf psqrt(const PacketMul2Xf& a) { + return __riscv_vfsqrt_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf print(const PacketMul2Xf& a) { + const PacketMul2Xf limit = pset1(static_cast(1 << 23)); + const PacketMul2Xf abs_a = pabs(a); + + PacketMask16 mask = __riscv_vmfne_vv_f32m2_b16(a, a, unpacket_traits::size); + const PacketMul2Xf x = __riscv_vfadd_vv_f32m2_tumu(mask, a, a, a, unpacket_traits::size); + const PacketMul2Xf new_x = __riscv_vfcvt_f_x_v_f32m2( + __riscv_vfcvt_x_f_v_i32m2(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f32m2_b16(abs_a, limit, unpacket_traits::size); + PacketMul2Xf signed_x = __riscv_vfsgnj_vv_f32m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pfloor(const PacketMul2Xf& a) { + PacketMul2Xf tmp = print(a); + // If greater, subtract one. + PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m2_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf preverse(const PacketMul2Xf& a) { + PacketMul2Xu idx = + __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pfrexp(const PacketMul2Xf& a, PacketMul2Xf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const PacketMul2Xf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 2), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketMul2Xf& a) { + return predux_mul(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const PacketMul2Xf& a) { + return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1( + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 2), + unpacket_traits::size)), + (std::numeric_limits::max)()); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const PacketMul2Xf& a) { + return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1( + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 2), + unpacket_traits::size)), + -(std::numeric_limits::max)()); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_f32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pldexp(const PacketMul2Xf& a, const PacketMul2Xf& exponent) { + return pldexp_generic(a, exponent); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xf>::type +predux_half_dowto4(const PacketMul4Xf& a) { + return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), __riscv_vget_v_f32m4_f32m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul1Xf>::type +predux_half_dowto4(const PacketMul2Xf& a) { + return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), + unpacket_traits::size); +} + +/********************************* PacketMul2Xl ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl plset(const numext::int64_t& a) { + PacketMul2Xl idx = __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)); + return __riscv_vadd_vx_i64m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pzero(const PacketMul2Xl& /*a*/) { + return __riscv_vmv_v_x_i64m2(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl padd(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vadd_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl psub(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pnegate(const PacketMul2Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pconj(const PacketMul2Xl& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmul(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pdiv(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmadd(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmsub(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pnmadd(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vnmsub_vv_i64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pnmsub(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vnmsub_vv_i64m2(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmin(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmax(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcmp_le(const PacketMul2Xl& a, const PacketMul2Xl& b) { + PacketMask32 mask = __riscv_vmsle_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcmp_lt(const PacketMul2Xl& a, const PacketMul2Xl& b) { + PacketMask32 mask = __riscv_vmslt_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcmp_eq(const PacketMul2Xl& a, const PacketMul2Xl& b) { + PacketMask32 mask = __riscv_vmseq_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ptrue(const PacketMul2Xl& /*a*/) { + return __riscv_vmv_v_x_i64m2(0xffffffffffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pand(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vand_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl por(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vor_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pxor(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vxor_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pandnot(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vand_vv_i64m2(a, __riscv_vnot_v_i64m2(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xl parithmetic_shift_right(PacketMul2Xl a) { + return __riscv_vsra_vx_i64m2(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_right(PacketMul2Xl a) { + return __riscv_vreinterpret_i64m2( + __riscv_vsrl_vx_u64m2(__riscv_vreinterpret_u64m2(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_left(PacketMul2Xl a) { + return __riscv_vsll_vx_i64m2(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ploaddup(const numext::int64_t* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ploadquad(const numext::int64_t* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul2Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul2Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xl pgather(const numext::int64_t* from, + Index stride) { + return __riscv_vlse64_v_i64m2(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul2Xl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul2Xl& a) { + return __riscv_vmv_x_s_i64m2_i64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl preverse(const PacketMul2Xl& a) { + PacketMul2Xul idx = + __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pabs(const PacketMul2Xl& a) { + PacketMul2Xl mask = __riscv_vsra_vx_i64m2(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m2(__riscv_vxor_vv_i64m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul2Xl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 2), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul2Xl& a) { + return predux_mul(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul2Xl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 2), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul2Xl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 2), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_i64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xl>::type +predux_half_dowto4(const PacketMul4Xl& a) { + return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), __riscv_vget_v_i64m4_i64m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul1Xl>::type +predux_half_dowto4(const PacketMul2Xl& a) { + return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), + unpacket_traits::size); +} + +/********************************* PacketMul2Xd ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ptrue(const PacketMul2Xd& /*a*/) { + return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(0xffffffffffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pzero(const PacketMul2Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m2(0.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pabs(const PacketMul2Xd& a) { + return __riscv_vfabs_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd plset(const double& a) { + PacketMul2Xd idx = __riscv_vfcvt_f_x_v_f64m2( + __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f64m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd padd(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfadd_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd psub(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfsub_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pnegate(const PacketMul2Xd& a) { + return __riscv_vfneg_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pconj(const PacketMul2Xd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmul(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfmul_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pdiv(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfdiv_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmadd(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfmadd_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmsub(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfmsub_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pnmadd(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfnmsub_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pnmsub(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfnmadd_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMul2Xd nans = + __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfmin_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMul2Xd nans = + __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfmax_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_le(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmfle_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_eq(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt_or_nan(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmfge_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m2(ptrue(a), 0.0, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for double, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pand(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd por(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pxor(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pandnot(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2( + __riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vnot_v_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ploaddup(const double* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ploadquad(const double* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul2Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul2Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m2(from, stride * sizeof(double), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul2Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const PacketMul2Xd& a) { + return __riscv_vfmv_f_s_f64m2_f64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd psqrt(const PacketMul2Xd& a) { + return __riscv_vfsqrt_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd print(const PacketMul2Xd& a) { + const PacketMul2Xd limit = pset1(static_cast(1ull << 52)); + const PacketMul2Xd abs_a = pabs(a); + + PacketMask32 mask = __riscv_vmfne_vv_f64m2_b32(a, a, unpacket_traits::size); + const PacketMul2Xd x = __riscv_vfadd_vv_f64m2_tumu(mask, a, a, a, unpacket_traits::size); + const PacketMul2Xd new_x = __riscv_vfcvt_f_x_v_f64m2( + __riscv_vfcvt_x_f_v_i64m2(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m2_b32(abs_a, limit, unpacket_traits::size); + PacketMul2Xd signed_x = __riscv_vfsgnj_vv_f64m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pfloor(const PacketMul2Xd& a) { + PacketMul2Xd tmp = print(a); + // If greater, subtract one. + PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m2_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd preverse(const PacketMul2Xd& a) { + PacketMul2Xul idx = + __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pfrexp(const PacketMul2Xd& a, PacketMul2Xd& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE double predux(const PacketMul2Xd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 2), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const PacketMul2Xd& a) { + return predux_mul(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const PacketMul2Xd& a) { + return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1( + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 2), + unpacket_traits::size)), + (std::numeric_limits::max)()); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const PacketMul2Xd& a) { + return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1( + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 2), + unpacket_traits::size)), + -(std::numeric_limits::max)()); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_f64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pldexp(const PacketMul2Xd& a, const PacketMul2Xd& exponent) { + return pldexp_generic(a, exponent); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xd>::type +predux_half_dowto4(const PacketMul4Xd& a) { + return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), __riscv_vget_v_f64m4_f64m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul1Xd>::type +predux_half_dowto4(const PacketMul2Xd& a) { + return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), + unpacket_traits::size); +} + +/********************************* PacketMul2Xs ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs plset(const numext::int16_t& a) { + PacketMul2Xs idx = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(unpacket_traits::size)); + return __riscv_vadd_vx_i16m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pzero(const PacketMul2Xs& /*a*/) { + return __riscv_vmv_v_x_i16m2(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs padd(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vadd_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs psub(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pnegate(const PacketMul2Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pconj(const PacketMul2Xs& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmul(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pdiv(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmadd(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmsub(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pnmadd(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vnmsub_vv_i16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pnmsub(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vnmsub_vv_i16m2(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmin(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmax(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcmp_le(const PacketMul2Xs& a, const PacketMul2Xs& b) { + PacketMask8 mask = __riscv_vmsle_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcmp_lt(const PacketMul2Xs& a, const PacketMul2Xs& b) { + PacketMask8 mask = __riscv_vmslt_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcmp_eq(const PacketMul2Xs& a, const PacketMul2Xs& b) { + PacketMask8 mask = __riscv_vmseq_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ptrue(const PacketMul2Xs& /*a*/) { + return __riscv_vmv_v_x_i16m2(static_cast(0xffffu), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pand(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vand_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs por(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vor_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pxor(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vxor_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pandnot(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vand_vv_i16m2(a, __riscv_vnot_v_i16m2(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xs parithmetic_shift_right(PacketMul2Xs a) { + return __riscv_vsra_vx_i16m2(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_right(PacketMul2Xs a) { + return __riscv_vreinterpret_i16m2( + __riscv_vsrl_vx_u16m2(__riscv_vreinterpret_u16m2(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_left(PacketMul2Xs a) { + return __riscv_vsll_vx_i16m2(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ploaddup(const numext::int16_t* from) { + PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); + // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... + return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ploadquad(const numext::int16_t* from) { + PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul2Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul2Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xs pgather(const numext::int16_t* from, + Index stride) { + return __riscv_vlse16_v_i16m2(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul2Xs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul2Xs& a) { + return __riscv_vmv_x_s_i16m2_i16(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs preverse(const PacketMul2Xs& a) { + PacketMul2Xsu idx = + __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pabs(const PacketMul2Xs& a) { + PacketMul2Xs mask = __riscv_vsra_vx_i16m2(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m2(__riscv_vxor_vv_i16m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul2Xs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 2), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul2Xs& a) { + return predux_mul(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul2Xs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 2), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul2Xs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 2), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle16_v_i16m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xs>::type +predux_half_dowto4(const PacketMul4Xs& a) { + return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), __riscv_vget_v_i16m4_i16m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul1Xs>::type +predux_half_dowto4(const PacketMul2Xs& a) { + return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), + unpacket_traits::size); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_PACKET2_MATH_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/PacketMath4.h b/Eigen/src/Core/arch/RVV10/PacketMath4.h new file mode 100644 index 000000000..fbdd212ef --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/PacketMath4.h @@ -0,0 +1,1431 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET4_MATH_RVV10_H +#define EIGEN_PACKET4_MATH_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/********************************* PacketMul4Xi ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi plset(const numext::int32_t& a) { + PacketMul4Xi idx = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)); + return __riscv_vadd_vx_i32m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pzero(const PacketMul4Xi& /*a*/) { + return __riscv_vmv_v_x_i32m4(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi padd(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vadd_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi psub(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pnegate(const PacketMul4Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pconj(const PacketMul4Xi& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmul(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pdiv(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmadd(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmsub(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pnmadd(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vnmsub_vv_i32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pnmsub(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vnmsub_vv_i32m4(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmin(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmax(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcmp_le(const PacketMul4Xi& a, const PacketMul4Xi& b) { + PacketMask8 mask = __riscv_vmsle_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcmp_lt(const PacketMul4Xi& a, const PacketMul4Xi& b) { + PacketMask8 mask = __riscv_vmslt_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcmp_eq(const PacketMul4Xi& a, const PacketMul4Xi& b) { + PacketMask8 mask = __riscv_vmseq_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi ptrue(const PacketMul4Xi& /*a*/) { + return __riscv_vmv_v_x_i32m4(0xffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pand(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vand_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi por(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vor_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pxor(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vxor_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pandnot(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vand_vv_i32m4(a, __riscv_vnot_v_i32m4(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xi parithmetic_shift_right(PacketMul4Xi a) { + return __riscv_vsra_vx_i32m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_right(PacketMul4Xi a) { + return __riscv_vreinterpret_i32m4( + __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_u32m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_left(PacketMul4Xi a) { + return __riscv_vsll_vx_i32m4(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi ploaddup(const numext::int32_t* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... + return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi ploadquad(const numext::int32_t* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul4Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul4Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xi pgather(const numext::int32_t* from, + Index stride) { + return __riscv_vlse32_v_i32m4(from, stride * sizeof(numext::int32_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul4Xi& from, + Index stride) { + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul4Xi& a) { + return __riscv_vmv_x_s_i32m4_i32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi preverse(const PacketMul4Xi& a) { + PacketMul4Xu idx = + __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pabs(const PacketMul4Xi& a) { + PacketMul4Xi mask = __riscv_vsra_vx_i32m4(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m4(__riscv_vxor_vv_i32m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul4Xi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul4Xi& a) { + PacketMul1Xi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), __riscv_vget_v_i32m4_i32m1(a, 1), + unpacket_traits::size); + PacketMul1Xi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), __riscv_vget_v_i32m4_i32m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul4Xi& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul4Xi& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_i32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul4Xf ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ptrue(const PacketMul4Xf& /*a*/) { + return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(0xffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pzero(const PacketMul4Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m4(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pabs(const PacketMul4Xf& a) { + return __riscv_vfabs_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf plset(const float& a) { + PacketMul4Xf idx = __riscv_vfcvt_f_x_v_f32m4( + __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f32m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf padd(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfadd_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf psub(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfsub_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pnegate(const PacketMul4Xf& a) { + return __riscv_vfneg_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pconj(const PacketMul4Xf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmul(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfmul_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pdiv(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfdiv_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmadd(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfmadd_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmsub(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfmsub_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pnmadd(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfnmsub_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pnmsub(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfnmadd_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMul4Xf nans = + __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfmin_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMul4Xf nans = + __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfmax_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_le(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmfle_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_eq(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt_or_nan(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmfge_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m4(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pand(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf por(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pxor(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pandnot(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4( + __riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vnot_v_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ploaddup(const float* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ploadquad(const float* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul4Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul4Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m4(from, stride * sizeof(float), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul4Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const PacketMul4Xf& a) { + return __riscv_vfmv_f_s_f32m4_f32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf psqrt(const PacketMul4Xf& a) { + return __riscv_vfsqrt_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf print(const PacketMul4Xf& a) { + const PacketMul4Xf limit = pset1(static_cast(1 << 23)); + const PacketMul4Xf abs_a = pabs(a); + + PacketMask8 mask = __riscv_vmfne_vv_f32m4_b8(a, a, unpacket_traits::size); + const PacketMul4Xf x = __riscv_vfadd_vv_f32m4_tumu(mask, a, a, a, unpacket_traits::size); + const PacketMul4Xf new_x = __riscv_vfcvt_f_x_v_f32m4( + __riscv_vfcvt_x_f_v_i32m4(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f32m4_b8(abs_a, limit, unpacket_traits::size); + PacketMul4Xf signed_x = __riscv_vfsgnj_vv_f32m4(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pfloor(const PacketMul4Xf& a) { + PacketMul4Xf tmp = print(a); + // If greater, subtract one. + PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m4_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf preverse(const PacketMul4Xf& a) { + PacketMul4Xu idx = + __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pfrexp(const PacketMul4Xf& a, PacketMul4Xf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const PacketMul4Xf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketMul4Xf& a) { + PacketMul1Xf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), __riscv_vget_v_f32m4_f32m1(a, 1), + unpacket_traits::size); + PacketMul1Xf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), __riscv_vget_v_f32m4_f32m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const PacketMul4Xf& a) { + return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1( + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 4), + unpacket_traits::size)), + (std::numeric_limits::max)()); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const PacketMul4Xf& a) { + return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1( + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 4), + unpacket_traits::size)), + -(std::numeric_limits::max)()); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_f32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pldexp(const PacketMul4Xf& a, const PacketMul4Xf& exponent) { + return pldexp_generic(a, exponent); +} + +/********************************* PacketMul4Xl ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl plset(const numext::int64_t& a) { + PacketMul4Xl idx = __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)); + return __riscv_vadd_vx_i64m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pzero(const PacketMul4Xl& /*a*/) { + return __riscv_vmv_v_x_i64m4(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl padd(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vadd_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl psub(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pnegate(const PacketMul4Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pconj(const PacketMul4Xl& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmul(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pdiv(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmadd(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmsub(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pnmadd(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vnmsub_vv_i64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pnmsub(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vnmsub_vv_i64m4(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmin(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmax(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcmp_le(const PacketMul4Xl& a, const PacketMul4Xl& b) { + PacketMask16 mask = __riscv_vmsle_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcmp_lt(const PacketMul4Xl& a, const PacketMul4Xl& b) { + PacketMask16 mask = __riscv_vmslt_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcmp_eq(const PacketMul4Xl& a, const PacketMul4Xl& b) { + PacketMask16 mask = __riscv_vmseq_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ptrue(const PacketMul4Xl& /*a*/) { + return __riscv_vmv_v_x_i64m4(0xffffffffffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pand(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vand_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl por(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vor_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pxor(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vxor_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pandnot(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vand_vv_i64m4(a, __riscv_vnot_v_i64m4(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xl parithmetic_shift_right(PacketMul4Xl a) { + return __riscv_vsra_vx_i64m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_right(PacketMul4Xl a) { + return __riscv_vreinterpret_i64m4( + __riscv_vsrl_vx_u64m4(__riscv_vreinterpret_u64m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_left(PacketMul4Xl a) { + return __riscv_vsll_vx_i64m4(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ploaddup(const numext::int64_t* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ploadquad(const numext::int64_t* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul4Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul4Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xl pgather(const numext::int64_t* from, + Index stride) { + return __riscv_vlse64_v_i64m4(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul4Xl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul4Xl& a) { + return __riscv_vmv_x_s_i64m4_i64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl preverse(const PacketMul4Xl& a) { + PacketMul4Xul idx = + __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pabs(const PacketMul4Xl& a) { + PacketMul4Xl mask = __riscv_vsra_vx_i64m4(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m4(__riscv_vxor_vv_i64m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul4Xl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul4Xl& a) { + PacketMul1Xl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), __riscv_vget_v_i64m4_i64m1(a, 1), + unpacket_traits::size); + PacketMul1Xl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), __riscv_vget_v_i64m4_i64m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul4Xl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul4Xl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_i64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul4Xd ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ptrue(const PacketMul4Xd& /*a*/) { + return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(0xffffffffffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pzero(const PacketMul4Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m4(0.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pabs(const PacketMul4Xd& a) { + return __riscv_vfabs_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd plset(const double& a) { + PacketMul4Xd idx = __riscv_vfcvt_f_x_v_f64m4( + __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f64m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd padd(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfadd_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd psub(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfsub_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pnegate(const PacketMul4Xd& a) { + return __riscv_vfneg_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pconj(const PacketMul4Xd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmul(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfmul_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pdiv(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfdiv_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfmadd_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfmsub_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pnmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfnmsub_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pnmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfnmadd_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMul4Xd nans = + __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfmin_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMul4Xd nans = + __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfmax_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_le(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmfle_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_eq(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt_or_nan(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmfge_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m4(ptrue(a), 0.0, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for double, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pand(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd por(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pxor(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pandnot(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4( + __riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vnot_v_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ploaddup(const double* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ploadquad(const double* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul4Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul4Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m4(from, stride * sizeof(double), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul4Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const PacketMul4Xd& a) { + return __riscv_vfmv_f_s_f64m4_f64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd psqrt(const PacketMul4Xd& a) { + return __riscv_vfsqrt_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd print(const PacketMul4Xd& a) { + const PacketMul4Xd limit = pset1(static_cast(1ull << 52)); + const PacketMul4Xd abs_a = pabs(a); + + PacketMask16 mask = __riscv_vmfne_vv_f64m4_b16(a, a, unpacket_traits::size); + const PacketMul4Xd x = __riscv_vfadd_vv_f64m4_tumu(mask, a, a, a, unpacket_traits::size); + const PacketMul4Xd new_x = __riscv_vfcvt_f_x_v_f64m4( + __riscv_vfcvt_x_f_v_i64m4(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m4_b16(abs_a, limit, unpacket_traits::size); + PacketMul4Xd signed_x = __riscv_vfsgnj_vv_f64m4(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pfloor(const PacketMul4Xd& a) { + PacketMul4Xd tmp = print(a); + // If greater, subtract one. + PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m4_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd preverse(const PacketMul4Xd& a) { + PacketMul4Xul idx = + __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pfrexp(const PacketMul4Xd& a, PacketMul4Xd& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE double predux(const PacketMul4Xd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const PacketMul4Xd& a) { + PacketMul1Xd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), __riscv_vget_v_f64m4_f64m1(a, 1), + unpacket_traits::size); + PacketMul1Xd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), __riscv_vget_v_f64m4_f64m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const PacketMul4Xd& a) { + return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1( + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 4), + unpacket_traits::size)), + (std::numeric_limits::max)()); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const PacketMul4Xd& a) { + return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1( + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 4), + unpacket_traits::size)), + -(std::numeric_limits::max)()); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_f64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pldexp(const PacketMul4Xd& a, const PacketMul4Xd& exponent) { + return pldexp_generic(a, exponent); +} + +/********************************* PacketMul4Xs ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs plset(const numext::int16_t& a) { + PacketMul4Xs idx = __riscv_vreinterpret_v_u16m4_i16m4(__riscv_vid_v_u16m4(unpacket_traits::size)); + return __riscv_vadd_vx_i16m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pzero(const PacketMul4Xs& /*a*/) { + return __riscv_vmv_v_x_i16m4(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs padd(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vadd_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs psub(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pnegate(const PacketMul4Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pconj(const PacketMul4Xs& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmul(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pdiv(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmadd(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmsub(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pnmadd(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vnmsub_vv_i16m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pnmsub(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vnmsub_vv_i16m4(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmin(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmax(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcmp_le(const PacketMul4Xs& a, const PacketMul4Xs& b) { + PacketMask4 mask = __riscv_vmsle_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcmp_lt(const PacketMul4Xs& a, const PacketMul4Xs& b) { + PacketMask4 mask = __riscv_vmslt_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcmp_eq(const PacketMul4Xs& a, const PacketMul4Xs& b) { + PacketMask4 mask = __riscv_vmseq_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ptrue(const PacketMul4Xs& /*a*/) { + return __riscv_vmv_v_x_i16m4(static_cast(0xffffu), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pand(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vand_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs por(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vor_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pxor(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vxor_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pandnot(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vand_vv_i16m4(a, __riscv_vnot_v_i16m4(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xs parithmetic_shift_right(PacketMul4Xs a) { + return __riscv_vsra_vx_i16m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_right(PacketMul4Xs a) { + return __riscv_vreinterpret_i16m4( + __riscv_vsrl_vx_u16m4(__riscv_vreinterpret_u16m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_left(PacketMul4Xs a) { + return __riscv_vsll_vx_i16m4(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ploaddup(const numext::int16_t* from) { + PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); + idx = __riscv_vand_vx_u16m4(idx, 0xfffeu, unpacket_traits::size); + // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... + return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ploadquad(const numext::int16_t* from) { + PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m4(__riscv_vand_vx_u16m4(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul4Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul4Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xs pgather(const numext::int16_t* from, + Index stride) { + return __riscv_vlse16_v_i16m4(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul4Xs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul4Xs& a) { + return __riscv_vmv_x_s_i16m4_i16(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs preverse(const PacketMul4Xs& a) { + PacketMul4Xsu idx = + __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pabs(const PacketMul4Xs& a) { + PacketMul4Xs mask = __riscv_vsra_vx_i16m4(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m4(__riscv_vxor_vv_i16m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul4Xs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul4Xs& a) { + PacketMul1Xs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), __riscv_vget_v_i16m4_i16m1(a, 1), + unpacket_traits::size); + PacketMul1Xs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), __riscv_vget_v_i16m4_i16m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul4Xs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul4Xs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle16_v_i16m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_PACKET4_MATH_RVV10_H -- GitLab From fe26150cd4d59dacc53297940d65841ceaf419ed Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 14 Nov 2025 14:28:21 +0000 Subject: [PATCH 19/21] Rename Packets from PacketMul1X for Packet1X, etc. Change error to warning and other fixes based on feedback. --- Eigen/Core | 2 + Eigen/src/Core/arch/RVV10/MathFunctions.h | 12 +- Eigen/src/Core/arch/RVV10/PacketMath.h | 1503 +++++++++--------- Eigen/src/Core/arch/RVV10/PacketMath2.h | 1286 +++++++-------- Eigen/src/Core/arch/RVV10/PacketMath4.h | 1216 +++++++------- Eigen/src/Core/arch/RVV10/PacketMathFP16.h | 386 ++--- Eigen/src/Core/arch/RVV10/TypeCasting.h | 174 +- Eigen/src/Core/util/ConfigureVectorization.h | 2 +- 8 files changed, 2290 insertions(+), 2291 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 6968c2b09..56059af63 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -278,6 +278,8 @@ using std::ptrdiff_t; #include "src/Core/arch/SVE/MathFunctions.h" #elif defined EIGEN_VECTORIZE_RVV10 #include "src/Core/arch/RVV10/PacketMath.h" +#include "src/Core/arch/RVV10/PacketMath4.h" +#include "src/Core/arch/RVV10/PacketMath2.h" #include "src/Core/arch/RVV10/TypeCasting.h" #include "src/Core/arch/RVV10/MathFunctions.h" #if defined EIGEN_VECTORIZE_RVV10FP16 diff --git a/Eigen/src/Core/arch/RVV10/MathFunctions.h b/Eigen/src/Core/arch/RVV10/MathFunctions.h index 6bbf8fe72..10a70c446 100644 --- a/Eigen/src/Core/arch/RVV10/MathFunctions.h +++ b/Eigen/src/Core/arch/RVV10/MathFunctions.h @@ -16,13 +16,13 @@ namespace Eigen { namespace internal { -EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketMul1Xf) -EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketMul2Xf) -EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketMul4Xf) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet1Xf) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet2Xf) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet4Xf) -EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketMul1Xd) -EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketMul2Xd) -EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketMul4Xd) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet1Xd) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet2Xd) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet4Xd) } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h index b7a6db817..f7560b8b4 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -52,27 +52,27 @@ typedef vbool8_t PacketMask8; typedef vbool4_t PacketMask4; /********************************* int32 **************************************/ -typedef eigen_packet_wrapper PacketMul1Xi; -typedef eigen_packet_wrapper PacketMul1Xu; +typedef eigen_packet_wrapper Packet1Xi; +typedef eigen_packet_wrapper Packet1Xu; typedef eigen_packet_wrapper - PacketMul2Xi; + Packet2Xi; typedef eigen_packet_wrapper - PacketMul2Xu; + Packet2Xu; typedef eigen_packet_wrapper - PacketMul4Xi; + Packet4Xi; typedef eigen_packet_wrapper - PacketMul4Xu; + Packet4Xu; #if EIGEN_RISCV64_DEFAULT_LMUL == 1 -typedef PacketMul1Xi PacketXi; -typedef PacketMul1Xu PacketXu; +typedef Packet1Xi PacketXi; +typedef Packet1Xu PacketXu; template <> struct packet_traits : default_packet_traits { - typedef PacketMul1Xi type; - typedef PacketMul1Xi half; // Half not implemented yet + typedef Packet1Xi type; + typedef Packet1Xi half; // Half not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -96,13 +96,13 @@ struct packet_traits : default_packet_traits { }; #elif EIGEN_RISCV64_DEFAULT_LMUL == 2 -typedef PacketMul2Xi PacketXi; -typedef PacketMul2Xu PacketXu; +typedef Packet2Xi PacketXi; +typedef Packet2Xu PacketXu; template <> struct packet_traits : default_packet_traits { - typedef PacketMul2Xi type; - typedef PacketMul1Xi half; + typedef Packet2Xi type; + typedef Packet1Xi half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -126,13 +126,13 @@ struct packet_traits : default_packet_traits { }; #elif EIGEN_RISCV64_DEFAULT_LMUL == 4 -typedef PacketMul4Xi PacketXi; -typedef PacketMul4Xu PacketXu; +typedef Packet4Xi PacketXi; +typedef Packet4Xu PacketXu; template <> struct packet_traits : default_packet_traits { - typedef PacketMul4Xi type; - typedef PacketMul2Xi half; + typedef Packet4Xi type; + typedef Packet2Xi half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -157,9 +157,9 @@ struct packet_traits : default_packet_traits { #endif template <> -struct unpacket_traits { +struct unpacket_traits { typedef numext::int32_t type; - typedef PacketMul1Xi half; // Half not yet implemented + typedef Packet1Xi half; // Half not yet implemented typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -171,9 +171,9 @@ struct unpacket_traits { }; template <> -struct unpacket_traits { +struct unpacket_traits { typedef numext::int32_t type; - typedef PacketMul1Xi half; + typedef Packet1Xi half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -185,9 +185,9 @@ struct unpacket_traits { }; template <> -struct unpacket_traits { +struct unpacket_traits { typedef numext::int32_t type; - typedef PacketMul2Xi half; + typedef Packet2Xi half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -205,285 +205,285 @@ EIGEN_STRONG_INLINE void prefetch(const numext::int32_t* addr) #endif } -/********************************* PacketMul1Xi ************************************/ +/********************************* Packet1Xi ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul1Xi pset1(const numext::int32_t& from) { - return __riscv_vmv_v_x_i32m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi plset(const numext::int32_t& a) { - PacketMul1Xi idx = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)); - return __riscv_vadd_vx_i32m1(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi plset(const numext::int32_t& a) { + Packet1Xi idx = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)); + return __riscv_vadd_vx_i32m1(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pzero(const PacketMul1Xi& /*a*/) { - return __riscv_vmv_v_x_i32m1(0, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pzero(const Packet1Xi& /*a*/) { + return __riscv_vmv_v_x_i32m1(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi padd(const PacketMul1Xi& a, const PacketMul1Xi& b) { - return __riscv_vadd_vv_i32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi padd(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vadd_vv_i32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi psub(const PacketMul1Xi& a, const PacketMul1Xi& b) { - return __riscv_vsub(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi psub(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pnegate(const PacketMul1Xi& a) { - return __riscv_vneg(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pnegate(const Packet1Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pconj(const PacketMul1Xi& a) { +EIGEN_STRONG_INLINE Packet1Xi pconj(const Packet1Xi& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pmul(const PacketMul1Xi& a, const PacketMul1Xi& b) { - return __riscv_vmul(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pmul(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pdiv(const PacketMul1Xi& a, const PacketMul1Xi& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pdiv(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pmadd(const PacketMul1Xi& a, const PacketMul1Xi& b, const PacketMul1Xi& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pmadd(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pmsub(const PacketMul1Xi& a, const PacketMul1Xi& b, const PacketMul1Xi& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pmsub(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pnmadd(const PacketMul1Xi& a, const PacketMul1Xi& b, const PacketMul1Xi& c) { - return __riscv_vnmsub_vv_i32m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pnmadd(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c) { + return __riscv_vnmsub_vv_i32m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pnmsub(const PacketMul1Xi& a, const PacketMul1Xi& b, const PacketMul1Xi& c) { - return __riscv_vnmsub_vv_i32m1(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pnmsub(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c) { + return __riscv_vnmsub_vv_i32m1(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pmin(const PacketMul1Xi& a, const PacketMul1Xi& b) { - return __riscv_vmin(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pmin(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pmax(const PacketMul1Xi& a, const PacketMul1Xi& b) { - return __riscv_vmax(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pmax(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pcmp_le(const PacketMul1Xi& a, const PacketMul1Xi& b) { - PacketMask32 mask = __riscv_vmsle_vv_i32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pcmp_le(const Packet1Xi& a, const Packet1Xi& b) { + PacketMask32 mask = __riscv_vmsle_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pcmp_lt(const PacketMul1Xi& a, const PacketMul1Xi& b) { - PacketMask32 mask = __riscv_vmslt_vv_i32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pcmp_lt(const Packet1Xi& a, const Packet1Xi& b) { + PacketMask32 mask = __riscv_vmslt_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pcmp_eq(const PacketMul1Xi& a, const PacketMul1Xi& b) { - PacketMask32 mask = __riscv_vmseq_vv_i32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pcmp_eq(const Packet1Xi& a, const Packet1Xi& b) { + PacketMask32 mask = __riscv_vmseq_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi ptrue(const PacketMul1Xi& /*a*/) { - return __riscv_vmv_v_x_i32m1(0xffffffffu, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi ptrue(const Packet1Xi& /*a*/) { + return __riscv_vmv_v_x_i32m1(0xffffffffu, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pand(const PacketMul1Xi& a, const PacketMul1Xi& b) { - return __riscv_vand_vv_i32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pand(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vand_vv_i32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi por(const PacketMul1Xi& a, const PacketMul1Xi& b) { - return __riscv_vor_vv_i32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi por(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vor_vv_i32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pxor(const PacketMul1Xi& a, const PacketMul1Xi& b) { - return __riscv_vxor_vv_i32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pxor(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vxor_vv_i32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pandnot(const PacketMul1Xi& a, const PacketMul1Xi& b) { - return __riscv_vand_vv_i32m1(a, __riscv_vnot_v_i32m1(b, unpacket_traits::size), - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pandnot(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vand_vv_i32m1(a, __riscv_vnot_v_i32m1(b, unpacket_traits::size), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul1Xi parithmetic_shift_right(PacketMul1Xi a) { - return __riscv_vsra_vx_i32m1(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi parithmetic_shift_right(Packet1Xi a) { + return __riscv_vsra_vx_i32m1(a, N, unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul1Xi plogical_shift_right(PacketMul1Xi a) { +EIGEN_STRONG_INLINE Packet1Xi plogical_shift_right(Packet1Xi a) { return __riscv_vreinterpret_i32m1( - __riscv_vsrl_vx_u32m1(__riscv_vreinterpret_u32m1(a), N, unpacket_traits::size)); + __riscv_vsrl_vx_u32m1(__riscv_vreinterpret_u32m1(a), N, unpacket_traits::size)); } template -EIGEN_STRONG_INLINE PacketMul1Xi plogical_shift_left(PacketMul1Xi a) { - return __riscv_vsll_vx_i32m1(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi plogical_shift_left(Packet1Xi a) { + return __riscv_vsll_vx_i32m1(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pload(const numext::int32_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi ploadu(const numext::int32_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi ploaddup(const numext::int32_t* from) { - PacketMul1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi ploaddup(const numext::int32_t* from) { + Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... - return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); + return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi ploadquad(const numext::int32_t* from) { - PacketMul1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi ploadquad(const numext::int32_t* from) { + Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul1Xi& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const Packet1Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul1Xi& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const Packet1Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul1Xi pgather(const numext::int32_t* from, Index stride) { - return __riscv_vlse32_v_i32m1(from, stride * sizeof(numext::int32_t), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline Packet1Xi pgather(const numext::int32_t* from, Index stride) { + return __riscv_vlse32_v_i32m1(from, stride * sizeof(numext::int32_t), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul1Xi& from, +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const Packet1Xi& from, Index stride) { - __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul1Xi& a) { +EIGEN_STRONG_INLINE numext::int32_t pfirst(const Packet1Xi& a) { return __riscv_vmv_x_s_i32m1_i32(a); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi preverse(const PacketMul1Xi& a) { - PacketMul1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i32m1(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi preverse(const Packet1Xi& a) { + Packet1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m1(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pabs(const PacketMul1Xi& a) { - PacketMul1Xi mask = __riscv_vsra_vx_i32m1(a, 31, unpacket_traits::size); - return __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pabs(const Packet1Xi& a) { + Packet1Xi mask = __riscv_vsra_vx_i32m1(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul1Xi& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i32m1_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size), - unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int32_t predux(const Packet1Xi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m1_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul1Xi& a) { +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const Packet1Xi& a) { // Multiply the vector by its reverse - PacketMul1Xi prod = __riscv_vmul_vv_i32m1(preverse(a), a, unpacket_traits::size); - PacketMul1Xi half_prod; + Packet1Xi prod = __riscv_vmul_vv_i32m1(preverse(a), a, unpacket_traits::size); + Packet1Xi half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_i32m1(prod, 8, unpacket_traits::size); - prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 8, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_i32m1(prod, 4, unpacket_traits::size); - prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_i32m1(prod, 2, unpacket_traits::size); - prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); } // Last reduction - half_prod = __riscv_vslidedown_vx_i32m1(prod, 1, unpacket_traits::size); - prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i32m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); // The reduction is done to the first element. return pfirst(prod); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul1Xi& a) { +EIGEN_STRONG_INLINE numext::int32_t predux_min(const Packet1Xi& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i32m1_i32m1( - a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul1Xi& a) { +EIGEN_STRONG_INLINE numext::int32_t predux_max(const Packet1Xi& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i32m1_i32m1( - a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int32_t buffer[unpacket_traits::size * N] = {0}; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle32_v_i32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle32_v_i32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } /********************************* float32 ************************************/ -typedef eigen_packet_wrapper PacketMul1Xf; +typedef eigen_packet_wrapper Packet1Xf; typedef eigen_packet_wrapper - PacketMul2Xf; + Packet2Xf; typedef eigen_packet_wrapper - PacketMul4Xf; + Packet4Xf; #if EIGEN_RISCV64_DEFAULT_LMUL == 1 -typedef PacketMul1Xf PacketXf; +typedef Packet1Xf PacketXf; template <> struct packet_traits : default_packet_traits { - typedef PacketMul1Xf type; - typedef PacketMul1Xf half; + typedef Packet1Xf type; + typedef Packet1Xf half; enum { Vectorizable = 1, @@ -520,12 +520,12 @@ struct packet_traits : default_packet_traits { }; #elif EIGEN_RISCV64_DEFAULT_LMUL == 2 -typedef PacketMul2Xf PacketXf; +typedef Packet2Xf PacketXf; template <> struct packet_traits : default_packet_traits { - typedef PacketMul2Xf type; - typedef PacketMul1Xf half; + typedef Packet2Xf type; + typedef Packet1Xf half; enum { Vectorizable = 1, @@ -562,12 +562,12 @@ struct packet_traits : default_packet_traits { }; #elif EIGEN_RISCV64_DEFAULT_LMUL == 4 -typedef PacketMul4Xf PacketXf; +typedef Packet4Xf PacketXf; template <> struct packet_traits : default_packet_traits { - typedef PacketMul4Xf type; - typedef PacketMul2Xf half; + typedef Packet4Xf type; + typedef Packet2Xf half; enum { Vectorizable = 1, @@ -605,10 +605,10 @@ struct packet_traits : default_packet_traits { #endif template <> -struct unpacket_traits { +struct unpacket_traits { typedef float type; - typedef PacketMul1Xf half; // Half not yet implemented - typedef PacketMul1Xi integer_packet; + typedef Packet1Xf half; // Half not yet implemented + typedef Packet1Xi integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask32 packet_mask; @@ -622,10 +622,10 @@ struct unpacket_traits { }; template <> -struct unpacket_traits { +struct unpacket_traits { typedef float type; - typedef PacketMul1Xf half; - typedef PacketMul2Xi integer_packet; + typedef Packet1Xf half; + typedef Packet2Xi integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask16 packet_mask; @@ -639,10 +639,10 @@ struct unpacket_traits { }; template <> -struct unpacket_traits { +struct unpacket_traits { typedef float type; - typedef PacketMul2Xf half; - typedef PacketMul4Xi integer_packet; + typedef Packet2Xf half; + typedef Packet4Xi integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask8 packet_mask; @@ -655,389 +655,389 @@ struct unpacket_traits { }; }; -/********************************* PacketMul1Xf ************************************/ +/********************************* Packet1Xf ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul1Xf ptrue(const PacketMul1Xf& /*a*/) { - return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(0xffffffffu, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet1Xf ptrue(const Packet1Xf& /*a*/) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(0xffffffffu, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pzero(const PacketMul1Xf& /*a*/) { - return __riscv_vfmv_v_f_f32m1(0.0f, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pzero(const Packet1Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m1(0.0f, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pabs(const PacketMul1Xf& a) { - return __riscv_vfabs_v_f32m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pabs(const Packet1Xf& a) { + return __riscv_vfabs_v_f32m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pset1(const float& from) { - return __riscv_vfmv_v_f_f32m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pset1frombits(numext::uint32_t from) { - return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(from, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet1Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(from, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf plset(const float& a) { - PacketMul1Xf idx = __riscv_vfcvt_f_x_v_f32m1( - __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)), - unpacket_traits::size); - return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf plset(const float& a) { + Packet1Xf idx = __riscv_vfcvt_f_x_v_f32m1( + __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf padd(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vfadd_vv_f32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf padd(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vfadd_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf psub(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vfsub_vv_f32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf psub(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vfsub_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pnegate(const PacketMul1Xf& a) { - return __riscv_vfneg_v_f32m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pnegate(const Packet1Xf& a) { + return __riscv_vfneg_v_f32m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pconj(const PacketMul1Xf& a) { +EIGEN_STRONG_INLINE Packet1Xf pconj(const Packet1Xf& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmul(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vfmul_vv_f32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pmul(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vfmul_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pdiv(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vfdiv_vv_f32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pdiv(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vfdiv_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmadd(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { - return __riscv_vfmadd_vv_f32m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pmadd(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c) { + return __riscv_vfmadd_vv_f32m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmsub(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { - return __riscv_vfmsub_vv_f32m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pmsub(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c) { + return __riscv_vfmsub_vv_f32m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pnmadd(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { - return __riscv_vfnmsub_vv_f32m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pnmadd(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c) { + return __riscv_vfnmsub_vv_f32m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pnmsub(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c) { - return __riscv_vfnmadd_vv_f32m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pnmsub(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c) { + return __riscv_vfnmadd_vv_f32m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmin(const PacketMul1Xf& a, const PacketMul1Xf& b) { - PacketMul1Xf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); - PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pmin(const Packet1Xf& a, const Packet1Xf& b) { + Packet1Xf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); - return __riscv_vfmin_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmin_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmin(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return pmin(a, b); +EIGEN_STRONG_INLINE Packet1Xf pmin(const Packet1Xf& a, const Packet1Xf& b) { + return pmin(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmin(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vfmin_vv_f32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pmin(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vfmin_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmax(const PacketMul1Xf& a, const PacketMul1Xf& b) { - PacketMul1Xf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); - PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pmax(const Packet1Xf& a, const Packet1Xf& b) { + Packet1Xf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); - return __riscv_vfmax_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmax_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmax(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return pmax(a, b); +EIGEN_STRONG_INLINE Packet1Xf pmax(const Packet1Xf& a, const Packet1Xf& b) { + return pmax(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pmax(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vfmax_vv_f32m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pmax(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vfmax_vv_f32m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pcmp_le(const PacketMul1Xf& a, const PacketMul1Xf& b) { - PacketMask32 mask = __riscv_vmfle_vv_f32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pcmp_le(const Packet1Xf& a, const Packet1Xf& b) { + PacketMask32 mask = __riscv_vmfle_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pcmp_lt(const PacketMul1Xf& a, const PacketMul1Xf& b) { - PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pcmp_lt(const Packet1Xf& a, const Packet1Xf& b) { + PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pcmp_eq(const PacketMul1Xf& a, const PacketMul1Xf& b) { - PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pcmp_eq(const Packet1Xf& a, const Packet1Xf& b) { + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pcmp_lt_or_nan(const PacketMul1Xf& a, const PacketMul1Xf& b) { - PacketMask32 mask = __riscv_vmfge_vv_f32m1_b32(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f32m1(ptrue(a), 0.0f, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pcmp_lt_or_nan(const Packet1Xf& a, const Packet1Xf& b) { + PacketMask32 mask = __riscv_vmfge_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m1(ptrue(a), 0.0f, mask, unpacket_traits::size); } // Logical Operations are not supported for float, so reinterpret casts template <> -EIGEN_STRONG_INLINE PacketMul1Xf pand(const PacketMul1Xf& a, const PacketMul1Xf& b) { +EIGEN_STRONG_INLINE Packet1Xf pand(const Packet1Xf& a, const Packet1Xf& b) { return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( - __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf por(const PacketMul1Xf& a, const PacketMul1Xf& b) { +EIGEN_STRONG_INLINE Packet1Xf por(const Packet1Xf& a, const Packet1Xf& b) { return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vor_vv_u32m1( - __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pxor(const PacketMul1Xf& a, const PacketMul1Xf& b) { +EIGEN_STRONG_INLINE Packet1Xf pxor(const Packet1Xf& a, const Packet1Xf& b) { return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1( - __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pandnot(const PacketMul1Xf& a, const PacketMul1Xf& b) { +EIGEN_STRONG_INLINE Packet1Xf pandnot(const Packet1Xf& a, const Packet1Xf& b) { return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( __riscv_vreinterpret_v_f32m1_u32m1(a), - __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size), - unpacket_traits::size)); + __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pload(const float* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf ploadu(const float* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf ploaddup(const float* from) { - PacketMul1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf ploaddup(const float* from) { + Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf ploadquad(const float* from) { - PacketMul1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); - idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf ploadquad(const float* from) { + Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul1Xf& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(float* to, const Packet1Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul1Xf& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet1Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul1Xf pgather(const float* from, Index stride) { - return __riscv_vlse32_v_f32m1(from, stride * sizeof(float), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline Packet1Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m1(from, stride * sizeof(float), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul1Xf& from, Index stride) { - __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet1Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE float pfirst(const PacketMul1Xf& a) { +EIGEN_STRONG_INLINE float pfirst(const Packet1Xf& a) { return __riscv_vfmv_f_s_f32m1_f32(a); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf psqrt(const PacketMul1Xf& a) { - return __riscv_vfsqrt_v_f32m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf psqrt(const Packet1Xf& a) { + return __riscv_vfsqrt_v_f32m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf print(const PacketMul1Xf& a) { - const PacketMul1Xf limit = pset1(static_cast(1 << 23)); - const PacketMul1Xf abs_a = pabs(a); +EIGEN_STRONG_INLINE Packet1Xf print(const Packet1Xf& a) { + const Packet1Xf limit = pset1(static_cast(1 << 23)); + const Packet1Xf abs_a = pabs(a); - PacketMask32 mask = __riscv_vmfne_vv_f32m1_b32(a, a, unpacket_traits::size); - const PacketMul1Xf x = __riscv_vfadd_vv_f32m1_tumu(mask, a, a, a, unpacket_traits::size); - const PacketMul1Xf new_x = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(a, unpacket_traits::size), - unpacket_traits::size); + PacketMask32 mask = __riscv_vmfne_vv_f32m1_b32(a, a, unpacket_traits::size); + const Packet1Xf x = __riscv_vfadd_vv_f32m1_tumu(mask, a, a, a, unpacket_traits::size); + const Packet1Xf new_x = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(a, unpacket_traits::size), + unpacket_traits::size); - mask = __riscv_vmflt_vv_f32m1_b32(abs_a, limit, unpacket_traits::size); - PacketMul1Xf signed_x = __riscv_vfsgnj_vv_f32m1(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m1(x, signed_x, mask, unpacket_traits::size); + mask = __riscv_vmflt_vv_f32m1_b32(abs_a, limit, unpacket_traits::size); + Packet1Xf signed_x = __riscv_vfsgnj_vv_f32m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(x, signed_x, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pfloor(const PacketMul1Xf& a) { - PacketMul1Xf tmp = print(a); +EIGEN_STRONG_INLINE Packet1Xf pfloor(const Packet1Xf& a) { + Packet1Xf tmp = print(a); // If greater, subtract one. - PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f32m1_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); + PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m1_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf preverse(const PacketMul1Xf& a) { - PacketMul1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf preverse(const Packet1Xf& a) { + Packet1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pfrexp(const PacketMul1Xf& a, PacketMul1Xf& exponent) { +EIGEN_STRONG_INLINE Packet1Xf pfrexp(const Packet1Xf& a, Packet1Xf& exponent) { return pfrexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE float predux(const PacketMul1Xf& a) { +EIGEN_STRONG_INLINE float predux(const Packet1Xf& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m1_f32m1( - a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size), unpacket_traits::size)); + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE float predux_mul(const PacketMul1Xf& a) { +EIGEN_STRONG_INLINE float predux_mul(const Packet1Xf& a) { // Multiply the vector by its reverse - PacketMul1Xf prod = __riscv_vfmul_vv_f32m1(preverse(a), a, unpacket_traits::size); - PacketMul1Xf half_prod; + Packet1Xf prod = __riscv_vfmul_vv_f32m1(preverse(a), a, unpacket_traits::size); + Packet1Xf half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, unpacket_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, unpacket_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, unpacket_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); } // Last reduction - half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, unpacket_traits::size); - prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); // The reduction is done to the first element. return pfirst(prod); } template <> -EIGEN_STRONG_INLINE float predux_min(const PacketMul1Xf& a) { +EIGEN_STRONG_INLINE float predux_min(const Packet1Xf& a) { return ( std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), (std::numeric_limits::max)()); } template <> -EIGEN_STRONG_INLINE float predux_max(const PacketMul1Xf& a) { +EIGEN_STRONG_INLINE float predux_max(const Packet1Xf& a) { return ( std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( a, - __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), -(std::numeric_limits::max)()); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - float buffer[unpacket_traits::size * N]; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle32_v_f32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle32_v_f32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template <> -EIGEN_STRONG_INLINE PacketMul1Xf pldexp(const PacketMul1Xf& a, const PacketMul1Xf& exponent) { +EIGEN_STRONG_INLINE Packet1Xf pldexp(const Packet1Xf& a, const Packet1Xf& exponent) { return pldexp_generic(a, exponent); } template <> EIGEN_STRONG_INLINE PacketMask32 por(const PacketMask32& a, const PacketMask32& b) { - return __riscv_vmor_mm_b32(a, b, unpacket_traits::size); + return __riscv_vmor_mm_b32(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMask32 pand(const PacketMask32& a, const PacketMask32& b) { - return __riscv_vmand_mm_b32(a, b, unpacket_traits::size); + return __riscv_vmand_mm_b32(a, b, unpacket_traits::size); } -EIGEN_STRONG_INLINE PacketMask32 pcmp_eq_mask(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask32 pcmp_eq_mask(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); } -EIGEN_STRONG_INLINE PacketMask32 pcmp_lt_mask(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask32 pcmp_lt_mask(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); } -EIGEN_STRONG_INLINE PacketMul1Xf pselect(const PacketMask32& mask, const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vmerge_vvm_f32m1(b, a, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pselect(const PacketMask32& mask, const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vmerge_vvm_f32m1(b, a, mask, unpacket_traits::size); } /********************************* int64 **************************************/ -typedef eigen_packet_wrapper PacketMul1Xl; -typedef eigen_packet_wrapper PacketMul1Xul; +typedef eigen_packet_wrapper Packet1Xl; +typedef eigen_packet_wrapper Packet1Xul; typedef eigen_packet_wrapper - PacketMul2Xl; + Packet2Xl; typedef eigen_packet_wrapper - PacketMul2Xul; + Packet2Xul; typedef eigen_packet_wrapper - PacketMul4Xl; + Packet4Xl; typedef eigen_packet_wrapper - PacketMul4Xul; + Packet4Xul; #if EIGEN_RISCV64_DEFAULT_LMUL == 1 -typedef PacketMul1Xl PacketXl; -typedef PacketMul1Xul PacketXul; +typedef Packet1Xl PacketXl; +typedef Packet1Xul PacketXul; template <> struct packet_traits : default_packet_traits { - typedef PacketMul1Xl type; - typedef PacketMul1Xl half; // Half not implemented yet + typedef Packet1Xl type; + typedef Packet1Xl half; // Half not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -1061,13 +1061,13 @@ struct packet_traits : default_packet_traits { }; #elif EIGEN_RISCV64_DEFAULT_LMUL == 2 -typedef PacketMul2Xl PacketXl; -typedef PacketMul2Xul PacketXul; +typedef Packet2Xl PacketXl; +typedef Packet2Xul PacketXul; template <> struct packet_traits : default_packet_traits { - typedef PacketMul2Xl type; - typedef PacketMul1Xl half; + typedef Packet2Xl type; + typedef Packet1Xl half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -1091,13 +1091,13 @@ struct packet_traits : default_packet_traits { }; #elif EIGEN_RISCV64_DEFAULT_LMUL == 4 -typedef PacketMul4Xl PacketXl; -typedef PacketMul4Xul PacketXul; +typedef Packet4Xl PacketXl; +typedef Packet4Xul PacketXul; template <> struct packet_traits : default_packet_traits { - typedef PacketMul4Xl type; - typedef PacketMul2Xl half; + typedef Packet4Xl type; + typedef Packet2Xl half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -1122,9 +1122,9 @@ struct packet_traits : default_packet_traits { #endif template <> -struct unpacket_traits { +struct unpacket_traits { typedef numext::int64_t type; - typedef PacketMul1Xl half; // Half not yet implemented + typedef Packet1Xl half; // Half not yet implemented typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -1136,9 +1136,9 @@ struct unpacket_traits { }; template <> -struct unpacket_traits { +struct unpacket_traits { typedef numext::int64_t type; - typedef PacketMul1Xl half; + typedef Packet1Xl half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -1150,9 +1150,9 @@ struct unpacket_traits { }; template <> -struct unpacket_traits { +struct unpacket_traits { typedef numext::int64_t type; - typedef PacketMul2Xl half; + typedef Packet2Xl half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -1170,235 +1170,235 @@ EIGEN_STRONG_INLINE void prefetch(const numext::int64_t* addr) #endif } -/********************************* PacketMul1Xl ************************************/ +/********************************* Packet1Xl ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul1Xl pset1(const numext::int64_t& from) { - return __riscv_vmv_v_x_i64m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl plset(const numext::int64_t& a) { - PacketMul1Xl idx = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)); - return __riscv_vadd_vx_i64m1(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl plset(const numext::int64_t& a) { + Packet1Xl idx = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)); + return __riscv_vadd_vx_i64m1(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pzero(const PacketMul1Xl& /*a*/) { - return __riscv_vmv_v_x_i64m1(0, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pzero(const Packet1Xl& /*a*/) { + return __riscv_vmv_v_x_i64m1(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl padd(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vadd_vv_i64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl padd(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vadd_vv_i64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl psub(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vsub(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl psub(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pnegate(const PacketMul1Xl& a) { - return __riscv_vneg(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pnegate(const Packet1Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pconj(const PacketMul1Xl& a) { +EIGEN_STRONG_INLINE Packet1Xl pconj(const Packet1Xl& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pmul(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vmul(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pmul(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pdiv(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pdiv(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pmadd(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pmadd(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pmsub(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pmsub(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pnmadd(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { - return __riscv_vnmsub_vv_i64m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pnmadd(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c) { + return __riscv_vnmsub_vv_i64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pnmsub(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c) { - return __riscv_vnmsub_vv_i64m1(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pnmsub(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c) { + return __riscv_vnmsub_vv_i64m1(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pmin(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vmin(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pmin(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pmax(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vmax(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pmax(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pcmp_le(const PacketMul1Xl& a, const PacketMul1Xl& b) { - PacketMask64 mask = __riscv_vmsle_vv_i64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pcmp_le(const Packet1Xl& a, const Packet1Xl& b) { + PacketMask64 mask = __riscv_vmsle_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pcmp_lt(const PacketMul1Xl& a, const PacketMul1Xl& b) { - PacketMask64 mask = __riscv_vmslt_vv_i64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pcmp_lt(const Packet1Xl& a, const Packet1Xl& b) { + PacketMask64 mask = __riscv_vmslt_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pcmp_eq(const PacketMul1Xl& a, const PacketMul1Xl& b) { - PacketMask64 mask = __riscv_vmseq_vv_i64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pcmp_eq(const Packet1Xl& a, const Packet1Xl& b) { + PacketMask64 mask = __riscv_vmseq_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl ptrue(const PacketMul1Xl& /*a*/) { - return __riscv_vmv_v_x_i64m1(0xffffffffffffffffu, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl ptrue(const Packet1Xl& /*a*/) { + return __riscv_vmv_v_x_i64m1(0xffffffffffffffffu, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pand(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vand_vv_i64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pand(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vand_vv_i64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl por(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vor_vv_i64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl por(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vor_vv_i64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pxor(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vxor_vv_i64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pxor(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vxor_vv_i64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pandnot(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vand_vv_i64m1(a, __riscv_vnot_v_i64m1(b, unpacket_traits::size), - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pandnot(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vand_vv_i64m1(a, __riscv_vnot_v_i64m1(b, unpacket_traits::size), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul1Xl parithmetic_shift_right(PacketMul1Xl a) { - return __riscv_vsra_vx_i64m1(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl parithmetic_shift_right(Packet1Xl a) { + return __riscv_vsra_vx_i64m1(a, N, unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul1Xl plogical_shift_right(PacketMul1Xl a) { +EIGEN_STRONG_INLINE Packet1Xl plogical_shift_right(Packet1Xl a) { return __riscv_vreinterpret_i64m1( - __riscv_vsrl_vx_u64m1(__riscv_vreinterpret_u64m1(a), N, unpacket_traits::size)); + __riscv_vsrl_vx_u64m1(__riscv_vreinterpret_u64m1(a), N, unpacket_traits::size)); } template -EIGEN_STRONG_INLINE PacketMul1Xl plogical_shift_left(PacketMul1Xl a) { - return __riscv_vsll_vx_i64m1(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl plogical_shift_left(Packet1Xl a) { + return __riscv_vsll_vx_i64m1(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pload(const numext::int64_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl ploadu(const numext::int64_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl ploaddup(const numext::int64_t* from) { - PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl ploaddup(const numext::int64_t* from) { + Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... - return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); + return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl ploadquad(const numext::int64_t* from) { - PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl ploadquad(const numext::int64_t* from) { + Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); ; - return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); + return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul1Xl& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const Packet1Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul1Xl& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const Packet1Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul1Xl pgather(const numext::int64_t* from, Index stride) { - return __riscv_vlse64_v_i64m1(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline Packet1Xl pgather(const numext::int64_t* from, Index stride) { + return __riscv_vlse64_v_i64m1(from, stride * sizeof(numext::int64_t), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul1Xl& from, +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const Packet1Xl& from, Index stride) { - __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul1Xl& a) { +EIGEN_STRONG_INLINE numext::int64_t pfirst(const Packet1Xl& a) { return __riscv_vmv_x_s_i64m1_i64(a); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl preverse(const PacketMul1Xl& a) { - PacketMul1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl preverse(const Packet1Xl& a) { + Packet1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pabs(const PacketMul1Xl& a) { - PacketMul1Xl mask = __riscv_vsra_vx_i64m1(a, 63, unpacket_traits::size); - return __riscv_vsub_vv_i64m1(__riscv_vxor_vv_i64m1(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pabs(const Packet1Xl& a) { + Packet1Xl mask = __riscv_vsra_vx_i64m1(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m1(__riscv_vxor_vv_i64m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul1Xl& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i64m1_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size), - unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int64_t predux(const Packet1Xl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m1_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul1Xl& a) { +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const Packet1Xl& a) { // Multiply the vector by its reverse - PacketMul1Xl prod = __riscv_vmul_vv_i64m1(preverse(a), a, unpacket_traits::size); - PacketMul1Xl half_prod; + Packet1Xl prod = __riscv_vmul_vv_i64m1(preverse(a), a, unpacket_traits::size); + Packet1Xl half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_i64m1(prod, 4, unpacket_traits::size); - prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i64m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_i64m1(prod, 2, unpacket_traits::size); - prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i64m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_i64m1(prod, 1, unpacket_traits::size); - prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i64m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); } // The reduction is done to the first element. @@ -1406,48 +1406,48 @@ EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul1Xl& } template <> -EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul1Xl& a) { +EIGEN_STRONG_INLINE numext::int64_t predux_min(const Packet1Xl& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i64m1_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul1Xl& a) { +EIGEN_STRONG_INLINE numext::int64_t predux_max(const Packet1Xl& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i64m1_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int64_t buffer[unpacket_traits::size * N] = {0}; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle64_v_i64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle64_v_i64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } /********************************* double ************************************/ -typedef eigen_packet_wrapper PacketMul1Xd; +typedef eigen_packet_wrapper Packet1Xd; typedef eigen_packet_wrapper - PacketMul2Xd; + Packet2Xd; typedef eigen_packet_wrapper - PacketMul4Xd; + Packet4Xd; #if EIGEN_RISCV64_DEFAULT_LMUL == 1 -typedef PacketMul1Xd PacketXd; +typedef Packet1Xd PacketXd; template <> struct packet_traits : default_packet_traits { - typedef PacketMul1Xd type; - typedef PacketMul1Xd half; + typedef Packet1Xd type; + typedef Packet1Xd half; enum { Vectorizable = 1, @@ -1480,12 +1480,12 @@ struct packet_traits : default_packet_traits { }; #elif EIGEN_RISCV64_DEFAULT_LMUL == 2 -typedef PacketMul2Xd PacketXd; +typedef Packet2Xd PacketXd; template <> struct packet_traits : default_packet_traits { - typedef PacketMul2Xd type; - typedef PacketMul1Xd half; + typedef Packet2Xd type; + typedef Packet1Xd half; enum { Vectorizable = 1, @@ -1518,12 +1518,12 @@ struct packet_traits : default_packet_traits { }; #elif EIGEN_RISCV64_DEFAULT_LMUL == 4 -typedef PacketMul4Xd PacketXd; +typedef Packet4Xd PacketXd; template <> struct packet_traits : default_packet_traits { - typedef PacketMul4Xd type; - typedef PacketMul2Xd half; + typedef Packet4Xd type; + typedef Packet2Xd half; enum { Vectorizable = 1, @@ -1557,10 +1557,10 @@ struct packet_traits : default_packet_traits { #endif template <> -struct unpacket_traits { +struct unpacket_traits { typedef double type; - typedef PacketMul1Xd half; // Half not yet implemented - typedef PacketMul1Xl integer_packet; + typedef Packet1Xd half; // Half not yet implemented + typedef Packet1Xl integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask64 packet_mask; @@ -1574,10 +1574,10 @@ struct unpacket_traits { }; template <> -struct unpacket_traits { +struct unpacket_traits { typedef double type; - typedef PacketMul1Xd half; - typedef PacketMul2Xl integer_packet; + typedef Packet1Xd half; + typedef Packet2Xl integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask32 packet_mask; @@ -1591,10 +1591,10 @@ struct unpacket_traits { }; template <> -struct unpacket_traits { +struct unpacket_traits { typedef double type; - typedef PacketMul2Xd half; - typedef PacketMul4Xl integer_packet; + typedef Packet2Xd half; + typedef Packet4Xl integer_packet; typedef numext::uint8_t mask_t; typedef PacketMask16 packet_mask; @@ -1607,297 +1607,297 @@ struct unpacket_traits { }; }; -/********************************* PacketMul1Xd ************************************/ +/********************************* Packet1Xd ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul1Xd ptrue(const PacketMul1Xd& /*a*/) { - return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(0xffffffffffffffffu, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet1Xd ptrue(const Packet1Xd& /*a*/) { + return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(0xffffffffffffffffu, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pzero(const PacketMul1Xd& /*a*/) { - return __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pzero(const Packet1Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pabs(const PacketMul1Xd& a) { - return __riscv_vfabs_v_f64m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pabs(const Packet1Xd& a) { + return __riscv_vfabs_v_f64m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pset1(const double& from) { - return __riscv_vfmv_v_f_f64m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pset1frombits(numext::uint64_t from) { - return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(from, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet1Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(from, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd plset(const double& a) { - PacketMul1Xd idx = __riscv_vfcvt_f_x_v_f64m1( - __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)), - unpacket_traits::size); - return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd plset(const double& a) { + Packet1Xd idx = __riscv_vfcvt_f_x_v_f64m1( + __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd padd(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vfadd_vv_f64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd padd(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vfadd_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd psub(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vfsub_vv_f64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd psub(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vfsub_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pnegate(const PacketMul1Xd& a) { - return __riscv_vfneg_v_f64m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pnegate(const Packet1Xd& a) { + return __riscv_vfneg_v_f64m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pconj(const PacketMul1Xd& a) { +EIGEN_STRONG_INLINE Packet1Xd pconj(const Packet1Xd& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmul(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vfmul_vv_f64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pmul(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vfmul_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pdiv(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vfdiv_vv_f64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pdiv(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vfdiv_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmadd(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { - return __riscv_vfmadd_vv_f64m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pmadd(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c) { + return __riscv_vfmadd_vv_f64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmsub(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { - return __riscv_vfmsub_vv_f64m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pmsub(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c) { + return __riscv_vfmsub_vv_f64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pnmadd(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { - return __riscv_vfnmsub_vv_f64m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pnmadd(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c) { + return __riscv_vfnmsub_vv_f64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pnmsub(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c) { - return __riscv_vfnmadd_vv_f64m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pnmsub(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c) { + return __riscv_vfnmadd_vv_f64m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmin(const PacketMul1Xd& a, const PacketMul1Xd& b) { - PacketMul1Xd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); - PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pmin(const Packet1Xd& a, const Packet1Xd& b) { + Packet1Xd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); + PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); - return __riscv_vfmin_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmin_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmin(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return pmin(a, b); +EIGEN_STRONG_INLINE Packet1Xd pmin(const Packet1Xd& a, const Packet1Xd& b) { + return pmin(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmin(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vfmin_vv_f64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pmin(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vfmin_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmax(const PacketMul1Xd& a, const PacketMul1Xd& b) { - PacketMul1Xd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); - PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pmax(const Packet1Xd& a, const Packet1Xd& b) { + Packet1Xd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); + PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); - return __riscv_vfmax_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmax_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmax(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return pmax(a, b); +EIGEN_STRONG_INLINE Packet1Xd pmax(const Packet1Xd& a, const Packet1Xd& b) { + return pmax(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pmax(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vfmax_vv_f64m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pmax(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vfmax_vv_f64m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pcmp_le(const PacketMul1Xd& a, const PacketMul1Xd& b) { - PacketMask64 mask = __riscv_vmfle_vv_f64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pcmp_le(const Packet1Xd& a, const Packet1Xd& b) { + PacketMask64 mask = __riscv_vmfle_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pcmp_lt(const PacketMul1Xd& a, const PacketMul1Xd& b) { - PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pcmp_lt(const Packet1Xd& a, const Packet1Xd& b) { + PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pcmp_eq(const PacketMul1Xd& a, const PacketMul1Xd& b) { - PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pcmp_eq(const Packet1Xd& a, const Packet1Xd& b) { + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pcmp_lt_or_nan(const PacketMul1Xd& a, const PacketMul1Xd& b) { - PacketMask64 mask = __riscv_vmfge_vv_f64m1_b64(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f64m1(ptrue(a), 0.0, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pcmp_lt_or_nan(const Packet1Xd& a, const Packet1Xd& b) { + PacketMask64 mask = __riscv_vmfge_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m1(ptrue(a), 0.0, mask, unpacket_traits::size); } // Logical Operations are not supported for double, so reinterpret casts template <> -EIGEN_STRONG_INLINE PacketMul1Xd pand(const PacketMul1Xd& a, const PacketMul1Xd& b) { +EIGEN_STRONG_INLINE Packet1Xd pand(const Packet1Xd& a, const Packet1Xd& b) { return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( - __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd por(const PacketMul1Xd& a, const PacketMul1Xd& b) { +EIGEN_STRONG_INLINE Packet1Xd por(const Packet1Xd& a, const Packet1Xd& b) { return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vor_vv_u64m1( - __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pxor(const PacketMul1Xd& a, const PacketMul1Xd& b) { +EIGEN_STRONG_INLINE Packet1Xd pxor(const Packet1Xd& a, const Packet1Xd& b) { return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vv_u64m1( - __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pandnot(const PacketMul1Xd& a, const PacketMul1Xd& b) { +EIGEN_STRONG_INLINE Packet1Xd pandnot(const Packet1Xd& a, const Packet1Xd& b) { return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( __riscv_vreinterpret_v_f64m1_u64m1(a), - __riscv_vnot_v_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size), - unpacket_traits::size)); + __riscv_vnot_v_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pload(const double* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd ploadu(const double* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd ploaddup(const double* from) { - PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd ploaddup(const double* from) { + Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd ploadquad(const double* from) { - PacketMul1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd ploadquad(const double* from) { + Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); ; - return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul1Xd& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(double* to, const Packet1Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul1Xd& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet1Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul1Xd pgather(const double* from, Index stride) { - return __riscv_vlse64_v_f64m1(from, stride * sizeof(double), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline Packet1Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m1(from, stride * sizeof(double), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul1Xd& from, Index stride) { - __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet1Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE double pfirst(const PacketMul1Xd& a) { +EIGEN_STRONG_INLINE double pfirst(const Packet1Xd& a) { return __riscv_vfmv_f_s_f64m1_f64(a); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd psqrt(const PacketMul1Xd& a) { - return __riscv_vfsqrt_v_f64m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd psqrt(const Packet1Xd& a) { + return __riscv_vfsqrt_v_f64m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd print(const PacketMul1Xd& a) { - const PacketMul1Xd limit = pset1(static_cast(1ull << 52)); - const PacketMul1Xd abs_a = pabs(a); +EIGEN_STRONG_INLINE Packet1Xd print(const Packet1Xd& a) { + const Packet1Xd limit = pset1(static_cast(1ull << 52)); + const Packet1Xd abs_a = pabs(a); - PacketMask64 mask = __riscv_vmfne_vv_f64m1_b64(a, a, unpacket_traits::size); - const PacketMul1Xd x = __riscv_vfadd_vv_f64m1_tumu(mask, a, a, a, unpacket_traits::size); - const PacketMul1Xd new_x = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1(a, unpacket_traits::size), - unpacket_traits::size); + PacketMask64 mask = __riscv_vmfne_vv_f64m1_b64(a, a, unpacket_traits::size); + const Packet1Xd x = __riscv_vfadd_vv_f64m1_tumu(mask, a, a, a, unpacket_traits::size); + const Packet1Xd new_x = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1(a, unpacket_traits::size), + unpacket_traits::size); - mask = __riscv_vmflt_vv_f64m1_b64(abs_a, limit, unpacket_traits::size); - PacketMul1Xd signed_x = __riscv_vfsgnj_vv_f64m1(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m1(x, signed_x, mask, unpacket_traits::size); + mask = __riscv_vmflt_vv_f64m1_b64(abs_a, limit, unpacket_traits::size); + Packet1Xd signed_x = __riscv_vfsgnj_vv_f64m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(x, signed_x, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pfloor(const PacketMul1Xd& a) { - PacketMul1Xd tmp = print(a); +EIGEN_STRONG_INLINE Packet1Xd pfloor(const Packet1Xd& a) { + Packet1Xd tmp = print(a); // If greater, subtract one. - PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f64m1_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); + PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m1_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd preverse(const PacketMul1Xd& a) { - PacketMul1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd preverse(const Packet1Xd& a) { + Packet1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pfrexp(const PacketMul1Xd& a, PacketMul1Xd& exponent) { +EIGEN_STRONG_INLINE Packet1Xd pfrexp(const Packet1Xd& a, Packet1Xd& exponent) { return pfrexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE double predux(const PacketMul1Xd& a) { +EIGEN_STRONG_INLINE double predux(const Packet1Xd& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m1_f64m1( - a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size), unpacket_traits::size)); + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE double predux_mul(const PacketMul1Xd& a) { +EIGEN_STRONG_INLINE double predux_mul(const Packet1Xd& a) { // Multiply the vector by its reverse - PacketMul1Xd prod = __riscv_vfmul_vv_f64m1(preverse(a), a, unpacket_traits::size); - PacketMul1Xd half_prod; + Packet1Xd prod = __riscv_vfmul_vv_f64m1(preverse(a), a, unpacket_traits::size); + Packet1Xd half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_f64m1(prod, 4, unpacket_traits::size); - prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f64m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_f64m1(prod, 2, unpacket_traits::size); - prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f64m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_f64m1(prod, 1, unpacket_traits::size); - prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_f64m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); } // The reduction is done to the first element. @@ -1905,95 +1905,95 @@ EIGEN_STRONG_INLINE double predux_mul(const PacketMul1Xd& a) { } template <> -EIGEN_STRONG_INLINE double predux_min(const PacketMul1Xd& a) { +EIGEN_STRONG_INLINE double predux_min(const Packet1Xd& a) { return ( std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), (std::numeric_limits::max)()); } template <> -EIGEN_STRONG_INLINE double predux_max(const PacketMul1Xd& a) { +EIGEN_STRONG_INLINE double predux_max(const Packet1Xd& a) { return ( std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( a, - __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), - unpacket_traits::size)), + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), -(std::numeric_limits::max)()); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - double buffer[unpacket_traits::size * N]; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle64_v_f64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle64_v_f64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template <> -EIGEN_STRONG_INLINE PacketMul1Xd pldexp(const PacketMul1Xd& a, const PacketMul1Xd& exponent) { +EIGEN_STRONG_INLINE Packet1Xd pldexp(const Packet1Xd& a, const Packet1Xd& exponent) { return pldexp_generic(a, exponent); } template <> EIGEN_STRONG_INLINE PacketMask64 por(const PacketMask64& a, const PacketMask64& b) { - return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); + return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMask64 pandnot(const PacketMask64& a, const PacketMask64& b) { - return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); + return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); } template <> EIGEN_STRONG_INLINE PacketMask64 pand(const PacketMask64& a, const PacketMask64& b) { - return __riscv_vmand_mm_b64(a, b, unpacket_traits::size); + return __riscv_vmand_mm_b64(a, b, unpacket_traits::size); } -EIGEN_STRONG_INLINE PacketMask64 pcmp_eq_mask(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask64 pcmp_eq_mask(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); } -EIGEN_STRONG_INLINE PacketMask64 pcmp_lt_mask(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE PacketMask64 pcmp_lt_mask(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); } -EIGEN_STRONG_INLINE PacketMul1Xd pselect(const PacketMask64& mask, const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vmerge_vvm_f64m1(b, a, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pselect(const PacketMask64& mask, const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vmerge_vvm_f64m1(b, a, mask, unpacket_traits::size); } /********************************* short **************************************/ -typedef eigen_packet_wrapper PacketMul1Xs; -typedef eigen_packet_wrapper PacketMul1Xsu; +typedef eigen_packet_wrapper Packet1Xs; +typedef eigen_packet_wrapper Packet1Xsu; typedef eigen_packet_wrapper - PacketMul2Xs; + Packet2Xs; typedef eigen_packet_wrapper - PacketMul2Xsu; + Packet2Xsu; typedef eigen_packet_wrapper - PacketMul4Xs; + Packet4Xs; typedef eigen_packet_wrapper - PacketMul4Xsu; + Packet4Xsu; #if EIGEN_RISCV64_DEFAULT_LMUL == 1 -typedef PacketMul1Xs PacketXs; -typedef PacketMul1Xsu PacketXsu; +typedef Packet1Xs PacketXs; +typedef Packet1Xsu PacketXsu; template <> struct packet_traits : default_packet_traits { - typedef PacketMul1Xs type; - typedef PacketMul1Xs half; // Half not implemented yet + typedef Packet1Xs type; + typedef Packet1Xs half; // Half not implemented yet enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -2017,13 +2017,13 @@ struct packet_traits : default_packet_traits { }; #elif EIGEN_RISCV64_DEFAULT_LMUL == 2 -typedef PacketMul2Xs PacketXs; -typedef PacketMul2Xsu PacketXsu; +typedef Packet2Xs PacketXs; +typedef Packet2Xsu PacketXsu; template <> struct packet_traits : default_packet_traits { - typedef PacketMul2Xs type; - typedef PacketMul1Xs half; + typedef Packet2Xs type; + typedef Packet1Xs half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -2047,13 +2047,13 @@ struct packet_traits : default_packet_traits { }; #elif EIGEN_RISCV64_DEFAULT_LMUL == 4 -typedef PacketMul4Xs PacketXs; -typedef PacketMul4Xsu PacketXsu; +typedef Packet4Xs PacketXs; +typedef Packet4Xsu PacketXsu; template <> struct packet_traits : default_packet_traits { - typedef PacketMul4Xs type; - typedef PacketMul2Xs half; + typedef Packet4Xs type; + typedef Packet2Xs half; enum { Vectorizable = 1, AlignedOnScalar = 1, @@ -2078,9 +2078,9 @@ struct packet_traits : default_packet_traits { #endif template <> -struct unpacket_traits { +struct unpacket_traits { typedef numext::int16_t type; - typedef PacketMul1Xs half; // Half not yet implemented + typedef Packet1Xs half; // Half not yet implemented typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -2092,9 +2092,9 @@ struct unpacket_traits { }; template <> -struct unpacket_traits { +struct unpacket_traits { typedef numext::int16_t type; - typedef PacketMul1Xs half; + typedef Packet1Xs half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -2106,9 +2106,9 @@ struct unpacket_traits { }; template <> -struct unpacket_traits { +struct unpacket_traits { typedef numext::int16_t type; - typedef PacketMul2Xs half; + typedef Packet2Xs half; typedef numext::uint8_t mask_t; enum { size = rvv_packet_size_selector::size, @@ -2126,277 +2126,274 @@ EIGEN_STRONG_INLINE void prefetch(const numext::int16_t* addr) #endif } -/********************************* PacketMul1Xs ************************************/ +/********************************* Packet1Xs ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul1Xs pset1(const numext::int16_t& from) { - return __riscv_vmv_v_x_i16m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs plset(const numext::int16_t& a) { - PacketMul1Xs idx = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vid_v_u16m1(unpacket_traits::size)); - return __riscv_vadd_vx_i16m1(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs plset(const numext::int16_t& a) { + Packet1Xs idx = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vid_v_u16m1(unpacket_traits::size)); + return __riscv_vadd_vx_i16m1(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pzero(const PacketMul1Xs& /*a*/) { - return __riscv_vmv_v_x_i16m1(0, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pzero(const Packet1Xs& /*a*/) { + return __riscv_vmv_v_x_i16m1(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs padd(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vadd_vv_i16m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs padd(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vadd_vv_i16m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs psub(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vsub(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs psub(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pnegate(const PacketMul1Xs& a) { - return __riscv_vneg(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pnegate(const Packet1Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pconj(const PacketMul1Xs& a) { +EIGEN_STRONG_INLINE Packet1Xs pconj(const Packet1Xs& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pmul(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vmul(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pmul(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pdiv(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pdiv(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pmadd(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pmadd(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pmsub(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pmsub(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pnmadd(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { - return __riscv_vnmsub_vv_i16m1(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pnmadd(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c) { + return __riscv_vnmsub_vv_i16m1(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pnmsub(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c) { - return __riscv_vnmsub_vv_i16m1(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pnmsub(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c) { + return __riscv_vnmsub_vv_i16m1(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pmin(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vmin(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pmin(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pmax(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vmax(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pmax(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pcmp_le(const PacketMul1Xs& a, const PacketMul1Xs& b) { - PacketMask16 mask = __riscv_vmsle_vv_i16m1_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pcmp_le(const Packet1Xs& a, const Packet1Xs& b) { + PacketMask16 mask = __riscv_vmsle_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pcmp_lt(const PacketMul1Xs& a, const PacketMul1Xs& b) { - PacketMask16 mask = __riscv_vmslt_vv_i16m1_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pcmp_lt(const Packet1Xs& a, const Packet1Xs& b) { + PacketMask16 mask = __riscv_vmslt_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pcmp_eq(const PacketMul1Xs& a, const PacketMul1Xs& b) { - PacketMask16 mask = __riscv_vmseq_vv_i16m1_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pcmp_eq(const Packet1Xs& a, const Packet1Xs& b) { + PacketMask16 mask = __riscv_vmseq_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs ptrue(const PacketMul1Xs& /*a*/) { - return __riscv_vmv_v_x_i16m1(static_cast(0xffffu), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs ptrue(const Packet1Xs& /*a*/) { + return __riscv_vmv_v_x_i16m1(static_cast(0xffffu), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pand(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vand_vv_i16m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pand(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vand_vv_i16m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs por(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vor_vv_i16m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs por(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vor_vv_i16m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pxor(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vxor_vv_i16m1(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pxor(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vxor_vv_i16m1(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pandnot(const PacketMul1Xs& a, const PacketMul1Xs& b) { - return __riscv_vand_vv_i16m1(a, __riscv_vnot_v_i16m1(b, unpacket_traits::size), - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pandnot(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vand_vv_i16m1(a, __riscv_vnot_v_i16m1(b, unpacket_traits::size), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul1Xs parithmetic_shift_right(PacketMul1Xs a) { - return __riscv_vsra_vx_i16m1(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs parithmetic_shift_right(Packet1Xs a) { + return __riscv_vsra_vx_i16m1(a, N, unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul1Xs plogical_shift_right(PacketMul1Xs a) { +EIGEN_STRONG_INLINE Packet1Xs plogical_shift_right(Packet1Xs a) { return __riscv_vreinterpret_i16m1( - __riscv_vsrl_vx_u16m1(__riscv_vreinterpret_u16m1(a), N, unpacket_traits::size)); + __riscv_vsrl_vx_u16m1(__riscv_vreinterpret_u16m1(a), N, unpacket_traits::size)); } template -EIGEN_STRONG_INLINE PacketMul1Xs plogical_shift_left(PacketMul1Xs a) { - return __riscv_vsll_vx_i16m1(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs plogical_shift_left(Packet1Xs a) { + return __riscv_vsll_vx_i16m1(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pload(const numext::int16_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs ploadu(const numext::int16_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs ploaddup(const numext::int16_t* from) { - PacketMul1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); - idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs ploaddup(const numext::int16_t* from) { + Packet1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... - return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); + return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs ploadquad(const numext::int16_t* from) { - PacketMul1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs ploadquad(const numext::int16_t* from) { + Packet1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul1Xs& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const Packet1Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul1Xs& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const Packet1Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul1Xs pgather(const numext::int16_t* from, Index stride) { - return __riscv_vlse16_v_i16m1(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline Packet1Xs pgather(const numext::int16_t* from, Index stride) { + return __riscv_vlse16_v_i16m1(from, stride * sizeof(numext::int16_t), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul1Xs& from, +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const Packet1Xs& from, Index stride) { - __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul1Xs& a) { +EIGEN_STRONG_INLINE numext::int16_t pfirst(const Packet1Xs& a) { return __riscv_vmv_x_s_i16m1_i16(a); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs preverse(const PacketMul1Xs& a) { - PacketMul1Xsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs preverse(const Packet1Xs& a) { + Packet1Xsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xs pabs(const PacketMul1Xs& a) { - PacketMul1Xs mask = __riscv_vsra_vx_i16m1(a, 15, unpacket_traits::size); - return __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xs pabs(const Packet1Xs& a) { + Packet1Xs mask = __riscv_vsra_vx_i16m1(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul1Xs& a) { - return __riscv_vmv_x(__riscv_vredsum_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size), - unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int16_t predux(const Packet1Xs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul1Xs& a) { +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const Packet1Xs& a) { // Multiply the vector by its reverse - PacketMul1Xs prod = __riscv_vmul_vv_i16m1(preverse(a), a, unpacket_traits::size); - PacketMul1Xs half_prod; + Packet1Xs prod = __riscv_vmul_vv_i16m1(preverse(a), a, unpacket_traits::size); + Packet1Xs half_prod; if (EIGEN_RISCV64_RVV_VL >= 1024) { - half_prod = __riscv_vslidedown_vx_i16m1(prod, 16, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 16, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 512) { - half_prod = __riscv_vslidedown_vx_i16m1(prod, 8, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 8, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); } if (EIGEN_RISCV64_RVV_VL >= 256) { - half_prod = __riscv_vslidedown_vx_i16m1(prod, 4, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); } // Last reduction - half_prod = __riscv_vslidedown_vx_i16m1(prod, 2, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); - half_prod = __riscv_vslidedown_vx_i16m1(prod, 1, unpacket_traits::size); - prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + half_prod = __riscv_vslidedown_vx_i16m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); // The reduction is done to the first element. return pfirst(prod); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul1Xs& a) { +EIGEN_STRONG_INLINE numext::int16_t predux_min(const Packet1Xs& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i16m1_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul1Xs& a) { +EIGEN_STRONG_INLINE numext::int16_t predux_max(const Packet1Xs& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i16m1_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int16_t buffer[unpacket_traits::size * N] = {0}; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle16_v_i16m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle16_v_i16m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } } // namespace internal } // namespace Eigen -#include "PacketMath4.h" -#include "PacketMath2.h" - #endif // EIGEN_PACKET_MATH_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/PacketMath2.h b/Eigen/src/Core/arch/RVV10/PacketMath2.h index 4e262cfe8..1fda51131 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath2.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath2.h @@ -16,1488 +16,1488 @@ namespace Eigen { namespace internal { -/********************************* PacketMul2Xi ************************************/ +/********************************* Packet2Xi ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul2Xi pset1(const numext::int32_t& from) { - return __riscv_vmv_v_x_i32m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi plset(const numext::int32_t& a) { - PacketMul2Xi idx = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)); - return __riscv_vadd_vx_i32m2(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi plset(const numext::int32_t& a) { + Packet2Xi idx = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)); + return __riscv_vadd_vx_i32m2(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pzero(const PacketMul2Xi& /*a*/) { - return __riscv_vmv_v_x_i32m2(0, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pzero(const Packet2Xi& /*a*/) { + return __riscv_vmv_v_x_i32m2(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi padd(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vadd_vv_i32m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi padd(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vadd_vv_i32m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi psub(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vsub(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi psub(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pnegate(const PacketMul2Xi& a) { - return __riscv_vneg(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pnegate(const Packet2Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pconj(const PacketMul2Xi& a) { +EIGEN_STRONG_INLINE Packet2Xi pconj(const Packet2Xi& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pmul(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vmul(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pmul(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pdiv(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pdiv(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pmadd(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pmadd(const Packet2Xi& a, const Packet2Xi& b, const Packet2Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pmsub(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pmsub(const Packet2Xi& a, const Packet2Xi& b, const Packet2Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pnmadd(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { - return __riscv_vnmsub_vv_i32m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pnmadd(const Packet2Xi& a, const Packet2Xi& b, const Packet2Xi& c) { + return __riscv_vnmsub_vv_i32m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pnmsub(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { - return __riscv_vnmsub_vv_i32m2(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pnmsub(const Packet2Xi& a, const Packet2Xi& b, const Packet2Xi& c) { + return __riscv_vnmsub_vv_i32m2(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pmin(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vmin(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pmin(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pmax(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vmax(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pmax(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pcmp_le(const PacketMul2Xi& a, const PacketMul2Xi& b) { - PacketMask16 mask = __riscv_vmsle_vv_i32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pcmp_le(const Packet2Xi& a, const Packet2Xi& b) { + PacketMask16 mask = __riscv_vmsle_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pcmp_lt(const PacketMul2Xi& a, const PacketMul2Xi& b) { - PacketMask16 mask = __riscv_vmslt_vv_i32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pcmp_lt(const Packet2Xi& a, const Packet2Xi& b) { + PacketMask16 mask = __riscv_vmslt_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pcmp_eq(const PacketMul2Xi& a, const PacketMul2Xi& b) { - PacketMask16 mask = __riscv_vmseq_vv_i32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pcmp_eq(const Packet2Xi& a, const Packet2Xi& b) { + PacketMask16 mask = __riscv_vmseq_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi ptrue(const PacketMul2Xi& /*a*/) { - return __riscv_vmv_v_x_i32m2(0xffffffffu, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi ptrue(const Packet2Xi& /*a*/) { + return __riscv_vmv_v_x_i32m2(0xffffffffu, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pand(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vand_vv_i32m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pand(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vand_vv_i32m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi por(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vor_vv_i32m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi por(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vor_vv_i32m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pxor(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vxor_vv_i32m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pxor(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vxor_vv_i32m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pandnot(const PacketMul2Xi& a, const PacketMul2Xi& b) { - return __riscv_vand_vv_i32m2(a, __riscv_vnot_v_i32m2(b, unpacket_traits::size), - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pandnot(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vand_vv_i32m2(a, __riscv_vnot_v_i32m2(b, unpacket_traits::size), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul2Xi parithmetic_shift_right(PacketMul2Xi a) { - return __riscv_vsra_vx_i32m2(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi parithmetic_shift_right(Packet2Xi a) { + return __riscv_vsra_vx_i32m2(a, N, unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_right(PacketMul2Xi a) { +EIGEN_STRONG_INLINE Packet2Xi plogical_shift_right(Packet2Xi a) { return __riscv_vreinterpret_i32m2( - __riscv_vsrl_vx_u32m2(__riscv_vreinterpret_u32m2(a), N, unpacket_traits::size)); + __riscv_vsrl_vx_u32m2(__riscv_vreinterpret_u32m2(a), N, unpacket_traits::size)); } template -EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_left(PacketMul2Xi a) { - return __riscv_vsll_vx_i32m2(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi plogical_shift_left(Packet2Xi a) { + return __riscv_vsll_vx_i32m2(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pload(const numext::int32_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi ploadu(const numext::int32_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi ploaddup(const numext::int32_t* from) { - PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi ploaddup(const numext::int32_t* from) { + Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... - return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); + return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi ploadquad(const numext::int32_t* from) { - PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); - idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi ploadquad(const numext::int32_t* from) { + Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul2Xi& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const Packet2Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul2Xi& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const Packet2Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul2Xi pgather(const numext::int32_t* from, +EIGEN_DEVICE_FUNC inline Packet2Xi pgather(const numext::int32_t* from, Index stride) { - return __riscv_vlse32_v_i32m2(from, stride * sizeof(numext::int32_t), unpacket_traits::size); + return __riscv_vlse32_v_i32m2(from, stride * sizeof(numext::int32_t), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul2Xi& from, +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const Packet2Xi& from, Index stride) { - __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul2Xi& a) { +EIGEN_STRONG_INLINE numext::int32_t pfirst(const Packet2Xi& a) { return __riscv_vmv_x_s_i32m2_i32(a); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi preverse(const PacketMul2Xi& a) { - PacketMul2Xu idx = - __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i32m2(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi preverse(const Packet2Xi& a) { + Packet2Xu idx = + __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m2(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pabs(const PacketMul2Xi& a) { - PacketMul2Xi mask = __riscv_vsra_vx_i32m2(a, 31, unpacket_traits::size); - return __riscv_vsub_vv_i32m2(__riscv_vxor_vv_i32m2(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pabs(const Packet2Xi& a) { + Packet2Xi mask = __riscv_vsra_vx_i32m2(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m2(__riscv_vxor_vv_i32m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul2Xi& a) { +EIGEN_STRONG_INLINE numext::int32_t predux(const Packet2Xi& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1( - a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 2), unpacket_traits::size)); + a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul2Xi& a) { - return predux_mul(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), - unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const Packet2Xi& a) { + return predux_mul(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul2Xi& a) { +EIGEN_STRONG_INLINE numext::int32_t predux_min(const Packet2Xi& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i32m2_i32m1( - a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 2), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 2), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul2Xi& a) { +EIGEN_STRONG_INLINE numext::int32_t predux_max(const Packet2Xi& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i32m2_i32m1( - a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 2), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 2), + unpacket_traits::size)); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int32_t buffer[unpacket_traits::size * N] = {0}; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle32_v_i32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle32_v_i32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } -template +template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xi>::type -predux_half_dowto4(const PacketMul4Xi& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xi>::type +predux_half_dowto4(const Packet4Xi& a) { return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), __riscv_vget_v_i32m4_i32m2(a, 1), - unpacket_traits::size); + unpacket_traits::size); } -template +template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul1Xi>::type -predux_half_dowto4(const PacketMul2Xi& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xi>::type +predux_half_dowto4(const Packet2Xi& a) { return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); } -/********************************* PacketMul2Xf ************************************/ +/********************************* Packet2Xf ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul2Xf ptrue(const PacketMul2Xf& /*a*/) { - return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(0xffffffffu, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet2Xf ptrue(const Packet2Xf& /*a*/) { + return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(0xffffffffu, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pzero(const PacketMul2Xf& /*a*/) { - return __riscv_vfmv_v_f_f32m2(0.0f, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pzero(const Packet2Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m2(0.0f, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pabs(const PacketMul2Xf& a) { - return __riscv_vfabs_v_f32m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pabs(const Packet2Xf& a) { + return __riscv_vfabs_v_f32m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pset1(const float& from) { - return __riscv_vfmv_v_f_f32m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pset1frombits(numext::uint32_t from) { - return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(from, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet2Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(from, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf plset(const float& a) { - PacketMul2Xf idx = __riscv_vfcvt_f_x_v_f32m2( - __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)), - unpacket_traits::size); - return __riscv_vfadd_vf_f32m2(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf plset(const float& a) { + Packet2Xf idx = __riscv_vfcvt_f_x_v_f32m2( + __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f32m2(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf padd(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vfadd_vv_f32m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf padd(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vfadd_vv_f32m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf psub(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vfsub_vv_f32m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf psub(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vfsub_vv_f32m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pnegate(const PacketMul2Xf& a) { - return __riscv_vfneg_v_f32m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pnegate(const Packet2Xf& a) { + return __riscv_vfneg_v_f32m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pconj(const PacketMul2Xf& a) { +EIGEN_STRONG_INLINE Packet2Xf pconj(const Packet2Xf& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmul(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vfmul_vv_f32m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pmul(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vfmul_vv_f32m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pdiv(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vfdiv_vv_f32m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pdiv(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vfdiv_vv_f32m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmadd(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { - return __riscv_vfmadd_vv_f32m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pmadd(const Packet2Xf& a, const Packet2Xf& b, const Packet2Xf& c) { + return __riscv_vfmadd_vv_f32m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmsub(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { - return __riscv_vfmsub_vv_f32m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pmsub(const Packet2Xf& a, const Packet2Xf& b, const Packet2Xf& c) { + return __riscv_vfmsub_vv_f32m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pnmadd(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { - return __riscv_vfnmsub_vv_f32m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pnmadd(const Packet2Xf& a, const Packet2Xf& b, const Packet2Xf& c) { + return __riscv_vfnmsub_vv_f32m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pnmsub(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { - return __riscv_vfnmadd_vv_f32m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pnmsub(const Packet2Xf& a, const Packet2Xf& b, const Packet2Xf& c) { + return __riscv_vfnmadd_vv_f32m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { - PacketMul2Xf nans = - __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); - PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pmin(const Packet2Xf& a, const Packet2Xf& b) { + Packet2Xf nans = + __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); - return __riscv_vfmin_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmin_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return pmin(a, b); +EIGEN_STRONG_INLINE Packet2Xf pmin(const Packet2Xf& a, const Packet2Xf& b) { + return pmin(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vfmin_vv_f32m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pmin(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vfmin_vv_f32m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { - PacketMul2Xf nans = - __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); - PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pmax(const Packet2Xf& a, const Packet2Xf& b) { + Packet2Xf nans = + __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); - return __riscv_vfmax_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmax_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return pmax(a, b); +EIGEN_STRONG_INLINE Packet2Xf pmax(const Packet2Xf& a, const Packet2Xf& b) { + return pmax(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { - return __riscv_vfmax_vv_f32m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pmax(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vfmax_vv_f32m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pcmp_le(const PacketMul2Xf& a, const PacketMul2Xf& b) { - PacketMask16 mask = __riscv_vmfle_vv_f32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pcmp_le(const Packet2Xf& a, const Packet2Xf& b) { + PacketMask16 mask = __riscv_vmfle_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt(const PacketMul2Xf& a, const PacketMul2Xf& b) { - PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pcmp_lt(const Packet2Xf& a, const Packet2Xf& b) { + PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pcmp_eq(const PacketMul2Xf& a, const PacketMul2Xf& b) { - PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pcmp_eq(const Packet2Xf& a, const Packet2Xf& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt_or_nan(const PacketMul2Xf& a, const PacketMul2Xf& b) { - PacketMask16 mask = __riscv_vmfge_vv_f32m2_b16(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f32m2(ptrue(a), 0.0f, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pcmp_lt_or_nan(const Packet2Xf& a, const Packet2Xf& b) { + PacketMask16 mask = __riscv_vmfge_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m2(ptrue(a), 0.0f, mask, unpacket_traits::size); } // Logical Operations are not supported for float, so reinterpret casts template <> -EIGEN_STRONG_INLINE PacketMul2Xf pand(const PacketMul2Xf& a, const PacketMul2Xf& b) { +EIGEN_STRONG_INLINE Packet2Xf pand(const Packet2Xf& a, const Packet2Xf& b) { return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf por(const PacketMul2Xf& a, const PacketMul2Xf& b) { +EIGEN_STRONG_INLINE Packet2Xf por(const Packet2Xf& a, const Packet2Xf& b) { return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pxor(const PacketMul2Xf& a, const PacketMul2Xf& b) { +EIGEN_STRONG_INLINE Packet2Xf pxor(const Packet2Xf& a, const Packet2Xf& b) { return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), __riscv_vreinterpret_v_f32m2_u32m2(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pandnot(const PacketMul2Xf& a, const PacketMul2Xf& b) { +EIGEN_STRONG_INLINE Packet2Xf pandnot(const Packet2Xf& a, const Packet2Xf& b) { return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2( __riscv_vreinterpret_v_f32m2_u32m2(a), - __riscv_vnot_v_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size), - unpacket_traits::size)); + __riscv_vnot_v_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pload(const float* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf ploadu(const float* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf ploaddup(const float* from) { - PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf ploaddup(const float* from) { + Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf ploadquad(const float* from) { - PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); - idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf ploadquad(const float* from) { + Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul2Xf& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(float* to, const Packet2Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul2Xf& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet2Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul2Xf pgather(const float* from, Index stride) { - return __riscv_vlse32_v_f32m2(from, stride * sizeof(float), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline Packet2Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m2(from, stride * sizeof(float), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul2Xf& from, Index stride) { - __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet2Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE float pfirst(const PacketMul2Xf& a) { +EIGEN_STRONG_INLINE float pfirst(const Packet2Xf& a) { return __riscv_vfmv_f_s_f32m2_f32(a); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf psqrt(const PacketMul2Xf& a) { - return __riscv_vfsqrt_v_f32m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf psqrt(const Packet2Xf& a) { + return __riscv_vfsqrt_v_f32m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf print(const PacketMul2Xf& a) { - const PacketMul2Xf limit = pset1(static_cast(1 << 23)); - const PacketMul2Xf abs_a = pabs(a); +EIGEN_STRONG_INLINE Packet2Xf print(const Packet2Xf& a) { + const Packet2Xf limit = pset1(static_cast(1 << 23)); + const Packet2Xf abs_a = pabs(a); - PacketMask16 mask = __riscv_vmfne_vv_f32m2_b16(a, a, unpacket_traits::size); - const PacketMul2Xf x = __riscv_vfadd_vv_f32m2_tumu(mask, a, a, a, unpacket_traits::size); - const PacketMul2Xf new_x = __riscv_vfcvt_f_x_v_f32m2( - __riscv_vfcvt_x_f_v_i32m2(a, unpacket_traits::size), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfne_vv_f32m2_b16(a, a, unpacket_traits::size); + const Packet2Xf x = __riscv_vfadd_vv_f32m2_tumu(mask, a, a, a, unpacket_traits::size); + const Packet2Xf new_x = __riscv_vfcvt_f_x_v_f32m2( + __riscv_vfcvt_x_f_v_i32m2(a, unpacket_traits::size), unpacket_traits::size); - mask = __riscv_vmflt_vv_f32m2_b16(abs_a, limit, unpacket_traits::size); - PacketMul2Xf signed_x = __riscv_vfsgnj_vv_f32m2(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m2(x, signed_x, mask, unpacket_traits::size); + mask = __riscv_vmflt_vv_f32m2_b16(abs_a, limit, unpacket_traits::size); + Packet2Xf signed_x = __riscv_vfsgnj_vv_f32m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(x, signed_x, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pfloor(const PacketMul2Xf& a) { - PacketMul2Xf tmp = print(a); +EIGEN_STRONG_INLINE Packet2Xf pfloor(const Packet2Xf& a) { + Packet2Xf tmp = print(a); // If greater, subtract one. - PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f32m2_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); + PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m2_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf preverse(const PacketMul2Xf& a) { - PacketMul2Xu idx = - __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f32m2(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf preverse(const Packet2Xf& a) { + Packet2Xu idx = + __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m2(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pfrexp(const PacketMul2Xf& a, PacketMul2Xf& exponent) { +EIGEN_STRONG_INLINE Packet2Xf pfrexp(const Packet2Xf& a, Packet2Xf& exponent) { return pfrexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE float predux(const PacketMul2Xf& a) { +EIGEN_STRONG_INLINE float predux(const Packet2Xf& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m2_f32m1( - a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 2), unpacket_traits::size)); + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE float predux_mul(const PacketMul2Xf& a) { - return predux_mul(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), - unpacket_traits::size)); +EIGEN_STRONG_INLINE float predux_mul(const Packet2Xf& a) { + return predux_mul(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE float predux_min(const PacketMul2Xf& a) { +EIGEN_STRONG_INLINE float predux_min(const Packet2Xf& a) { return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1( a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 2), - unpacket_traits::size)), + unpacket_traits::size / 2), + unpacket_traits::size)), (std::numeric_limits::max)()); } template <> -EIGEN_STRONG_INLINE float predux_max(const PacketMul2Xf& a) { +EIGEN_STRONG_INLINE float predux_max(const Packet2Xf& a) { return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1( a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 2), - unpacket_traits::size)), + unpacket_traits::size / 2), + unpacket_traits::size)), -(std::numeric_limits::max)()); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - float buffer[unpacket_traits::size * N]; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle32_v_f32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle32_v_f32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pldexp(const PacketMul2Xf& a, const PacketMul2Xf& exponent) { +EIGEN_STRONG_INLINE Packet2Xf pldexp(const Packet2Xf& a, const Packet2Xf& exponent) { return pldexp_generic(a, exponent); } -template +template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xf>::type -predux_half_dowto4(const PacketMul4Xf& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xf>::type +predux_half_dowto4(const Packet4Xf& a) { return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), __riscv_vget_v_f32m4_f32m2(a, 1), - unpacket_traits::size); + unpacket_traits::size); } -template +template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul1Xf>::type -predux_half_dowto4(const PacketMul2Xf& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xf>::type +predux_half_dowto4(const Packet2Xf& a) { return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); } -/********************************* PacketMul2Xl ************************************/ +/********************************* Packet2Xl ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul2Xl pset1(const numext::int64_t& from) { - return __riscv_vmv_v_x_i64m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl plset(const numext::int64_t& a) { - PacketMul2Xl idx = __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)); - return __riscv_vadd_vx_i64m2(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl plset(const numext::int64_t& a) { + Packet2Xl idx = __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)); + return __riscv_vadd_vx_i64m2(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pzero(const PacketMul2Xl& /*a*/) { - return __riscv_vmv_v_x_i64m2(0, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pzero(const Packet2Xl& /*a*/) { + return __riscv_vmv_v_x_i64m2(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl padd(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vadd_vv_i64m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl padd(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vadd_vv_i64m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl psub(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vsub(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl psub(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pnegate(const PacketMul2Xl& a) { - return __riscv_vneg(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pnegate(const Packet2Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pconj(const PacketMul2Xl& a) { +EIGEN_STRONG_INLINE Packet2Xl pconj(const Packet2Xl& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pmul(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vmul(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pmul(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pdiv(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pdiv(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pmadd(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pmadd(const Packet2Xl& a, const Packet2Xl& b, const Packet2Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pmsub(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pmsub(const Packet2Xl& a, const Packet2Xl& b, const Packet2Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pnmadd(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { - return __riscv_vnmsub_vv_i64m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pnmadd(const Packet2Xl& a, const Packet2Xl& b, const Packet2Xl& c) { + return __riscv_vnmsub_vv_i64m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pnmsub(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { - return __riscv_vnmsub_vv_i64m2(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pnmsub(const Packet2Xl& a, const Packet2Xl& b, const Packet2Xl& c) { + return __riscv_vnmsub_vv_i64m2(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pmin(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vmin(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pmin(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pmax(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vmax(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pmax(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pcmp_le(const PacketMul2Xl& a, const PacketMul2Xl& b) { - PacketMask32 mask = __riscv_vmsle_vv_i64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pcmp_le(const Packet2Xl& a, const Packet2Xl& b) { + PacketMask32 mask = __riscv_vmsle_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pcmp_lt(const PacketMul2Xl& a, const PacketMul2Xl& b) { - PacketMask32 mask = __riscv_vmslt_vv_i64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pcmp_lt(const Packet2Xl& a, const Packet2Xl& b) { + PacketMask32 mask = __riscv_vmslt_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pcmp_eq(const PacketMul2Xl& a, const PacketMul2Xl& b) { - PacketMask32 mask = __riscv_vmseq_vv_i64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pcmp_eq(const Packet2Xl& a, const Packet2Xl& b) { + PacketMask32 mask = __riscv_vmseq_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl ptrue(const PacketMul2Xl& /*a*/) { - return __riscv_vmv_v_x_i64m2(0xffffffffffffffffu, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl ptrue(const Packet2Xl& /*a*/) { + return __riscv_vmv_v_x_i64m2(0xffffffffffffffffu, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pand(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vand_vv_i64m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pand(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vand_vv_i64m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl por(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vor_vv_i64m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl por(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vor_vv_i64m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pxor(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vxor_vv_i64m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pxor(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vxor_vv_i64m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pandnot(const PacketMul2Xl& a, const PacketMul2Xl& b) { - return __riscv_vand_vv_i64m2(a, __riscv_vnot_v_i64m2(b, unpacket_traits::size), - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pandnot(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vand_vv_i64m2(a, __riscv_vnot_v_i64m2(b, unpacket_traits::size), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul2Xl parithmetic_shift_right(PacketMul2Xl a) { - return __riscv_vsra_vx_i64m2(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl parithmetic_shift_right(Packet2Xl a) { + return __riscv_vsra_vx_i64m2(a, N, unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_right(PacketMul2Xl a) { +EIGEN_STRONG_INLINE Packet2Xl plogical_shift_right(Packet2Xl a) { return __riscv_vreinterpret_i64m2( - __riscv_vsrl_vx_u64m2(__riscv_vreinterpret_u64m2(a), N, unpacket_traits::size)); + __riscv_vsrl_vx_u64m2(__riscv_vreinterpret_u64m2(a), N, unpacket_traits::size)); } template -EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_left(PacketMul2Xl a) { - return __riscv_vsll_vx_i64m2(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl plogical_shift_left(Packet2Xl a) { + return __riscv_vsll_vx_i64m2(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pload(const numext::int64_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl ploadu(const numext::int64_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl ploaddup(const numext::int64_t* from) { - PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl ploaddup(const numext::int64_t* from) { + Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... - return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); + return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl ploadquad(const numext::int64_t* from) { - PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl ploadquad(const numext::int64_t* from) { + Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul2Xl& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const Packet2Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul2Xl& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const Packet2Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul2Xl pgather(const numext::int64_t* from, +EIGEN_DEVICE_FUNC inline Packet2Xl pgather(const numext::int64_t* from, Index stride) { - return __riscv_vlse64_v_i64m2(from, stride * sizeof(numext::int64_t), unpacket_traits::size); + return __riscv_vlse64_v_i64m2(from, stride * sizeof(numext::int64_t), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul2Xl& from, +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const Packet2Xl& from, Index stride) { - __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul2Xl& a) { +EIGEN_STRONG_INLINE numext::int64_t pfirst(const Packet2Xl& a) { return __riscv_vmv_x_s_i64m2_i64(a); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl preverse(const PacketMul2Xl& a) { - PacketMul2Xul idx = - __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i64m2(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl preverse(const Packet2Xl& a) { + Packet2Xul idx = + __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m2(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pabs(const PacketMul2Xl& a) { - PacketMul2Xl mask = __riscv_vsra_vx_i64m2(a, 63, unpacket_traits::size); - return __riscv_vsub_vv_i64m2(__riscv_vxor_vv_i64m2(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pabs(const Packet2Xl& a) { + Packet2Xl mask = __riscv_vsra_vx_i64m2(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m2(__riscv_vxor_vv_i64m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul2Xl& a) { +EIGEN_STRONG_INLINE numext::int64_t predux(const Packet2Xl& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1( - a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 2), unpacket_traits::size)); + a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul2Xl& a) { - return predux_mul(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), - unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const Packet2Xl& a) { + return predux_mul(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul2Xl& a) { +EIGEN_STRONG_INLINE numext::int64_t predux_min(const Packet2Xl& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i64m2_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 2), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 2), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul2Xl& a) { +EIGEN_STRONG_INLINE numext::int64_t predux_max(const Packet2Xl& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i64m2_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 2), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 2), + unpacket_traits::size)); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int64_t buffer[unpacket_traits::size * N] = {0}; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle64_v_i64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle64_v_i64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } -template +template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xl>::type -predux_half_dowto4(const PacketMul4Xl& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xl>::type +predux_half_dowto4(const Packet4Xl& a) { return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), __riscv_vget_v_i64m4_i64m2(a, 1), - unpacket_traits::size); + unpacket_traits::size); } -template +template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul1Xl>::type -predux_half_dowto4(const PacketMul2Xl& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xl>::type +predux_half_dowto4(const Packet2Xl& a) { return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); } -/********************************* PacketMul2Xd ************************************/ +/********************************* Packet2Xd ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul2Xd ptrue(const PacketMul2Xd& /*a*/) { - return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(0xffffffffffffffffu, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet2Xd ptrue(const Packet2Xd& /*a*/) { + return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(0xffffffffffffffffu, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pzero(const PacketMul2Xd& /*a*/) { - return __riscv_vfmv_v_f_f64m2(0.0, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pzero(const Packet2Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m2(0.0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pabs(const PacketMul2Xd& a) { - return __riscv_vfabs_v_f64m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pabs(const Packet2Xd& a) { + return __riscv_vfabs_v_f64m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pset1(const double& from) { - return __riscv_vfmv_v_f_f64m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pset1frombits(numext::uint64_t from) { - return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(from, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet2Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(from, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd plset(const double& a) { - PacketMul2Xd idx = __riscv_vfcvt_f_x_v_f64m2( - __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)), - unpacket_traits::size); - return __riscv_vfadd_vf_f64m2(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd plset(const double& a) { + Packet2Xd idx = __riscv_vfcvt_f_x_v_f64m2( + __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f64m2(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd padd(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vfadd_vv_f64m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd padd(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vfadd_vv_f64m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd psub(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vfsub_vv_f64m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd psub(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vfsub_vv_f64m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pnegate(const PacketMul2Xd& a) { - return __riscv_vfneg_v_f64m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pnegate(const Packet2Xd& a) { + return __riscv_vfneg_v_f64m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pconj(const PacketMul2Xd& a) { +EIGEN_STRONG_INLINE Packet2Xd pconj(const Packet2Xd& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmul(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vfmul_vv_f64m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pmul(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vfmul_vv_f64m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pdiv(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vfdiv_vv_f64m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pdiv(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vfdiv_vv_f64m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmadd(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { - return __riscv_vfmadd_vv_f64m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pmadd(const Packet2Xd& a, const Packet2Xd& b, const Packet2Xd& c) { + return __riscv_vfmadd_vv_f64m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmsub(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { - return __riscv_vfmsub_vv_f64m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pmsub(const Packet2Xd& a, const Packet2Xd& b, const Packet2Xd& c) { + return __riscv_vfmsub_vv_f64m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pnmadd(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { - return __riscv_vfnmsub_vv_f64m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pnmadd(const Packet2Xd& a, const Packet2Xd& b, const Packet2Xd& c) { + return __riscv_vfnmsub_vv_f64m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pnmsub(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { - return __riscv_vfnmadd_vv_f64m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pnmsub(const Packet2Xd& a, const Packet2Xd& b, const Packet2Xd& c) { + return __riscv_vfnmadd_vv_f64m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { - PacketMul2Xd nans = - __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); - PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pmin(const Packet2Xd& a, const Packet2Xd& b) { + Packet2Xd nans = + __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); - return __riscv_vfmin_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmin_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return pmin(a, b); +EIGEN_STRONG_INLINE Packet2Xd pmin(const Packet2Xd& a, const Packet2Xd& b) { + return pmin(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vfmin_vv_f64m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pmin(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vfmin_vv_f64m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { - PacketMul2Xd nans = - __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); - PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pmax(const Packet2Xd& a, const Packet2Xd& b) { + Packet2Xd nans = + __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); - return __riscv_vfmax_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmax_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return pmax(a, b); +EIGEN_STRONG_INLINE Packet2Xd pmax(const Packet2Xd& a, const Packet2Xd& b) { + return pmax(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { - return __riscv_vfmax_vv_f64m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pmax(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vfmax_vv_f64m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pcmp_le(const PacketMul2Xd& a, const PacketMul2Xd& b) { - PacketMask32 mask = __riscv_vmfle_vv_f64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pcmp_le(const Packet2Xd& a, const Packet2Xd& b) { + PacketMask32 mask = __riscv_vmfle_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt(const PacketMul2Xd& a, const PacketMul2Xd& b) { - PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pcmp_lt(const Packet2Xd& a, const Packet2Xd& b) { + PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pcmp_eq(const PacketMul2Xd& a, const PacketMul2Xd& b) { - PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pcmp_eq(const Packet2Xd& a, const Packet2Xd& b) { + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt_or_nan(const PacketMul2Xd& a, const PacketMul2Xd& b) { - PacketMask32 mask = __riscv_vmfge_vv_f64m2_b32(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f64m2(ptrue(a), 0.0, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pcmp_lt_or_nan(const Packet2Xd& a, const Packet2Xd& b) { + PacketMask32 mask = __riscv_vmfge_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m2(ptrue(a), 0.0, mask, unpacket_traits::size); } // Logical Operations are not supported for double, so reinterpret casts template <> -EIGEN_STRONG_INLINE PacketMul2Xd pand(const PacketMul2Xd& a, const PacketMul2Xd& b) { +EIGEN_STRONG_INLINE Packet2Xd pand(const Packet2Xd& a, const Packet2Xd& b) { return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd por(const PacketMul2Xd& a, const PacketMul2Xd& b) { +EIGEN_STRONG_INLINE Packet2Xd por(const Packet2Xd& a, const Packet2Xd& b) { return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pxor(const PacketMul2Xd& a, const PacketMul2Xd& b) { +EIGEN_STRONG_INLINE Packet2Xd pxor(const Packet2Xd& a, const Packet2Xd& b) { return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), __riscv_vreinterpret_v_f64m2_u64m2(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pandnot(const PacketMul2Xd& a, const PacketMul2Xd& b) { +EIGEN_STRONG_INLINE Packet2Xd pandnot(const Packet2Xd& a, const Packet2Xd& b) { return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2( __riscv_vreinterpret_v_f64m2_u64m2(a), - __riscv_vnot_v_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size), - unpacket_traits::size)); + __riscv_vnot_v_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pload(const double* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd ploadu(const double* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd ploaddup(const double* from) { - PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd ploaddup(const double* from) { + Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd ploadquad(const double* from) { - PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd ploadquad(const double* from) { + Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul2Xd& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(double* to, const Packet2Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul2Xd& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul2Xd pgather(const double* from, Index stride) { - return __riscv_vlse64_v_f64m2(from, stride * sizeof(double), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline Packet2Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m2(from, stride * sizeof(double), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul2Xd& from, Index stride) { - __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE double pfirst(const PacketMul2Xd& a) { +EIGEN_STRONG_INLINE double pfirst(const Packet2Xd& a) { return __riscv_vfmv_f_s_f64m2_f64(a); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd psqrt(const PacketMul2Xd& a) { - return __riscv_vfsqrt_v_f64m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd psqrt(const Packet2Xd& a) { + return __riscv_vfsqrt_v_f64m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd print(const PacketMul2Xd& a) { - const PacketMul2Xd limit = pset1(static_cast(1ull << 52)); - const PacketMul2Xd abs_a = pabs(a); +EIGEN_STRONG_INLINE Packet2Xd print(const Packet2Xd& a) { + const Packet2Xd limit = pset1(static_cast(1ull << 52)); + const Packet2Xd abs_a = pabs(a); - PacketMask32 mask = __riscv_vmfne_vv_f64m2_b32(a, a, unpacket_traits::size); - const PacketMul2Xd x = __riscv_vfadd_vv_f64m2_tumu(mask, a, a, a, unpacket_traits::size); - const PacketMul2Xd new_x = __riscv_vfcvt_f_x_v_f64m2( - __riscv_vfcvt_x_f_v_i64m2(a, unpacket_traits::size), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfne_vv_f64m2_b32(a, a, unpacket_traits::size); + const Packet2Xd x = __riscv_vfadd_vv_f64m2_tumu(mask, a, a, a, unpacket_traits::size); + const Packet2Xd new_x = __riscv_vfcvt_f_x_v_f64m2( + __riscv_vfcvt_x_f_v_i64m2(a, unpacket_traits::size), unpacket_traits::size); - mask = __riscv_vmflt_vv_f64m2_b32(abs_a, limit, unpacket_traits::size); - PacketMul2Xd signed_x = __riscv_vfsgnj_vv_f64m2(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m2(x, signed_x, mask, unpacket_traits::size); + mask = __riscv_vmflt_vv_f64m2_b32(abs_a, limit, unpacket_traits::size); + Packet2Xd signed_x = __riscv_vfsgnj_vv_f64m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(x, signed_x, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pfloor(const PacketMul2Xd& a) { - PacketMul2Xd tmp = print(a); +EIGEN_STRONG_INLINE Packet2Xd pfloor(const Packet2Xd& a) { + Packet2Xd tmp = print(a); // If greater, subtract one. - PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f64m2_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); + PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m2_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd preverse(const PacketMul2Xd& a) { - PacketMul2Xul idx = - __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f64m2(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd preverse(const Packet2Xd& a) { + Packet2Xul idx = + __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m2(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pfrexp(const PacketMul2Xd& a, PacketMul2Xd& exponent) { +EIGEN_STRONG_INLINE Packet2Xd pfrexp(const Packet2Xd& a, Packet2Xd& exponent) { return pfrexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE double predux(const PacketMul2Xd& a) { +EIGEN_STRONG_INLINE double predux(const Packet2Xd& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m2_f64m1( - a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 2), unpacket_traits::size)); + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE double predux_mul(const PacketMul2Xd& a) { - return predux_mul(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), - unpacket_traits::size)); +EIGEN_STRONG_INLINE double predux_mul(const Packet2Xd& a) { + return predux_mul(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE double predux_min(const PacketMul2Xd& a) { +EIGEN_STRONG_INLINE double predux_min(const Packet2Xd& a) { return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1( a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 2), - unpacket_traits::size)), + unpacket_traits::size / 2), + unpacket_traits::size)), (std::numeric_limits::max)()); } template <> -EIGEN_STRONG_INLINE double predux_max(const PacketMul2Xd& a) { +EIGEN_STRONG_INLINE double predux_max(const Packet2Xd& a) { return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1( a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 2), - unpacket_traits::size)), + unpacket_traits::size / 2), + unpacket_traits::size)), -(std::numeric_limits::max)()); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - double buffer[unpacket_traits::size * N]; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle64_v_f64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle64_v_f64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pldexp(const PacketMul2Xd& a, const PacketMul2Xd& exponent) { +EIGEN_STRONG_INLINE Packet2Xd pldexp(const Packet2Xd& a, const Packet2Xd& exponent) { return pldexp_generic(a, exponent); } -template +template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xd>::type -predux_half_dowto4(const PacketMul4Xd& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xd>::type +predux_half_dowto4(const Packet4Xd& a) { return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), __riscv_vget_v_f64m4_f64m2(a, 1), - unpacket_traits::size); + unpacket_traits::size); } -template +template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul1Xd>::type -predux_half_dowto4(const PacketMul2Xd& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xd>::type +predux_half_dowto4(const Packet2Xd& a) { return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); } -/********************************* PacketMul2Xs ************************************/ +/********************************* Packet2Xs ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul2Xs pset1(const numext::int16_t& from) { - return __riscv_vmv_v_x_i16m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs plset(const numext::int16_t& a) { - PacketMul2Xs idx = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(unpacket_traits::size)); - return __riscv_vadd_vx_i16m2(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs plset(const numext::int16_t& a) { + Packet2Xs idx = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(unpacket_traits::size)); + return __riscv_vadd_vx_i16m2(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pzero(const PacketMul2Xs& /*a*/) { - return __riscv_vmv_v_x_i16m2(0, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pzero(const Packet2Xs& /*a*/) { + return __riscv_vmv_v_x_i16m2(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs padd(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vadd_vv_i16m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs padd(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vadd_vv_i16m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs psub(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vsub(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs psub(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pnegate(const PacketMul2Xs& a) { - return __riscv_vneg(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pnegate(const Packet2Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pconj(const PacketMul2Xs& a) { +EIGEN_STRONG_INLINE Packet2Xs pconj(const Packet2Xs& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pmul(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vmul(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pmul(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pdiv(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pdiv(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pmadd(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pmadd(const Packet2Xs& a, const Packet2Xs& b, const Packet2Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pmsub(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pmsub(const Packet2Xs& a, const Packet2Xs& b, const Packet2Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pnmadd(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { - return __riscv_vnmsub_vv_i16m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pnmadd(const Packet2Xs& a, const Packet2Xs& b, const Packet2Xs& c) { + return __riscv_vnmsub_vv_i16m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pnmsub(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { - return __riscv_vnmsub_vv_i16m2(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pnmsub(const Packet2Xs& a, const Packet2Xs& b, const Packet2Xs& c) { + return __riscv_vnmsub_vv_i16m2(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pmin(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vmin(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pmin(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pmax(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vmax(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pmax(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pcmp_le(const PacketMul2Xs& a, const PacketMul2Xs& b) { - PacketMask8 mask = __riscv_vmsle_vv_i16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pcmp_le(const Packet2Xs& a, const Packet2Xs& b) { + PacketMask8 mask = __riscv_vmsle_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pcmp_lt(const PacketMul2Xs& a, const PacketMul2Xs& b) { - PacketMask8 mask = __riscv_vmslt_vv_i16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pcmp_lt(const Packet2Xs& a, const Packet2Xs& b) { + PacketMask8 mask = __riscv_vmslt_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pcmp_eq(const PacketMul2Xs& a, const PacketMul2Xs& b) { - PacketMask8 mask = __riscv_vmseq_vv_i16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pcmp_eq(const Packet2Xs& a, const Packet2Xs& b) { + PacketMask8 mask = __riscv_vmseq_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs ptrue(const PacketMul2Xs& /*a*/) { - return __riscv_vmv_v_x_i16m2(static_cast(0xffffu), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs ptrue(const Packet2Xs& /*a*/) { + return __riscv_vmv_v_x_i16m2(static_cast(0xffffu), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pand(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vand_vv_i16m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pand(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vand_vv_i16m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs por(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vor_vv_i16m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs por(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vor_vv_i16m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pxor(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vxor_vv_i16m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pxor(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vxor_vv_i16m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pandnot(const PacketMul2Xs& a, const PacketMul2Xs& b) { - return __riscv_vand_vv_i16m2(a, __riscv_vnot_v_i16m2(b, unpacket_traits::size), - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pandnot(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vand_vv_i16m2(a, __riscv_vnot_v_i16m2(b, unpacket_traits::size), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul2Xs parithmetic_shift_right(PacketMul2Xs a) { - return __riscv_vsra_vx_i16m2(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs parithmetic_shift_right(Packet2Xs a) { + return __riscv_vsra_vx_i16m2(a, N, unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_right(PacketMul2Xs a) { +EIGEN_STRONG_INLINE Packet2Xs plogical_shift_right(Packet2Xs a) { return __riscv_vreinterpret_i16m2( - __riscv_vsrl_vx_u16m2(__riscv_vreinterpret_u16m2(a), N, unpacket_traits::size)); + __riscv_vsrl_vx_u16m2(__riscv_vreinterpret_u16m2(a), N, unpacket_traits::size)); } template -EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_left(PacketMul2Xs a) { - return __riscv_vsll_vx_i16m2(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs plogical_shift_left(Packet2Xs a) { + return __riscv_vsll_vx_i16m2(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pload(const numext::int16_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs ploadu(const numext::int16_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs ploaddup(const numext::int16_t* from) { - PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); - idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs ploaddup(const numext::int16_t* from) { + Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... - return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); + return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs ploadquad(const numext::int16_t* from) { - PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs ploadquad(const numext::int16_t* from) { + Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul2Xs& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const Packet2Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul2Xs& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const Packet2Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul2Xs pgather(const numext::int16_t* from, +EIGEN_DEVICE_FUNC inline Packet2Xs pgather(const numext::int16_t* from, Index stride) { - return __riscv_vlse16_v_i16m2(from, stride * sizeof(numext::int16_t), unpacket_traits::size); + return __riscv_vlse16_v_i16m2(from, stride * sizeof(numext::int16_t), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul2Xs& from, +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const Packet2Xs& from, Index stride) { - __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul2Xs& a) { +EIGEN_STRONG_INLINE numext::int16_t pfirst(const Packet2Xs& a) { return __riscv_vmv_x_s_i16m2_i16(a); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs preverse(const PacketMul2Xs& a) { - PacketMul2Xsu idx = - __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i16m2(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs preverse(const Packet2Xs& a) { + Packet2Xsu idx = + __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m2(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pabs(const PacketMul2Xs& a) { - PacketMul2Xs mask = __riscv_vsra_vx_i16m2(a, 15, unpacket_traits::size); - return __riscv_vsub_vv_i16m2(__riscv_vxor_vv_i16m2(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pabs(const Packet2Xs& a) { + Packet2Xs mask = __riscv_vsra_vx_i16m2(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m2(__riscv_vxor_vv_i16m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul2Xs& a) { +EIGEN_STRONG_INLINE numext::int16_t predux(const Packet2Xs& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1( - a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 2), unpacket_traits::size)); + a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 2), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul2Xs& a) { - return predux_mul(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), - unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const Packet2Xs& a) { + return predux_mul(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul2Xs& a) { +EIGEN_STRONG_INLINE numext::int16_t predux_min(const Packet2Xs& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i16m2_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 2), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 2), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul2Xs& a) { +EIGEN_STRONG_INLINE numext::int16_t predux_max(const Packet2Xs& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i16m2_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 2), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 2), + unpacket_traits::size)); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int16_t buffer[unpacket_traits::size * N] = {0}; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle16_v_i16m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle16_v_i16m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } -template +template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul2Xs>::type -predux_half_dowto4(const PacketMul4Xs& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xs>::type +predux_half_dowto4(const Packet4Xs& a) { return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), __riscv_vget_v_i16m4_i16m2(a, 1), - unpacket_traits::size); + unpacket_traits::size); } -template +template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, - PacketMul1Xs>::type -predux_half_dowto4(const PacketMul2Xs& a) { +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xs>::type +predux_half_dowto4(const Packet2Xs& a) { return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), - unpacket_traits::size); + unpacket_traits::size); } } // namespace internal diff --git a/Eigen/src/Core/arch/RVV10/PacketMath4.h b/Eigen/src/Core/arch/RVV10/PacketMath4.h index fbdd212ef..30f5ca33d 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath4.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath4.h @@ -16,1412 +16,1412 @@ namespace Eigen { namespace internal { -/********************************* PacketMul4Xi ************************************/ +/********************************* Packet4Xi ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul4Xi pset1(const numext::int32_t& from) { - return __riscv_vmv_v_x_i32m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi plset(const numext::int32_t& a) { - PacketMul4Xi idx = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)); - return __riscv_vadd_vx_i32m4(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi plset(const numext::int32_t& a) { + Packet4Xi idx = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)); + return __riscv_vadd_vx_i32m4(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pzero(const PacketMul4Xi& /*a*/) { - return __riscv_vmv_v_x_i32m4(0, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pzero(const Packet4Xi& /*a*/) { + return __riscv_vmv_v_x_i32m4(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi padd(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vadd_vv_i32m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi padd(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vadd_vv_i32m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi psub(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vsub(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi psub(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pnegate(const PacketMul4Xi& a) { - return __riscv_vneg(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pnegate(const Packet4Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pconj(const PacketMul4Xi& a) { +EIGEN_STRONG_INLINE Packet4Xi pconj(const Packet4Xi& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pmul(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vmul(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pmul(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pdiv(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pdiv(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pmadd(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pmadd(const Packet4Xi& a, const Packet4Xi& b, const Packet4Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pmsub(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pmsub(const Packet4Xi& a, const Packet4Xi& b, const Packet4Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pnmadd(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { - return __riscv_vnmsub_vv_i32m4(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pnmadd(const Packet4Xi& a, const Packet4Xi& b, const Packet4Xi& c) { + return __riscv_vnmsub_vv_i32m4(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pnmsub(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { - return __riscv_vnmsub_vv_i32m4(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pnmsub(const Packet4Xi& a, const Packet4Xi& b, const Packet4Xi& c) { + return __riscv_vnmsub_vv_i32m4(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pmin(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vmin(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pmin(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pmax(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vmax(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pmax(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pcmp_le(const PacketMul4Xi& a, const PacketMul4Xi& b) { - PacketMask8 mask = __riscv_vmsle_vv_i32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pcmp_le(const Packet4Xi& a, const Packet4Xi& b) { + PacketMask8 mask = __riscv_vmsle_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pcmp_lt(const PacketMul4Xi& a, const PacketMul4Xi& b) { - PacketMask8 mask = __riscv_vmslt_vv_i32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pcmp_lt(const Packet4Xi& a, const Packet4Xi& b) { + PacketMask8 mask = __riscv_vmslt_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pcmp_eq(const PacketMul4Xi& a, const PacketMul4Xi& b) { - PacketMask8 mask = __riscv_vmseq_vv_i32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pcmp_eq(const Packet4Xi& a, const Packet4Xi& b) { + PacketMask8 mask = __riscv_vmseq_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi ptrue(const PacketMul4Xi& /*a*/) { - return __riscv_vmv_v_x_i32m4(0xffffffffu, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi ptrue(const Packet4Xi& /*a*/) { + return __riscv_vmv_v_x_i32m4(0xffffffffu, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pand(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vand_vv_i32m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pand(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vand_vv_i32m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi por(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vor_vv_i32m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi por(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vor_vv_i32m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pxor(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vxor_vv_i32m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pxor(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vxor_vv_i32m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pandnot(const PacketMul4Xi& a, const PacketMul4Xi& b) { - return __riscv_vand_vv_i32m4(a, __riscv_vnot_v_i32m4(b, unpacket_traits::size), - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pandnot(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vand_vv_i32m4(a, __riscv_vnot_v_i32m4(b, unpacket_traits::size), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul4Xi parithmetic_shift_right(PacketMul4Xi a) { - return __riscv_vsra_vx_i32m4(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi parithmetic_shift_right(Packet4Xi a) { + return __riscv_vsra_vx_i32m4(a, N, unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_right(PacketMul4Xi a) { +EIGEN_STRONG_INLINE Packet4Xi plogical_shift_right(Packet4Xi a) { return __riscv_vreinterpret_i32m4( - __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_u32m4(a), N, unpacket_traits::size)); + __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_u32m4(a), N, unpacket_traits::size)); } template -EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_left(PacketMul4Xi a) { - return __riscv_vsll_vx_i32m4(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi plogical_shift_left(Packet4Xi a) { + return __riscv_vsll_vx_i32m4(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pload(const numext::int32_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi ploadu(const numext::int32_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi ploaddup(const numext::int32_t* from) { - PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi ploaddup(const numext::int32_t* from) { + Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... - return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); + return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi ploadquad(const numext::int32_t* from) { - PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); - idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi ploadquad(const numext::int32_t* from) { + Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul4Xi& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const Packet4Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul4Xi& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const Packet4Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul4Xi pgather(const numext::int32_t* from, +EIGEN_DEVICE_FUNC inline Packet4Xi pgather(const numext::int32_t* from, Index stride) { - return __riscv_vlse32_v_i32m4(from, stride * sizeof(numext::int32_t), unpacket_traits::size); + return __riscv_vlse32_v_i32m4(from, stride * sizeof(numext::int32_t), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul4Xi& from, +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const Packet4Xi& from, Index stride) { - __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul4Xi& a) { +EIGEN_STRONG_INLINE numext::int32_t pfirst(const Packet4Xi& a) { return __riscv_vmv_x_s_i32m4_i32(a); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi preverse(const PacketMul4Xi& a) { - PacketMul4Xu idx = - __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i32m4(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi preverse(const Packet4Xi& a) { + Packet4Xu idx = + __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m4(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pabs(const PacketMul4Xi& a) { - PacketMul4Xi mask = __riscv_vsra_vx_i32m4(a, 31, unpacket_traits::size); - return __riscv_vsub_vv_i32m4(__riscv_vxor_vv_i32m4(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pabs(const Packet4Xi& a) { + Packet4Xi mask = __riscv_vsra_vx_i32m4(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m4(__riscv_vxor_vv_i32m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul4Xi& a) { +EIGEN_STRONG_INLINE numext::int32_t predux(const Packet4Xi& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1( - a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), unpacket_traits::size)); + a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul4Xi& a) { - PacketMul1Xi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), __riscv_vget_v_i32m4_i32m1(a, 1), - unpacket_traits::size); - PacketMul1Xi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), __riscv_vget_v_i32m4_i32m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const Packet4Xi& a) { + Packet1Xi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), __riscv_vget_v_i32m4_i32m1(a, 1), + unpacket_traits::size); + Packet1Xi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), __riscv_vget_v_i32m4_i32m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul4Xi& a) { +EIGEN_STRONG_INLINE numext::int32_t predux_min(const Packet4Xi& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i32m4_i32m1( - a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul4Xi& a) { +EIGEN_STRONG_INLINE numext::int32_t predux_max(const Packet4Xi& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i32m4_i32m1( - a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int32_t buffer[unpacket_traits::size * N] = {0}; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle32_v_i32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle32_v_i32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } -/********************************* PacketMul4Xf ************************************/ +/********************************* Packet4Xf ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul4Xf ptrue(const PacketMul4Xf& /*a*/) { - return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(0xffffffffu, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet4Xf ptrue(const Packet4Xf& /*a*/) { + return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(0xffffffffu, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pzero(const PacketMul4Xf& /*a*/) { - return __riscv_vfmv_v_f_f32m4(0.0f, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pzero(const Packet4Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m4(0.0f, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pabs(const PacketMul4Xf& a) { - return __riscv_vfabs_v_f32m4(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pabs(const Packet4Xf& a) { + return __riscv_vfabs_v_f32m4(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pset1(const float& from) { - return __riscv_vfmv_v_f_f32m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pset1frombits(numext::uint32_t from) { - return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(from, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet4Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(from, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf plset(const float& a) { - PacketMul4Xf idx = __riscv_vfcvt_f_x_v_f32m4( - __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)), - unpacket_traits::size); - return __riscv_vfadd_vf_f32m4(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf plset(const float& a) { + Packet4Xf idx = __riscv_vfcvt_f_x_v_f32m4( + __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f32m4(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf padd(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vfadd_vv_f32m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf padd(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vfadd_vv_f32m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf psub(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vfsub_vv_f32m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf psub(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vfsub_vv_f32m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pnegate(const PacketMul4Xf& a) { - return __riscv_vfneg_v_f32m4(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pnegate(const Packet4Xf& a) { + return __riscv_vfneg_v_f32m4(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pconj(const PacketMul4Xf& a) { +EIGEN_STRONG_INLINE Packet4Xf pconj(const Packet4Xf& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmul(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vfmul_vv_f32m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pmul(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vfmul_vv_f32m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pdiv(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vfdiv_vv_f32m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pdiv(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vfdiv_vv_f32m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmadd(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { - return __riscv_vfmadd_vv_f32m4(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pmadd(const Packet4Xf& a, const Packet4Xf& b, const Packet4Xf& c) { + return __riscv_vfmadd_vv_f32m4(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmsub(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { - return __riscv_vfmsub_vv_f32m4(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pmsub(const Packet4Xf& a, const Packet4Xf& b, const Packet4Xf& c) { + return __riscv_vfmsub_vv_f32m4(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pnmadd(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { - return __riscv_vfnmsub_vv_f32m4(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pnmadd(const Packet4Xf& a, const Packet4Xf& b, const Packet4Xf& c) { + return __riscv_vfnmsub_vv_f32m4(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pnmsub(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { - return __riscv_vfnmadd_vv_f32m4(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pnmsub(const Packet4Xf& a, const Packet4Xf& b, const Packet4Xf& c) { + return __riscv_vfnmadd_vv_f32m4(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { - PacketMul4Xf nans = - __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); - PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pmin(const Packet4Xf& a, const Packet4Xf& b) { + Packet4Xf nans = + __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); - return __riscv_vfmin_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmin_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return pmin(a, b); +EIGEN_STRONG_INLINE Packet4Xf pmin(const Packet4Xf& a, const Packet4Xf& b) { + return pmin(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vfmin_vv_f32m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pmin(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vfmin_vv_f32m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { - PacketMul4Xf nans = - __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); - PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pmax(const Packet4Xf& a, const Packet4Xf& b) { + Packet4Xf nans = + __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); - return __riscv_vfmax_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmax_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return pmax(a, b); +EIGEN_STRONG_INLINE Packet4Xf pmax(const Packet4Xf& a, const Packet4Xf& b) { + return pmax(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { - return __riscv_vfmax_vv_f32m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pmax(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vfmax_vv_f32m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcmp_le(const PacketMul4Xf& a, const PacketMul4Xf& b) { - PacketMask8 mask = __riscv_vmfle_vv_f32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pcmp_le(const Packet4Xf& a, const Packet4Xf& b) { + PacketMask8 mask = __riscv_vmfle_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt(const PacketMul4Xf& a, const PacketMul4Xf& b) { - PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pcmp_lt(const Packet4Xf& a, const Packet4Xf& b) { + PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcmp_eq(const PacketMul4Xf& a, const PacketMul4Xf& b) { - PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pcmp_eq(const Packet4Xf& a, const Packet4Xf& b) { + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt_or_nan(const PacketMul4Xf& a, const PacketMul4Xf& b) { - PacketMask8 mask = __riscv_vmfge_vv_f32m4_b8(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f32m4(ptrue(a), 0.0f, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pcmp_lt_or_nan(const Packet4Xf& a, const Packet4Xf& b) { + PacketMask8 mask = __riscv_vmfge_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m4(ptrue(a), 0.0f, mask, unpacket_traits::size); } // Logical Operations are not supported for float, so reinterpret casts template <> -EIGEN_STRONG_INLINE PacketMul4Xf pand(const PacketMul4Xf& a, const PacketMul4Xf& b) { +EIGEN_STRONG_INLINE Packet4Xf pand(const Packet4Xf& a, const Packet4Xf& b) { return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf por(const PacketMul4Xf& a, const PacketMul4Xf& b) { +EIGEN_STRONG_INLINE Packet4Xf por(const Packet4Xf& a, const Packet4Xf& b) { return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pxor(const PacketMul4Xf& a, const PacketMul4Xf& b) { +EIGEN_STRONG_INLINE Packet4Xf pxor(const Packet4Xf& a, const Packet4Xf& b) { return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), __riscv_vreinterpret_v_f32m4_u32m4(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pandnot(const PacketMul4Xf& a, const PacketMul4Xf& b) { +EIGEN_STRONG_INLINE Packet4Xf pandnot(const Packet4Xf& a, const Packet4Xf& b) { return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4( __riscv_vreinterpret_v_f32m4_u32m4(a), - __riscv_vnot_v_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size), - unpacket_traits::size)); + __riscv_vnot_v_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pload(const float* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf ploadu(const float* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf ploaddup(const float* from) { - PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf ploaddup(const float* from) { + Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf ploadquad(const float* from) { - PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); - idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); - return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf ploadquad(const float* from) { + Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul4Xf& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(float* to, const Packet4Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul4Xf& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul4Xf pgather(const float* from, Index stride) { - return __riscv_vlse32_v_f32m4(from, stride * sizeof(float), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline Packet4Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m4(from, stride * sizeof(float), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul4Xf& from, Index stride) { - __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE float pfirst(const PacketMul4Xf& a) { +EIGEN_STRONG_INLINE float pfirst(const Packet4Xf& a) { return __riscv_vfmv_f_s_f32m4_f32(a); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf psqrt(const PacketMul4Xf& a) { - return __riscv_vfsqrt_v_f32m4(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf psqrt(const Packet4Xf& a) { + return __riscv_vfsqrt_v_f32m4(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf print(const PacketMul4Xf& a) { - const PacketMul4Xf limit = pset1(static_cast(1 << 23)); - const PacketMul4Xf abs_a = pabs(a); +EIGEN_STRONG_INLINE Packet4Xf print(const Packet4Xf& a) { + const Packet4Xf limit = pset1(static_cast(1 << 23)); + const Packet4Xf abs_a = pabs(a); - PacketMask8 mask = __riscv_vmfne_vv_f32m4_b8(a, a, unpacket_traits::size); - const PacketMul4Xf x = __riscv_vfadd_vv_f32m4_tumu(mask, a, a, a, unpacket_traits::size); - const PacketMul4Xf new_x = __riscv_vfcvt_f_x_v_f32m4( - __riscv_vfcvt_x_f_v_i32m4(a, unpacket_traits::size), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfne_vv_f32m4_b8(a, a, unpacket_traits::size); + const Packet4Xf x = __riscv_vfadd_vv_f32m4_tumu(mask, a, a, a, unpacket_traits::size); + const Packet4Xf new_x = __riscv_vfcvt_f_x_v_f32m4( + __riscv_vfcvt_x_f_v_i32m4(a, unpacket_traits::size), unpacket_traits::size); - mask = __riscv_vmflt_vv_f32m4_b8(abs_a, limit, unpacket_traits::size); - PacketMul4Xf signed_x = __riscv_vfsgnj_vv_f32m4(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f32m4(x, signed_x, mask, unpacket_traits::size); + mask = __riscv_vmflt_vv_f32m4_b8(abs_a, limit, unpacket_traits::size); + Packet4Xf signed_x = __riscv_vfsgnj_vv_f32m4(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(x, signed_x, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pfloor(const PacketMul4Xf& a) { - PacketMul4Xf tmp = print(a); +EIGEN_STRONG_INLINE Packet4Xf pfloor(const Packet4Xf& a) { + Packet4Xf tmp = print(a); // If greater, subtract one. - PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f32m4_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); + PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m4_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf preverse(const PacketMul4Xf& a) { - PacketMul4Xu idx = - __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f32m4(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf preverse(const Packet4Xf& a) { + Packet4Xu idx = + __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m4(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pfrexp(const PacketMul4Xf& a, PacketMul4Xf& exponent) { +EIGEN_STRONG_INLINE Packet4Xf pfrexp(const Packet4Xf& a, Packet4Xf& exponent) { return pfrexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE float predux(const PacketMul4Xf& a) { +EIGEN_STRONG_INLINE float predux(const Packet4Xf& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m4_f32m1( - a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE float predux_mul(const PacketMul4Xf& a) { - PacketMul1Xf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), __riscv_vget_v_f32m4_f32m1(a, 1), - unpacket_traits::size); - PacketMul1Xf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), __riscv_vget_v_f32m4_f32m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits::size)); +EIGEN_STRONG_INLINE float predux_mul(const Packet4Xf& a) { + Packet1Xf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), __riscv_vget_v_f32m4_f32m1(a, 1), + unpacket_traits::size); + Packet1Xf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), __riscv_vget_v_f32m4_f32m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE float predux_min(const PacketMul4Xf& a) { +EIGEN_STRONG_INLINE float predux_min(const Packet4Xf& a) { return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1( a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 4), - unpacket_traits::size)), + unpacket_traits::size / 4), + unpacket_traits::size)), (std::numeric_limits::max)()); } template <> -EIGEN_STRONG_INLINE float predux_max(const PacketMul4Xf& a) { +EIGEN_STRONG_INLINE float predux_max(const Packet4Xf& a) { return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1( a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 4), - unpacket_traits::size)), + unpacket_traits::size / 4), + unpacket_traits::size)), -(std::numeric_limits::max)()); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - float buffer[unpacket_traits::size * N]; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle32_v_f32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle32_v_f32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pldexp(const PacketMul4Xf& a, const PacketMul4Xf& exponent) { +EIGEN_STRONG_INLINE Packet4Xf pldexp(const Packet4Xf& a, const Packet4Xf& exponent) { return pldexp_generic(a, exponent); } -/********************************* PacketMul4Xl ************************************/ +/********************************* Packet4Xl ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul4Xl pset1(const numext::int64_t& from) { - return __riscv_vmv_v_x_i64m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl plset(const numext::int64_t& a) { - PacketMul4Xl idx = __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)); - return __riscv_vadd_vx_i64m4(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl plset(const numext::int64_t& a) { + Packet4Xl idx = __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)); + return __riscv_vadd_vx_i64m4(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pzero(const PacketMul4Xl& /*a*/) { - return __riscv_vmv_v_x_i64m4(0, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pzero(const Packet4Xl& /*a*/) { + return __riscv_vmv_v_x_i64m4(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl padd(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vadd_vv_i64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl padd(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vadd_vv_i64m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl psub(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vsub(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl psub(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pnegate(const PacketMul4Xl& a) { - return __riscv_vneg(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pnegate(const Packet4Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pconj(const PacketMul4Xl& a) { +EIGEN_STRONG_INLINE Packet4Xl pconj(const Packet4Xl& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pmul(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vmul(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pmul(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pdiv(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pdiv(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pmadd(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pmadd(const Packet4Xl& a, const Packet4Xl& b, const Packet4Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pmsub(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pmsub(const Packet4Xl& a, const Packet4Xl& b, const Packet4Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pnmadd(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { - return __riscv_vnmsub_vv_i64m4(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pnmadd(const Packet4Xl& a, const Packet4Xl& b, const Packet4Xl& c) { + return __riscv_vnmsub_vv_i64m4(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pnmsub(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { - return __riscv_vnmsub_vv_i64m4(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pnmsub(const Packet4Xl& a, const Packet4Xl& b, const Packet4Xl& c) { + return __riscv_vnmsub_vv_i64m4(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pmin(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vmin(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pmin(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pmax(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vmax(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pmax(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pcmp_le(const PacketMul4Xl& a, const PacketMul4Xl& b) { - PacketMask16 mask = __riscv_vmsle_vv_i64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pcmp_le(const Packet4Xl& a, const Packet4Xl& b) { + PacketMask16 mask = __riscv_vmsle_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pcmp_lt(const PacketMul4Xl& a, const PacketMul4Xl& b) { - PacketMask16 mask = __riscv_vmslt_vv_i64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pcmp_lt(const Packet4Xl& a, const Packet4Xl& b) { + PacketMask16 mask = __riscv_vmslt_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pcmp_eq(const PacketMul4Xl& a, const PacketMul4Xl& b) { - PacketMask16 mask = __riscv_vmseq_vv_i64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pcmp_eq(const Packet4Xl& a, const Packet4Xl& b) { + PacketMask16 mask = __riscv_vmseq_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl ptrue(const PacketMul4Xl& /*a*/) { - return __riscv_vmv_v_x_i64m4(0xffffffffffffffffu, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl ptrue(const Packet4Xl& /*a*/) { + return __riscv_vmv_v_x_i64m4(0xffffffffffffffffu, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pand(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vand_vv_i64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pand(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vand_vv_i64m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl por(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vor_vv_i64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl por(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vor_vv_i64m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pxor(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vxor_vv_i64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pxor(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vxor_vv_i64m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pandnot(const PacketMul4Xl& a, const PacketMul4Xl& b) { - return __riscv_vand_vv_i64m4(a, __riscv_vnot_v_i64m4(b, unpacket_traits::size), - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pandnot(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vand_vv_i64m4(a, __riscv_vnot_v_i64m4(b, unpacket_traits::size), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul4Xl parithmetic_shift_right(PacketMul4Xl a) { - return __riscv_vsra_vx_i64m4(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl parithmetic_shift_right(Packet4Xl a) { + return __riscv_vsra_vx_i64m4(a, N, unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_right(PacketMul4Xl a) { +EIGEN_STRONG_INLINE Packet4Xl plogical_shift_right(Packet4Xl a) { return __riscv_vreinterpret_i64m4( - __riscv_vsrl_vx_u64m4(__riscv_vreinterpret_u64m4(a), N, unpacket_traits::size)); + __riscv_vsrl_vx_u64m4(__riscv_vreinterpret_u64m4(a), N, unpacket_traits::size)); } template -EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_left(PacketMul4Xl a) { - return __riscv_vsll_vx_i64m4(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl plogical_shift_left(Packet4Xl a) { + return __riscv_vsll_vx_i64m4(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pload(const numext::int64_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl ploadu(const numext::int64_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl ploaddup(const numext::int64_t* from) { - PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl ploaddup(const numext::int64_t* from) { + Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... - return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); + return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl ploadquad(const numext::int64_t* from) { - PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl ploadquad(const numext::int64_t* from) { + Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul4Xl& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const Packet4Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul4Xl& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const Packet4Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul4Xl pgather(const numext::int64_t* from, +EIGEN_DEVICE_FUNC inline Packet4Xl pgather(const numext::int64_t* from, Index stride) { - return __riscv_vlse64_v_i64m4(from, stride * sizeof(numext::int64_t), unpacket_traits::size); + return __riscv_vlse64_v_i64m4(from, stride * sizeof(numext::int64_t), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul4Xl& from, +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const Packet4Xl& from, Index stride) { - __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul4Xl& a) { +EIGEN_STRONG_INLINE numext::int64_t pfirst(const Packet4Xl& a) { return __riscv_vmv_x_s_i64m4_i64(a); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl preverse(const PacketMul4Xl& a) { - PacketMul4Xul idx = - __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i64m4(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl preverse(const Packet4Xl& a) { + Packet4Xul idx = + __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m4(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pabs(const PacketMul4Xl& a) { - PacketMul4Xl mask = __riscv_vsra_vx_i64m4(a, 63, unpacket_traits::size); - return __riscv_vsub_vv_i64m4(__riscv_vxor_vv_i64m4(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pabs(const Packet4Xl& a) { + Packet4Xl mask = __riscv_vsra_vx_i64m4(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m4(__riscv_vxor_vv_i64m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul4Xl& a) { +EIGEN_STRONG_INLINE numext::int64_t predux(const Packet4Xl& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1( - a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), unpacket_traits::size)); + a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul4Xl& a) { - PacketMul1Xl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), __riscv_vget_v_i64m4_i64m1(a, 1), - unpacket_traits::size); - PacketMul1Xl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), __riscv_vget_v_i64m4_i64m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const Packet4Xl& a) { + Packet1Xl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), __riscv_vget_v_i64m4_i64m1(a, 1), + unpacket_traits::size); + Packet1Xl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), __riscv_vget_v_i64m4_i64m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul4Xl& a) { +EIGEN_STRONG_INLINE numext::int64_t predux_min(const Packet4Xl& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i64m4_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul4Xl& a) { +EIGEN_STRONG_INLINE numext::int64_t predux_max(const Packet4Xl& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i64m4_i64m1( - a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int64_t buffer[unpacket_traits::size * N] = {0}; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle64_v_i64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle64_v_i64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } -/********************************* PacketMul4Xd ************************************/ +/********************************* Packet4Xd ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul4Xd ptrue(const PacketMul4Xd& /*a*/) { - return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(0xffffffffffffffffu, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet4Xd ptrue(const Packet4Xd& /*a*/) { + return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(0xffffffffffffffffu, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pzero(const PacketMul4Xd& /*a*/) { - return __riscv_vfmv_v_f_f64m4(0.0, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pzero(const Packet4Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m4(0.0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pabs(const PacketMul4Xd& a) { - return __riscv_vfabs_v_f64m4(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pabs(const Packet4Xd& a) { + return __riscv_vfabs_v_f64m4(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pset1(const double& from) { - return __riscv_vfmv_v_f_f64m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pset1frombits(numext::uint64_t from) { - return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(from, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet4Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(from, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd plset(const double& a) { - PacketMul4Xd idx = __riscv_vfcvt_f_x_v_f64m4( - __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)), - unpacket_traits::size); - return __riscv_vfadd_vf_f64m4(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd plset(const double& a) { + Packet4Xd idx = __riscv_vfcvt_f_x_v_f64m4( + __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f64m4(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd padd(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vfadd_vv_f64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd padd(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vfadd_vv_f64m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd psub(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vfsub_vv_f64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd psub(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vfsub_vv_f64m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pnegate(const PacketMul4Xd& a) { - return __riscv_vfneg_v_f64m4(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pnegate(const Packet4Xd& a) { + return __riscv_vfneg_v_f64m4(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pconj(const PacketMul4Xd& a) { +EIGEN_STRONG_INLINE Packet4Xd pconj(const Packet4Xd& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmul(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vfmul_vv_f64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pmul(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vfmul_vv_f64m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pdiv(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vfdiv_vv_f64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pdiv(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vfdiv_vv_f64m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { - return __riscv_vfmadd_vv_f64m4(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pmadd(const Packet4Xd& a, const Packet4Xd& b, const Packet4Xd& c) { + return __riscv_vfmadd_vv_f64m4(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { - return __riscv_vfmsub_vv_f64m4(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pmsub(const Packet4Xd& a, const Packet4Xd& b, const Packet4Xd& c) { + return __riscv_vfmsub_vv_f64m4(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pnmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { - return __riscv_vfnmsub_vv_f64m4(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pnmadd(const Packet4Xd& a, const Packet4Xd& b, const Packet4Xd& c) { + return __riscv_vfnmsub_vv_f64m4(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pnmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { - return __riscv_vfnmadd_vv_f64m4(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pnmsub(const Packet4Xd& a, const Packet4Xd& b, const Packet4Xd& c) { + return __riscv_vfnmadd_vv_f64m4(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { - PacketMul4Xd nans = - __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); - PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pmin(const Packet4Xd& a, const Packet4Xd& b) { + Packet4Xd nans = + __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); - return __riscv_vfmin_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmin_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return pmin(a, b); +EIGEN_STRONG_INLINE Packet4Xd pmin(const Packet4Xd& a, const Packet4Xd& b) { + return pmin(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vfmin_vv_f64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pmin(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vfmin_vv_f64m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { - PacketMul4Xd nans = - __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); - PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pmax(const Packet4Xd& a, const Packet4Xd& b) { + Packet4Xd nans = + __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); - return __riscv_vfmax_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmax_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return pmax(a, b); +EIGEN_STRONG_INLINE Packet4Xd pmax(const Packet4Xd& a, const Packet4Xd& b) { + return pmax(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { - return __riscv_vfmax_vv_f64m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pmax(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vfmax_vv_f64m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcmp_le(const PacketMul4Xd& a, const PacketMul4Xd& b) { - PacketMask16 mask = __riscv_vmfle_vv_f64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pcmp_le(const Packet4Xd& a, const Packet4Xd& b) { + PacketMask16 mask = __riscv_vmfle_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt(const PacketMul4Xd& a, const PacketMul4Xd& b) { - PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pcmp_lt(const Packet4Xd& a, const Packet4Xd& b) { + PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcmp_eq(const PacketMul4Xd& a, const PacketMul4Xd& b) { - PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pcmp_eq(const Packet4Xd& a, const Packet4Xd& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt_or_nan(const PacketMul4Xd& a, const PacketMul4Xd& b) { - PacketMask16 mask = __riscv_vmfge_vv_f64m4_b16(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f64m4(ptrue(a), 0.0, mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pcmp_lt_or_nan(const Packet4Xd& a, const Packet4Xd& b) { + PacketMask16 mask = __riscv_vmfge_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m4(ptrue(a), 0.0, mask, unpacket_traits::size); } // Logical Operations are not supported for double, so reinterpret casts template <> -EIGEN_STRONG_INLINE PacketMul4Xd pand(const PacketMul4Xd& a, const PacketMul4Xd& b) { +EIGEN_STRONG_INLINE Packet4Xd pand(const Packet4Xd& a, const Packet4Xd& b) { return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd por(const PacketMul4Xd& a, const PacketMul4Xd& b) { +EIGEN_STRONG_INLINE Packet4Xd por(const Packet4Xd& a, const Packet4Xd& b) { return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pxor(const PacketMul4Xd& a, const PacketMul4Xd& b) { +EIGEN_STRONG_INLINE Packet4Xd pxor(const Packet4Xd& a, const Packet4Xd& b) { return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), __riscv_vreinterpret_v_f64m4_u64m4(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pandnot(const PacketMul4Xd& a, const PacketMul4Xd& b) { +EIGEN_STRONG_INLINE Packet4Xd pandnot(const Packet4Xd& a, const Packet4Xd& b) { return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4( __riscv_vreinterpret_v_f64m4_u64m4(a), - __riscv_vnot_v_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size), - unpacket_traits::size)); + __riscv_vnot_v_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pload(const double* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd ploadu(const double* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd ploaddup(const double* from) { - PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd ploaddup(const double* from) { + Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd ploadquad(const double* from) { - PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); - idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd ploadquad(const double* from) { + Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul4Xd& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(double* to, const Packet4Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul4Xd& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul4Xd pgather(const double* from, Index stride) { - return __riscv_vlse64_v_f64m4(from, stride * sizeof(double), unpacket_traits::size); +EIGEN_DEVICE_FUNC inline Packet4Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m4(from, stride * sizeof(double), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul4Xd& from, Index stride) { - __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet4Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE double pfirst(const PacketMul4Xd& a) { +EIGEN_STRONG_INLINE double pfirst(const Packet4Xd& a) { return __riscv_vfmv_f_s_f64m4_f64(a); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd psqrt(const PacketMul4Xd& a) { - return __riscv_vfsqrt_v_f64m4(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd psqrt(const Packet4Xd& a) { + return __riscv_vfsqrt_v_f64m4(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd print(const PacketMul4Xd& a) { - const PacketMul4Xd limit = pset1(static_cast(1ull << 52)); - const PacketMul4Xd abs_a = pabs(a); +EIGEN_STRONG_INLINE Packet4Xd print(const Packet4Xd& a) { + const Packet4Xd limit = pset1(static_cast(1ull << 52)); + const Packet4Xd abs_a = pabs(a); - PacketMask16 mask = __riscv_vmfne_vv_f64m4_b16(a, a, unpacket_traits::size); - const PacketMul4Xd x = __riscv_vfadd_vv_f64m4_tumu(mask, a, a, a, unpacket_traits::size); - const PacketMul4Xd new_x = __riscv_vfcvt_f_x_v_f64m4( - __riscv_vfcvt_x_f_v_i64m4(a, unpacket_traits::size), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfne_vv_f64m4_b16(a, a, unpacket_traits::size); + const Packet4Xd x = __riscv_vfadd_vv_f64m4_tumu(mask, a, a, a, unpacket_traits::size); + const Packet4Xd new_x = __riscv_vfcvt_f_x_v_f64m4( + __riscv_vfcvt_x_f_v_i64m4(a, unpacket_traits::size), unpacket_traits::size); - mask = __riscv_vmflt_vv_f64m4_b16(abs_a, limit, unpacket_traits::size); - PacketMul4Xd signed_x = __riscv_vfsgnj_vv_f64m4(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f64m4(x, signed_x, mask, unpacket_traits::size); + mask = __riscv_vmflt_vv_f64m4_b16(abs_a, limit, unpacket_traits::size); + Packet4Xd signed_x = __riscv_vfsgnj_vv_f64m4(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(x, signed_x, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pfloor(const PacketMul4Xd& a) { - PacketMul4Xd tmp = print(a); +EIGEN_STRONG_INLINE Packet4Xd pfloor(const Packet4Xd& a) { + Packet4Xd tmp = print(a); // If greater, subtract one. - PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f64m4_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); + PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m4_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd preverse(const PacketMul4Xd& a) { - PacketMul4Xul idx = - __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f64m4(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd preverse(const Packet4Xd& a) { + Packet4Xul idx = + __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m4(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pfrexp(const PacketMul4Xd& a, PacketMul4Xd& exponent) { +EIGEN_STRONG_INLINE Packet4Xd pfrexp(const Packet4Xd& a, Packet4Xd& exponent) { return pfrexp_generic(a, exponent); } template <> -EIGEN_STRONG_INLINE double predux(const PacketMul4Xd& a) { +EIGEN_STRONG_INLINE double predux(const Packet4Xd& a) { return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m4_f64m1( - a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE double predux_mul(const PacketMul4Xd& a) { - PacketMul1Xd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), __riscv_vget_v_f64m4_f64m1(a, 1), - unpacket_traits::size); - PacketMul1Xd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), __riscv_vget_v_f64m4_f64m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits::size)); +EIGEN_STRONG_INLINE double predux_mul(const Packet4Xd& a) { + Packet1Xd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), __riscv_vget_v_f64m4_f64m1(a, 1), + unpacket_traits::size); + Packet1Xd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), __riscv_vget_v_f64m4_f64m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE double predux_min(const PacketMul4Xd& a) { +EIGEN_STRONG_INLINE double predux_min(const Packet4Xd& a) { return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1( a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 4), - unpacket_traits::size)), + unpacket_traits::size / 4), + unpacket_traits::size)), (std::numeric_limits::max)()); } template <> -EIGEN_STRONG_INLINE double predux_max(const PacketMul4Xd& a) { +EIGEN_STRONG_INLINE double predux_max(const Packet4Xd& a) { return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1( a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), - unpacket_traits::size / 4), - unpacket_traits::size)), + unpacket_traits::size / 4), + unpacket_traits::size)), -(std::numeric_limits::max)()); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - double buffer[unpacket_traits::size * N]; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle64_v_f64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle64_v_f64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pldexp(const PacketMul4Xd& a, const PacketMul4Xd& exponent) { +EIGEN_STRONG_INLINE Packet4Xd pldexp(const Packet4Xd& a, const Packet4Xd& exponent) { return pldexp_generic(a, exponent); } -/********************************* PacketMul4Xs ************************************/ +/********************************* Packet4Xs ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul4Xs pset1(const numext::int16_t& from) { - return __riscv_vmv_v_x_i16m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs plset(const numext::int16_t& a) { - PacketMul4Xs idx = __riscv_vreinterpret_v_u16m4_i16m4(__riscv_vid_v_u16m4(unpacket_traits::size)); - return __riscv_vadd_vx_i16m4(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs plset(const numext::int16_t& a) { + Packet4Xs idx = __riscv_vreinterpret_v_u16m4_i16m4(__riscv_vid_v_u16m4(unpacket_traits::size)); + return __riscv_vadd_vx_i16m4(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pzero(const PacketMul4Xs& /*a*/) { - return __riscv_vmv_v_x_i16m4(0, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pzero(const Packet4Xs& /*a*/) { + return __riscv_vmv_v_x_i16m4(0, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs padd(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vadd_vv_i16m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs padd(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vadd_vv_i16m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs psub(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vsub(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs psub(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pnegate(const PacketMul4Xs& a) { - return __riscv_vneg(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pnegate(const Packet4Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pconj(const PacketMul4Xs& a) { +EIGEN_STRONG_INLINE Packet4Xs pconj(const Packet4Xs& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pmul(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vmul(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pmul(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pdiv(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vdiv(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pdiv(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pmadd(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { - return __riscv_vmadd(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pmadd(const Packet4Xs& a, const Packet4Xs& b, const Packet4Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pmsub(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { - return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pmsub(const Packet4Xs& a, const Packet4Xs& b, const Packet4Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pnmadd(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { - return __riscv_vnmsub_vv_i16m4(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pnmadd(const Packet4Xs& a, const Packet4Xs& b, const Packet4Xs& c) { + return __riscv_vnmsub_vv_i16m4(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pnmsub(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { - return __riscv_vnmsub_vv_i16m4(a, b, pnegate(c), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pnmsub(const Packet4Xs& a, const Packet4Xs& b, const Packet4Xs& c) { + return __riscv_vnmsub_vv_i16m4(a, b, pnegate(c), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pmin(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vmin(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pmin(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pmax(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vmax(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pmax(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pcmp_le(const PacketMul4Xs& a, const PacketMul4Xs& b) { - PacketMask4 mask = __riscv_vmsle_vv_i16m4_b4(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pcmp_le(const Packet4Xs& a, const Packet4Xs& b) { + PacketMask4 mask = __riscv_vmsle_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pcmp_lt(const PacketMul4Xs& a, const PacketMul4Xs& b) { - PacketMask4 mask = __riscv_vmslt_vv_i16m4_b4(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pcmp_lt(const Packet4Xs& a, const Packet4Xs& b) { + PacketMask4 mask = __riscv_vmslt_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pcmp_eq(const PacketMul4Xs& a, const PacketMul4Xs& b) { - PacketMask4 mask = __riscv_vmseq_vv_i16m4_b4(a, b, unpacket_traits::size); - return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pcmp_eq(const Packet4Xs& a, const Packet4Xs& b) { + PacketMask4 mask = __riscv_vmseq_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs ptrue(const PacketMul4Xs& /*a*/) { - return __riscv_vmv_v_x_i16m4(static_cast(0xffffu), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs ptrue(const Packet4Xs& /*a*/) { + return __riscv_vmv_v_x_i16m4(static_cast(0xffffu), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pand(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vand_vv_i16m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pand(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vand_vv_i16m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs por(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vor_vv_i16m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs por(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vor_vv_i16m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pxor(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vxor_vv_i16m4(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pxor(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vxor_vv_i16m4(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pandnot(const PacketMul4Xs& a, const PacketMul4Xs& b) { - return __riscv_vand_vv_i16m4(a, __riscv_vnot_v_i16m4(b, unpacket_traits::size), - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pandnot(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vand_vv_i16m4(a, __riscv_vnot_v_i16m4(b, unpacket_traits::size), + unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul4Xs parithmetic_shift_right(PacketMul4Xs a) { - return __riscv_vsra_vx_i16m4(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs parithmetic_shift_right(Packet4Xs a) { + return __riscv_vsra_vx_i16m4(a, N, unpacket_traits::size); } template -EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_right(PacketMul4Xs a) { +EIGEN_STRONG_INLINE Packet4Xs plogical_shift_right(Packet4Xs a) { return __riscv_vreinterpret_i16m4( - __riscv_vsrl_vx_u16m4(__riscv_vreinterpret_u16m4(a), N, unpacket_traits::size)); + __riscv_vsrl_vx_u16m4(__riscv_vreinterpret_u16m4(a), N, unpacket_traits::size)); } template -EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_left(PacketMul4Xs a) { - return __riscv_vsll_vx_i16m4(a, N, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs plogical_shift_left(Packet4Xs a) { + return __riscv_vsll_vx_i16m4(a, N, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pload(const numext::int16_t* from) { - EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs ploadu(const numext::int16_t* from) { - EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs ploaddup(const numext::int16_t* from) { - PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); - idx = __riscv_vand_vx_u16m4(idx, 0xfffeu, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs ploaddup(const numext::int16_t* from) { + Packet4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); + idx = __riscv_vand_vx_u16m4(idx, 0xfffeu, unpacket_traits::size); // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... - return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); + return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs ploadquad(const numext::int16_t* from) { - PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m4(__riscv_vand_vx_u16m4(idx, 0xfffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs ploadquad(const numext::int16_t* from) { + Packet4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m4(__riscv_vand_vx_u16m4(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul4Xs& from) { - EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const Packet4Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul4Xs& from) { - EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const Packet4Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul4Xs pgather(const numext::int16_t* from, +EIGEN_DEVICE_FUNC inline Packet4Xs pgather(const numext::int16_t* from, Index stride) { - return __riscv_vlse16_v_i16m4(from, stride * sizeof(numext::int16_t), unpacket_traits::size); + return __riscv_vlse16_v_i16m4(from, stride * sizeof(numext::int16_t), unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul4Xs& from, +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const Packet4Xs& from, Index stride) { - __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul4Xs& a) { +EIGEN_STRONG_INLINE numext::int16_t pfirst(const Packet4Xs& a) { return __riscv_vmv_x_s_i16m4_i16(a); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs preverse(const PacketMul4Xs& a) { - PacketMul4Xsu idx = - __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_i16m4(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs preverse(const Packet4Xs& a) { + Packet4Xsu idx = + __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m4(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pabs(const PacketMul4Xs& a) { - PacketMul4Xs mask = __riscv_vsra_vx_i16m4(a, 15, unpacket_traits::size); - return __riscv_vsub_vv_i16m4(__riscv_vxor_vv_i16m4(a, mask, unpacket_traits::size), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xs pabs(const Packet4Xs& a) { + Packet4Xs mask = __riscv_vsra_vx_i16m4(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m4(__riscv_vxor_vv_i16m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul4Xs& a) { +EIGEN_STRONG_INLINE numext::int16_t predux(const Packet4Xs& a) { return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1( - a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), unpacket_traits::size)); + a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul4Xs& a) { - PacketMul1Xs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), __riscv_vget_v_i16m4_i16m1(a, 1), - unpacket_traits::size); - PacketMul1Xs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), __riscv_vget_v_i16m4_i16m1(a, 3), - unpacket_traits::size); - return predux_mul(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits::size)); +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const Packet4Xs& a) { + Packet1Xs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), __riscv_vget_v_i16m4_i16m1(a, 1), + unpacket_traits::size); + Packet1Xs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), __riscv_vget_v_i16m4_i16m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul4Xs& a) { +EIGEN_STRONG_INLINE numext::int16_t predux_min(const Packet4Xs& a) { return __riscv_vmv_x(__riscv_vredmin_vs_i16m4_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul4Xs& a) { +EIGEN_STRONG_INLINE numext::int16_t predux_max(const Packet4Xs& a) { return __riscv_vmv_x(__riscv_vredmax_vs_i16m4_i16m1( - a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), - unpacket_traits::size)); + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - numext::int16_t buffer[unpacket_traits::size * N] = {0}; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; int i = 0; for (i = 0; i < N; i++) { - __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle16_v_i16m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + __riscv_vle16_v_i16m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); } } diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h index 3801b858b..fbda19138 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h +++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h @@ -16,16 +16,16 @@ namespace Eigen { namespace internal { -typedef vfloat16m1_t PacketMul1Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); -typedef vfloat16m2_t PacketMul2Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); +typedef vfloat16m1_t Packet1Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vfloat16m2_t Packet2Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); #if EIGEN_RISCV64_DEFAULT_LMUL == 1 -typedef PacketMul1Xh PacketXh; +typedef Packet1Xh PacketXh; template <> struct packet_traits : default_packet_traits { - typedef PacketMul1Xh type; - typedef PacketMul1Xh half; + typedef Packet1Xh type; + typedef Packet1Xh half; enum { Vectorizable = 1, @@ -62,12 +62,12 @@ struct packet_traits : default_packet_traits { }; #else -typedef PacketMul2Xh PacketXh; +typedef Packet2Xh PacketXh; template <> struct packet_traits : default_packet_traits { - typedef PacketMul2Xh type; - typedef PacketMul1Xh half; + typedef Packet2Xh type; + typedef Packet1Xh half; enum { Vectorizable = 1, @@ -105,9 +105,9 @@ struct packet_traits : default_packet_traits { #endif template <> -struct unpacket_traits { +struct unpacket_traits { typedef Eigen::half type; - typedef PacketMul1Xh half; // Half not yet implemented + typedef Packet1Xh half; // Half not yet implemented typedef PacketXs integer_packet; typedef numext::uint8_t mask_t; @@ -121,10 +121,10 @@ struct unpacket_traits { }; template <> -struct unpacket_traits { +struct unpacket_traits { typedef Eigen::half type; - typedef PacketMul1Xh half; - typedef PacketMul2Xs integer_packet; + typedef Packet1Xh half; + typedef Packet2Xs integer_packet; typedef numext::uint8_t mask_t; enum { @@ -470,373 +470,373 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { } } -EIGEN_STRONG_INLINE PacketMul2Xf half2float(const PacketXh& a) { - return __riscv_vfwcvt_f_f_v_f32m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf half2float(const PacketXh& a) { + return __riscv_vfwcvt_f_f_v_f32m2(a, unpacket_traits::size); } -EIGEN_STRONG_INLINE PacketXh float2half(const PacketMul2Xf& a) { +EIGEN_STRONG_INLINE PacketXh float2half(const Packet2Xf& a) { return __riscv_vfncvt_f_f_w_f16m1(a, unpacket_traits::size); } -/********************************* PacketMul2Xh ************************************/ +/********************************* Packet2Xh ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul2Xh ptrue(const PacketMul2Xh& /*a*/) { - return __riscv_vreinterpret_f16m2(__riscv_vmv_v_x_u16m2(0xffffu, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet2Xh ptrue(const Packet2Xh& /*a*/) { + return __riscv_vreinterpret_f16m2(__riscv_vmv_v_x_u16m2(0xffffu, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pzero(const PacketMul2Xh& /*a*/) { - return __riscv_vfmv_v_f_f16m2(static_cast(0.0), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pzero(const Packet2Xh& /*a*/) { + return __riscv_vfmv_v_f_f16m2(static_cast(0.0), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pabs(const PacketMul2Xh& a) { - return __riscv_vfabs_v_f16m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pabs(const Packet2Xh& a) { + return __riscv_vfabs_v_f16m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pset1(const Eigen::half& from) { - return __riscv_vfmv_v_f_f16m2(static_cast<_Float16>(from), unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pset1(const Eigen::half& from) { + return __riscv_vfmv_v_f_f16m2(static_cast<_Float16>(from), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pset1frombits(numext::uint16_t from) { - return __riscv_vreinterpret_f16m2(__riscv_vmv_v_x_u16m2(from, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet2Xh pset1frombits(numext::uint16_t from) { + return __riscv_vreinterpret_f16m2(__riscv_vmv_v_x_u16m2(from, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh plset(const Eigen::half& a) { - PacketMul2Xh idx = __riscv_vfcvt_f_x_v_f16m2(__riscv_vid_v_i16m2(unpacket_traits::size), - unpacket_traits::size); - return __riscv_vfadd_vf_f16m2(idx, a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh plset(const Eigen::half& a) { + Packet2Xh idx = __riscv_vfcvt_f_x_v_f16m2(__riscv_vid_v_i16m2(unpacket_traits::size), + unpacket_traits::size); + return __riscv_vfadd_vf_f16m2(idx, a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh padd(const PacketMul2Xh& a, const PacketMul2Xh& b) { - return __riscv_vfadd_vv_f16m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh padd(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vfadd_vv_f16m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh psub(const PacketMul2Xh& a, const PacketMul2Xh& b) { - return __riscv_vfsub_vv_f16m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh psub(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vfsub_vv_f16m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pnegate(const PacketMul2Xh& a) { - return __riscv_vfneg_v_f16m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pnegate(const Packet2Xh& a) { + return __riscv_vfneg_v_f16m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pconj(const PacketMul2Xh& a) { +EIGEN_STRONG_INLINE Packet2Xh pconj(const Packet2Xh& a) { return a; } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pmul(const PacketMul2Xh& a, const PacketMul2Xh& b) { - return __riscv_vfmul_vv_f16m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pmul(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vfmul_vv_f16m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pdiv(const PacketMul2Xh& a, const PacketMul2Xh& b) { - return __riscv_vfdiv_vv_f16m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pdiv(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vfdiv_vv_f16m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pmadd(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { - return __riscv_vfmadd_vv_f16m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pmadd(const Packet2Xh& a, const Packet2Xh& b, const Packet2Xh& c) { + return __riscv_vfmadd_vv_f16m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pmsub(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { - return __riscv_vfmsub_vv_f16m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pmsub(const Packet2Xh& a, const Packet2Xh& b, const Packet2Xh& c) { + return __riscv_vfmsub_vv_f16m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pnmadd(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { - return __riscv_vfnmsub_vv_f16m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pnmadd(const Packet2Xh& a, const Packet2Xh& b, const Packet2Xh& c) { + return __riscv_vfnmsub_vv_f16m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pnmsub(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { - return __riscv_vfnmadd_vv_f16m2(a, b, c, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pnmsub(const Packet2Xh& a, const Packet2Xh& b, const Packet2Xh& c) { + return __riscv_vfnmadd_vv_f16m2(a, b, c, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pmin(const PacketMul2Xh& a, const PacketMul2Xh& b) { - PacketMul2Xh nans = - __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); - PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pmin(const Packet2Xh& a, const Packet2Xh& b) { + Packet2Xh nans = + __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); - return __riscv_vfmin_vv_f16m2_tum(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmin_vv_f16m2_tum(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pmin(const PacketMul2Xh& a, const PacketMul2Xh& b) { - return pmin(a, b); +EIGEN_STRONG_INLINE Packet2Xh pmin(const Packet2Xh& a, const Packet2Xh& b) { + return pmin(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pmin(const PacketMul2Xh& a, const PacketMul2Xh& b) { - return __riscv_vfmin_vv_f16m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pmin(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vfmin_vv_f16m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pmax(const PacketMul2Xh& a, const PacketMul2Xh& b) { - PacketMul2Xh nans = - __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); - PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); - PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); - mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pmax(const Packet2Xh& a, const Packet2Xh& b) { + Packet2Xh nans = + __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); - return __riscv_vfmax_vv_f16m2_tum(mask, nans, a, b, unpacket_traits::size); + return __riscv_vfmax_vv_f16m2_tum(mask, nans, a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pmax(const PacketMul2Xh& a, const PacketMul2Xh& b) { - return pmax(a, b); +EIGEN_STRONG_INLINE Packet2Xh pmax(const Packet2Xh& a, const Packet2Xh& b) { + return pmax(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pmax(const PacketMul2Xh& a, const PacketMul2Xh& b) { - return __riscv_vfmax_vv_f16m2(a, b, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pmax(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vfmax_vv_f16m2(a, b, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pcmp_le(const PacketMul2Xh& a, const PacketMul2Xh& b) { - PacketMask8 mask = __riscv_vmfle_vv_f16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pcmp_le(const Packet2Xh& a, const Packet2Xh& b) { + PacketMask8 mask = __riscv_vmfle_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pcmp_lt(const PacketMul2Xh& a, const PacketMul2Xh& b) { - PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pcmp_lt(const Packet2Xh& a, const Packet2Xh& b) { + PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pcmp_eq(const PacketMul2Xh& a, const PacketMul2Xh& b) { - PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, b, unpacket_traits::size); - return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pcmp_eq(const Packet2Xh& a, const Packet2Xh& b) { + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pcmp_lt_or_nan(const PacketMul2Xh& a, const PacketMul2Xh& b) { - PacketMask8 mask = __riscv_vmfge_vv_f16m2_b8(a, b, unpacket_traits::size); - return __riscv_vfmerge_vfm_f16m2(ptrue(a), static_cast(0.0), mask, - unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pcmp_lt_or_nan(const Packet2Xh& a, const Packet2Xh& b) { + PacketMask8 mask = __riscv_vmfge_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f16m2(ptrue(a), static_cast(0.0), mask, + unpacket_traits::size); } // Logical Operations are not supported for half, so reinterpret casts template <> -EIGEN_STRONG_INLINE PacketMul2Xh pand(const PacketMul2Xh& a, const PacketMul2Xh& b) { +EIGEN_STRONG_INLINE Packet2Xh pand(const Packet2Xh& a, const Packet2Xh& b) { return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh por(const PacketMul2Xh& a, const PacketMul2Xh& b) { +EIGEN_STRONG_INLINE Packet2Xh por(const Packet2Xh& a, const Packet2Xh& b) { return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pxor(const PacketMul2Xh& a, const PacketMul2Xh& b) { +EIGEN_STRONG_INLINE Packet2Xh pxor(const Packet2Xh& a, const Packet2Xh& b) { return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vxor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), __riscv_vreinterpret_v_f16m2_u16m2(b), - unpacket_traits::size)); + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pandnot(const PacketMul2Xh& a, const PacketMul2Xh& b) { +EIGEN_STRONG_INLINE Packet2Xh pandnot(const Packet2Xh& a, const Packet2Xh& b) { return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2( __riscv_vreinterpret_v_f16m2_u16m2(a), - __riscv_vnot_v_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size), - unpacket_traits::size)); + __riscv_vnot_v_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size), + unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pload(const Eigen::half* from) { +EIGEN_STRONG_INLINE Packet2Xh pload(const Eigen::half* from) { EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast(from), - unpacket_traits::size); + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh ploadu(const Eigen::half* from) { +EIGEN_STRONG_INLINE Packet2Xh ploadu(const Eigen::half* from) { EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast(from), - unpacket_traits::size); + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh ploaddup(const Eigen::half* from) { - PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); - idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); - return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh ploaddup(const Eigen::half* from) { + Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); + return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh ploadquad(const Eigen::half* from) { - PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); - idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, - unpacket_traits::size); - return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh ploadquad(const Eigen::half* from) { + Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const PacketMul2Xh& from) { +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet2Xh& from) { EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from, - unpacket_traits::size); + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const PacketMul2Xh& from) { +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet2Xh& from) { EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from, - unpacket_traits::size); + unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline PacketMul2Xh pgather(const Eigen::half* from, Index stride) { +EIGEN_DEVICE_FUNC inline Packet2Xh pgather(const Eigen::half* from, Index stride) { return __riscv_vlse16_v_f16m2(reinterpret_cast(from), stride * sizeof(Eigen::half), - unpacket_traits::size); + unpacket_traits::size); } template <> -EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const PacketMul2Xh& from, +EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const Packet2Xh& from, Index stride) { __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, - unpacket_traits::size); + unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE Eigen::half pfirst(const PacketMul2Xh& a) { +EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet2Xh& a) { return static_cast(__riscv_vfmv_f_s_f16m2_f16(a)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh psqrt(const PacketMul2Xh& a) { - return __riscv_vfsqrt_v_f16m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh psqrt(const Packet2Xh& a) { + return __riscv_vfsqrt_v_f16m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh print(const PacketMul2Xh& a) { - const PacketMul2Xh limit = pset1(static_cast(1 << 10)); - const PacketMul2Xh abs_a = pabs(a); +EIGEN_STRONG_INLINE Packet2Xh print(const Packet2Xh& a) { + const Packet2Xh limit = pset1(static_cast(1 << 10)); + const Packet2Xh abs_a = pabs(a); - PacketMask8 mask = __riscv_vmfne_vv_f16m2_b8(a, a, unpacket_traits::size); - const PacketMul2Xh x = __riscv_vfadd_vv_f16m2_tum(mask, a, a, a, unpacket_traits::size); - const PacketMul2Xh new_x = __riscv_vfcvt_f_x_v_f16m2( - __riscv_vfcvt_x_f_v_i16m2(a, unpacket_traits::size), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfne_vv_f16m2_b8(a, a, unpacket_traits::size); + const Packet2Xh x = __riscv_vfadd_vv_f16m2_tum(mask, a, a, a, unpacket_traits::size); + const Packet2Xh new_x = __riscv_vfcvt_f_x_v_f16m2( + __riscv_vfcvt_x_f_v_i16m2(a, unpacket_traits::size), unpacket_traits::size); - mask = __riscv_vmflt_vv_f16m2_b8(abs_a, limit, unpacket_traits::size); - PacketMul2Xh signed_x = __riscv_vfsgnj_vv_f16m2(new_x, x, unpacket_traits::size); - return __riscv_vmerge_vvm_f16m2(x, signed_x, mask, unpacket_traits::size); + mask = __riscv_vmflt_vv_f16m2_b8(abs_a, limit, unpacket_traits::size); + Packet2Xh signed_x = __riscv_vfsgnj_vv_f16m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(x, signed_x, mask, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pfloor(const PacketMul2Xh& a) { - PacketMul2Xh tmp = print(a); +EIGEN_STRONG_INLINE Packet2Xh pfloor(const Packet2Xh& a) { + Packet2Xh tmp = print(a); // If greater, subtract one. - PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, tmp, unpacket_traits::size); - return __riscv_vfsub_vf_f16m2_tum(mask, tmp, tmp, static_cast(1.0), unpacket_traits::size); + PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f16m2_tum(mask, tmp, tmp, static_cast(1.0), unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh preverse(const PacketMul2Xh& a) { - PacketMul2Xsu idx = - __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), - unpacket_traits::size - 1, unpacket_traits::size); - return __riscv_vrgather_vv_f16m2(a, idx, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh preverse(const Packet2Xh& a) { + Packet2Xsu idx = + __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f16m2(a, idx, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE Eigen::half predux(const PacketMul2Xh& a) { +EIGEN_STRONG_INLINE Eigen::half predux(const Packet2Xh& a) { return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m2_f16m1( - a, __riscv_vfmv_v_f_f16m1(static_cast(0.0), unpacket_traits::size / 4), - unpacket_traits::size))); + a, __riscv_vfmv_v_f_f16m1(static_cast(0.0), unpacket_traits::size / 4), + unpacket_traits::size))); } template <> -EIGEN_STRONG_INLINE Eigen::half predux_mul(const PacketMul2Xh& a) { +EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet2Xh& a) { return predux_mul(__riscv_vfmul_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE Eigen::half predux_min(const PacketMul2Xh& a) { +EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet2Xh& a) { return static_cast(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m2_f16m1( - a, __riscv_vfmv_v_f_f16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size))); + a, __riscv_vfmv_v_f_f16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size))); } template <> -EIGEN_STRONG_INLINE Eigen::half predux_max(const PacketMul2Xh& a) { +EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet2Xh& a) { return static_cast(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m2_f16m1( - a, __riscv_vfmv_v_f_f16m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), - unpacket_traits::size))); + a, __riscv_vfmv_v_f_f16m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size))); } template -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - Eigen::half buffer[unpacket_traits::size * N]; +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + Eigen::half buffer[unpacket_traits::size * N]; int i = 0; for (i = 0; i < N; i++) { __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i], - unpacket_traits::size); + unpacket_traits::size); } for (i = 0; i < N; i++) { kernel.packet[i] = - __riscv_vle16_v_f16m2(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), - unpacket_traits::size); + __riscv_vle16_v_f16m2(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), + unpacket_traits::size); } } -EIGEN_STRONG_INLINE PacketMul4Xf half2float(const PacketMul2Xh& a) { - return __riscv_vfwcvt_f_f_v_f32m4(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf half2float(const Packet2Xh& a) { + return __riscv_vfwcvt_f_f_v_f32m4(a, unpacket_traits::size); } -EIGEN_STRONG_INLINE PacketMul2Xh float2half(const PacketMul4Xf& a) { - return __riscv_vfncvt_f_f_w_f16m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh float2half(const Packet4Xf& a) { + return __riscv_vfncvt_f_f_w_f16m2(a, unpacket_traits::size); } -template +template EIGEN_STRONG_INLINE -typename std::enable_if::value && (unpacket_traits::size % 8) == 0, +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, PacketXh>::type -predux_half_dowto4(const PacketMul2Xh& a) { +predux_half_dowto4(const Packet2Xh& a) { return __riscv_vfadd_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), unpacket_traits::size); } -F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, pcos) -F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, pexp) -F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, pexpm1) -F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, plog) -F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, plog1p) -F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, plog2) -F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, preciprocal) -F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, prsqrt) -F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, psin) -F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, ptanh) - -F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, pcos) -F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, pexp) -F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, pexpm1) -F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, plog) -F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, plog1p) -F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, plog2) -F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, preciprocal) -F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, prsqrt) -F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, psin) -F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, ptanh) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, pcos) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, pexp) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, pexpm1) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, plog) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, plog1p) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, plog2) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, preciprocal) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, prsqrt) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, psin) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, ptanh) + +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, pcos) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, pexp) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, pexpm1) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, plog) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, plog1p) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, plog2) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, preciprocal) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, prsqrt) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, psin) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, ptanh) /********************************* casting ************************************/ @@ -871,27 +871,27 @@ EIGEN_STRONG_INLINE PacketXs preinterpret(const PacketXh& a) } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketMul2Xs& a) { - return __riscv_vfcvt_f_x_v_f16m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xh pcast(const Packet2Xs& a) { + return __riscv_vfcvt_f_x_v_f16m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketMul2Xh& a) { - return __riscv_vfcvt_rtz_x_f_v_i16m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xs pcast(const Packet2Xh& a) { + return __riscv_vfcvt_rtz_x_f_v_i16m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh preinterpret(const PacketMul2Xs& a) { +EIGEN_STRONG_INLINE Packet2Xh preinterpret(const Packet2Xs& a) { return __riscv_vreinterpret_v_i16m2_f16m2(a); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs preinterpret(const PacketMul2Xh& a) { +EIGEN_STRONG_INLINE Packet2Xs preinterpret(const Packet2Xh& a) { return __riscv_vreinterpret_v_f16m2_i16m2(a); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXh& a, const PacketXh& b, const PacketXh& c, +EIGEN_STRONG_INLINE Packet4Xs pcast(const PacketXh& a, const PacketXh& b, const PacketXh& c, const PacketXh& d) { return __riscv_vcreate_v_i16m1_i16m4(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size), __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size), @@ -900,18 +900,18 @@ EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXh& a } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketXs& a, const PacketXs& b) { +EIGEN_STRONG_INLINE Packet2Xh pcast(const PacketXs& a, const PacketXs& b) { return __riscv_vcreate_v_f16m1_f16m2(__riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits::size), __riscv_vfcvt_f_x_v_f16m1(b, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketXh& a, const PacketXh& b) { +EIGEN_STRONG_INLINE Packet2Xh pcast(const PacketXh& a, const PacketXh& b) { return __riscv_vcreate_v_f16m1_f16m2(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketXh& a, const PacketXh& b) { +EIGEN_STRONG_INLINE Packet2Xs pcast(const PacketXh& a, const PacketXh& b) { return __riscv_vcreate_v_i16m1_i16m2(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size), __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size)); } diff --git a/Eigen/src/Core/arch/RVV10/TypeCasting.h b/Eigen/src/Core/arch/RVV10/TypeCasting.h index eeb9141b4..2b0d3db47 100644 --- a/Eigen/src/Core/arch/RVV10/TypeCasting.h +++ b/Eigen/src/Core/arch/RVV10/TypeCasting.h @@ -29,115 +29,115 @@ struct type_casting_traits { }; template <> -EIGEN_STRONG_INLINE PacketMul1Xf pcast(const PacketMul1Xi& a) { - return __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xf pcast(const Packet1Xi& a) { + return __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi pcast(const PacketMul1Xf& a) { - return __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xi pcast(const Packet1Xf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xf preinterpret(const PacketMul1Xi& a) { +EIGEN_STRONG_INLINE Packet1Xf preinterpret(const Packet1Xi& a) { return __riscv_vreinterpret_v_i32m1_f32m1(a); } template <> -EIGEN_STRONG_INLINE PacketMul1Xi preinterpret(const PacketMul1Xf& a) { +EIGEN_STRONG_INLINE Packet1Xi preinterpret(const Packet1Xf& a) { return __riscv_vreinterpret_v_f32m1_i32m1(a); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketMul4Xi& a) { - return __riscv_vfcvt_f_x_v_f32m4(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xf pcast(const Packet4Xi& a) { + return __riscv_vfcvt_f_x_v_f32m4(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketMul4Xf& a) { - return __riscv_vfcvt_rtz_x_f_v_i32m4(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xi pcast(const Packet4Xf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m4(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf preinterpret(const PacketMul4Xi& a) { +EIGEN_STRONG_INLINE Packet4Xf preinterpret(const Packet4Xi& a) { return __riscv_vreinterpret_v_i32m4_f32m4(a); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi preinterpret(const PacketMul4Xf& a) { +EIGEN_STRONG_INLINE Packet4Xi preinterpret(const Packet4Xf& a) { return __riscv_vreinterpret_v_f32m4_i32m4(a); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketMul2Xi& a) { - return __riscv_vfcvt_f_x_v_f32m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xf pcast(const Packet2Xi& a) { + return __riscv_vfcvt_f_x_v_f32m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketMul2Xf& a) { - return __riscv_vfcvt_rtz_x_f_v_i32m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xi pcast(const Packet2Xf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf preinterpret(const PacketMul2Xi& a) { +EIGEN_STRONG_INLINE Packet2Xf preinterpret(const Packet2Xi& a) { return __riscv_vreinterpret_v_i32m2_f32m2(a); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi preinterpret(const PacketMul2Xf& a) { +EIGEN_STRONG_INLINE Packet2Xi preinterpret(const Packet2Xf& a) { return __riscv_vreinterpret_v_f32m2_i32m2(a); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketMul1Xi& a, const PacketMul1Xi& b, const PacketMul1Xi& c, - const PacketMul1Xi& d) { +EIGEN_STRONG_INLINE Packet4Xi pcast(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c, + const Packet1Xi& d) { return __riscv_vcreate_v_i32m1_i32m4(a, b, c, d); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketMul1Xi& a, const PacketMul1Xi& b, const PacketMul1Xi& c, - const PacketMul1Xi& d) { - return __riscv_vcreate_v_f32m1_f32m4(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f32m1(c, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f32m1(d, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet4Xf pcast(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c, + const Packet1Xi& d) { + return __riscv_vcreate_v_f32m1_f32m4(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(c, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(d, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c, - const PacketMul1Xf& d) { +EIGEN_STRONG_INLINE Packet4Xf pcast(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c, + const Packet1Xf& d) { return __riscv_vcreate_v_f32m1_f32m4(a, b, c, d); } template <> -EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketMul1Xf& a, const PacketMul1Xf& b, const PacketMul1Xf& c, - const PacketMul1Xf& d) { - return __riscv_vcreate_v_i32m1_i32m4(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i32m1(c, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i32m1(d, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet4Xi pcast(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c, + const Packet1Xf& d) { + return __riscv_vcreate_v_i32m1_i32m4(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(d, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketMul1Xi& a, const PacketMul1Xi& b) { +EIGEN_STRONG_INLINE Packet2Xi pcast(const Packet1Xi& a, const Packet1Xi& b) { return __riscv_vcreate_v_i32m1_i32m2(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketMul1Xi& a, const PacketMul1Xi& b) { - return __riscv_vcreate_v_f32m1_f32m2(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet2Xf pcast(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vcreate_v_f32m1_f32m2(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketMul1Xf& a, const PacketMul1Xf& b) { +EIGEN_STRONG_INLINE Packet2Xf pcast(const Packet1Xf& a, const Packet1Xf& b) { return __riscv_vcreate_v_f32m1_f32m2(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketMul1Xf& a, const PacketMul1Xf& b) { - return __riscv_vcreate_v_i32m1_i32m2(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet2Xi pcast(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vcreate_v_i32m1_i32m2(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size)); } /********************************* 64 bits ************************************/ @@ -153,128 +153,128 @@ struct type_casting_traits { }; template <> -EIGEN_STRONG_INLINE PacketMul1Xd pcast(const PacketMul1Xl& a) { - return __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xd pcast(const Packet1Xl& a) { + return __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl pcast(const PacketMul1Xd& a) { - return __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet1Xl pcast(const Packet1Xd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul1Xd preinterpret(const PacketMul1Xl& a) { +EIGEN_STRONG_INLINE Packet1Xd preinterpret(const Packet1Xl& a) { return __riscv_vreinterpret_v_i64m1_f64m1(a); } template <> -EIGEN_STRONG_INLINE PacketMul1Xl preinterpret(const PacketMul1Xd& a) { +EIGEN_STRONG_INLINE Packet1Xl preinterpret(const Packet1Xd& a) { return __riscv_vreinterpret_v_f64m1_i64m1(a); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketMul4Xl& a) { - return __riscv_vfcvt_f_x_v_f64m4(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xd pcast(const Packet4Xl& a) { + return __riscv_vfcvt_f_x_v_f64m4(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketMul4Xd& a) { - return __riscv_vfcvt_rtz_x_f_v_i64m4(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet4Xl pcast(const Packet4Xd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m4(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd preinterpret(const PacketMul4Xl& a) { +EIGEN_STRONG_INLINE Packet4Xd preinterpret(const Packet4Xl& a) { return __riscv_vreinterpret_v_i64m4_f64m4(a); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl preinterpret(const PacketMul4Xd& a) { +EIGEN_STRONG_INLINE Packet4Xl preinterpret(const Packet4Xd& a) { return __riscv_vreinterpret_v_f64m4_i64m4(a); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketMul2Xl& a) { - return __riscv_vfcvt_f_x_v_f64m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xd pcast(const Packet2Xl& a) { + return __riscv_vfcvt_f_x_v_f64m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketMul2Xd& a) { - return __riscv_vfcvt_rtz_x_f_v_i64m2(a, unpacket_traits::size); +EIGEN_STRONG_INLINE Packet2Xl pcast(const Packet2Xd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m2(a, unpacket_traits::size); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd preinterpret(const PacketMul2Xl& a) { +EIGEN_STRONG_INLINE Packet2Xd preinterpret(const Packet2Xl& a) { return __riscv_vreinterpret_v_i64m2_f64m2(a); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl preinterpret(const PacketMul2Xd& a) { +EIGEN_STRONG_INLINE Packet2Xl preinterpret(const Packet2Xd& a) { return __riscv_vreinterpret_v_f64m2_i64m2(a); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c, - const PacketMul1Xl& d) { +EIGEN_STRONG_INLINE Packet4Xl pcast(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c, + const Packet1Xl& d) { return __riscv_vcreate_v_i64m1_i64m4(a, b, c, d); ; } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketMul1Xl& a, const PacketMul1Xl& b, const PacketMul1Xl& c, - const PacketMul1Xl& d) { - return __riscv_vcreate_v_f64m1_f64m4(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f64m1(c, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f64m1(d, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet4Xd pcast(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c, + const Packet1Xl& d) { + return __riscv_vcreate_v_f64m1_f64m4(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(c, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(d, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c, - const PacketMul1Xd& d) { +EIGEN_STRONG_INLINE Packet4Xd pcast(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c, + const Packet1Xd& d) { return __riscv_vcreate_v_f64m1_f64m4(a, b, c, d); } template <> -EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketMul1Xd& a, const PacketMul1Xd& b, const PacketMul1Xd& c, - const PacketMul1Xd& d) { - return __riscv_vcreate_v_i64m1_i64m4(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i64m1(c, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i64m1(d, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet4Xl pcast(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c, + const Packet1Xd& d) { + return __riscv_vcreate_v_i64m1_i64m4(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(d, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketMul1Xl& a, const PacketMul1Xl& b) { +EIGEN_STRONG_INLINE Packet2Xl pcast(const Packet1Xl& a, const Packet1Xl& b) { return __riscv_vcreate_v_i64m1_i64m2(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketMul1Xl& a, const PacketMul1Xl& b) { - return __riscv_vcreate_v_f64m1_f64m2(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), - __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet2Xd pcast(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vcreate_v_f64m1_f64m2(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size)); } template <> -EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketMul1Xd& a, const PacketMul1Xd& b) { +EIGEN_STRONG_INLINE Packet2Xd pcast(const Packet1Xd& a, const Packet1Xd& b) { return __riscv_vcreate_v_f64m1_f64m2(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketMul1Xd& a, const PacketMul1Xd& b) { - return __riscv_vcreate_v_i64m1_i64m2(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), - __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size)); +EIGEN_STRONG_INLINE Packet2Xl pcast(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vcreate_v_i64m1_i64m2(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size)); } /********************************* 16 bits ************************************/ template <> -EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketMul1Xs& a, const PacketMul1Xs& b) { +EIGEN_STRONG_INLINE Packet2Xs pcast(const Packet1Xs& a, const Packet1Xs& b) { return __riscv_vcreate_v_i16m1_i16m2(a, b); } template <> -EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketMul1Xs& a, const PacketMul1Xs& b, const PacketMul1Xs& c, - const PacketMul1Xs& d) { +EIGEN_STRONG_INLINE Packet4Xs pcast(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c, + const Packet1Xs& d) { return __riscv_vcreate_v_i16m1_i16m4(a, b, c, d); } diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 5cd244dd3..f9fa5c9cf 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -460,7 +460,7 @@ extern "C" { #if defined(__riscv_zvfh) && defined(__riscv_zfh) #define EIGEN_VECTORIZE_RVV10FP16 #elif defined(__riscv_zvfh) -#error "The Eigen::Half vectorization requires Zfh and Zvfh extensions." +#warning "The Eigen::Half vectorization requires Zfh and Zvfh extensions." #endif #endif // defined(EIGEN_ARCH_RISCV) -- GitLab From 6453027a4eee8dcc46a6f99b8a8bf81db52431e2 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Mon, 17 Nov 2025 14:35:41 +0000 Subject: [PATCH 20/21] Make warning message compiler independent. --- Eigen/src/Core/util/ConfigureVectorization.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index f9fa5c9cf..80ad82ef1 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -460,7 +460,11 @@ extern "C" { #if defined(__riscv_zvfh) && defined(__riscv_zfh) #define EIGEN_VECTORIZE_RVV10FP16 #elif defined(__riscv_zvfh) +#if defined(__GNUC__) || defined(__clang__) #warning "The Eigen::Half vectorization requires Zfh and Zvfh extensions." +#elif defined(_MSC_VER) +#pragma message("The Eigen::Half vectorization requires Zfh and Zvfh extensions.") +#endif #endif #endif // defined(EIGEN_ARCH_RISCV) -- GitLab From 169bc10cbc1e6f6def99be604f7765d5e8419756 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 20 Nov 2025 15:05:19 +0000 Subject: [PATCH 21/21] Remove temp override for EIGEN_RISCV64_DEFAULT_LMUL --- Eigen/src/Core/arch/RVV10/PacketMath.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h index f7560b8b4..54db62634 100644 --- a/Eigen/src/Core/arch/RVV10/PacketMath.h +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -25,10 +25,6 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 -// Temporarily make LMUL = 1 -#undef EIGEN_RISCV64_DEFAULT_LMUL -#define EIGEN_RISCV64_DEFAULT_LMUL 1 - template struct rvv_packet_size_selector { enum { size = VectorLength * VectorLMul / (sizeof(Scalar) * CHAR_BIT) }; -- GitLab