diff --git a/Eigen/Core b/Eigen/Core index 6ae069a92c08ba6dc26493e20e70e7cd08297ecc..3a238407f22de5992f046f148ab92b07d37d461a 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -249,6 +249,14 @@ using std::ptrdiff_t; #include "src/Core/arch/SVE/PacketMath.h" #include "src/Core/arch/SVE/TypeCasting.h" #include "src/Core/arch/SVE/MathFunctions.h" +#elif defined EIGEN_VECTORIZE_RVV10 +#include "src/Core/arch/RVV10/PacketMath.h" +#include "src/Core/arch/RVV10/TypeCasting.h" +#include "src/Core/arch/RVV10/MathFunctions.h" +#include "src/Core/arch/RVV10/Complex.h" +#if defined EIGEN_VECTORIZE_RVV10FP16 +#include "src/Core/arch/RVV10/PacketMathFP16.h" +#endif #elif defined EIGEN_VECTORIZE_ZVECTOR #include "src/Core/arch/ZVector/PacketMath.h" #include "src/Core/arch/ZVector/MathFunctions.h" @@ -396,6 +404,10 @@ using std::ptrdiff_t; #include "src/Core/arch/AVX512/GemmKernel.h" #endif +#if defined(EIGEN_VECTORIZE_RVV10) +#include "src/Core/arch/RVV10/GeneralBlockPanelKernel.h" +#endif + #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" #include "src/Core/PartialReduxEvaluator.h" diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index 36f0a9d74de66b87babdaaf83c78523f50c50961..093ceb43538e6ed7ad77bc6795ce5f6aea0c6641 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -64,8 +64,13 @@ struct copy_using_evaluator_traits { static constexpr int OuterStride = outer_stride_at_compile_time::ret; // TODO distinguish between linear traversal and inner-traversals +#ifdef EIGEN_RISCV64_USE_RVV10 + using LinearPacketType = typename find_best_packet::type; + using InnerPacketType = typename find_best_packet::type; +#else using LinearPacketType = typename find_best_packet::type; using InnerPacketType = typename find_best_packet::type; +#endif static constexpr int LinearPacketSize = unpacket_traits::size; static constexpr int InnerPacketSize = unpacket_traits::size; diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index e3af2d2020bd938707a95bf69e102c7e1b957084..9fa8e4286426a82d787b7225c74b0d291912ecb4 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -1367,7 +1367,11 @@ struct evaluator> typedef Block XprType; typedef typename XprType::Scalar Scalar; // TODO: should check for smaller packet types once we can handle multi-sized packet types +#ifdef EIGEN_RISCV64_USE_RVV10 + typedef typename packet_traits::type PacketScalar; +#else typedef typename packet_traits::type PacketScalar; +#endif enum { CoeffReadCost = evaluator::CoeffReadCost, diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index d45cb4bf4a49b9cc5e4ceb5e692c5c2a04f7fbdd..e42baf75d45fd5ce4349a7d72c87734bc068ddc1 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -105,7 +105,7 @@ struct default_packet_traits { }; }; -template +template struct packet_traits : default_packet_traits { typedef T type; typedef T half; diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index ce8d954bffc4892f0eb398299bd0161e06586c42..db820ba6d89ef8b8d99006073b8c401436bcefb7 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -533,8 +533,13 @@ struct product_evaluator, ProductTag, DenseShape, MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime }; +#ifdef EIGEN_RISCV64_USE_RVV10 + typedef typename find_best_packet::type LhsVecPacketType; + typedef typename find_best_packet::type RhsVecPacketType; +#else typedef typename find_best_packet::type LhsVecPacketType; typedef typename find_best_packet::type RhsVecPacketType; +#endif enum { diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 4e9ab0e4f89191ac10a2c0b76c963b20179fcd30..716a7c00e8ef976c7342f6bbfcfc5bd820f36d8c 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -29,7 +29,11 @@ namespace internal { template struct redux_traits { public: +#ifdef EIGEN_RISCV64_USE_RVV10 + typedef typename find_best_packet::type PacketType; +#else typedef typename find_best_packet::type PacketType; +#endif enum { PacketSize = unpacket_traits::size, InnerMaxSize = int(Evaluator::IsRowMajor) ? Evaluator::MaxColsAtCompileTime : Evaluator::MaxRowsAtCompileTime, diff --git a/Eigen/src/Core/arch/RVV10/Complex.h b/Eigen/src/Core/arch/RVV10/Complex.h new file mode 100644 index 0000000000000000000000000000000000000000..73ef50cc57d5a1dda6647c226988dc806264db18 --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/Complex.h @@ -0,0 +1,713 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_COMPLEX_RVV10_H +#define EIGEN_COMPLEX_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { + +namespace internal { + +/********************************* float32 ************************************/ + +struct PacketXcf { + EIGEN_STRONG_INLINE PacketXcf() {} + EIGEN_STRONG_INLINE explicit PacketXcf(const PacketXf& _real, const PacketXf& _imag) : real(_real), imag(_imag) {} + EIGEN_STRONG_INLINE explicit PacketXcf(const PacketMul2Xf& a) + : real(__riscv_vget_v_f32m2_f32m1(a, 0)), imag(__riscv_vget_v_f32m2_f32m1(a, 1)) {} + PacketXf real; + PacketXf imag; +}; + +template +struct packet_traits, LMul> : default_packet_traits { + typedef PacketXcf type; + typedef PacketXcf half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasSqrt = 1, + HasSign = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasLog = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + typedef PacketXcf half; + typedef PacketMul2Xf as_real; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXcf pcast(const PacketMul2Xf& a) { + return PacketXcf(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXcf& a) { + return __riscv_vcreate_v_f32m1_f32m2(a.real, a.imag); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pset1(const std::complex& from) { + PacketXf real = pset1(from.real()); + PacketXf imag = pset1(from.imag()); + return PacketXcf(real, imag); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf padd(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(padd(a.real, b.real), padd(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf psub(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(psub(a.real, b.real), psub(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pnegate(const PacketXcf& a) { + return PacketXcf(pnegate(a.real), pnegate(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pconj(const PacketXcf& a) { + return PacketXcf( + a.real, __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vx_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(a.imag), + 0x80000000, unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pmul(const PacketXcf& a, const PacketXcf& b) { + PacketXf v1 = pmul(a.real, b.real); + PacketXf v2 = pmul(a.imag, b.imag); + PacketXf v3 = pmul(a.real, b.imag); + PacketXf v4 = pmul(a.imag, b.real); + return PacketXcf(psub(v1, v2), padd(v3, v4)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketXcf& a, const PacketXcf& b, const PacketXcf& c) { + PacketXf v1 = pmadd(a.real, b.real, c.real); + PacketXf v2 = pmul(a.imag, b.imag); + PacketXf v3 = pmadd(a.real, b.imag, c.imag); + PacketXf v4 = pmul(a.imag, b.real); + return PacketXcf(psub(v1, v2), padd(v3, v4)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pcmp_eq(const PacketXcf& a, const PacketXcf& b) { + PacketMask32 eq_both = pand(pcmp_eq_mask(a.real, b.real), pcmp_eq_mask(a.imag, b.imag)); + PacketXf res = pselect(eq_both, ptrue(a.real), pzero(a.real)); + return PacketXcf(res, res); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pand(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(pand(a.real, b.real), pand(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf por(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(por(a.real, b.real), por(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pxor(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(pxor(a.real, b.real), pxor(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pandnot(const PacketXcf& a, const PacketXcf& b) { + return PacketXcf(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pload(const std::complex* from) { + vfloat32m1x2_t res = __riscv_vlseg2e32_v_f32m1x2((const float*)from, unpacket_traits::size); + EIGEN_DEBUG_ALIGNED_LOAD return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), __riscv_vget_v_f32m1x2_f32m1(res, 1)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf ploadu(const std::complex* from) { + vfloat32m1x2_t res = __riscv_vlseg2e32_v_f32m1x2((const float*)from, unpacket_traits::size); + EIGEN_DEBUG_UNALIGNED_LOAD return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), + __riscv_vget_v_f32m1x2_f32m1(res, 1)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf ploaddup(const std::complex* from) { + PacketXu real_idx = __riscv_vid_v_u32m1(unpacket_traits::size); + real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + PacketXu imag_idx = __riscv_vadd_vx_u32m1(real_idx, sizeof(float), unpacket_traits::size); + // real_idx = 0 0 2*sizeof(float) 2*sizeof(float) 4*sizeof(float) 4*sizeof(float) ... + return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), + __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf ploadquad(const std::complex* from) { + PacketXu real_idx = __riscv_vid_v_u32m1(unpacket_traits::size); + real_idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(real_idx, 0xfffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + PacketXu imag_idx = __riscv_vadd_vx_u32m1(real_idx, sizeof(float), unpacket_traits::size); + // real_idx = 0 0 2*sizeof(float) 2*sizeof(float) 4*sizeof(float) 4*sizeof(float) ... + return PacketXcf(__riscv_vloxei32_v_f32m1((const float*)from, real_idx, unpacket_traits::size), + __riscv_vloxei32_v_f32m1((const float*)from, imag_idx, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, const PacketXcf& from) { + vfloat32m1x2_t vx2 = __riscv_vundefined_f32m1x2(); + vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 0, from.real); + vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 1, from.imag); + EIGEN_DEBUG_ALIGNED_STORE __riscv_vsseg2e32_v_f32m1x2((float*)to, vx2, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const PacketXcf& from) { + vfloat32m1x2_t vx2 = __riscv_vundefined_f32m1x2(); + vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 0, from.real); + vx2 = __riscv_vset_v_f32m1_f32m1x2(vx2, 1, from.imag); + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vsseg2e32_v_f32m1x2((float*)to, vx2, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXcf pgather, PacketXcf>(const std::complex* from, + Index stride) { + vfloat32m1x2_t res = + __riscv_vlsseg2e32_v_f32m1x2((const float*)from, 2 * stride * sizeof(float), unpacket_traits::size); + return PacketXcf(__riscv_vget_v_f32m1x2_f32m1(res, 0), __riscv_vget_v_f32m1x2_f32m1(res, 1)); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, PacketXcf>(std::complex* to, const PacketXcf& from, + Index stride) { + vfloat32m1x2_t from_rvv_type = __riscv_vundefined_f32m1x2(); + from_rvv_type = __riscv_vset_v_f32m1_f32m1x2(from_rvv_type, 0, from.real); + from_rvv_type = __riscv_vset_v_f32m1_f32m1x2(from_rvv_type, 1, from.imag); + __riscv_vssseg2e32_v_f32m1x2((float*)to, 2 * stride * sizeof(float), from_rvv_type, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const PacketXcf& a) { + return std::complex(pfirst(a.real), pfirst(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf preverse(const PacketXcf& a) { + return PacketXcf(preverse(a.real), preverse(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pcplxflip(const PacketXcf& a) { + return PacketXcf(a.imag, a.real); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const PacketXcf& a) { + return std::complex(predux(a.real), predux(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf pdiv(const PacketXcf& a, const PacketXcf& b) { + PacketXcf b_conj = pconj(b); + PacketXcf dividend = pmul(a, b_conj); + PacketXf divider = psub(pmul(b.real, b_conj.real), pmul(b.imag, b_conj.imag)); + return PacketXcf(pdiv(dividend.real, divider), pdiv(dividend.imag, divider)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer_real[unpacket_traits::size * N]; + float buffer_imag[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer_real[i], N * sizeof(float), kernel.packet[i].real, unpacket_traits::size); + __riscv_vsse32(&buffer_imag[i], N * sizeof(float), kernel.packet[i].imag, unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i].real = + __riscv_vle32_v_f32m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i].imag = + __riscv_vle32_v_f32m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE Packet psqrt_complex_rvv(const Packet& a) { + typedef typename unpacket_traits::type Scalar; + typedef typename Scalar::value_type RealScalar; + typedef typename packet_traits::type RealPacket; + typedef typename unpacket_traits::packet_mask PacketMask; + + // Computes the principal sqrt of the complex numbers in the input. + // + // For example, for packets containing 2 complex numbers stored in + // [real0, real1, imag0, imag1] format + // a = [a0, a1] = [x0, x1, y0, y1], + // where x0 = real(a0), y0 = imag(a0) etc., this function returns + // b = [b0, b1] = [u0, u1, v0, v1], + // such that b0^2 = a0, b1^2 = a1. + // + // To derive the formula for the complex square roots, let's consider the equation for + // a single complex square root of the number x + i*y. We want to find real numbers + // u and v such that + // (u + i*v)^2 = x + i*y <=> + // u^2 - v^2 + i*2*u*v = x + i*v. + // By equating the real and imaginary parts we get: + // u^2 - v^2 = x + // 2*u*v = y. + // + // For x >= 0, this has the numerically stable solution + // u = sqrt(0.5 * (x + sqrt(x^2 + y^2))) + // v = 0.5 * (y / u) + // and for x < 0, + // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2))) + // u = 0.5 * (y / v) + // + // To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as + // l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) , + + // In the following, without lack of generality, we have annotated the code, assuming + // that the input is a packet of 2 complex numbers. + // + // Step 1. Compute l = [l0, l1], where + // l0 = sqrt(x0^2 + y0^2), l1 = sqrt(x1^2 + y1^2) + // To avoid over- and underflow, we use the stable formula for each hypotenuse + // l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)), + // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1. + + Packet a_abs = Packet(pabs(a.real), pabs(a.imag)); + RealPacket a_max = pmax(a_abs.real, a_abs.imag); + RealPacket a_min = pmin(a_abs.real, a_abs.imag); + + PacketMask a_min_zero_mask = pcmp_eq_mask(a_min, pzero(a_min)); + PacketMask a_max_zero_mask = pcmp_eq_mask(a_max, pzero(a_max)); + RealPacket r = pdiv(a_min, a_max); + + const RealPacket cst_one = pset1(RealScalar(1)); + const RealPacket cst_true = ptrue(cst_one); + RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r)))); + // Set l to a_max if a_min is zero. + l = pselect(a_min_zero_mask, a_max, l); + + // Step 2. Compute [rho0, rho1], where + // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 = sqrt(0.5 * (l1 + |x1|)) + // We don't care about the imaginary parts computed here. They will be overwritten later. + const RealPacket cst_half = pset1(RealScalar(0.5)); + RealPacket rho = psqrt(pmul(cst_half, padd(a_abs.real, l))); + + // Step 3. Compute [rho0, rho1, eta0, eta1], where + // eta0 = (y0 / rho0) / 2, and eta1 = (y1 / rho1) / 2. + // set eta = 0 of input is 0 + i0. + RealPacket eta = pselect(a_max_zero_mask, pzero(cst_one), pmul(cst_half, pdiv(a.imag, rho))); + // Compute result for inputs with positive real part. + Packet positive_real_result = Packet(rho, eta); + + // Step 4. Compute solution for inputs with negative real part: + // [|eta0| |eta1|, sign(y0)*rho0, sign(y1)*rho1] + const RealPacket cst_imag_sign_mask = pset1(RealScalar(-0.0)); + RealPacket imag_signs = pand(a.imag, cst_imag_sign_mask); + Packet negative_real_result = Packet(pabs(eta), por(rho, imag_signs)); + + // Step 5. Select solution branch based on the sign of the real parts. + PacketMask negative_real_mask_half = pcmp_lt_mask(a.real, pzero(a.real)); + Packet result = Packet(pselect(negative_real_mask_half, negative_real_result.real, positive_real_result.real), + pselect(negative_real_mask_half, negative_real_result.imag, positive_real_result.imag)); + + // Step 6. Handle special cases for infinities: + // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN + // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN + // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y + // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y + const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); + PacketMask is_real_inf = pcmp_eq_mask(a_abs.real, cst_pos_inf); + // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part. + const Packet cst_one_zero = pset1(Scalar(RealScalar(1.0), RealScalar(0.0))); + Packet real_inf_result = Packet(pmul(a_abs.real, cst_one_zero.real), pmul(a_abs.imag, cst_one_zero.imag)); + real_inf_result = Packet(pselect(negative_real_mask_half, real_inf_result.imag, real_inf_result.real), + pselect(negative_real_mask_half, real_inf_result.real, real_inf_result.imag)); + // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part. + PacketMask is_imag_inf = pcmp_eq_mask(a_abs.imag, cst_pos_inf); + // unless otherwise specified, if either the real or imaginary component is nan, the entire result is nan + result = Packet(pselect(pcmp_eq_mask(result.real, result.real), result.real, cst_true), + pselect(pcmp_eq_mask(result.imag, result.imag), result.imag, cst_true)); + + result = Packet(pselect(is_real_inf, real_inf_result.real, result.real), + pselect(is_real_inf, real_inf_result.imag, result.imag)); + + return Packet(pselect(is_imag_inf, cst_pos_inf, result.real), pselect(is_imag_inf, a.imag, result.imag)); +} + +template +EIGEN_STRONG_INLINE Packet plog_complex_rvv(const Packet& x) { + typedef typename unpacket_traits::type Scalar; + typedef typename Scalar::value_type RealScalar; + typedef typename packet_traits::type RealPacket; + typedef typename unpacket_traits::packet_mask PacketMask; + + // log(sqrt(a^2 + b^2)), atan2(b, a) + RealPacket xlogr = plog(psqrt(padd(pmul(x.real, x.real), pmul(x.imag, x.imag)))); + RealPacket ximg = patan2(x.imag, x.real); + + const RealPacket cst_pos_inf = pset1(NumTraits::infinity()); + RealPacket r_abs = pabs(x.real); + RealPacket i_abs = pabs(x.imag); + PacketMask is_r_pos_inf = pcmp_eq_mask(r_abs, cst_pos_inf); + PacketMask is_i_pos_inf = pcmp_eq_mask(i_abs, cst_pos_inf); + PacketMask is_any_inf = por(is_r_pos_inf, is_i_pos_inf); + RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr); + + return Packet(xreal, ximg); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf psqrt(const PacketXcf& a) { + return psqrt_complex_rvv(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXcf plog(const PacketXcf& a) { + return plog_complex_rvv(a); +} + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketMul2Xf& x, const PacketXcf& y, const PacketXcf& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcf pmul(const PacketMul2Xf& x, const PacketXcf& y) const { + return PacketXcf(Eigen::internal::pmul(x, pcast(y))); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcf pmadd(const PacketXcf& x, const PacketMul2Xf& y, const PacketXcf& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcf pmul(const PacketXcf& x, const PacketMul2Xf& y) const { + return PacketXcf(Eigen::internal::pmul(pcast(x), y)); + } +}; + +/********************************* double ************************************/ + +struct PacketXcd { + EIGEN_STRONG_INLINE PacketXcd() {} + EIGEN_STRONG_INLINE explicit PacketXcd(const PacketXd& _real, const PacketXd& _imag) : real(_real), imag(_imag) {} + EIGEN_STRONG_INLINE explicit PacketXcd(const PacketMul2Xd& a) + : real(__riscv_vget_v_f64m2_f64m1(a, 0)), imag(__riscv_vget_v_f64m2_f64m1(a, 1)) {} + PacketXd real; + PacketXd imag; +}; + +template +struct packet_traits, LMul> : default_packet_traits { + typedef PacketXcd type; + typedef PacketXcd half; + enum { + Vectorizable = 1, + AlignedOnScalar = 0, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasSqrt = 1, + HasSign = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasLog = 0, + HasSetLinear = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef std::complex type; + typedef PacketXcd half; + typedef PacketMul2Xd as_real; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXcd pcast(const PacketMul2Xd& a) { + return PacketXcd(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXcd& a) { + return __riscv_vcreate_v_f64m1_f64m2(a.real, a.imag); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pset1(const std::complex& from) { + PacketXd real = pset1(from.real()); + PacketXd imag = pset1(from.imag()); + return PacketXcd(real, imag); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd padd(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(padd(a.real, b.real), padd(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd psub(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(psub(a.real, b.real), psub(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pnegate(const PacketXcd& a) { + return PacketXcd(pnegate(a.real), pnegate(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pconj(const PacketXcd& a) { + return PacketXcd( + a.real, __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vx_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a.imag), 0x8000000000000000, unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pmul(const PacketXcd& a, const PacketXcd& b) { + PacketXd v1 = pmul(a.real, b.real); + PacketXd v2 = pmul(a.imag, b.imag); + PacketXd v3 = pmul(a.real, b.imag); + PacketXd v4 = pmul(a.imag, b.real); + return PacketXcd(psub(v1, v2), padd(v3, v4)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketXcd& a, const PacketXcd& b, const PacketXcd& c) { + PacketXd v1 = pmadd(a.real, b.real, c.real); + PacketXd v2 = pmul(a.imag, b.imag); + PacketXd v3 = pmadd(a.real, b.imag, c.imag); + PacketXd v4 = pmul(a.imag, b.real); + return PacketXcd(psub(v1, v2), padd(v3, v4)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pcmp_eq(const PacketXcd& a, const PacketXcd& b) { + PacketMask64 eq_both = pand(pcmp_eq_mask(a.real, b.real), pcmp_eq_mask(a.imag, b.imag)); + PacketXd res = pselect(eq_both, ptrue(a.real), pzero(a.real)); + return PacketXcd(res, res); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pand(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(pand(a.real, b.real), pand(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd por(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(por(a.real, b.real), por(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pxor(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(pxor(a.real, b.real), pxor(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pandnot(const PacketXcd& a, const PacketXcd& b) { + return PacketXcd(pandnot(a.real, b.real), pandnot(a.imag, b.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pload(const std::complex* from) { + vfloat64m1x2_t res = __riscv_vlseg2e64_v_f64m1x2((const double*)from, unpacket_traits::size); + EIGEN_DEBUG_ALIGNED_LOAD return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), __riscv_vget_v_f64m1x2_f64m1(res, 1)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd ploadu(const std::complex* from) { + vfloat64m1x2_t res = __riscv_vlseg2e64_v_f64m1x2((const double*)from, unpacket_traits::size); + EIGEN_DEBUG_UNALIGNED_LOAD return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), + __riscv_vget_v_f64m1x2_f64m1(res, 1)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd ploaddup(const std::complex* from) { + PacketXul real_idx = __riscv_vid_v_u64m1(unpacket_traits::size); + real_idx = + __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffeu, unpacket_traits::size), 3, + unpacket_traits::size); + PacketXul imag_idx = __riscv_vadd_vx_u64m1(real_idx, sizeof(double), unpacket_traits::size); + // real_idx = 0 0 2*sizeof(double) 2*sizeof(double) 4*sizeof(double) 4*sizeof(double) ... + return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), + __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd ploadquad(const std::complex* from) { + PacketXul real_idx = __riscv_vid_v_u64m1(unpacket_traits::size); + real_idx = + __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(real_idx, 0xfffffffffffffffcu, unpacket_traits::size), 2, + unpacket_traits::size); + PacketXul imag_idx = __riscv_vadd_vx_u64m1(real_idx, sizeof(double), unpacket_traits::size); + // real_idx = 0 0 2*sizeof(double) 2*sizeof(double) 4*sizeof(double) 4*sizeof(double) ... + return PacketXcd(__riscv_vloxei64_v_f64m1((const double*)from, real_idx, unpacket_traits::size), + __riscv_vloxei64_v_f64m1((const double*)from, imag_idx, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE void pstore >(std::complex* to, const PacketXcd& from) { + vfloat64m1x2_t vx2 = __riscv_vundefined_f64m1x2(); + vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 0, from.real); + vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 1, from.imag); + EIGEN_DEBUG_ALIGNED_STORE __riscv_vsseg2e64_v_f64m1x2((double*)to, vx2, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const PacketXcd& from) { + vfloat64m1x2_t vx2 = __riscv_vundefined_f64m1x2(); + vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 0, from.real); + vx2 = __riscv_vset_v_f64m1_f64m1x2(vx2, 1, from.imag); + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vsseg2e64_v_f64m1x2((double*)to, vx2, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXcd pgather, PacketXcd>(const std::complex* from, + Index stride) { + vfloat64m1x2_t res = + __riscv_vlsseg2e64_v_f64m1x2((const double*)from, 2 * stride * sizeof(double), unpacket_traits::size); + return PacketXcd(__riscv_vget_v_f64m1x2_f64m1(res, 0), __riscv_vget_v_f64m1x2_f64m1(res, 1)); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter, PacketXcd>(std::complex* to, const PacketXcd& from, + Index stride) { + vfloat64m1x2_t from_rvv_type = __riscv_vundefined_f64m1x2(); + from_rvv_type = __riscv_vset_v_f64m1_f64m1x2(from_rvv_type, 0, from.real); + from_rvv_type = __riscv_vset_v_f64m1_f64m1x2(from_rvv_type, 1, from.imag); + __riscv_vssseg2e64_v_f64m1x2((double*)to, 2 * stride * sizeof(double), from_rvv_type, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE std::complex pfirst(const PacketXcd& a) { + return std::complex(pfirst(a.real), pfirst(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd preverse(const PacketXcd& a) { + return PacketXcd(preverse(a.real), preverse(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pcplxflip(const PacketXcd& a) { + return PacketXcd(a.imag, a.real); +} + +template <> +EIGEN_STRONG_INLINE std::complex predux(const PacketXcd& a) { + return std::complex(predux(a.real), predux(a.imag)); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd pdiv(const PacketXcd& a, const PacketXcd& b) { + PacketXcd b_conj = pconj(b); + PacketXcd dividend = pmul(a, b_conj); + PacketXd divider = psub(pmul(b.real, b_conj.real), pmul(b.imag, b_conj.imag)); + return PacketXcd(pdiv(dividend.real, divider), pdiv(dividend.imag, divider)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer_real[unpacket_traits::size * N]; + double buffer_imag[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer_real[i], N * sizeof(double), kernel.packet[i].real, unpacket_traits::size); + __riscv_vsse64(&buffer_imag[i], N * sizeof(double), kernel.packet[i].imag, unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i].real = + __riscv_vle64_v_f64m1(&buffer_real[i * unpacket_traits::size], unpacket_traits::size); + kernel.packet[i].imag = + __riscv_vle64_v_f64m1(&buffer_imag[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketXcd psqrt(const PacketXcd& a) { + return psqrt_complex_rvv(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXcd plog(const PacketXcd& a) { + return plog_complex_rvv(a); +} + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketMul2Xd& x, const PacketXcd& y, const PacketXcd& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcd pmul(const PacketMul2Xd& x, const PacketXcd& y) const { + return PacketXcd(Eigen::internal::pmul(x, pcast(y))); + } +}; + +template <> +struct conj_helper { + EIGEN_STRONG_INLINE PacketXcd pmadd(const PacketXcd& x, const PacketMul2Xd& y, const PacketXcd& c) const { + return padd(c, this->pmul(x, y)); + } + EIGEN_STRONG_INLINE PacketXcd pmul(const PacketXcd& x, const PacketMul2Xd& y) const { + return PacketXcd(Eigen::internal::pmul(pcast(x), y)); + } +}; + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_COMPLEX_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..212df434f3a54b5e2c20ca5edf3837cd98d8836e --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/GeneralBlockPanelKernel.h @@ -0,0 +1,491 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H +#define EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/********************************* real ************************************/ + +template <> +struct gebp_traits + : gebp_traits { + typedef float RhsPacket; + typedef QuadPacket RhsPacketx4; + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = __riscv_vfmadd_vf_f32m1(a, b, c, unpacket_traits::size); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, + const LaneIdType& lane) const { + c = __riscv_vfmadd_vf_f32m1(a, b.get(lane), c, unpacket_traits::size); + } +}; + +template <> +struct gebp_traits + : gebp_traits { + typedef double RhsPacket; + typedef QuadPacket RhsPacketx4; + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = __riscv_vfmadd_vf_f64m1(a, b, c, unpacket_traits::size); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, + const LaneIdType& lane) const { + c = __riscv_vfmadd_vf_f64m1(a, b.get(lane), c, unpacket_traits::size); + } +}; + +#if defined(EIGEN_VECTORIZE_RVV10FP16) + +template <> +struct gebp_traits + : gebp_traits { + typedef half RhsPacket; + typedef PacketXh LhsPacket; + typedef PacketXh AccPacket; + typedef QuadPacket RhsPacketx4; + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = pset1(*b); } + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = pload(b); } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, + const FixedInt<0>&) const { + c = __riscv_vfmadd_vf_f16m1(a, b, c, unpacket_traits::size); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, + const LaneIdType& lane) const { + c = __riscv_vfmadd_vf_f16m1(a, b.get(lane), c, unpacket_traits::size); + } +}; + +#endif + +/********************************* complex ************************************/ + +#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ + typedef typename packet_conditional< \ + packet_size, typename packet_traits::type, typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type name##Packet##postfix + +#define RISCV_COMPLEX_PACKET_DECL_COND_SCALAR(packet_size) \ + typedef typename packet_conditional< \ + packet_size, typename packet_traits::type, typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type ScalarPacket + +template +struct gebp_traits, std::complex, ConjLhs_, ConjRhs_, Architecture::RVV10, + PacketSize_> : gebp_traits, std::complex, ConjLhs_, ConjRhs_, + Architecture::Generic, PacketSize_> { + typedef std::complex Scalar; + typedef std::complex LhsScalar; + typedef std::complex RhsScalar; + typedef std::complex ResScalar; + typedef typename packet_traits>::type RealPacket; + + PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); + RISCV_COMPLEX_PACKET_DECL_COND_SCALAR(PacketSize_); +#undef RISCV_COMPLEX_PACKET_DECL_COND_SCALAR + + enum { + ConjLhs = ConjLhs_, + ConjRhs = ConjRhs_, + Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, + ResPacketSize = Vectorizable ? unpacket_traits::size : 1, + LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RealPacketSize = Vectorizable ? unpacket_traits::size : 1, + + nr = 4, + mr = ResPacketSize, + + LhsProgress = ResPacketSize, + RhsProgress = 1 + }; + + typedef DoublePacket DoublePacketType; + + typedef std::conditional_t LhsPacket4Packing; + typedef std::conditional_t LhsPacket; + typedef std::conditional_t, Scalar> RhsPacket; + typedef std::conditional_t ResPacket; + typedef std::conditional_t AccPacket; + + typedef QuadPacket RhsPacketx4; + + EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } + + EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) { + p.first = pset1(RealScalar(0)); + p.second = pset1(RealScalar(0)); + } + + // Scalar path + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { dest = pset1(*b); } + + // Vectorized path + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const { + dest.first = pset1(numext::real(*b)); + dest.second = pset1(numext::imag(*b)); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + loadRhs(b, dest.B_0); + loadRhs(b + 1, dest.B1); + loadRhs(b + 2, dest.B2); + loadRhs(b + 3, dest.B3); + } + + // Scalar path + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { loadRhs(b, dest); } + + // Vectorized path + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket& dest) const { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { loadRhs(b, dest); } + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacket& dest) const { + loadQuadToDoublePacket(b, dest); + } + + // nothing special here + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { + dest = pload((const typename unpacket_traits::type*)(a)); + } + + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { + dest = ploadu((const typename unpacket_traits::type*)(a)); + } + + EIGEN_STRONG_INLINE PacketXcf pmadd_scalar(const PacketXcf& a, float b, const PacketXcf& c) const { + PacketXf v1 = __riscv_vfmadd_vf_f32m1(a.real, b, c.real, unpacket_traits::size); + PacketXf v4 = __riscv_vfmadd_vf_f32m1(a.imag, b, c.imag, unpacket_traits::size); + return PacketXcf(v1, v4); + } + + EIGEN_STRONG_INLINE PacketXcd pmadd_scalar(const PacketXcd& a, double b, const PacketXcd& c) const { + PacketXd v1 = __riscv_vfmadd_vf_f64m1(a.real, b, c.real, unpacket_traits::size); + PacketXd v4 = __riscv_vfmadd_vf_f64m1(a.imag, b, c.imag, unpacket_traits::size); + return PacketXcd(v1, v4); + } + + template + EIGEN_STRONG_INLINE std::enable_if_t::value> madd(const LhsPacketType& a, + const RhsPacketType& b, + DoublePacket& c, + TmpType& /*tmp*/, + const LaneIdType&) const { + c.first = pmadd_scalar(a, b.first, c.first); + c.second = pmadd_scalar(a, b.second, c.second); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, + const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, + const LaneIdType&) const { + c = cj.pmadd(a, b, c); + } + + protected: + conj_helper cj; +}; + +#define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size) \ + typedef typename packet_conditional< \ + packet_size, typename packet_traits::type, typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type ScalarPacket##postfix + +template +class gebp_traits, false, ConjRhs_, Architecture::RVV10, PacketSize_> + : public gebp_traits, false, ConjRhs_, Architecture::Generic, PacketSize_> { + public: + typedef std::complex Scalar; + typedef RealScalar LhsScalar; + typedef Scalar RhsScalar; + typedef Scalar ResScalar; + PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Real, PacketSize_); + PACKET_DECL_COND_SCALAR_POSTFIX(_, PacketSize_); +#undef PACKET_DECL_COND_SCALAR_POSTFIX + + enum { + ConjLhs = false, + ConjRhs = ConjRhs_, + Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits::size : 1, + + NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, + // FIXME: should depend on NumberOfRegisters + nr = 4, + mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * ResPacketSize, + + LhsProgress = ResPacketSize, + RhsProgress = 1 + }; + + typedef std::conditional_t LhsPacket; + typedef RhsScalar RhsPacket; + typedef std::conditional_t ResPacket; + typedef LhsPacket LhsPacket4Packing; + typedef QuadPacket RhsPacketx4; + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } + + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { + dest = pset1(*b); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload(a); } + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); } + + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { + dest = ploadu((const typename unpacket_traits::type*)a); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, + const LaneIdType&) const { + madd_impl(a, b, c, tmp, std::conditional_t()); + } + + EIGEN_STRONG_INLINE PacketXcf pmadd_scalar(const PacketXf& a, std::complex b, const PacketXcf& c) const { + PacketXf v1 = __riscv_vfmadd_vf_f32m1(a, b.real(), c.real, unpacket_traits::size); + PacketXf v3 = __riscv_vfmadd_vf_f32m1(a, b.imag(), c.imag, unpacket_traits::size); + return PacketXcf(v1, v3); + } + + EIGEN_STRONG_INLINE PacketXcd pmadd_scalar(const PacketXd& a, std::complex b, const PacketXcd& c) const { + PacketXd v1 = __riscv_vfmadd_vf_f64m1(a, b.real(), c.real, unpacket_traits::size); + PacketXd v3 = __riscv_vfmadd_vf_f64m1(a, b.imag(), c.imag, unpacket_traits::size); + return PacketXcd(v1, v3); + } + + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, + RhsPacketType& tmp, const true_type&) const { + EIGEN_UNUSED_VARIABLE(tmp); + c = pmadd_scalar(a, b, c); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, + const false_type&) const { + c += a * b; + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, + const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + conj_helper cj; + r = cj.pmadd(alpha, c, r); + } +}; + +template +class gebp_traits, RealScalar, ConjLhs_, false, Architecture::RVV10, PacketSize_> + : public gebp_traits, ConjLhs_, false, Architecture::Generic, PacketSize_> { + public: + typedef std::complex LhsScalar; + typedef RealScalar RhsScalar; + typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; + + PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); + PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_); +#undef PACKET_DECL_COND_POSTFIX + + enum { + ConjLhs = ConjLhs_, + ConjRhs = false, + Vectorizable = unpacket_traits::vectorizable && unpacket_traits::vectorizable, + LhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, + ResPacketSize = Vectorizable ? unpacket_traits::size : 1, + + nr = 4, + mr = 3 * LhsPacketSize, + + LhsProgress = LhsPacketSize, + RhsProgress = 1 + }; + + typedef std::conditional_t LhsPacket; + typedef RhsScalar RhsPacket; + typedef std::conditional_t ResPacket; + typedef LhsPacket LhsPacket4Packing; + + typedef QuadPacket RhsPacketx4; + + typedef ResPacket AccPacket; + + EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } + + template + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { + dest = pset1(*b); + } + + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { + pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + } + + template + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const { + loadRhs(b, dest); + } + + EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} + + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { + loadRhsQuad_impl(b, dest, std::conditional_t()); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const { + // FIXME we can do better! + // what we want here is a ploadheight + RhsScalar tmp[4] = {b[0], b[0], b[1], b[1]}; + dest = ploadquad(tmp); + } + + EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const { + eigen_internal_assert(RhsPacketSize <= 8); + dest = pset1(*b); + } + + EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload(a); } + + template + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const { + dest = ploadu(a); + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, + const LaneIdType&) const { + madd_impl(a, b, c, tmp, std::conditional_t()); + } + + EIGEN_STRONG_INLINE PacketXcf pmadd_scalar(const PacketXcf& a, float b, const PacketXcf& c) const { + PacketXf v1 = __riscv_vfmadd_vf_f32m1(a.real, b, c.real, unpacket_traits::size); + PacketXf v3 = __riscv_vfmadd_vf_f32m1(a.imag, b, c.imag, unpacket_traits::size); + return PacketXcf(v1, v3); + } + + EIGEN_STRONG_INLINE PacketXcd pmadd_scalar(const PacketXcd& a, double b, const PacketXcd& c) const { + PacketXd v1 = __riscv_vfmadd_vf_f64m1(a.real, b, c.real, unpacket_traits::size); + PacketXd v3 = __riscv_vfmadd_vf_f64m1(a.imag, b, c.imag, unpacket_traits::size); + return PacketXcd(v1, v3); + } + + template + EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, + RhsPacketType& tmp, const true_type&) const { + EIGEN_UNUSED_VARIABLE(tmp); + c = pmadd_scalar(a, b, c); + } + + EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, + const false_type&) const { + c += a * b; + } + + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, + const LaneIdType& lane) const { + madd(a, b.get(lane), c, tmp, lane); + } + + template + EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const { + conj_helper cj; + r = cj.pmadd(c, alpha, r); + } +}; + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_RVV10_GENERAL_BLOCK_KERNEL_H diff --git a/Eigen/src/Core/arch/RVV10/MathFunctions.h b/Eigen/src/Core/arch/RVV10/MathFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..a77496540a8d974ef97195a550a4916a31eead20 --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/MathFunctions.h @@ -0,0 +1,30 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_RVV10_H +#define EIGEN_MATH_FUNCTIONS_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketXf) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketMul2Xf) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PacketMul4Xf) + +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketXd) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketMul2Xd) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PacketMul4Xd) + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h new file mode 100644 index 0000000000000000000000000000000000000000..4daa0084d4e3bfee897f4830755727c82d93f3ba --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -0,0 +1,5177 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_RVV10_H +#define EIGEN_PACKET_MATH_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 + +template +struct rvv_packet_size_selector { + enum { size = VectorLength * VectorLMul / (sizeof(Scalar) * CHAR_BIT) }; +}; + +template +struct rvv_packet_alignment_selector { + enum { + alignment = + (VectorLength * VectorLMul) >= 1024 + ? Aligned128 + : ((VectorLength * VectorLMul) >= 512 ? Aligned64 + : ((VectorLength * VectorLMul) >= 256 ? Aligned32 : Aligned16)) + }; +}; + +typedef vbool64_t PacketMask64; +typedef vbool32_t PacketMask32; +typedef vbool16_t PacketMask16; +typedef vbool8_t PacketMask8; +typedef vbool4_t PacketMask4; + +/********************************* int32 **************************************/ +typedef vint32m1_t PacketXi __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vuint32m1_t PacketXu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); + +typedef vint32m2_t PacketMul2Xi __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); +typedef vuint32m2_t PacketMul2Xu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); + +typedef vint32m4_t PacketMul4Xi __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); +typedef vuint32m4_t PacketMul4Xu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXi type; + typedef PacketXi half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xi type; + typedef PacketXi half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xi type; + typedef PacketMul2Xi half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int32_t type; + typedef PacketXi half; // Half not yet implemented + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int32_t type; + typedef PacketXi half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int32_t type; + typedef PacketMul2Xi half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE void prefetch(const numext::int32_t* addr) { +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + __builtin_prefetch(addr); +#endif +} + +/********************************* PacketXi ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi plset(const numext::int32_t& a) { + PacketXi idx = __riscv_vid_v_i32m1(unpacket_traits::size); + return __riscv_vadd_vx_i32m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pzero(const PacketXi& /*a*/) { + return __riscv_vmv_v_x_i32m1(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi padd(const PacketXi& a, const PacketXi& b) { + return __riscv_vadd_vv_i32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi psub(const PacketXi& a, const PacketXi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmul(const PacketXi& a, const PacketXi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pdiv(const PacketXi& a, const PacketXi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmsub(const PacketXi& a, const PacketXi& b, const PacketXi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pnmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) { + return __riscv_vnmsub_vv_i32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pnmsub(const PacketXi& a, const PacketXi& b, const PacketXi& c) { + return __riscv_vnmsub_vv_i32m1(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmin(const PacketXi& a, const PacketXi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pmax(const PacketXi& a, const PacketXi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_le(const PacketXi& a, const PacketXi& b) { + PacketMask32 mask = __riscv_vmsle_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_lt(const PacketXi& a, const PacketXi& b) { + PacketMask32 mask = __riscv_vmslt_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcmp_eq(const PacketXi& a, const PacketXi& b) { + PacketMask32 mask = __riscv_vmseq_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ptrue(const PacketXi& /*a*/) { + return __riscv_vmv_v_x_i32m1(0xffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pand(const PacketXi& a, const PacketXi& b) { + return __riscv_vand_vv_i32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi por(const PacketXi& a, const PacketXi& b) { + return __riscv_vor_vv_i32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pxor(const PacketXi& a, const PacketXi& b) { + return __riscv_vxor_vv_i32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pandnot(const PacketXi& a, const PacketXi& b) { + return __riscv_vand_vv_i32m1(a, __riscv_vnot_v_i32m1(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) { + return __riscv_vsra_vx_i32m1(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) { + return __riscv_vreinterpret_i32m1( + __riscv_vsrl_vx_u32m1(__riscv_vreinterpret_u32m1(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a) { + return __riscv_vsll_vx_i32m1(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploaddup(const numext::int32_t* from) { + PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... + return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi ploadquad(const numext::int32_t* from) { + PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketXi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketXi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXi pgather(const numext::int32_t* from, Index stride) { + return __riscv_vlse32_v_i32m1(from, stride * sizeof(numext::int32_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketXi& from, + Index stride) { + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketXi& a) { + return __riscv_vmv_x_s_i32m1_i32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) { + PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a) { + PacketXi mask = __riscv_vsra_vx_i32m1(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const PacketXi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m1_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketXi& a) { + // Multiply the vector by its reverse + PacketXi prod = __riscv_vmul_vv_i32m1(preverse(a), a, unpacket_traits::size); + PacketXi half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_i32m1(prod, 8, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_i32m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_i32m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_i32m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketXi& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i32m1_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketXi& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i32m1_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_i32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul4Xi ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi plset(const numext::int32_t& a) { + PacketMul4Xi idx = __riscv_vid_v_i32m4(unpacket_traits::size); + return __riscv_vadd_vx_i32m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pzero(const PacketMul4Xi& /*a*/) { + return __riscv_vmv_v_x_i32m4(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi padd(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vadd_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi psub(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pnegate(const PacketMul4Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pconj(const PacketMul4Xi& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmul(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pdiv(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmadd(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmsub(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pnmadd(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vnmsub_vv_i32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pnmsub(const PacketMul4Xi& a, const PacketMul4Xi& b, const PacketMul4Xi& c) { + return __riscv_vnmsub_vv_i32m4(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmin(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pmax(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcmp_le(const PacketMul4Xi& a, const PacketMul4Xi& b) { + PacketMask8 mask = __riscv_vmsle_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcmp_lt(const PacketMul4Xi& a, const PacketMul4Xi& b) { + PacketMask8 mask = __riscv_vmslt_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcmp_eq(const PacketMul4Xi& a, const PacketMul4Xi& b) { + PacketMask8 mask = __riscv_vmseq_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi ptrue(const PacketMul4Xi& /*a*/) { + return __riscv_vmv_v_x_i32m4(0xffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pand(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vand_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi por(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vor_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pxor(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vxor_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pandnot(const PacketMul4Xi& a, const PacketMul4Xi& b) { + return __riscv_vand_vv_i32m4(a, __riscv_vnot_v_i32m4(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xi parithmetic_shift_right(PacketMul4Xi a) { + return __riscv_vsra_vx_i32m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_right(PacketMul4Xi a) { + return __riscv_vreinterpret_i32m4( + __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_u32m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xi plogical_shift_left(PacketMul4Xi a) { + return __riscv_vsll_vx_i32m4(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi ploaddup(const numext::int32_t* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... + return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi ploadquad(const numext::int32_t* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul4Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul4Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xi pgather(const numext::int32_t* from, + Index stride) { + return __riscv_vlse32_v_i32m4(from, stride * sizeof(numext::int32_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul4Xi& from, + Index stride) { + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul4Xi& a) { + return __riscv_vmv_x_s_i32m4_i32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi preverse(const PacketMul4Xi& a) { + PacketMul4Xu idx = + __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pabs(const PacketMul4Xi& a) { + PacketMul4Xi mask = __riscv_vsra_vx_i32m4(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m4(__riscv_vxor_vv_i32m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul4Xi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul4Xi& a) { + PacketXi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), __riscv_vget_v_i32m4_i32m1(a, 1), + unpacket_traits::size); + PacketXi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), __riscv_vget_v_i32m4_i32m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul4Xi& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul4Xi& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_i32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul2Xi ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi plset(const numext::int32_t& a) { + PacketMul2Xi idx = __riscv_vid_v_i32m2(unpacket_traits::size); + return __riscv_vadd_vx_i32m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pzero(const PacketMul2Xi& /*a*/) { + return __riscv_vmv_v_x_i32m2(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi padd(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vadd_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi psub(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pnegate(const PacketMul2Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pconj(const PacketMul2Xi& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmul(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pdiv(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmadd(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmsub(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pnmadd(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vnmsub_vv_i32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pnmsub(const PacketMul2Xi& a, const PacketMul2Xi& b, const PacketMul2Xi& c) { + return __riscv_vnmsub_vv_i32m2(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmin(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pmax(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcmp_le(const PacketMul2Xi& a, const PacketMul2Xi& b) { + PacketMask16 mask = __riscv_vmsle_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcmp_lt(const PacketMul2Xi& a, const PacketMul2Xi& b) { + PacketMask16 mask = __riscv_vmslt_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcmp_eq(const PacketMul2Xi& a, const PacketMul2Xi& b) { + PacketMask16 mask = __riscv_vmseq_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ptrue(const PacketMul2Xi& /*a*/) { + return __riscv_vmv_v_x_i32m2(0xffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pand(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vand_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi por(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vor_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pxor(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vxor_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pandnot(const PacketMul2Xi& a, const PacketMul2Xi& b) { + return __riscv_vand_vv_i32m2(a, __riscv_vnot_v_i32m2(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xi parithmetic_shift_right(PacketMul2Xi a) { + return __riscv_vsra_vx_i32m2(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_right(PacketMul2Xi a) { + return __riscv_vreinterpret_i32m2( + __riscv_vsrl_vx_u32m2(__riscv_vreinterpret_u32m2(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xi plogical_shift_left(PacketMul2Xi a) { + return __riscv_vsll_vx_i32m2(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ploaddup(const numext::int32_t* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... + return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi ploadquad(const numext::int32_t* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const PacketMul2Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const PacketMul2Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xi pgather(const numext::int32_t* from, + Index stride) { + return __riscv_vlse32_v_i32m2(from, stride * sizeof(numext::int32_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const PacketMul2Xi& from, + Index stride) { + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const PacketMul2Xi& a) { + return __riscv_vmv_x_s_i32m2_i32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi preverse(const PacketMul2Xi& a) { + PacketMul2Xu idx = + __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pabs(const PacketMul2Xi& a) { + PacketMul2Xi mask = __riscv_vsra_vx_i32m2(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m2(__riscv_vxor_vv_i32m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const PacketMul2Xi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const PacketMul2Xi& a) { + return predux_mul(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const PacketMul2Xi& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const PacketMul2Xi& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_i32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xi>::type + predux_half_dowto4(const PacketMul4Xi& a) { + return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), __riscv_vget_v_i32m4_i32m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXi>::type + predux_half_dowto4(const PacketMul2Xi& a) { + return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), + unpacket_traits::size); +} + +/********************************* float32 ************************************/ + +typedef vfloat32m1_t PacketXf __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vfloat32m2_t PacketMul2Xf __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); +typedef vfloat32m4_t PacketMul4Xf __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXf type; + typedef PacketXf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xf type; + typedef PacketXf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xf type; + typedef PacketMul2Xf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef PacketXf half; // Half not yet implemented + typedef PacketXi integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask32 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef PacketXf half; + typedef PacketMul2Xi integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask16 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef PacketMul2Xf half; + typedef PacketMul4Xi integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask8 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +/********************************* PacketXf ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXf ptrue(const PacketXf& /*a*/) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(0xffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pzero(const PacketXf& /*a*/) { + return __riscv_vfmv_v_f_f32m1(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a) { + return __riscv_vfabs_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf plset(const float& a) { + PacketXf idx = + __riscv_vfcvt_f_x_v_f32m1(__riscv_vid_v_i32m1(unpacket_traits::size), unpacket_traits::size); + return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf padd(const PacketXf& a, const PacketXf& b) { + return __riscv_vfadd_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf psub(const PacketXf& a, const PacketXf& b) { + return __riscv_vfsub_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a) { + return __riscv_vfneg_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmul(const PacketXf& a, const PacketXf& b) { + return __riscv_vfmul_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pdiv(const PacketXf& a, const PacketXf& b) { + return __riscv_vfdiv_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfmadd_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmsub(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfmsub_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pnmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfnmsub_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pnmsub(const PacketXf& a, const PacketXf& b, const PacketXf& c) { + return __riscv_vfnmadd_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { + PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f32m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmin(const PacketXf& a, const PacketXf& b) { + return __riscv_vfmin_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { + PacketXf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f32m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pmax(const PacketXf& a, const PacketXf& b) { + return __riscv_vfmax_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_le(const PacketXf& a, const PacketXf& b) { + PacketMask32 mask = __riscv_vmfle_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_lt(const PacketXf& a, const PacketXf& b) { + PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_eq(const PacketXf& a, const PacketXf& b) { + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan(const PacketXf& a, const PacketXf& b) { + PacketMask32 mask = __riscv_vmfge_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m1(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketXf pand(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf por(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vor_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pxor(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pandnot(const PacketXf& a, const PacketXf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), + __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploaddup(const float* from) { + PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf ploadquad(const float* from) { + PacketXu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const PacketXf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketXf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m1(from, stride * sizeof(float), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketXf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const PacketXf& a) { + return __riscv_vfmv_f_s_f32m1_f32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXf psqrt(const PacketXf& a) { + return __riscv_vfsqrt_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf print(const PacketXf& a) { + const PacketXf limit = pset1(static_cast(1 << 23)); + const PacketXf abs_a = pabs(a); + + PacketMask32 mask = __riscv_vmfne_vv_f32m1_b32(a, a, unpacket_traits::size); + const PacketXf x = __riscv_vfadd_vv_f32m1_tum(mask, a, a, a, unpacket_traits::size); + const PacketXf new_x = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(a, unpacket_traits::size), + unpacket_traits::size); + + mask = __riscv_vmflt_vv_f32m1_b32(abs_a, limit, unpacket_traits::size); + PacketXf signed_x = __riscv_vfsgnj_vv_f32m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pfloor(const PacketXf& a) { + PacketXf tmp = print(a); + // If greater, subtract one. + PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m1_tum(mask, tmp, tmp, 1.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) { + PacketXu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf pfrexp(const PacketXf& a, PacketXf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const PacketXf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketXf& a) { + // Multiply the vector by its reverse + PacketXf prod = __riscv_vfmul_vv_f32m1(preverse(a), a, unpacket_traits::size); + PacketXf half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const PacketXf& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const PacketXf& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_f32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketXf pldexp(const PacketXf& a, const PacketXf& exponent) { + return pldexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE PacketMask32 por(const PacketMask32& a, const PacketMask32& b) { + return __riscv_vmor_mm_b32(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMask32 pand(const PacketMask32& a, const PacketMask32& b) { + return __riscv_vmand_mm_b32(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask32 pcmp_eq_mask(const PacketXf& a, const PacketXf& b) { + return __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask32 pcmp_lt_mask(const PacketXf& a, const PacketXf& b) { + return __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketXf pselect(const PacketMask32& mask, const PacketXf& a, const PacketXf& b) { + return __riscv_vmerge_vvm_f32m1(b, a, mask, unpacket_traits::size); +} + +/********************************* PacketMul4Xf ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ptrue(const PacketMul4Xf& /*a*/) { + return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(0xffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pzero(const PacketMul4Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m4(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pabs(const PacketMul4Xf& a) { + return __riscv_vfabs_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf plset(const float& a) { + PacketMul4Xf idx = __riscv_vfcvt_f_x_v_f32m4(__riscv_vid_v_i32m4(unpacket_traits::size), + unpacket_traits::size); + return __riscv_vfadd_vf_f32m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf padd(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfadd_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf psub(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfsub_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pnegate(const PacketMul4Xf& a) { + return __riscv_vfneg_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pconj(const PacketMul4Xf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmul(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfmul_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pdiv(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfdiv_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmadd(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfmadd_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmsub(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfmsub_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pnmadd(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfnmsub_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pnmsub(const PacketMul4Xf& a, const PacketMul4Xf& b, const PacketMul4Xf& c) { + return __riscv_vfnmadd_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMul4Xf nans = + __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f32m4_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmin(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfmin_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMul4Xf nans = + __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f32m4_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pmax(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vfmax_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_le(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmfle_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_eq(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcmp_lt_or_nan(const PacketMul4Xf& a, const PacketMul4Xf& b) { + PacketMask8 mask = __riscv_vmfge_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m4(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pand(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf por(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pxor(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pandnot(const PacketMul4Xf& a, const PacketMul4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4( + __riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vnot_v_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ploaddup(const float* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf ploadquad(const float* from) { + PacketMul4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul4Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul4Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m4(from, stride * sizeof(float), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul4Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const PacketMul4Xf& a) { + return __riscv_vfmv_f_s_f32m4_f32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf psqrt(const PacketMul4Xf& a) { + return __riscv_vfsqrt_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf print(const PacketMul4Xf& a) { + const PacketMul4Xf limit = pset1(static_cast(1 << 23)); + const PacketMul4Xf abs_a = pabs(a); + + PacketMask8 mask = __riscv_vmfne_vv_f32m4_b8(a, a, unpacket_traits::size); + const PacketMul4Xf x = __riscv_vfadd_vv_f32m4_tum(mask, a, a, a, unpacket_traits::size); + const PacketMul4Xf new_x = __riscv_vfcvt_f_x_v_f32m4( + __riscv_vfcvt_x_f_v_i32m4(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f32m4_b8(abs_a, limit, unpacket_traits::size); + PacketMul4Xf signed_x = __riscv_vfsgnj_vv_f32m4(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pfloor(const PacketMul4Xf& a) { + PacketMul4Xf tmp = print(a); + // If greater, subtract one. + PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m4_tum(mask, tmp, tmp, 1.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf preverse(const PacketMul4Xf& a) { + PacketMul4Xu idx = + __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pfrexp(const PacketMul4Xf& a, PacketMul4Xf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const PacketMul4Xf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketMul4Xf& a) { + PacketXf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), __riscv_vget_v_f32m4_f32m1(a, 1), + unpacket_traits::size); + PacketXf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), __riscv_vget_v_f32m4_f32m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const PacketMul4Xf& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const PacketMul4Xf& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_f32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pldexp(const PacketMul4Xf& a, const PacketMul4Xf& exponent) { + return pldexp_generic(a, exponent); +} + +/********************************* PacketMul2Xf ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ptrue(const PacketMul2Xf& /*a*/) { + return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(0xffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pzero(const PacketMul2Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m2(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pabs(const PacketMul2Xf& a) { + return __riscv_vfabs_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf plset(const float& a) { + PacketMul2Xf idx = __riscv_vfcvt_f_x_v_f32m2(__riscv_vid_v_i32m2(unpacket_traits::size), + unpacket_traits::size); + return __riscv_vfadd_vf_f32m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf padd(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfadd_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf psub(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfsub_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pnegate(const PacketMul2Xf& a) { + return __riscv_vfneg_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pconj(const PacketMul2Xf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmul(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfmul_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pdiv(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfdiv_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmadd(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfmadd_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmsub(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfmsub_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pnmadd(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfnmsub_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pnmsub(const PacketMul2Xf& a, const PacketMul2Xf& b, const PacketMul2Xf& c) { + return __riscv_vfnmadd_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMul2Xf nans = + __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f32m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmin(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfmin_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMul2Xf nans = + __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f32m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pmax(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vfmax_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_le(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmfle_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_eq(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcmp_lt_or_nan(const PacketMul2Xf& a, const PacketMul2Xf& b) { + PacketMask16 mask = __riscv_vmfge_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m2(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pand(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf por(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pxor(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pandnot(const PacketMul2Xf& a, const PacketMul2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2( + __riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vnot_v_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ploaddup(const float* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf ploadquad(const float* from) { + PacketMul2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const PacketMul2Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const PacketMul2Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m2(from, stride * sizeof(float), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const PacketMul2Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const PacketMul2Xf& a) { + return __riscv_vfmv_f_s_f32m2_f32(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf psqrt(const PacketMul2Xf& a) { + return __riscv_vfsqrt_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf print(const PacketMul2Xf& a) { + const PacketMul2Xf limit = pset1(static_cast(1 << 23)); + const PacketMul2Xf abs_a = pabs(a); + + PacketMask16 mask = __riscv_vmfne_vv_f32m2_b16(a, a, unpacket_traits::size); + const PacketMul2Xf x = __riscv_vfadd_vv_f32m2_tum(mask, a, a, a, unpacket_traits::size); + const PacketMul2Xf new_x = __riscv_vfcvt_f_x_v_f32m2( + __riscv_vfcvt_x_f_v_i32m2(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f32m2_b16(abs_a, limit, unpacket_traits::size); + PacketMul2Xf signed_x = __riscv_vfsgnj_vv_f32m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pfloor(const PacketMul2Xf& a) { + PacketMul2Xf tmp = print(a); + // If greater, subtract one. + PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m2_tum(mask, tmp, tmp, 1.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf preverse(const PacketMul2Xf& a) { + PacketMul2Xu idx = + __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pfrexp(const PacketMul2Xf& a, PacketMul2Xf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const PacketMul2Xf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const PacketMul2Xf& a) { + return predux_mul(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const PacketMul2Xf& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const PacketMul2Xf& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_f32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pldexp(const PacketMul2Xf& a, const PacketMul2Xf& exponent) { + return pldexp_generic(a, exponent); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xf>::type + predux_half_dowto4(const PacketMul4Xf& a) { + return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), __riscv_vget_v_f32m4_f32m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXf>::type + predux_half_dowto4(const PacketMul2Xf& a) { + return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), + unpacket_traits::size); +} + +/********************************* int64 **************************************/ + +typedef vint64m1_t PacketXl __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vuint64m1_t PacketXul __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); + +typedef vint64m2_t PacketMul2Xl __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); +typedef vuint64m2_t PacketMul2Xul __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); + +typedef vint64m4_t PacketMul4Xl __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); +typedef vuint64m4_t PacketMul4Xul __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXl type; + typedef PacketXl half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xl type; + typedef PacketXl half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xl type; + typedef PacketMul2Xl half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int64_t type; + typedef PacketXl half; // Half not yet implemented + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int64_t type; + typedef PacketXl half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int64_t type; + typedef PacketMul2Xl half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE void prefetch(const numext::int64_t* addr) { +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + __builtin_prefetch(addr); +#endif +} + +/********************************* PacketXl ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl plset(const numext::int64_t& a) { + PacketXl idx = __riscv_vid_v_i64m1(unpacket_traits::size); + return __riscv_vadd_vx_i64m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pzero(const PacketXl& /*a*/) { + return __riscv_vmv_v_x_i64m1(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl padd(const PacketXl& a, const PacketXl& b) { + return __riscv_vadd_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl psub(const PacketXl& a, const PacketXl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pnegate(const PacketXl& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pconj(const PacketXl& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmul(const PacketXl& a, const PacketXl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pdiv(const PacketXl& a, const PacketXl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmadd(const PacketXl& a, const PacketXl& b, const PacketXl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmsub(const PacketXl& a, const PacketXl& b, const PacketXl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pnmadd(const PacketXl& a, const PacketXl& b, const PacketXl& c) { + return __riscv_vnmsub_vv_i64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pnmsub(const PacketXl& a, const PacketXl& b, const PacketXl& c) { + return __riscv_vnmsub_vv_i64m1(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmin(const PacketXl& a, const PacketXl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pmax(const PacketXl& a, const PacketXl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pcmp_le(const PacketXl& a, const PacketXl& b) { + PacketMask64 mask = __riscv_vmsle_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pcmp_lt(const PacketXl& a, const PacketXl& b) { + PacketMask64 mask = __riscv_vmslt_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pcmp_eq(const PacketXl& a, const PacketXl& b) { + PacketMask64 mask = __riscv_vmseq_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl ptrue(const PacketXl& /*a*/) { + return __riscv_vmv_v_x_i64m1(0xffffffffffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pand(const PacketXl& a, const PacketXl& b) { + return __riscv_vand_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl por(const PacketXl& a, const PacketXl& b) { + return __riscv_vor_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pxor(const PacketXl& a, const PacketXl& b) { + return __riscv_vxor_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pandnot(const PacketXl& a, const PacketXl& b) { + return __riscv_vand_vv_i64m1(a, __riscv_vnot_v_i64m1(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXl parithmetic_shift_right(PacketXl a) { + return __riscv_vsra_vx_i64m1(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXl plogical_shift_right(PacketXl a) { + return __riscv_vreinterpret_i64m1( + __riscv_vsrl_vx_u64m1(__riscv_vreinterpret_u64m1(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketXl plogical_shift_left(PacketXl a) { + return __riscv_vsll_vx_i64m1(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl ploaddup(const numext::int64_t* from) { + PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl ploadquad(const numext::int64_t* from) { + PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + ; + return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketXl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketXl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXl pgather(const numext::int64_t* from, Index stride) { + return __riscv_vlse64_v_i64m1(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketXl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketXl& a) { + return __riscv_vmv_x_s_i64m1_i64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXl preverse(const PacketXl& a) { + PacketXul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pabs(const PacketXl& a) { + PacketXl mask = __riscv_vsra_vx_i64m1(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m1(__riscv_vxor_vv_i64m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux(const PacketXl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m1_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketXl& a) { + // Multiply the vector by its reverse + PacketXl prod = __riscv_vmul_vv_i64m1(preverse(a), a, unpacket_traits::size); + PacketXl half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketXl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m1_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketXl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m1_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_i64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul4Xl ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl plset(const numext::int64_t& a) { + PacketMul4Xl idx = __riscv_vid_v_i64m4(unpacket_traits::size); + return __riscv_vadd_vx_i64m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pzero(const PacketMul4Xl& /*a*/) { + return __riscv_vmv_v_x_i64m4(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl padd(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vadd_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl psub(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pnegate(const PacketMul4Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pconj(const PacketMul4Xl& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmul(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pdiv(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmadd(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmsub(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pnmadd(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vnmsub_vv_i64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pnmsub(const PacketMul4Xl& a, const PacketMul4Xl& b, const PacketMul4Xl& c) { + return __riscv_vnmsub_vv_i64m4(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmin(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pmax(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcmp_le(const PacketMul4Xl& a, const PacketMul4Xl& b) { + PacketMask16 mask = __riscv_vmsle_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcmp_lt(const PacketMul4Xl& a, const PacketMul4Xl& b) { + PacketMask16 mask = __riscv_vmslt_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcmp_eq(const PacketMul4Xl& a, const PacketMul4Xl& b) { + PacketMask16 mask = __riscv_vmseq_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ptrue(const PacketMul4Xl& /*a*/) { + return __riscv_vmv_v_x_i64m4(0xffffffffffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pand(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vand_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl por(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vor_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pxor(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vxor_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pandnot(const PacketMul4Xl& a, const PacketMul4Xl& b) { + return __riscv_vand_vv_i64m4(a, __riscv_vnot_v_i64m4(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xl parithmetic_shift_right(PacketMul4Xl a) { + return __riscv_vsra_vx_i64m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_right(PacketMul4Xl a) { + return __riscv_vreinterpret_i64m4( + __riscv_vsrl_vx_u64m4(__riscv_vreinterpret_u64m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xl plogical_shift_left(PacketMul4Xl a) { + return __riscv_vsll_vx_i64m4(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ploaddup(const numext::int64_t* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl ploadquad(const numext::int64_t* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul4Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul4Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xl pgather(const numext::int64_t* from, + Index stride) { + return __riscv_vlse64_v_i64m4(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul4Xl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul4Xl& a) { + return __riscv_vmv_x_s_i64m4_i64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl preverse(const PacketMul4Xl& a) { + PacketMul4Xul idx = + __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pabs(const PacketMul4Xl& a) { + PacketMul4Xl mask = __riscv_vsra_vx_i64m4(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m4(__riscv_vxor_vv_i64m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul4Xl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul4Xl& a) { + PacketXl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), __riscv_vget_v_i64m4_i64m1(a, 1), + unpacket_traits::size); + PacketXl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), __riscv_vget_v_i64m4_i64m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul4Xl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul4Xl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_i64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul2Xl ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl plset(const numext::int64_t& a) { + PacketMul2Xl idx = __riscv_vid_v_i64m2(unpacket_traits::size); + return __riscv_vadd_vx_i64m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pzero(const PacketMul2Xl& /*a*/) { + return __riscv_vmv_v_x_i64m2(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl padd(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vadd_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl psub(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pnegate(const PacketMul2Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pconj(const PacketMul2Xl& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmul(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pdiv(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmadd(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmsub(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pnmadd(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vnmsub_vv_i64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pnmsub(const PacketMul2Xl& a, const PacketMul2Xl& b, const PacketMul2Xl& c) { + return __riscv_vnmsub_vv_i64m2(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmin(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pmax(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcmp_le(const PacketMul2Xl& a, const PacketMul2Xl& b) { + PacketMask32 mask = __riscv_vmsle_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcmp_lt(const PacketMul2Xl& a, const PacketMul2Xl& b) { + PacketMask32 mask = __riscv_vmslt_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcmp_eq(const PacketMul2Xl& a, const PacketMul2Xl& b) { + PacketMask32 mask = __riscv_vmseq_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ptrue(const PacketMul2Xl& /*a*/) { + return __riscv_vmv_v_x_i64m2(0xffffffffffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pand(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vand_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl por(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vor_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pxor(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vxor_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pandnot(const PacketMul2Xl& a, const PacketMul2Xl& b) { + return __riscv_vand_vv_i64m2(a, __riscv_vnot_v_i64m2(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xl parithmetic_shift_right(PacketMul2Xl a) { + return __riscv_vsra_vx_i64m2(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_right(PacketMul2Xl a) { + return __riscv_vreinterpret_i64m2( + __riscv_vsrl_vx_u64m2(__riscv_vreinterpret_u64m2(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xl plogical_shift_left(PacketMul2Xl a) { + return __riscv_vsll_vx_i64m2(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ploaddup(const numext::int64_t* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl ploadquad(const numext::int64_t* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const PacketMul2Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const PacketMul2Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xl pgather(const numext::int64_t* from, + Index stride) { + return __riscv_vlse64_v_i64m2(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const PacketMul2Xl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t pfirst(const PacketMul2Xl& a) { + return __riscv_vmv_x_s_i64m2_i64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl preverse(const PacketMul2Xl& a) { + PacketMul2Xul idx = + __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pabs(const PacketMul2Xl& a) { + PacketMul2Xl mask = __riscv_vsra_vx_i64m2(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m2(__riscv_vxor_vv_i64m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux(const PacketMul2Xl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const PacketMul2Xl& a) { + return predux_mul(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_min(const PacketMul2Xl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_max(const PacketMul2Xl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_i64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xl>::type + predux_half_dowto4(const PacketMul4Xl& a) { + return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), __riscv_vget_v_i64m4_i64m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXl>::type + predux_half_dowto4(const PacketMul2Xl& a) { + return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), + unpacket_traits::size); +} + +/********************************* double ************************************/ + +typedef vfloat64m1_t PacketXd __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vfloat64m2_t PacketMul2Xd __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); +typedef vfloat64m4_t PacketMul4Xd __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXd type; + typedef PacketXd half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xd type; + typedef PacketXd half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xd type; + typedef PacketMul2Xd half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + typedef PacketXd half; // Half not yet implemented + typedef PacketXl integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask64 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + typedef PacketXd half; + typedef PacketMul2Xl integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask32 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + typedef PacketMul2Xd half; + typedef PacketMul4Xl integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask16 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +/********************************* PacketXd ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXd ptrue(const PacketXd& /*a*/) { + return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(0xffffffffffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pzero(const PacketXd& /*a*/) { + return __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pabs(const PacketXd& a) { + return __riscv_vfabs_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd plset(const double& a) { + PacketXd idx = + __riscv_vfcvt_f_x_v_f64m1(__riscv_vid_v_i64m1(unpacket_traits::size), unpacket_traits::size); + return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd padd(const PacketXd& a, const PacketXd& b) { + return __riscv_vfadd_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd psub(const PacketXd& a, const PacketXd& b) { + return __riscv_vfsub_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pnegate(const PacketXd& a) { + return __riscv_vfneg_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pconj(const PacketXd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmul(const PacketXd& a, const PacketXd& b) { + return __riscv_vfmul_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pdiv(const PacketXd& a, const PacketXd& b) { + return __riscv_vfdiv_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmadd(const PacketXd& a, const PacketXd& b, const PacketXd& c) { + return __riscv_vfmadd_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmsub(const PacketXd& a, const PacketXd& b, const PacketXd& c) { + return __riscv_vfmsub_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pnmadd(const PacketXd& a, const PacketXd& b, const PacketXd& c) { + return __riscv_vfnmsub_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pnmsub(const PacketXd& a, const PacketXd& b, const PacketXd& c) { + return __riscv_vfnmadd_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { + PacketXd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); + PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f64m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmin(const PacketXd& a, const PacketXd& b) { + return __riscv_vfmin_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { + PacketXd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); + PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f64m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pmax(const PacketXd& a, const PacketXd& b) { + return __riscv_vfmax_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pcmp_le(const PacketXd& a, const PacketXd& b) { + PacketMask64 mask = __riscv_vmfle_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pcmp_lt(const PacketXd& a, const PacketXd& b) { + PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pcmp_eq(const PacketXd& a, const PacketXd& b) { + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pcmp_lt_or_nan(const PacketXd& a, const PacketXd& b) { + PacketMask64 mask = __riscv_vmfge_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m1(ptrue(a), 0.0, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for double, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketXd pand(const PacketXd& a, const PacketXd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd por(const PacketXd& a, const PacketXd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vor_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pxor(const PacketXd& a, const PacketXd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pandnot(const PacketXd& a, const PacketXd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), + __riscv_vnot_v_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd ploaddup(const double* from) { + PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd ploadquad(const double* from) { + PacketXul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + ; + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const PacketXd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketXd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m1(from, stride * sizeof(double), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketXd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const PacketXd& a) { + return __riscv_vfmv_f_s_f64m1_f64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXd psqrt(const PacketXd& a) { + return __riscv_vfsqrt_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd print(const PacketXd& a) { + const PacketXd limit = pset1(static_cast(1ull << 52)); + const PacketXd abs_a = pabs(a); + + PacketMask64 mask = __riscv_vmfne_vv_f64m1_b64(a, a, unpacket_traits::size); + const PacketXd x = __riscv_vfadd_vv_f64m1_tum(mask, a, a, a, unpacket_traits::size); + const PacketXd new_x = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1(a, unpacket_traits::size), + unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m1_b64(abs_a, limit, unpacket_traits::size); + PacketXd signed_x = __riscv_vfsgnj_vv_f64m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pfloor(const PacketXd& a) { + PacketXd tmp = print(a); + // If greater, subtract one. + PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m1_tum(mask, tmp, tmp, 1.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd preverse(const PacketXd& a) { + PacketXul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd pfrexp(const PacketXd& a, PacketXd& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE double predux(const PacketXd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const PacketXd& a) { + // Multiply the vector by its reverse + PacketXd prod = __riscv_vfmul_vv_f64m1(preverse(a), a, unpacket_traits::size); + PacketXd half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const PacketXd& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const PacketXd& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_f64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketXd pldexp(const PacketXd& a, const PacketXd& exponent) { + return pldexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE PacketMask64 por(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMask64 pandnot(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMask64 pand(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmand_mm_b64(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask64 pcmp_eq_mask(const PacketXd& a, const PacketXd& b) { + return __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask64 pcmp_lt_mask(const PacketXd& a, const PacketXd& b) { + return __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketXd pselect(const PacketMask64& mask, const PacketXd& a, const PacketXd& b) { + return __riscv_vmerge_vvm_f64m1(b, a, mask, unpacket_traits::size); +} + +/********************************* PacketMul4Xd ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ptrue(const PacketMul4Xd& /*a*/) { + return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(0xffffffffffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pzero(const PacketMul4Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m4(0.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pabs(const PacketMul4Xd& a) { + return __riscv_vfabs_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd plset(const double& a) { + PacketMul4Xd idx = __riscv_vfcvt_f_x_v_f64m4(__riscv_vid_v_i64m4(unpacket_traits::size), + unpacket_traits::size); + return __riscv_vfadd_vf_f64m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd padd(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfadd_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd psub(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfsub_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pnegate(const PacketMul4Xd& a) { + return __riscv_vfneg_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pconj(const PacketMul4Xd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmul(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfmul_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pdiv(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfdiv_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfmadd_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfmsub_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pnmadd(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfnmsub_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pnmsub(const PacketMul4Xd& a, const PacketMul4Xd& b, const PacketMul4Xd& c) { + return __riscv_vfnmadd_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMul4Xd nans = + __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f64m4_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmin(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfmin_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMul4Xd nans = + __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f64m4_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pmax(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vfmax_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_le(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmfle_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_eq(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcmp_lt_or_nan(const PacketMul4Xd& a, const PacketMul4Xd& b) { + PacketMask16 mask = __riscv_vmfge_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m4(ptrue(a), 0.0, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for double, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pand(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd por(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pxor(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pandnot(const PacketMul4Xd& a, const PacketMul4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4( + __riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vnot_v_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ploaddup(const double* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd ploadquad(const double* from) { + PacketMul4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul4Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul4Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m4(from, stride * sizeof(double), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul4Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const PacketMul4Xd& a) { + return __riscv_vfmv_f_s_f64m4_f64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd psqrt(const PacketMul4Xd& a) { + return __riscv_vfsqrt_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd print(const PacketMul4Xd& a) { + const PacketMul4Xd limit = pset1(static_cast(1ull << 52)); + const PacketMul4Xd abs_a = pabs(a); + + PacketMask16 mask = __riscv_vmfne_vv_f64m4_b16(a, a, unpacket_traits::size); + const PacketMul4Xd x = __riscv_vfadd_vv_f64m4_tum(mask, a, a, a, unpacket_traits::size); + const PacketMul4Xd new_x = __riscv_vfcvt_f_x_v_f64m4( + __riscv_vfcvt_x_f_v_i64m4(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m4_b16(abs_a, limit, unpacket_traits::size); + PacketMul4Xd signed_x = __riscv_vfsgnj_vv_f64m4(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pfloor(const PacketMul4Xd& a) { + PacketMul4Xd tmp = print(a); + // If greater, subtract one. + PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m4_tum(mask, tmp, tmp, 1.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd preverse(const PacketMul4Xd& a) { + PacketMul4Xul idx = + __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pfrexp(const PacketMul4Xd& a, PacketMul4Xd& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE double predux(const PacketMul4Xd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const PacketMul4Xd& a) { + PacketXd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), __riscv_vget_v_f64m4_f64m1(a, 1), + unpacket_traits::size); + PacketXd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), __riscv_vget_v_f64m4_f64m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const PacketMul4Xd& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const PacketMul4Xd& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_f64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pldexp(const PacketMul4Xd& a, const PacketMul4Xd& exponent) { + return pldexp_generic(a, exponent); +} + +/********************************* PacketMul2Xd ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ptrue(const PacketMul2Xd& /*a*/) { + return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(0xffffffffffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pzero(const PacketMul2Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m2(0.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pabs(const PacketMul2Xd& a) { + return __riscv_vfabs_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd plset(const double& a) { + PacketMul2Xd idx = __riscv_vfcvt_f_x_v_f64m2(__riscv_vid_v_i64m2(unpacket_traits::size), + unpacket_traits::size); + return __riscv_vfadd_vf_f64m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd padd(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfadd_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd psub(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfsub_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pnegate(const PacketMul2Xd& a) { + return __riscv_vfneg_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pconj(const PacketMul2Xd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmul(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfmul_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pdiv(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfdiv_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmadd(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfmadd_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmsub(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfmsub_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pnmadd(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfnmsub_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pnmsub(const PacketMul2Xd& a, const PacketMul2Xd& b, const PacketMul2Xd& c) { + return __riscv_vfnmadd_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMul2Xd nans = + __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f64m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmin(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfmin_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMul2Xd nans = + __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f64m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pmax(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vfmax_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_le(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmfle_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_eq(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcmp_lt_or_nan(const PacketMul2Xd& a, const PacketMul2Xd& b) { + PacketMask32 mask = __riscv_vmfge_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m2(ptrue(a), 0.0, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for double, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pand(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd por(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pxor(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pandnot(const PacketMul2Xd& a, const PacketMul2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2( + __riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vnot_v_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ploaddup(const double* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd ploadquad(const double* from) { + PacketMul2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const PacketMul2Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const PacketMul2Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m2(from, stride * sizeof(double), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const PacketMul2Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const PacketMul2Xd& a) { + return __riscv_vfmv_f_s_f64m2_f64(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd psqrt(const PacketMul2Xd& a) { + return __riscv_vfsqrt_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd print(const PacketMul2Xd& a) { + const PacketMul2Xd limit = pset1(static_cast(1ull << 52)); + const PacketMul2Xd abs_a = pabs(a); + + PacketMask32 mask = __riscv_vmfne_vv_f64m2_b32(a, a, unpacket_traits::size); + const PacketMul2Xd x = __riscv_vfadd_vv_f64m2_tum(mask, a, a, a, unpacket_traits::size); + const PacketMul2Xd new_x = __riscv_vfcvt_f_x_v_f64m2( + __riscv_vfcvt_x_f_v_i64m2(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m2_b32(abs_a, limit, unpacket_traits::size); + PacketMul2Xd signed_x = __riscv_vfsgnj_vv_f64m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pfloor(const PacketMul2Xd& a) { + PacketMul2Xd tmp = print(a); + // If greater, subtract one. + PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m2_tum(mask, tmp, tmp, 1.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd preverse(const PacketMul2Xd& a) { + PacketMul2Xul idx = + __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pfrexp(const PacketMul2Xd& a, PacketMul2Xd& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE double predux(const PacketMul2Xd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const PacketMul2Xd& a) { + return predux_mul(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const PacketMul2Xd& a) { + return __riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const PacketMul2Xd& a) { + return __riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_f64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pldexp(const PacketMul2Xd& a, const PacketMul2Xd& exponent) { + return pldexp_generic(a, exponent); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xd>::type + predux_half_dowto4(const PacketMul4Xd& a) { + return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), __riscv_vget_v_f64m4_f64m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXd>::type + predux_half_dowto4(const PacketMul2Xd& a) { + return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), + unpacket_traits::size); +} + +/********************************* short **************************************/ + +typedef vint16m1_t PacketXs __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vuint16m1_t PacketXsu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); + +typedef vint16m2_t PacketMul2Xs __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); +typedef vuint16m2_t PacketMul2Xsu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); + +typedef vint16m4_t PacketMul4Xs __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); +typedef vuint16m4_t PacketMul4Xsu __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 4))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXs type; + typedef PacketXs half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xs type; + typedef PacketXs half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul4Xs type; + typedef PacketMul2Xs half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int16_t type; + typedef PacketXs half; // Half not yet implemented + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int16_t type; + typedef PacketXs half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int16_t type; + typedef PacketMul2Xs half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE void prefetch(const numext::int16_t* addr) { +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + __builtin_prefetch(addr); +#endif +} + +/********************************* PacketXs ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs plset(const numext::int16_t& a) { + PacketXs idx = __riscv_vid_v_i16m1(unpacket_traits::size); + return __riscv_vadd_vx_i16m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pzero(const PacketXs& /*a*/) { + return __riscv_vmv_v_x_i16m1(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs padd(const PacketXs& a, const PacketXs& b) { + return __riscv_vadd_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs psub(const PacketXs& a, const PacketXs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pnegate(const PacketXs& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pconj(const PacketXs& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmul(const PacketXs& a, const PacketXs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pdiv(const PacketXs& a, const PacketXs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmadd(const PacketXs& a, const PacketXs& b, const PacketXs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmsub(const PacketXs& a, const PacketXs& b, const PacketXs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pnmadd(const PacketXs& a, const PacketXs& b, const PacketXs& c) { + return __riscv_vnmsub_vv_i16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pnmsub(const PacketXs& a, const PacketXs& b, const PacketXs& c) { + return __riscv_vnmsub_vv_i16m1(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmin(const PacketXs& a, const PacketXs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pmax(const PacketXs& a, const PacketXs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pcmp_le(const PacketXs& a, const PacketXs& b) { + PacketMask16 mask = __riscv_vmsle_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pcmp_lt(const PacketXs& a, const PacketXs& b) { + PacketMask16 mask = __riscv_vmslt_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pcmp_eq(const PacketXs& a, const PacketXs& b) { + PacketMask16 mask = __riscv_vmseq_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs ptrue(const PacketXs& /*a*/) { + return __riscv_vmv_v_x_i16m1(0xffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pand(const PacketXs& a, const PacketXs& b) { + return __riscv_vand_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs por(const PacketXs& a, const PacketXs& b) { + return __riscv_vor_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pxor(const PacketXs& a, const PacketXs& b) { + return __riscv_vxor_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pandnot(const PacketXs& a, const PacketXs& b) { + return __riscv_vand_vv_i16m1(a, __riscv_vnot_v_i16m1(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXs parithmetic_shift_right(PacketXs a) { + return __riscv_vsra_vx_i16m1(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketXs plogical_shift_right(PacketXs a) { + return __riscv_vreinterpret_i16m1( + __riscv_vsrl_vx_u16m1(__riscv_vreinterpret_u16m1(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketXs plogical_shift_left(PacketXs a) { + return __riscv_vsll_vx_i16m1(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs ploaddup(const numext::int16_t* from) { + PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); + // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... + return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs ploadquad(const numext::int16_t* from) { + PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketXs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketXs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXs pgather(const numext::int16_t* from, Index stride) { + return __riscv_vlse16_v_i16m1(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketXs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketXs& a) { + return __riscv_vmv_x_s_i16m1_i16(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXs preverse(const PacketXs& a) { + PacketXsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pabs(const PacketXs& a) { + PacketXs mask = __riscv_vsra_vx_i16m1(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux(const PacketXs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketXs& a) { + // Multiply the vector by its reverse + PacketXs prod = __riscv_vmul_vv_i16m1(preverse(a), a, unpacket_traits::size); + PacketXs half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 16, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 8, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_i16m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + + half_prod = __riscv_vslidedown_vx_i16m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketXs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m1_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketXs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m1_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle16_v_i16m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul4Xs ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs plset(const numext::int16_t& a) { + PacketMul4Xs idx = __riscv_vid_v_i16m4(unpacket_traits::size); + return __riscv_vadd_vx_i16m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pzero(const PacketMul4Xs& /*a*/) { + return __riscv_vmv_v_x_i16m4(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs padd(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vadd_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs psub(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pnegate(const PacketMul4Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pconj(const PacketMul4Xs& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmul(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pdiv(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmadd(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmsub(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pnmadd(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vnmsub_vv_i16m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pnmsub(const PacketMul4Xs& a, const PacketMul4Xs& b, const PacketMul4Xs& c) { + return __riscv_vnmsub_vv_i16m4(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmin(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pmax(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcmp_le(const PacketMul4Xs& a, const PacketMul4Xs& b) { + PacketMask4 mask = __riscv_vmsle_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcmp_lt(const PacketMul4Xs& a, const PacketMul4Xs& b) { + PacketMask4 mask = __riscv_vmslt_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcmp_eq(const PacketMul4Xs& a, const PacketMul4Xs& b) { + PacketMask4 mask = __riscv_vmseq_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ptrue(const PacketMul4Xs& /*a*/) { + return __riscv_vmv_v_x_i16m4(0xffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pand(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vand_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs por(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vor_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pxor(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vxor_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pandnot(const PacketMul4Xs& a, const PacketMul4Xs& b) { + return __riscv_vand_vv_i16m4(a, __riscv_vnot_v_i16m4(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xs parithmetic_shift_right(PacketMul4Xs a) { + return __riscv_vsra_vx_i16m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_right(PacketMul4Xs a) { + return __riscv_vreinterpret_i16m4( + __riscv_vsrl_vx_u16m4(__riscv_vreinterpret_u16m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul4Xs plogical_shift_left(PacketMul4Xs a) { + return __riscv_vsll_vx_i16m4(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ploaddup(const numext::int16_t* from) { + PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); + idx = __riscv_vand_vx_u16m4(idx, 0xfffeu, unpacket_traits::size); + // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... + return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs ploadquad(const numext::int16_t* from) { + PacketMul4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m4(__riscv_vand_vx_u16m4(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul4Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul4Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul4Xs pgather(const numext::int16_t* from, + Index stride) { + return __riscv_vlse16_v_i16m4(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul4Xs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul4Xs& a) { + return __riscv_vmv_x_s_i16m4_i16(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs preverse(const PacketMul4Xs& a) { + PacketMul4Xsu idx = + __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pabs(const PacketMul4Xs& a) { + PacketMul4Xs mask = __riscv_vsra_vx_i16m4(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m4(__riscv_vxor_vv_i16m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul4Xs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul4Xs& a) { + PacketXs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), __riscv_vget_v_i16m4_i16m1(a, 1), + unpacket_traits::size); + PacketXs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), __riscv_vget_v_i16m4_i16m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul4Xs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul4Xs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle16_v_i16m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* PacketMul2Xs ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs plset(const numext::int16_t& a) { + PacketMul2Xs idx = __riscv_vid_v_i16m2(unpacket_traits::size); + return __riscv_vadd_vx_i16m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pzero(const PacketMul2Xs& /*a*/) { + return __riscv_vmv_v_x_i16m2(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs padd(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vadd_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs psub(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pnegate(const PacketMul2Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pconj(const PacketMul2Xs& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmul(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pdiv(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmadd(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmsub(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pnmadd(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vnmsub_vv_i16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pnmsub(const PacketMul2Xs& a, const PacketMul2Xs& b, const PacketMul2Xs& c) { + return __riscv_vnmsub_vv_i16m2(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmin(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pmax(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcmp_le(const PacketMul2Xs& a, const PacketMul2Xs& b) { + PacketMask8 mask = __riscv_vmsle_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcmp_lt(const PacketMul2Xs& a, const PacketMul2Xs& b) { + PacketMask8 mask = __riscv_vmslt_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcmp_eq(const PacketMul2Xs& a, const PacketMul2Xs& b) { + PacketMask8 mask = __riscv_vmseq_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), 0xffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ptrue(const PacketMul2Xs& /*a*/) { + return __riscv_vmv_v_x_i16m2(0xffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pand(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vand_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs por(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vor_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pxor(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vxor_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pandnot(const PacketMul2Xs& a, const PacketMul2Xs& b) { + return __riscv_vand_vv_i16m2(a, __riscv_vnot_v_i16m2(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xs parithmetic_shift_right(PacketMul2Xs a) { + return __riscv_vsra_vx_i16m2(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_right(PacketMul2Xs a) { + return __riscv_vreinterpret_i16m2( + __riscv_vsrl_vx_u16m2(__riscv_vreinterpret_u16m2(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE PacketMul2Xs plogical_shift_left(PacketMul2Xs a) { + return __riscv_vsll_vx_i16m2(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ploaddup(const numext::int16_t* from) { + PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); + // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... + return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs ploadquad(const numext::int16_t* from) { + PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const PacketMul2Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const PacketMul2Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xs pgather(const numext::int16_t* from, + Index stride) { + return __riscv_vlse16_v_i16m2(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const PacketMul2Xs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t pfirst(const PacketMul2Xs& a) { + return __riscv_vmv_x_s_i16m2_i16(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs preverse(const PacketMul2Xs& a) { + PacketMul2Xsu idx = + __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pabs(const PacketMul2Xs& a) { + PacketMul2Xs mask = __riscv_vsra_vx_i16m2(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m2(__riscv_vxor_vv_i16m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux(const PacketMul2Xs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const PacketMul2Xs& a) { + return predux_mul(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_min(const PacketMul2Xs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_max(const PacketMul2Xs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle16_v_i16m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketMul2Xs>::type + predux_half_dowto4(const PacketMul4Xs& a) { + return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), __riscv_vget_v_i16m4_i16m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXs>::type + predux_half_dowto4(const PacketMul2Xs& a) { + return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), + unpacket_traits::size); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_PACKET_MATH_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h new file mode 100644 index 0000000000000000000000000000000000000000..085952fcd7d108284c4dfbc2a3e9a8721b6c285d --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h @@ -0,0 +1,917 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_FP16_RVV10_H +#define EIGEN_PACKET_MATH_FP16_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +typedef vfloat16m1_t PacketXh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vfloat16m2_t PacketMul2Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); + +template <> +struct packet_traits : default_packet_traits { + typedef PacketXh type; + typedef PacketXh half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 0, + HasExp = 0, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = 0 + }; +}; + +template <> +struct packet_traits : default_packet_traits { + typedef PacketMul2Xh type; + typedef PacketXh half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasFloor = 1, + HasRint = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 0, + HasExp = 0, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = 0 + }; +}; + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef PacketXh half; // Half not yet implemented + typedef PacketXs integer_packet; + typedef numext::uint8_t mask_t; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef PacketXh half; + typedef PacketMul2Xs integer_packet; + typedef numext::uint8_t mask_t; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +/********************************* PacketXh ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXh ptrue(const PacketXh& /*a*/) { + return __riscv_vreinterpret_f16m1(__riscv_vmv_v_x_u16m1(0xffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pzero(const PacketXh& /*a*/) { + return __riscv_vfmv_v_f_f16m1(static_cast(0.0), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pabs(const PacketXh& a) { + return __riscv_vfabs_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pset1(const Eigen::half& from) { + return __riscv_vfmv_v_f_f16m1(static_cast<_Float16>(from), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pset1frombits(numext::uint16_t from) { + return __riscv_vreinterpret_f16m1(__riscv_vmv_v_x_u16m1(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh plset(const Eigen::half& a) { + PacketXh idx = + __riscv_vfcvt_f_x_v_f16m1(__riscv_vid_v_i16m1(unpacket_traits::size), unpacket_traits::size); + return __riscv_vfadd_vf_f16m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh padd(const PacketXh& a, const PacketXh& b) { + return __riscv_vfadd_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh psub(const PacketXh& a, const PacketXh& b) { + return __riscv_vfsub_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pnegate(const PacketXh& a) { + return __riscv_vfneg_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pconj(const PacketXh& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmul(const PacketXh& a, const PacketXh& b) { + return __riscv_vfmul_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pdiv(const PacketXh& a, const PacketXh& b) { + return __riscv_vfdiv_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmadd(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfmadd_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmsub(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfmsub_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pnmadd(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfnmsub_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pnmsub(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfnmadd_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmin(const PacketXh& a, const PacketXh& b) { + PacketXh nans = + __riscv_vfmv_v_f_f16m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f16m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmin(const PacketXh& a, const PacketXh& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmin(const PacketXh& a, const PacketXh& b) { + return __riscv_vfmin_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmax(const PacketXh& a, const PacketXh& b) { + PacketXh nans = + __riscv_vfmv_v_f_f16m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f16m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmax(const PacketXh& a, const PacketXh& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmax(const PacketXh& a, const PacketXh& b) { + return __riscv_vfmax_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_le(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmfle_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_lt(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmflt_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_eq(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_lt_or_nan(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmfge_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f16m1(ptrue(a), static_cast(0.0), mask, + unpacket_traits::size); +} + +// Logical Operations are not supported for half, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketXh pand(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh por(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vor_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pxor(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vxor_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pandnot(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), + __riscv_vnot_v_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast(from), + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast(from), + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh ploaddup(const Eigen::half* from) { + PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); + return __riscv_vloxei16_v_f16m1(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh ploadquad(const Eigen::half* from) { + PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_f16m1(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const PacketXh& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const PacketXh& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXh pgather(const Eigen::half* from, Index stride) { + return __riscv_vlse16_v_f16m1(reinterpret_cast(from), stride * sizeof(Eigen::half), + unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const PacketXh& from, Index stride) { + __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const PacketXh& a) { + return static_cast(__riscv_vfmv_f_s_f16m1_f16(a)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh psqrt(const PacketXh& a) { + return __riscv_vfsqrt_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh print(const PacketXh& a) { + const PacketXh limit = pset1(static_cast(1 << 10)); + const PacketXh abs_a = pabs(a); + + PacketMask16 mask = __riscv_vmfne_vv_f16m1_b16(a, a, unpacket_traits::size); + const PacketXh x = __riscv_vfadd_vv_f16m1_tum(mask, a, a, a, unpacket_traits::size); + const PacketXh new_x = __riscv_vfcvt_f_x_v_f16m1(__riscv_vfcvt_x_f_v_i16m1(a, unpacket_traits::size), + unpacket_traits::size); + + mask = __riscv_vmflt_vv_f16m1_b16(abs_a, limit, unpacket_traits::size); + PacketXh signed_x = __riscv_vfsgnj_vv_f16m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pfloor(const PacketXh& a) { + PacketXh tmp = print(a); + // If greater, subtract one. + PacketMask16 mask = __riscv_vmflt_vv_f16m1_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f16m1_tum(mask, tmp, tmp, static_cast(1.0), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh preverse(const PacketXh& a) { + PacketXsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f16m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux(const PacketXh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m1_f16m1( + a, __riscv_vfmv_v_f_f16m1(static_cast(0.0), unpacket_traits::size), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_mul(const PacketXh& a) { + // Multiply the vector by its reverse + PacketXh prod = __riscv_vfmul_vv_f16m1(preverse(a), a, unpacket_traits::size); + PacketXh half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f16m1(prod, 16, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f16m1(prod, 8, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f16m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_f16m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + + half_prod = __riscv_vslidedown_vx_f16m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_min(const PacketXh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m1_f16m1( + a, __riscv_vfmv_v_f_f16m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_max(const PacketXh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m1_f16m1( + a, __riscv_vfmv_v_f_f16m1(-(std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size))); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + Eigen::half buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i], + unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle16_v_f16m1(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), + unpacket_traits::size); + } +} + +EIGEN_STRONG_INLINE PacketMul2Xf half2float(const PacketXh& a) { + return __riscv_vfwcvt_f_f_v_f32m2(a, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketXh float2half(const PacketMul2Xf& a) { + return __riscv_vfncvt_f_f_w_f16m1(a, unpacket_traits::size); +} + +/********************************* PacketMul2Xh ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh ptrue(const PacketMul2Xh& /*a*/) { + return __riscv_vreinterpret_f16m2(__riscv_vmv_v_x_u16m2(0xffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pzero(const PacketMul2Xh& /*a*/) { + return __riscv_vfmv_v_f_f16m2(static_cast(0.0), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pabs(const PacketMul2Xh& a) { + return __riscv_vfabs_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pset1(const Eigen::half& from) { + return __riscv_vfmv_v_f_f16m2(static_cast<_Float16>(from), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pset1frombits(numext::uint16_t from) { + return __riscv_vreinterpret_f16m2(__riscv_vmv_v_x_u16m2(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh plset(const Eigen::half& a) { + PacketMul2Xh idx = __riscv_vfcvt_f_x_v_f16m2(__riscv_vid_v_i16m2(unpacket_traits::size), + unpacket_traits::size); + return __riscv_vfadd_vf_f16m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh padd(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfadd_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh psub(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfsub_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pnegate(const PacketMul2Xh& a) { + return __riscv_vfneg_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pconj(const PacketMul2Xh& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmul(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfmul_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pdiv(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfdiv_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmadd(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { + return __riscv_vfmadd_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmsub(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { + return __riscv_vfmsub_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pnmadd(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { + return __riscv_vfnmsub_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pnmsub(const PacketMul2Xh& a, const PacketMul2Xh& b, const PacketMul2Xh& c) { + return __riscv_vfnmadd_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmin(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMul2Xh nans = + __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f16m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmin(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmin(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfmin_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmax(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMul2Xh nans = + __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f16m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmax(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pmax(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vfmax_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcmp_le(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMask8 mask = __riscv_vmfle_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcmp_lt(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcmp_eq(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcmp_lt_or_nan(const PacketMul2Xh& a, const PacketMul2Xh& b) { + PacketMask8 mask = __riscv_vmfge_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f16m2(ptrue(a), static_cast(0.0), mask, + unpacket_traits::size); +} + +// Logical Operations are not supported for half, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pand(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vreinterpret_v_f16m2_u16m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh por(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vreinterpret_v_f16m2_u16m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pxor(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vxor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vreinterpret_v_f16m2_u16m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pandnot(const PacketMul2Xh& a, const PacketMul2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2( + __riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vnot_v_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast(from), + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast(from), + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh ploaddup(const Eigen::half* from) { + PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); + return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh ploadquad(const Eigen::half* from) { + PacketMul2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const PacketMul2Xh& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const PacketMul2Xh& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketMul2Xh pgather(const Eigen::half* from, Index stride) { + return __riscv_vlse16_v_f16m2(reinterpret_cast(from), stride * sizeof(Eigen::half), + unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const PacketMul2Xh& from, + Index stride) { + __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const PacketMul2Xh& a) { + return static_cast(__riscv_vfmv_f_s_f16m2_f16(a)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh psqrt(const PacketMul2Xh& a) { + return __riscv_vfsqrt_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh print(const PacketMul2Xh& a) { + const PacketMul2Xh limit = pset1(static_cast(1 << 10)); + const PacketMul2Xh abs_a = pabs(a); + + PacketMask8 mask = __riscv_vmfne_vv_f16m2_b8(a, a, unpacket_traits::size); + const PacketMul2Xh x = __riscv_vfadd_vv_f16m2_tum(mask, a, a, a, unpacket_traits::size); + const PacketMul2Xh new_x = __riscv_vfcvt_f_x_v_f16m2( + __riscv_vfcvt_x_f_v_i16m2(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f16m2_b8(abs_a, limit, unpacket_traits::size); + PacketMul2Xh signed_x = __riscv_vfsgnj_vv_f16m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pfloor(const PacketMul2Xh& a) { + PacketMul2Xh tmp = print(a); + // If greater, subtract one. + PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f16m2_tum(mask, tmp, tmp, static_cast(1.0), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh preverse(const PacketMul2Xh& a) { + PacketMul2Xsu idx = + __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f16m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux(const PacketMul2Xh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m2_f16m1( + a, __riscv_vfmv_v_f_f16m1(static_cast(0.0), unpacket_traits::size / 4), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_mul(const PacketMul2Xh& a) { + return predux_mul(__riscv_vfmul_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_min(const PacketMul2Xh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m2_f16m1( + a, __riscv_vfmv_v_f_f16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_max(const PacketMul2Xh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m2_f16m1( + a, __riscv_vfmv_v_f_f16m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size))); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + Eigen::half buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i], + unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle16_v_f16m2(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), + unpacket_traits::size); + } +} + +EIGEN_STRONG_INLINE PacketMul4Xf half2float(const PacketMul2Xh& a) { + return __riscv_vfwcvt_f_f_v_f32m4(a, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMul2Xh float2half(const PacketMul4Xf& a) { + return __riscv_vfncvt_f_f_w_f16m2(a, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE + typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXh>::type + predux_half_dowto4(const PacketMul2Xh& a) { + return __riscv_vfadd_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), + unpacket_traits::size); +} + +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, pcos) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, pexp) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, pexpm1) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, plog) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, plog1p) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, plog2) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, preciprocal) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, prsqrt) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, psin) +F16_PACKET_FUNCTION(PacketMul2Xf, PacketXh, ptanh) + +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, pcos) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, pexp) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, pexpm1) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, plog) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, plog1p) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, plog2) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, preciprocal) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, prsqrt) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, psin) +F16_PACKET_FUNCTION(PacketMul4Xf, PacketMul2Xh, ptanh) + +/********************************* casting ************************************/ + +template <> +struct type_casting_traits<_Float16, numext::int16_t> { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXh pcast(const PacketXs& a) { + return __riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pcast(const PacketXh& a) { + return __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh preinterpret(const PacketXs& a) { + return __riscv_vreinterpret_v_i16m1_f16m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXs preinterpret(const PacketXh& a) { + return __riscv_vreinterpret_v_f16m1_i16m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketMul2Xs& a) { + return __riscv_vfcvt_f_x_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketMul2Xh& a) { + return __riscv_vfcvt_rtz_x_f_v_i16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh preinterpret(const PacketMul2Xs& a) { + return __riscv_vreinterpret_v_i16m2_f16m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs preinterpret(const PacketMul2Xh& a) { + return __riscv_vreinterpret_v_f16m2_i16m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXh& a, const PacketXh& b, const PacketXh& c, + const PacketXh& d) { + return __riscv_vcreate_v_i16m1_i16m4(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketXs& a, const PacketXs& b) { + return __riscv_vcreate_v_f16m1_f16m2(__riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f16m1(b, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xh pcast(const PacketXh& a, const PacketXh& b) { + return __riscv_vcreate_v_f16m1_f16m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketXh& a, const PacketXh& b) { + return __riscv_vcreate_v_i16m1_i16m2(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size)); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_PACKET_MATH_FP16_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/TypeCasting.h b/Eigen/src/Core/arch/RVV10/TypeCasting.h new file mode 100644 index 0000000000000000000000000000000000000000..67bc99d0b820de70b04c1b1b45365752bc3282be --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/TypeCasting.h @@ -0,0 +1,284 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_RVV10_H +#define EIGEN_TYPE_CASTING_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/********************************* 32 bits ************************************/ + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXf pcast(const PacketXi& a) { + return __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXi pcast(const PacketXf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXf preinterpret(const PacketXi& a) { + return __riscv_vreinterpret_v_i32m1_f32m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXi preinterpret(const PacketXf& a) { + return __riscv_vreinterpret_v_f32m1_i32m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketMul4Xi& a) { + return __riscv_vfcvt_f_x_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketMul4Xf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf preinterpret(const PacketMul4Xi& a) { + return __riscv_vreinterpret_v_i32m4_f32m4(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi preinterpret(const PacketMul4Xf& a) { + return __riscv_vreinterpret_v_f32m4_i32m4(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketMul2Xi& a) { + return __riscv_vfcvt_f_x_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketMul2Xf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf preinterpret(const PacketMul2Xi& a) { + return __riscv_vreinterpret_v_i32m2_f32m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi preinterpret(const PacketMul2Xf& a) { + return __riscv_vreinterpret_v_f32m2_i32m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXi& a, const PacketXi& b, const PacketXi& c, + const PacketXi& d) { + return __riscv_vcreate_v_i32m1_i32m4(a, b, c, d); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXi& a, const PacketXi& b, const PacketXi& c, + const PacketXi& d) { + return __riscv_vcreate_v_f32m1_f32m4(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(c, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xf pcast(const PacketXf& a, const PacketXf& b, const PacketXf& c, + const PacketXf& d) { + return __riscv_vcreate_v_f32m1_f32m4(a, b, c, d); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xi pcast(const PacketXf& a, const PacketXf& b, const PacketXf& c, + const PacketXf& d) { + return __riscv_vcreate_v_i32m1_i32m4(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketXi& a, const PacketXi& b) { + return __riscv_vcreate_v_i32m1_i32m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXi& a, const PacketXi& b) { + return __riscv_vcreate_v_f32m1_f32m2(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xf pcast(const PacketXf& a, const PacketXf& b) { + return __riscv_vcreate_v_f32m1_f32m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xi pcast(const PacketXf& a, const PacketXf& b) { + return __riscv_vcreate_v_i32m1_i32m2(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size)); +} + +/********************************* 64 bits ************************************/ + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXd pcast(const PacketXl& a) { + return __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXl pcast(const PacketXd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXd preinterpret(const PacketXl& a) { + return __riscv_vreinterpret_v_i64m1_f64m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXl preinterpret(const PacketXd& a) { + return __riscv_vreinterpret_v_f64m1_i64m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketMul4Xl& a) { + return __riscv_vfcvt_f_x_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketMul4Xd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd preinterpret(const PacketMul4Xl& a) { + return __riscv_vreinterpret_v_i64m4_f64m4(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl preinterpret(const PacketMul4Xd& a) { + return __riscv_vreinterpret_v_f64m4_i64m4(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketMul2Xl& a) { + return __riscv_vfcvt_f_x_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketMul2Xd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd preinterpret(const PacketMul2Xl& a) { + return __riscv_vreinterpret_v_i64m2_f64m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl preinterpret(const PacketMul2Xd& a) { + return __riscv_vreinterpret_v_f64m2_i64m2(a); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXl& a, const PacketXl& b, const PacketXl& c, + const PacketXl& d) { + return __riscv_vcreate_v_i64m1_i64m4(a, b, c, d); + ; +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXl& a, const PacketXl& b, const PacketXl& c, + const PacketXl& d) { + return __riscv_vcreate_v_f64m1_f64m4(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(c, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xd pcast(const PacketXd& a, const PacketXd& b, const PacketXd& c, + const PacketXd& d) { + return __riscv_vcreate_v_f64m1_f64m4(a, b, c, d); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xl pcast(const PacketXd& a, const PacketXd& b, const PacketXd& c, + const PacketXd& d) { + return __riscv_vcreate_v_i64m1_i64m4(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketXl& a, const PacketXl& b) { + return __riscv_vcreate_v_i64m1_i64m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXl& a, const PacketXl& b) { + return __riscv_vcreate_v_f64m1_f64m2(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xd pcast(const PacketXd& a, const PacketXd& b) { + return __riscv_vcreate_v_f64m1_f64m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul2Xl pcast(const PacketXd& a, const PacketXd& b) { + return __riscv_vcreate_v_i64m1_i64m2(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size)); +} + +/********************************* 16 bits ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketMul2Xs pcast(const PacketXs& a, const PacketXs& b) { + return __riscv_vcreate_v_i16m1_i16m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketMul4Xs pcast(const PacketXs& a, const PacketXs& b, const PacketXs& c, + const PacketXs& d) { + return __riscv_vcreate_v_i16m1_i16m4(a, b, c, d); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_TYPE_CASTING_RVV10_H diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index ba7d97a038dcfe6684358740fea4d252d7193090..326c6eab3c035170e155332bb7abbe0682b4bafb 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -111,7 +111,11 @@ struct squared_norm_functor { } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { +#if defined EIGEN_VECTORIZE_RVV10 + return Packet(pmul(a.real, a.real), pmul(a.imag, a.imag)); +#else return Packet(pmul(a.v, a.v)); +#endif } }; template diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index ba72a8a4fbe2fdd87c01ba89b676790379b3cdf4..a691d092ec8aa4c64bb474f7f6434fb380707dbe 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -38,10 +38,21 @@ template ::ReturnType ResScalar; +#ifdef EIGEN_RISCV64_USE_RVV10 +#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ + typedef typename std::conditional_t< \ + NumTraits::IsComplex || NumTraits::IsComplex, \ + typename packet_traits::type, \ + typename gemv_packet_cond::type, \ + typename packet_traits::half, \ + typename unpacket_traits::half>::half>::type> \ + name##Packet##postfix +#else #define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \ typedef typename gemv_packet_cond< \ packet_size, typename packet_traits::type, typename packet_traits::half, \ typename unpacket_traits::half>::half>::type name##Packet##postfix +#endif PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_); PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_); diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 49f307c734e937f013e659e931286a17ef6756f9..5be3e8028baa36124759371284fcbc40988106a3 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -68,6 +68,8 @@ #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 #elif defined __HVX__ && (__HVX_LENGTH__ == 128) #define EIGEN_IDEAL_MAX_ALIGN_BYTES 128 +#elif defined(EIGEN_RISCV64_USE_RVV10) +#define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 #else #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 #endif @@ -104,7 +106,7 @@ // Only static alignment is really problematic (relies on nonstandard compiler extensions), // try to keep heap alignment even when we have to disable static alignment. #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || \ - EIGEN_ARCH_MIPS || EIGEN_ARCH_LOONGARCH64) + EIGEN_ARCH_MIPS || EIGEN_ARCH_LOONGARCH64 || EIGEN_ARCH_RISCV) #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 #else #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 @@ -406,14 +408,48 @@ extern "C" { #define EIGEN_VECTORIZE_SVE #include -// Since we depend on knowing SVE vector lengths at compile-time, we need -// to ensure a fixed lengths is set +// Since we depend on knowing SVE vector length at compile-time, we need +// to ensure a fixed length is set #if defined __ARM_FEATURE_SVE_BITS #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS #else #error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set." #endif +#elif defined(EIGEN_ARCH_RISCV) + +#if defined(__riscv_zfh) +#define EIGEN_HAS_BUILTIN_FLOAT16 +#endif + +// We currently require RVV to be enabled explicitly via EIGEN_RISCV64_USE_RVV and +// will not select the backend automatically +#if (defined EIGEN_RISCV64_USE_RVV10) + +#define EIGEN_VECTORIZE +#define EIGEN_VECTORIZE_RVV10 +#include + +// Since we depend on knowing RVV vector length at compile-time, we need +// to ensure a fixed length is set +#if defined(__riscv_v_fixed_vlen) +#define EIGEN_RISCV64_RVV_VL __riscv_v_fixed_vlen +#if __riscv_v_fixed_vlen >= 256 +#undef EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT +#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 +#endif +#else +#error "Eigen requires a fixed RVV vector length but -mrvv-vector-bits=zvl is not set." +#endif + +#if defined(__riscv_zvfh) && defined(__riscv_zfh) +#define EIGEN_VECTORIZE_RVV10FP16 +#elif defined(__riscv_zvfh) +#error "The Eigen::Half vectorization requires Zfh and Zvfh extensions." +#endif + +#endif // defined(EIGEN_ARCH_RISCV) + #elif (defined __s390x__ && defined __VEC__) #define EIGEN_VECTORIZE diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index fcc2db82266e54776d6efe1403fa84ae44011225..8aba62b75565ff787b1399b3dde62fb6601a1025 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -475,6 +475,7 @@ enum Type { SVE = 0x6, HVX = 0x7, LSX = 0x8, + RVV10 = 0x9, #if defined EIGEN_VECTORIZE_SSE Target = SSE #elif defined EIGEN_VECTORIZE_ALTIVEC @@ -491,6 +492,8 @@ enum Type { Target = HVX #elif defined EIGEN_VECTORIZE_LSX Target = LSX +#elif defined EIGEN_VECTORIZE_RVV10 + Target = RVV10 #else Target = Generic #endif diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 00d55577d9f90fdd0d279d906b98bf170ebe7386..5f29a9c7204320c7cd5ea6077a2d1d91e70bd831 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -404,6 +404,13 @@ #define EIGEN_ARCH_PPC 0 #endif +/// \internal EIGEN_ARCH_RISCV set to 1 if the architecture is RISC-V. +#if defined(__riscv) +#define EIGEN_ARCH_RISCV 1 +#else +#define EIGEN_ARCH_RISCV 0 +#endif + //------------------------------------------------------------------------------------------ // Operating system identification, EIGEN_OS_* //------------------------------------------------------------------------------------------ @@ -976,7 +983,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); #if !defined(EIGEN_ASM_COMMENT) -#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64) +#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_RISCV) #define EIGEN_ASM_COMMENT(X) __asm__("#" X) #else #define EIGEN_ASM_COMMENT(X) diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index a0e160eba4fe95b35ca48c109f389842014c4bd3..e91a14e9dd71981a7ab04b3f6b04c58888da3c05 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -264,7 +264,7 @@ struct functor_cost { static constexpr Index Cost = plain_enum_max(nested_functor_cost::Cost, 1); }; -template +template struct packet_traits; template @@ -285,9 +285,12 @@ struct find_best_packet_helper { typedef typename find_best_packet_helper::half>::type type; }; -template +template +struct find_best_packet; + +template struct find_best_packet { - typedef typename find_best_packet_helper::type>::type type; + typedef typename find_best_packet_helper::type>::type type; }; template () * diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index 2686a5237af521aa03bc21c613c2f988df974f23..9bc9b1099f9f6701a5665906eb94744ef0b18d05 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -301,12 +301,25 @@ template { static inline void run(Scalar* x, Index incrx, Scalar* y, Index incry, Index size, OtherScalar c, OtherScalar s) { +#ifdef EIGEN_RISCV64_USE_RVV10 + typedef + typename std::conditional_t::IsComplex || NumTraits::IsComplex, + typename packet_traits::type, typename packet_traits::type> + Packet; + typedef typename std::conditional_t::IsComplex || NumTraits::IsComplex, + typename packet_traits::type, + typename packet_traits::type> + OtherPacket; + + constexpr Index PacketSize = unpacket_traits::size; +#else typedef typename packet_traits::type Packet; typedef typename packet_traits::type OtherPacket; - constexpr int RequiredAlignment = - (std::max)(unpacket_traits::alignment, unpacket_traits::alignment); constexpr Index PacketSize = packet_traits::size; +#endif + constexpr int RequiredAlignment = + (std::max)(unpacket_traits::alignment, unpacket_traits::alignment); /*** dynamic-size vectorized paths ***/ if (size >= 2 * PacketSize && SizeAtCompileTime == Dynamic && ((incrx == 1 && incry == 1) || PacketSize == 1)) { diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 76475923f8cc371fa9b6ea90099429aa21e73de7..4259b61b1db15720ba9ea96ace67aea01c09b7e1 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -1816,6 +1816,19 @@ EIGEN_DECLARE_TEST(packetmath) { CALL_SUBTEST_14((packetmath::type>())); CALL_SUBTEST_14((packetmath_scatter_gather::type>())); CALL_SUBTEST_15(test::runner::run()); +#ifdef EIGEN_RISCV64_USE_RVV10 + CALL_SUBTEST_16((test::runner::type>::run())); + CALL_SUBTEST_17((test::runner::type>::run())); + CALL_SUBTEST_18((test::runner::type>::run())); + CALL_SUBTEST_19((test::runner::type>::run())); + CALL_SUBTEST_20((test::runner::type>::run())); + CALL_SUBTEST_21((test::runner::type>::run())); + CALL_SUBTEST_22((test::runner::type>::run())); + CALL_SUBTEST_23((test::runner::type>::run())); + CALL_SUBTEST_24((test::runner::type>::run())); + CALL_SUBTEST_25((test::runner::type>::run())); + CALL_SUBTEST_26((test::runner::type>::run())); +#endif g_first_pass = false; } } diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 724fa40ba1f0a37a359c9f2af5df887d9a184fae..02d46cef8d1966a63c19995ef39ede40e50c4246 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -107,7 +107,11 @@ template ::Vector struct vectorization_logic { typedef internal::packet_traits PacketTraits; +#ifdef EIGEN_RISCV64_USE_RVV10 + typedef typename internal::packet_traits::type PacketType; +#else typedef typename internal::packet_traits::type PacketType; +#endif typedef typename internal::unpacket_traits::half HalfPacketType; enum { PacketSize = internal::unpacket_traits::size,