[go: up one dir, main page]

wide/
f32x8_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="avx")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(32))]
7    pub struct f32x8 { avx: m256 }
8  } else {
9    #[derive(Default, Clone, Copy, PartialEq)]
10    #[repr(C, align(32))]
11    pub struct f32x8 { a : f32x4, b : f32x4 }
12  }
13}
14
15macro_rules! const_f32_as_f32x8 {
16  ($i:ident, $f:expr) => {
17    #[allow(non_upper_case_globals)]
18    pub const $i: f32x8 = f32x8::new([$f; 8]);
19  };
20}
21
22impl f32x8 {
23  const_f32_as_f32x8!(ONE, 1.0);
24  const_f32_as_f32x8!(HALF, 0.5);
25  const_f32_as_f32x8!(ZERO, 0.0);
26  const_f32_as_f32x8!(E, core::f32::consts::E);
27  const_f32_as_f32x8!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
28  const_f32_as_f32x8!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
29  const_f32_as_f32x8!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
30  const_f32_as_f32x8!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
31  const_f32_as_f32x8!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
32  const_f32_as_f32x8!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
33  const_f32_as_f32x8!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
34  const_f32_as_f32x8!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
35  const_f32_as_f32x8!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
36  const_f32_as_f32x8!(LN_2, core::f32::consts::LN_2);
37  const_f32_as_f32x8!(LN_10, core::f32::consts::LN_10);
38  const_f32_as_f32x8!(LOG2_E, core::f32::consts::LOG2_E);
39  const_f32_as_f32x8!(LOG10_E, core::f32::consts::LOG10_E);
40  const_f32_as_f32x8!(LOG10_2, core::f32::consts::LOG10_2);
41  const_f32_as_f32x8!(LOG2_10, core::f32::consts::LOG2_10);
42  const_f32_as_f32x8!(PI, core::f32::consts::PI);
43  const_f32_as_f32x8!(SQRT_2, core::f32::consts::SQRT_2);
44  const_f32_as_f32x8!(TAU, core::f32::consts::TAU);
45}
46
47unsafe impl Zeroable for f32x8 {}
48unsafe impl Pod for f32x8 {}
49
50impl AlignTo for f32x8 {
51  type Elem = f32;
52}
53
54impl Add for f32x8 {
55  type Output = Self;
56  #[inline]
57  fn add(self, rhs: Self) -> Self::Output {
58    pick! {
59      if #[cfg(target_feature="avx")] {
60        Self { avx: add_m256(self.avx, rhs.avx) }
61      } else {
62        Self {
63          a : self.a.add(rhs.a),
64          b : self.b.add(rhs.b),
65        }
66      }
67    }
68  }
69}
70
71impl Sub for f32x8 {
72  type Output = Self;
73  #[inline]
74  fn sub(self, rhs: Self) -> Self::Output {
75    pick! {
76      if #[cfg(target_feature="avx")] {
77        Self { avx: sub_m256(self.avx, rhs.avx) }
78      } else {
79        Self {
80          a : self.a.sub(rhs.a),
81          b : self.b.sub(rhs.b),
82        }
83      }
84    }
85  }
86}
87
88impl Mul for f32x8 {
89  type Output = Self;
90  #[inline]
91  fn mul(self, rhs: Self) -> Self::Output {
92    pick! {
93      if #[cfg(target_feature="avx")] {
94        Self { avx: mul_m256(self.avx, rhs.avx) }
95      } else {
96        Self {
97          a : self.a.mul(rhs.a),
98          b : self.b.mul(rhs.b),
99        }
100      }
101    }
102  }
103}
104
105impl Div for f32x8 {
106  type Output = Self;
107  #[inline]
108  fn div(self, rhs: Self) -> Self::Output {
109    pick! {
110      if #[cfg(target_feature="avx")] {
111        Self { avx: div_m256(self.avx, rhs.avx) }
112      } else {
113        Self {
114          a : self.a.div(rhs.a),
115          b : self.b.div(rhs.b),
116        }
117      }
118    }
119  }
120}
121
122impl Add<f32> for f32x8 {
123  type Output = Self;
124  #[inline]
125  fn add(self, rhs: f32) -> Self::Output {
126    self.add(Self::splat(rhs))
127  }
128}
129
130impl Sub<f32> for f32x8 {
131  type Output = Self;
132  #[inline]
133  fn sub(self, rhs: f32) -> Self::Output {
134    self.sub(Self::splat(rhs))
135  }
136}
137
138impl Mul<f32> for f32x8 {
139  type Output = Self;
140  #[inline]
141  fn mul(self, rhs: f32) -> Self::Output {
142    self.mul(Self::splat(rhs))
143  }
144}
145
146impl Div<f32> for f32x8 {
147  type Output = Self;
148  #[inline]
149  fn div(self, rhs: f32) -> Self::Output {
150    self.div(Self::splat(rhs))
151  }
152}
153
154impl Add<f32x8> for f32 {
155  type Output = f32x8;
156  #[inline]
157  fn add(self, rhs: f32x8) -> Self::Output {
158    f32x8::splat(self).add(rhs)
159  }
160}
161
162impl Sub<f32x8> for f32 {
163  type Output = f32x8;
164  #[inline]
165  fn sub(self, rhs: f32x8) -> Self::Output {
166    f32x8::splat(self).sub(rhs)
167  }
168}
169
170impl Mul<f32x8> for f32 {
171  type Output = f32x8;
172  #[inline]
173  fn mul(self, rhs: f32x8) -> Self::Output {
174    f32x8::splat(self).mul(rhs)
175  }
176}
177
178impl Div<f32x8> for f32 {
179  type Output = f32x8;
180  #[inline]
181  fn div(self, rhs: f32x8) -> Self::Output {
182    f32x8::splat(self).div(rhs)
183  }
184}
185
186impl BitAnd for f32x8 {
187  type Output = Self;
188  #[inline]
189  fn bitand(self, rhs: Self) -> Self::Output {
190    pick! {
191      if #[cfg(target_feature="avx")] {
192        Self { avx: bitand_m256(self.avx, rhs.avx) }
193      } else {
194        Self {
195          a : self.a.bitand(rhs.a),
196          b : self.b.bitand(rhs.b),
197        }
198      }
199    }
200  }
201}
202
203impl BitOr for f32x8 {
204  type Output = Self;
205  #[inline]
206  fn bitor(self, rhs: Self) -> Self::Output {
207    pick! {
208      if #[cfg(target_feature="avx")] {
209        Self { avx: bitor_m256(self.avx, rhs.avx) }
210      } else {
211        Self {
212          a : self.a.bitor(rhs.a),
213          b : self.b.bitor(rhs.b),
214        }
215      }
216    }
217  }
218}
219
220impl BitXor for f32x8 {
221  type Output = Self;
222  #[inline]
223  fn bitxor(self, rhs: Self) -> Self::Output {
224    pick! {
225      if #[cfg(target_feature="avx")] {
226        Self { avx: bitxor_m256(self.avx, rhs.avx) }
227      } else {
228        Self {
229          a : self.a.bitxor(rhs.a),
230          b : self.b.bitxor(rhs.b),
231        }
232      }
233    }
234  }
235}
236
237impl CmpEq for f32x8 {
238  type Output = Self;
239  #[inline]
240  fn simd_eq(self, rhs: Self) -> Self::Output {
241    pick! {
242      if #[cfg(target_feature="avx")] {
243        Self { avx: cmp_op_mask_m256::<{cmp_op!(EqualOrdered)}>(self.avx, rhs.avx) }
244      } else {
245        Self {
246          a : self.a.simd_eq(rhs.a),
247          b : self.b.simd_eq(rhs.b),
248        }
249      }
250    }
251  }
252}
253
254impl CmpGe for f32x8 {
255  type Output = Self;
256  #[inline]
257  fn simd_ge(self, rhs: Self) -> Self::Output {
258    pick! {
259      if #[cfg(target_feature="avx")] {
260        Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterEqualOrdered)}>(self.avx, rhs.avx) }
261      } else {
262        Self {
263          a : self.a.simd_ge(rhs.a),
264          b : self.b.simd_ge(rhs.b),
265        }
266      }
267    }
268  }
269}
270
271impl CmpGt for f32x8 {
272  type Output = Self;
273  #[inline]
274  fn simd_gt(self, rhs: Self) -> Self::Output {
275    pick! {
276      if #[cfg(target_feature="avx")] {
277        Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterThanOrdered)}>(self.avx, rhs.avx) }
278      } else {
279        Self {
280          a : self.a.simd_gt(rhs.a),
281          b : self.b.simd_gt(rhs.b),
282        }
283      }
284    }
285  }
286}
287
288impl CmpNe for f32x8 {
289  type Output = Self;
290  #[inline]
291  fn simd_ne(self, rhs: Self) -> Self::Output {
292    pick! {
293      if #[cfg(target_feature="avx")] {
294        Self { avx: cmp_op_mask_m256::<{cmp_op!(NotEqualOrdered)}>(self.avx, rhs.avx) }
295      } else {
296        Self {
297          a : self.a.simd_ne(rhs.a),
298          b : self.b.simd_ne(rhs.b),
299        }
300      }
301    }
302  }
303}
304
305impl CmpLe for f32x8 {
306  type Output = Self;
307  #[inline]
308  fn simd_le(self, rhs: Self) -> Self::Output {
309    pick! {
310      if #[cfg(target_feature="avx")] {
311        Self { avx: cmp_op_mask_m256::<{cmp_op!(LessEqualOrdered)}>(self.avx, rhs.avx) }
312      } else {
313        Self {
314          a : self.a.simd_le(rhs.a),
315          b : self.b.simd_le(rhs.b),
316        }
317      }
318    }
319  }
320}
321
322impl CmpLt for f32x8 {
323  type Output = Self;
324  #[inline]
325  fn simd_lt(self, rhs: Self) -> Self::Output {
326    pick! {
327      if #[cfg(target_feature="avx")] {
328        Self { avx: cmp_op_mask_m256::<{cmp_op!(LessThanOrdered)}>(self.avx, rhs.avx) }
329      } else {
330        Self {
331          a : self.a.simd_lt(rhs.a),
332          b : self.b.simd_lt(rhs.b),
333        }
334      }
335    }
336  }
337}
338
339impl f32x8 {
340  #[inline]
341  #[must_use]
342  pub const fn new(array: [f32; 8]) -> Self {
343    unsafe { core::mem::transmute(array) }
344  }
345  #[inline]
346  #[must_use]
347  pub fn blend(self, t: Self, f: Self) -> Self {
348    pick! {
349      if #[cfg(target_feature="avx")] {
350        Self { avx: blend_varying_m256(f.avx, t.avx, self.avx) }
351      } else {
352        Self {
353          a : self.a.blend(t.a, f.a),
354          b : self.b.blend(t.b, f.b),
355        }
356      }
357    }
358  }
359  #[inline]
360  #[must_use]
361  pub fn abs(self) -> Self {
362    pick! {
363      if #[cfg(target_feature="avx")] {
364        let non_sign_bits = f32x8::from(f32::from_bits(i32::MAX as u32));
365        self & non_sign_bits
366      } else {
367        Self {
368          a : self.a.abs(),
369          b : self.b.abs(),
370        }
371      }
372    }
373  }
374  #[inline]
375  #[must_use]
376  pub fn floor(self) -> Self {
377    pick! {
378      if #[cfg(target_feature="avx")] {
379        Self { avx: floor_m256(self.avx) }
380      } else {
381        Self {
382          a : self.a.floor(),
383          b : self.b.floor(),
384        }
385      }
386    }
387  }
388  #[inline]
389  #[must_use]
390  pub fn ceil(self) -> Self {
391    pick! {
392      if #[cfg(target_feature="avx")] {
393        Self { avx: ceil_m256(self.avx) }
394      } else {
395        Self {
396          a : self.a.ceil(),
397          b : self.b.ceil(),
398        }
399      }
400    }
401  }
402
403  /// Calculates the lanewise maximum of both vectors. This is a faster
404  /// implementation than `max`, but it doesn't specify any behavior if NaNs are
405  /// involved.
406  #[inline]
407  #[must_use]
408  pub fn fast_max(self, rhs: Self) -> Self {
409    pick! {
410      if #[cfg(target_feature="avx")] {
411        Self { avx: max_m256(self.avx, rhs.avx) }
412      } else {
413        Self {
414          a : self.a.fast_max(rhs.a),
415          b : self.b.fast_max(rhs.b),
416        }
417      }
418    }
419  }
420
421  /// Calculates the lanewise maximum of both vectors. This doesn't match
422  /// IEEE-754 and instead is defined as `self < rhs ? rhs : self`.
423  #[inline]
424  #[must_use]
425  pub fn max(self, rhs: Self) -> Self {
426    pick! {
427      if #[cfg(target_feature="avx")] {
428        // max_m256 seems to do rhs < self ? self : rhs. So if there's any NaN
429        // involved, it chooses rhs, so we need to specifically check rhs for
430        // NaN.
431        rhs.is_nan().blend(self, Self { avx: max_m256(self.avx, rhs.avx) })
432      } else {
433        Self {
434          a : self.a.max(rhs.a),
435          b : self.b.max(rhs.b),
436        }
437      }
438
439    }
440  }
441
442  /// Calculates the lanewise minimum of both vectors. This is a faster
443  /// implementation than `min`, but it doesn't specify any behavior if NaNs are
444  /// involved.
445  #[inline]
446  #[must_use]
447  pub fn fast_min(self, rhs: Self) -> Self {
448    pick! {
449      if #[cfg(target_feature="avx")] {
450        Self { avx: min_m256(self.avx, rhs.avx) }
451      } else {
452        Self {
453          a : self.a.fast_min(rhs.a),
454          b : self.b.fast_min(rhs.b),
455        }
456      }
457    }
458  }
459
460  /// Calculates the lanewise minimum of both vectors. If either lane is NaN,
461  /// the other lane gets chosen. Use `fast_min` for a faster implementation
462  /// that doesn't handle NaNs.
463  #[inline]
464  #[must_use]
465  pub fn min(self, rhs: Self) -> Self {
466    pick! {
467      if #[cfg(target_feature="avx")] {
468        // min_m256 seems to do rhs > self ? self : rhs. So if there's any NaN
469        // involved, it chooses rhs, so we need to specifically check rhs for
470        // NaN.
471        rhs.is_nan().blend(self, Self { avx: min_m256(self.avx, rhs.avx) })
472      } else {
473        Self {
474          a : self.a.min(rhs.a),
475          b : self.b.min(rhs.b),
476        }
477      }
478    }
479  }
480  #[inline]
481  #[must_use]
482  pub fn is_nan(self) -> Self {
483    pick! {
484      if #[cfg(target_feature="avx")] {
485        Self { avx: cmp_op_mask_m256::<{cmp_op!(Unordered)}>(self.avx, self.avx) }
486      } else {
487        Self {
488          a : self.a.is_nan(),
489          b : self.b.is_nan(),
490        }
491      }
492    }
493  }
494  #[inline]
495  #[must_use]
496  pub fn is_finite(self) -> Self {
497    let shifted_exp_mask = u32x8::from(0xFF000000);
498    let u: u32x8 = cast(self);
499    let shift_u = u << 1_u64;
500    let out = !(shift_u & shifted_exp_mask).simd_eq(shifted_exp_mask);
501    cast(out)
502  }
503  #[inline]
504  #[must_use]
505  pub fn is_inf(self) -> Self {
506    let shifted_inf = u32x8::from(0xFF000000);
507    let u: u32x8 = cast(self);
508    let shift_u = u << 1_u64;
509    let out = (shift_u).simd_eq(shifted_inf);
510    cast(out)
511  }
512
513  #[inline]
514  #[must_use]
515  pub fn round(self) -> Self {
516    pick! {
517      // NOTE: Is there an SSE2 version of this? f32x4 version probably translates but I've not had time to figure it out
518      if #[cfg(target_feature="avx")] {
519        Self { avx: round_m256::<{round_op!(Nearest)}>(self.avx) }
520      } else {
521        Self {
522          a : self.a.round(),
523          b : self.b.round(),
524        }
525      }
526    }
527  }
528
529  /// Rounds each lane into an integer. This is a faster implementation than
530  /// `round_int`, but it doesn't handle out of range values or NaNs. For those
531  /// values you get implementation defined behavior.
532  #[inline]
533  #[must_use]
534  pub fn fast_round_int(self) -> i32x8 {
535    pick! {
536      if #[cfg(target_feature="avx")] {
537        cast(convert_to_i32_m256i_from_m256(self.avx))
538      } else {
539        cast([
540          self.a.fast_round_int(),
541          self.b.fast_round_int()])
542      }
543    }
544  }
545
546  /// Rounds each lane into an integer. This saturates out of range values and
547  /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that
548  /// doesn't handle out of range values or NaNs.
549  #[inline]
550  #[must_use]
551  pub fn round_int(self) -> i32x8 {
552    pick! {
553      if #[cfg(target_feature="avx")] {
554        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
555        let non_nan_mask = self.simd_eq(self);
556        let non_nan = self & non_nan_mask;
557        let flip_to_max: i32x8 = cast(self.simd_ge(Self::splat(2147483648.0)));
558        let cast: i32x8 = cast(convert_to_i32_m256i_from_m256(non_nan.avx));
559        flip_to_max ^ cast
560      } else {
561        cast([
562          self.a.round_int(),
563          self.b.round_int(),
564        ])
565      }
566    }
567  }
568
569  /// Truncates each lane into an integer. This is a faster implementation than
570  /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those
571  /// values you get implementation defined behavior.
572  #[inline]
573  #[must_use]
574  pub fn fast_trunc_int(self) -> i32x8 {
575    pick! {
576      if #[cfg(all(target_feature="avx"))] {
577        cast(convert_truncate_to_i32_m256i_from_m256(self.avx))
578      } else {
579        cast([
580          self.a.fast_trunc_int(),
581          self.b.fast_trunc_int(),
582        ])
583      }
584    }
585  }
586
587  /// Truncates each lane into an integer. This saturates out of range values
588  /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation
589  /// that doesn't handle out of range values or NaNs.
590  #[inline]
591  #[must_use]
592  pub fn trunc_int(self) -> i32x8 {
593    pick! {
594        if #[cfg(target_feature="avx")] {
595        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
596        let non_nan_mask = self.simd_eq(self);
597        let non_nan = self & non_nan_mask;
598        let flip_to_max: i32x8 = cast(self.simd_ge(Self::splat(2147483648.0)));
599        let cast: i32x8 = cast(convert_truncate_to_i32_m256i_from_m256(non_nan.avx));
600        flip_to_max ^ cast
601      } else {
602        cast([
603          self.a.trunc_int(),
604          self.b.trunc_int(),
605        ])
606      }
607    }
608  }
609  /// Performs a multiply-add operation: `self * m + a`
610  ///
611  /// When hardware FMA support is available, this computes the result with a
612  /// single rounding operation. Without FMA support, it falls back to separate
613  /// multiply and add operations with two roundings.
614  ///
615  /// # Platform-specific behavior
616  /// - On `x86`/`x86_64` with AVX+FMA: Uses `vfmadd` (single rounding, best
617  ///   accuracy)
618  /// - On `x86`/`x86_64` with AVX only: Uses `(self * m) + a` (two roundings)
619  /// - Other platforms: Delegates to [`f32x4`] (may use NEON FMA or fallback)
620  ///
621  /// # Examples
622  /// ```
623  /// # use wide::f32x8;
624  /// let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
625  /// let b = f32x8::from([2.0; 8]);
626  /// let c = f32x8::from([10.0; 8]);
627  ///
628  /// let result = a.mul_add(b, c);
629  ///
630  /// let expected = f32x8::from([12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0]);
631  /// assert_eq!(result, expected);
632  /// ```
633  #[inline]
634  #[must_use]
635  pub fn mul_add(self, m: Self, a: Self) -> Self {
636    pick! {
637      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
638        Self { avx: fused_mul_add_m256(self.avx, m.avx, a.avx) }
639      } else if #[cfg(target_feature="avx")] {
640        // still want to use 256 bit ops
641        (self * m) + a
642      } else {
643        Self {
644          a : self.a.mul_add(m.a, a.a),
645          b : self.b.mul_add(m.b, a.b),
646        }
647      }
648    }
649  }
650
651  /// Performs a multiply-subtract operation: `self * m - s`
652  ///
653  /// When hardware FMA support is available, this computes the result with a
654  /// single rounding operation. Without FMA support, it falls back to separate
655  /// multiply and subtract operations with two roundings.
656  ///
657  /// # Platform-specific behavior
658  /// - On `x86`/`x86_64` with AVX+FMA: Uses `vfmsub` (single rounding, best
659  ///   accuracy)
660  /// - On `x86`/`x86_64` with AVX only: Uses `(self * m) - s` (two roundings)
661  /// - Other platforms: Delegates to [`f32x4`] (may use NEON FMA or fallback)
662  ///
663  /// # Examples
664  /// ```
665  /// # use wide::f32x8;
666  /// let a = f32x8::from([10.0; 8]);
667  /// let b = f32x8::from([2.0; 8]);
668  /// let c = f32x8::from([5.0; 8]);
669  ///
670  /// let result = a.mul_sub(b, c);
671  ///
672  /// let expected = f32x8::from([15.0; 8]);
673  /// assert_eq!(result, expected);
674  /// ```
675  #[inline]
676  #[must_use]
677  pub fn mul_sub(self, m: Self, s: Self) -> Self {
678    pick! {
679      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
680        Self { avx: fused_mul_sub_m256(self.avx, m.avx, s.avx) }
681      } else if #[cfg(target_feature="avx")] {
682        // still want to use 256 bit ops
683        (self * m) - s
684      } else {
685        Self {
686          a : self.a.mul_sub(m.a, s.a),
687          b : self.b.mul_sub(m.b, s.b),
688        }
689      }
690    }
691  }
692
693  /// Performs a negative multiply-add operation: `a - (self * m)`
694  ///
695  /// When hardware FMA support is available, this computes the result with a
696  /// single rounding operation. Without FMA support, it falls back to separate
697  /// operations with two roundings.
698  ///
699  /// # Platform-specific behavior
700  /// - On `x86`/`x86_64` with AVX+FMA: Uses `vfnmadd` (single rounding, best
701  ///   accuracy)
702  /// - On `x86`/`x86_64` with AVX only: Uses `a - (self * m)` (two roundings)
703  /// - Other platforms: Delegates to [`f32x4`] (may use NEON FMA or fallback)
704  ///
705  /// # Examples
706  /// ```
707  /// # use wide::f32x8;
708  /// let a = f32x8::from([3.0; 8]);
709  /// let b = f32x8::from([2.0; 8]);
710  /// let c = f32x8::from([10.0; 8]);
711  ///
712  /// let result = a.mul_neg_add(b, c);
713  ///
714  /// let expected = f32x8::from([4.0; 8]);
715  /// assert_eq!(result, expected);
716  /// ```
717  #[inline]
718  #[must_use]
719  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
720    pick! {
721      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
722        Self { avx: fused_mul_neg_add_m256(self.avx, m.avx, a.avx) }
723      } else if #[cfg(target_feature="avx")] {
724        // still want to use 256 bit ops
725        a - (self * m)
726      } else {
727        Self {
728          a : self.a.mul_neg_add(m.a, a.a),
729          b : self.b.mul_neg_add(m.b, a.b),
730        }
731      }
732    }
733  }
734
735  /// Performs a negative multiply-subtract operation: `-(self * m) - s`
736  ///
737  /// When hardware FMA support is available, this computes the result with a
738  /// single rounding operation. Without FMA support, it falls back to separate
739  /// operations with two roundings.
740  ///
741  /// # Platform-specific behavior
742  /// - On `x86`/`x86_64` with AVX+FMA: Uses `vfnmsub` (single rounding, best
743  ///   accuracy)
744  /// - On `x86`/`x86_64` with AVX only: Uses `-(self * m) - s` (two roundings)
745  /// - Other platforms: Delegates to [`f32x4`] (may use NEON FMA or fallback)
746  ///
747  /// # Examples
748  /// ```
749  /// # use wide::f32x8;
750  /// let a = f32x8::from([3.0; 8]);
751  /// let b = f32x8::from([2.0; 8]);
752  /// let c = f32x8::from([1.0; 8]);
753  ///
754  /// let result = a.mul_neg_sub(b, c);
755  ///
756  /// let expected = f32x8::from([-7.0; 8]);
757  /// assert_eq!(result, expected);
758  /// ```
759  #[inline]
760  #[must_use]
761  pub fn mul_neg_sub(self, m: Self, s: Self) -> Self {
762    pick! {
763      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
764        Self { avx: fused_mul_neg_sub_m256(self.avx, m.avx, s.avx) }
765      } else if #[cfg(target_feature="avx")] {
766        // still want to use 256 bit ops
767        -(self * m) - s
768      } else {
769        Self {
770          a : self.a.mul_neg_sub(m.a, s.a),
771          b : self.b.mul_neg_sub(m.b, s.b),
772        }
773      }
774    }
775  }
776
777  #[inline]
778  #[must_use]
779  pub fn flip_signs(self, signs: Self) -> Self {
780    self ^ (signs & Self::from(-0.0))
781  }
782
783  #[inline]
784  #[must_use]
785  pub fn copysign(self, sign: Self) -> Self {
786    let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
787    (self & magnitude_mask) | (sign & Self::from(-0.0))
788  }
789
790  #[inline]
791  pub fn asin_acos(self) -> (Self, Self) {
792    // Based on the Agner Fog "vector class library":
793    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
794    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
795    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
796    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
797    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
798    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
799
800    let xa = self.abs();
801    let big = xa.simd_ge(f32x8::splat(0.5));
802
803    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
804    let x2 = xa * xa;
805    let x3 = big.blend(x1, x2);
806
807    let xb = x1.sqrt();
808
809    let x4 = big.blend(xb, xa);
810
811    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
812    let z = z.mul_add(x3 * x4, x4);
813
814    let z1 = z + z;
815
816    // acos
817    let z3 = self.simd_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1);
818    let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self);
819    let acos = big.blend(z3, z4);
820
821    // asin
822    let z3 = f32x8::FRAC_PI_2 - z1;
823    let asin = big.blend(z3, z);
824    let asin = asin.flip_signs(self);
825
826    (asin, acos)
827  }
828
829  #[inline]
830  #[must_use]
831  pub fn asin(self) -> Self {
832    // Based on the Agner Fog "vector class library":
833    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
834    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
835    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
836    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
837    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
838    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
839
840    let xa = self.abs();
841    let big = xa.simd_ge(f32x8::splat(0.5));
842
843    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
844    let x2 = xa * xa;
845    let x3 = big.blend(x1, x2);
846
847    let xb = x1.sqrt();
848
849    let x4 = big.blend(xb, xa);
850
851    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
852    let z = z.mul_add(x3 * x4, x4);
853
854    let z1 = z + z;
855
856    // asin
857    let z3 = f32x8::FRAC_PI_2 - z1;
858    let asin = big.blend(z3, z);
859    let asin = asin.flip_signs(self);
860
861    asin
862  }
863
864  #[inline]
865  #[must_use]
866  pub fn acos(self) -> Self {
867    // Based on the Agner Fog "vector class library":
868    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
869    const_f32_as_f32x8!(P4asinf, 4.2163199048E-2);
870    const_f32_as_f32x8!(P3asinf, 2.4181311049E-2);
871    const_f32_as_f32x8!(P2asinf, 4.5470025998E-2);
872    const_f32_as_f32x8!(P1asinf, 7.4953002686E-2);
873    const_f32_as_f32x8!(P0asinf, 1.6666752422E-1);
874
875    let xa = self.abs();
876    let big = xa.simd_ge(f32x8::splat(0.5));
877
878    let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa);
879    let x2 = xa * xa;
880    let x3 = big.blend(x1, x2);
881
882    let xb = x1.sqrt();
883
884    let x4 = big.blend(xb, xa);
885
886    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
887    let z = z.mul_add(x3 * x4, x4);
888
889    let z1 = z + z;
890
891    // acos
892    let z3 = self.simd_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1);
893    let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self);
894    let acos = big.blend(z3, z4);
895
896    acos
897  }
898
899  #[inline]
900  pub fn atan(self) -> Self {
901    // Based on the Agner Fog "vector class library":
902    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
903    const_f32_as_f32x8!(P3atanf, 8.05374449538E-2);
904    const_f32_as_f32x8!(P2atanf, -1.38776856032E-1);
905    const_f32_as_f32x8!(P1atanf, 1.99777106478E-1);
906    const_f32_as_f32x8!(P0atanf, -3.33329491539E-1);
907
908    let t = self.abs();
909
910    // small:  z = t / 1.0;
911    // medium: z = (t-1.0) / (t+1.0);
912    // big:    z = -1.0 / t;
913    let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
914    let notbig = t.simd_le(Self::SQRT_2 + Self::ONE);
915
916    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
917    s = notsmal & s;
918
919    let mut a = notbig & t;
920    a = notsmal.blend(a - Self::ONE, a);
921    let mut b = notbig & Self::ONE;
922    b = notsmal.blend(b + t, b);
923    let z = a / b;
924
925    let zz = z * z;
926
927    // Taylor expansion
928    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
929    re = re.mul_add(zz * z, z) + s;
930
931    // get sign bit
932    re = (self.sign_bit()).blend(-re, re);
933
934    re
935  }
936
937  #[inline]
938  pub fn atan2(self, x: Self) -> Self {
939    // Based on the Agner Fog "vector class library":
940    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
941    const_f32_as_f32x8!(P3atanf, 8.05374449538E-2);
942    const_f32_as_f32x8!(P2atanf, -1.38776856032E-1);
943    const_f32_as_f32x8!(P1atanf, 1.99777106478E-1);
944    const_f32_as_f32x8!(P0atanf, -3.33329491539E-1);
945
946    let y = self;
947
948    // move in first octant
949    let x1 = x.abs();
950    let y1 = y.abs();
951    let swapxy = y1.simd_gt(x1);
952    // swap x and y if y1 > x1
953    let mut x2 = swapxy.blend(y1, x1);
954    let mut y2 = swapxy.blend(x1, y1);
955
956    // check for special case: x and y are both +/- INF
957    let both_infinite = x.is_inf() & y.is_inf();
958    if both_infinite.any() {
959      let minus_one = -Self::ONE;
960      x2 = both_infinite.blend(x2 & minus_one, x2);
961      y2 = both_infinite.blend(y2 & minus_one, y2);
962    }
963
964    // x = y = 0 will produce NAN. No problem, fixed below
965    let t = y2 / x2;
966
967    // small:  z = t / 1.0;
968    // medium: z = (t-1.0) / (t+1.0);
969    let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
970
971    let a = notsmal.blend(t - Self::ONE, t);
972    let b = notsmal.blend(t + Self::ONE, Self::ONE);
973    let s = notsmal & Self::FRAC_PI_4;
974    let z = a / b;
975
976    let zz = z * z;
977
978    // Taylor expansion
979    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
980    re = re.mul_add(zz * z, z) + s;
981
982    // move back in place
983    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
984    re = ((x | y).simd_eq(Self::ZERO)).blend(Self::ZERO, re);
985    re = (x.sign_bit()).blend(Self::PI - re, re);
986
987    // get sign bit
988    re = (y.sign_bit()).blend(-re, re);
989
990    re
991  }
992
993  #[inline]
994  #[must_use]
995  pub fn sin_cos(self) -> (Self, Self) {
996    // Based on the Agner Fog "vector class library":
997    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
998
999    const_f32_as_f32x8!(DP1F, 0.78515625_f32 * 2.0);
1000    const_f32_as_f32x8!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
1001    const_f32_as_f32x8!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
1002
1003    const_f32_as_f32x8!(P0sinf, -1.6666654611E-1);
1004    const_f32_as_f32x8!(P1sinf, 8.3321608736E-3);
1005    const_f32_as_f32x8!(P2sinf, -1.9515295891E-4);
1006
1007    const_f32_as_f32x8!(P0cosf, 4.166664568298827E-2);
1008    const_f32_as_f32x8!(P1cosf, -1.388731625493765E-3);
1009    const_f32_as_f32x8!(P2cosf, 2.443315711809948E-5);
1010
1011    const_f32_as_f32x8!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
1012
1013    let xa = self.abs();
1014
1015    // Find quadrant
1016    let y = (xa * TWO_OVER_PI).round();
1017    let q: i32x8 = y.round_int();
1018
1019    let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
1020
1021    let x2 = x * x;
1022    let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
1023    let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
1024      + f32x8::from(0.5).mul_neg_add(x2, f32x8::from(1.0));
1025
1026    let swap = !(q & i32x8::from(1)).simd_eq(i32x8::from(0));
1027
1028    let mut overflow: f32x8 = cast(q.simd_gt(i32x8::from(0x2000000)));
1029    overflow &= xa.is_finite();
1030    s = overflow.blend(f32x8::from(0.0), s);
1031    c = overflow.blend(f32x8::from(1.0), c);
1032
1033    // calc sin
1034    let mut sin1 = cast::<_, f32x8>(swap).blend(c, s);
1035    let sign_sin: i32x8 = (q << 30) ^ cast::<_, i32x8>(self);
1036    sin1 = sin1.flip_signs(cast(sign_sin));
1037
1038    // calc cos
1039    let mut cos1 = cast::<_, f32x8>(swap).blend(s, c);
1040    let sign_cos: i32x8 = ((q + i32x8::from(1)) & i32x8::from(2)) << 30;
1041    cos1 ^= cast::<_, f32x8>(sign_cos);
1042
1043    (sin1, cos1)
1044  }
1045  #[inline]
1046  #[must_use]
1047  pub fn sin(self) -> Self {
1048    let (s, _) = self.sin_cos();
1049    s
1050  }
1051  #[inline]
1052  #[must_use]
1053  pub fn cos(self) -> Self {
1054    let (_, c) = self.sin_cos();
1055    c
1056  }
1057  #[inline]
1058  #[must_use]
1059  pub fn tan(self) -> Self {
1060    let (s, c) = self.sin_cos();
1061    s / c
1062  }
1063  #[inline]
1064  #[must_use]
1065  pub fn to_degrees(self) -> Self {
1066    const_f32_as_f32x8!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
1067    self * RAD_TO_DEG_RATIO
1068  }
1069  #[inline]
1070  #[must_use]
1071  pub fn to_radians(self) -> Self {
1072    const_f32_as_f32x8!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
1073    self * DEG_TO_RAD_RATIO
1074  }
1075  #[inline]
1076  #[must_use]
1077  pub fn recip(self) -> Self {
1078    pick! {
1079      if #[cfg(target_feature="avx")] {
1080        Self { avx: reciprocal_m256(self.avx) }
1081      } else {
1082        Self {
1083          a : self.a.recip(),
1084          b : self.b.recip(),
1085        }
1086      }
1087    }
1088  }
1089  #[inline]
1090  #[must_use]
1091  pub fn recip_sqrt(self) -> Self {
1092    pick! {
1093      if #[cfg(target_feature="avx")] {
1094        Self { avx: reciprocal_sqrt_m256(self.avx) }
1095      } else {
1096        Self {
1097          a : self.a.recip_sqrt(),
1098          b : self.b.recip_sqrt(),
1099        }
1100      }
1101    }
1102  }
1103  #[inline]
1104  #[must_use]
1105  pub fn sqrt(self) -> Self {
1106    pick! {
1107      if #[cfg(target_feature="avx")] {
1108        Self { avx: sqrt_m256(self.avx) }
1109      } else {
1110        Self {
1111          a : self.a.sqrt(),
1112          b : self.b.sqrt(),
1113        }
1114      }
1115    }
1116  }
1117  #[inline]
1118  #[must_use]
1119  pub fn to_bitmask(self) -> u32 {
1120    pick! {
1121      if #[cfg(target_feature="avx")] {
1122        move_mask_m256(self.avx) as u32
1123      } else {
1124        (self.b.to_bitmask() << 4) | self.a.to_bitmask()
1125      }
1126    }
1127  }
1128  #[inline]
1129  #[must_use]
1130  pub fn any(self) -> bool {
1131    pick! {
1132      if #[cfg(target_feature="avx")] {
1133        move_mask_m256(self.avx) != 0
1134      } else {
1135        self.a.any() || self.b.any()
1136      }
1137    }
1138  }
1139  #[inline]
1140  #[must_use]
1141  pub fn all(self) -> bool {
1142    pick! {
1143      if #[cfg(target_feature="avx")] {
1144        move_mask_m256(self.avx) == 0b11111111
1145      } else {
1146        self.a.all() && self.b.all()
1147      }
1148    }
1149  }
1150  #[inline]
1151  #[must_use]
1152  pub fn none(self) -> bool {
1153    !self.any()
1154  }
1155
1156  #[inline]
1157  fn vm_pow2n(self) -> Self {
1158    const_f32_as_f32x8!(pow2_23, 8388608.0);
1159    const_f32_as_f32x8!(bias, 127.0);
1160    let a = self + (bias + pow2_23);
1161    let c = cast::<_, i32x8>(a) << 23;
1162    cast::<_, f32x8>(c)
1163  }
1164
1165  /// Calculate the exponent of a packed `f32x8`
1166  #[inline]
1167  #[must_use]
1168  pub fn exp(self) -> Self {
1169    const_f32_as_f32x8!(P0, 1.0 / 2.0);
1170    const_f32_as_f32x8!(P1, 1.0 / 6.0);
1171    const_f32_as_f32x8!(P2, 1. / 24.);
1172    const_f32_as_f32x8!(P3, 1. / 120.);
1173    const_f32_as_f32x8!(P4, 1. / 720.);
1174    const_f32_as_f32x8!(P5, 1. / 5040.);
1175    const_f32_as_f32x8!(LN2D_HI, 0.693359375);
1176    const_f32_as_f32x8!(LN2D_LO, -2.12194440e-4);
1177    let max_x = f32x8::from(87.3);
1178    let r = (self * Self::LOG2_E).round();
1179    let x = r.mul_neg_add(LN2D_HI, self);
1180    let x = r.mul_neg_add(LN2D_LO, x);
1181    let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1182    let x2 = x * x;
1183    let z = z.mul_add(x2, x);
1184    let n2 = Self::vm_pow2n(r);
1185    let z = (z + Self::ONE) * n2;
1186    // check for overflow
1187    let in_range = self.abs().simd_lt(max_x);
1188    let in_range = in_range & self.is_finite();
1189    in_range.blend(z, Self::ZERO)
1190  }
1191
1192  #[inline]
1193  fn exponent(self) -> f32x8 {
1194    const_f32_as_f32x8!(pow2_23, 8388608.0);
1195    const_f32_as_f32x8!(bias, 127.0);
1196    let a = cast::<_, u32x8>(self);
1197    let b = a >> 23;
1198    let c = b | cast::<_, u32x8>(pow2_23);
1199    let d = cast::<_, f32x8>(c);
1200    let e = d - (pow2_23 + bias);
1201    e
1202  }
1203
1204  #[inline]
1205  fn fraction_2(self) -> Self {
1206    let t1 = cast::<_, u32x8>(self);
1207    let t2 = cast::<_, u32x8>(
1208      (t1 & u32x8::from(0x007FFFFF)) | u32x8::from(0x3F000000),
1209    );
1210    cast::<_, f32x8>(t2)
1211  }
1212  #[inline]
1213  fn is_zero_or_subnormal(self) -> Self {
1214    let t = cast::<_, i32x8>(self);
1215    let t = t & i32x8::splat(0x7F800000);
1216    i32x8::round_float(t.simd_eq(i32x8::splat(0)))
1217  }
1218  #[inline]
1219  fn infinity() -> Self {
1220    cast::<_, f32x8>(i32x8::splat(0x7F800000))
1221  }
1222  #[inline]
1223  fn nan_log() -> Self {
1224    cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1225  }
1226  #[inline]
1227  fn nan_pow() -> Self {
1228    cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1229  }
1230  #[inline]
1231  pub fn sign_bit(self) -> Self {
1232    let t1 = cast::<_, i32x8>(self);
1233    let t2 = t1 >> 31;
1234    !cast::<_, f32x8>(t2).simd_eq(f32x8::ZERO)
1235  }
1236
1237  /// horizontal add of all the elements of the vector
1238  #[inline]
1239  #[must_use]
1240  pub fn reduce_add(self) -> f32 {
1241    pick! {
1242      // From https://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally
1243      if #[cfg(target_feature="avx")]{
1244        let hi_quad = extract_m128_from_m256::<1>(self.avx);
1245        let lo_quad = cast_to_m128_from_m256(self.avx);
1246        let sum_quad = add_m128(lo_quad,hi_quad);
1247        let lo_dual = sum_quad;
1248        let hi_dual = move_high_low_m128(sum_quad,sum_quad);
1249        let sum_dual = add_m128(lo_dual,hi_dual);
1250        let lo = sum_dual;
1251        let hi = shuffle_abi_f32_all_m128::<0b_01>(sum_dual, sum_dual);
1252        let sum = add_m128_s(lo, hi);
1253        get_f32_from_m128_s(sum)
1254      } else {
1255        self.a.reduce_add() + self.b.reduce_add()
1256      }
1257    }
1258  }
1259
1260  /// Natural log (ln(x))
1261  #[inline]
1262  #[must_use]
1263  pub fn ln(self) -> Self {
1264    const_f32_as_f32x8!(HALF, 0.5);
1265    const_f32_as_f32x8!(P0, 3.3333331174E-1);
1266    const_f32_as_f32x8!(P1, -2.4999993993E-1);
1267    const_f32_as_f32x8!(P2, 2.0000714765E-1);
1268    const_f32_as_f32x8!(P3, -1.6668057665E-1);
1269    const_f32_as_f32x8!(P4, 1.4249322787E-1);
1270    const_f32_as_f32x8!(P5, -1.2420140846E-1);
1271    const_f32_as_f32x8!(P6, 1.1676998740E-1);
1272    const_f32_as_f32x8!(P7, -1.1514610310E-1);
1273    const_f32_as_f32x8!(P8, 7.0376836292E-2);
1274    const_f32_as_f32x8!(LN2F_HI, 0.693359375);
1275    const_f32_as_f32x8!(LN2F_LO, -2.12194440e-4);
1276    const_f32_as_f32x8!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1277
1278    let x1 = self;
1279    let x = Self::fraction_2(x1);
1280    let e = Self::exponent(x1);
1281    let mask = x.simd_gt(Self::SQRT_2 * HALF);
1282    let x = (!mask).blend(x + x, x);
1283    let fe = mask.blend(e + Self::ONE, e);
1284    let x = x - Self::ONE;
1285    let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1286    let x2 = x * x;
1287    let res = x2 * x * res;
1288    let res = fe.mul_add(LN2F_LO, res);
1289    let res = res + x2.mul_neg_add(HALF, x);
1290    let res = fe.mul_add(LN2F_HI, res);
1291    let overflow = !self.is_finite();
1292    let underflow = x1.simd_lt(VM_SMALLEST_NORMAL);
1293    let mask = overflow | underflow;
1294    if !mask.any() {
1295      res
1296    } else {
1297      let is_zero = self.is_zero_or_subnormal();
1298      let res = underflow.blend(Self::nan_log(), res);
1299      let res = is_zero.blend(Self::infinity(), res);
1300      let res = overflow.blend(self, res);
1301      res
1302    }
1303  }
1304
1305  #[inline]
1306  #[must_use]
1307  pub fn log2(self) -> Self {
1308    Self::ln(self) * Self::LOG2_E
1309  }
1310  #[inline]
1311  #[must_use]
1312  pub fn log10(self) -> Self {
1313    Self::ln(self) * Self::LOG10_E
1314  }
1315
1316  #[inline]
1317  #[must_use]
1318  pub fn pow_f32x8(self, y: Self) -> Self {
1319    const_f32_as_f32x8!(ln2f_hi, 0.693359375);
1320    const_f32_as_f32x8!(ln2f_lo, -2.12194440e-4);
1321    const_f32_as_f32x8!(P0logf, 3.3333331174E-1);
1322    const_f32_as_f32x8!(P1logf, -2.4999993993E-1);
1323    const_f32_as_f32x8!(P2logf, 2.0000714765E-1);
1324    const_f32_as_f32x8!(P3logf, -1.6668057665E-1);
1325    const_f32_as_f32x8!(P4logf, 1.4249322787E-1);
1326    const_f32_as_f32x8!(P5logf, -1.2420140846E-1);
1327    const_f32_as_f32x8!(P6logf, 1.1676998740E-1);
1328    const_f32_as_f32x8!(P7logf, -1.1514610310E-1);
1329    const_f32_as_f32x8!(P8logf, 7.0376836292E-2);
1330
1331    const_f32_as_f32x8!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1332    const_f32_as_f32x8!(p3expf, 1.0 / 6.0);
1333    const_f32_as_f32x8!(p4expf, 1.0 / 24.0);
1334    const_f32_as_f32x8!(p5expf, 1.0 / 120.0);
1335    const_f32_as_f32x8!(p6expf, 1.0 / 720.0);
1336    const_f32_as_f32x8!(p7expf, 1.0 / 5040.0);
1337
1338    let x1 = self.abs();
1339    let x = x1.fraction_2();
1340    let mask = x.simd_gt(f32x8::SQRT_2 * f32x8::HALF);
1341    let x = (!mask).blend(x + x, x);
1342
1343    let x = x - f32x8::ONE;
1344    let x2 = x * x;
1345    let lg1 = polynomial_8!(
1346      x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1347    );
1348    let lg1 = lg1 * x2 * x;
1349
1350    let ef = x1.exponent();
1351    let ef = mask.blend(ef + f32x8::ONE, ef);
1352    let e1 = (ef * y).round();
1353    let yr = ef.mul_sub(y, e1);
1354
1355    let lg = f32x8::HALF.mul_neg_add(x2, x) + lg1;
1356    let x2_err = (f32x8::HALF * x).mul_sub(x, f32x8::HALF * x2);
1357    let lg_err = f32x8::HALF.mul_add(x2, lg - x) - lg1;
1358
1359    let e2 = (lg * y * f32x8::LOG2_E).round();
1360    let v = lg.mul_sub(y, e2 * ln2f_hi);
1361    let v = e2.mul_neg_add(ln2f_lo, v);
1362    let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x8::LN_2);
1363
1364    let x = v;
1365    let e3 = (x * f32x8::LOG2_E).round();
1366    let x = e3.mul_neg_add(f32x8::LN_2, x);
1367    let x2 = x * x;
1368    let z = x2.mul_add(
1369      polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1370      x + f32x8::ONE,
1371    );
1372
1373    let ee = e1 + e2 + e3;
1374    let ei = cast::<_, i32x8>(ee.round_int());
1375    let ej = cast::<_, i32x8>(ei + (cast::<_, i32x8>(z) >> 23));
1376
1377    let overflow = cast::<_, f32x8>(ej.simd_gt(i32x8::splat(0x0FF)))
1378      | (ee.simd_gt(f32x8::splat(300.0)));
1379    let underflow = cast::<_, f32x8>(ej.simd_lt(i32x8::splat(0x000)))
1380      | (ee.simd_lt(f32x8::splat(-300.0)));
1381
1382    // Add exponent by integer addition
1383    let z = cast::<_, f32x8>(cast::<_, i32x8>(z) + (ei << 23));
1384    // Check for overflow/underflow
1385    let z = underflow.blend(f32x8::ZERO, z);
1386    let z = overflow.blend(Self::infinity(), z);
1387
1388    // Check for self == 0
1389    let x_zero = self.is_zero_or_subnormal();
1390    let z = x_zero.blend(
1391      y.simd_lt(f32x8::ZERO).blend(
1392        Self::infinity(),
1393        y.simd_eq(f32x8::ZERO).blend(f32x8::ONE, f32x8::ZERO),
1394      ),
1395      z,
1396    );
1397
1398    let x_sign = self.sign_bit();
1399    let z = if x_sign.any() {
1400      // Y into an integer
1401      let yi = y.simd_eq(y.round());
1402
1403      // Is y odd?
1404      let y_odd = cast::<_, i32x8>(y.round_int() << 31).round_float();
1405
1406      let z1 =
1407        yi.blend(z | y_odd, self.simd_eq(Self::ZERO).blend(z, Self::nan_pow()));
1408
1409      x_sign.blend(z1, z)
1410    } else {
1411      z
1412    };
1413
1414    let x_finite = self.is_finite();
1415    let y_finite = y.is_finite();
1416    let e_finite = ee.is_finite();
1417    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1418      return z;
1419    }
1420
1421    (self.is_nan() | y.is_nan()).blend(self + y, z)
1422  }
1423  #[inline]
1424  pub fn powf(self, y: f32) -> Self {
1425    Self::pow_f32x8(self, f32x8::splat(y))
1426  }
1427
1428  /// Transpose matrix of 8x8 `f32` matrix. Currently only accelerated on AVX.
1429  #[must_use]
1430  #[inline]
1431  pub fn transpose(data: [f32x8; 8]) -> [f32x8; 8] {
1432    pick! {
1433      if #[cfg(target_feature="avx")] {
1434        let a0 = unpack_lo_m256(data[0].avx, data[1].avx);
1435        let a1 = unpack_hi_m256(data[0].avx, data[1].avx);
1436        let a2 = unpack_lo_m256(data[2].avx, data[3].avx);
1437        let a3 = unpack_hi_m256(data[2].avx, data[3].avx);
1438        let a4 = unpack_lo_m256(data[4].avx, data[5].avx);
1439        let a5 = unpack_hi_m256(data[4].avx, data[5].avx);
1440        let a6 = unpack_lo_m256(data[6].avx, data[7].avx);
1441        let a7 = unpack_hi_m256(data[6].avx, data[7].avx);
1442
1443        pub const fn mm_shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
1444          (z << 6) | (y << 4) | (x << 2) | w
1445        }
1446
1447        const SHUFF_LO : i32 = mm_shuffle(1,0,1,0);
1448        const SHUFF_HI : i32 = mm_shuffle(3,2,3,2);
1449
1450        // possible todo: intel performance manual suggests alternative with blend to avoid port 5 pressure
1451        // (since blend runs on a different port than shuffle)
1452        let b0 = shuffle_m256::<SHUFF_LO>(a0,a2);
1453        let b1 = shuffle_m256::<SHUFF_HI>(a0,a2);
1454        let b2 = shuffle_m256::<SHUFF_LO>(a1,a3);
1455        let b3 = shuffle_m256::<SHUFF_HI>(a1,a3);
1456        let b4 = shuffle_m256::<SHUFF_LO>(a4,a6);
1457        let b5 = shuffle_m256::<SHUFF_HI>(a4,a6);
1458        let b6 = shuffle_m256::<SHUFF_LO>(a5,a7);
1459        let b7 = shuffle_m256::<SHUFF_HI>(a5,a7);
1460
1461        [
1462          f32x8 { avx: permute2z_m256::<0x20>(b0, b4) },
1463          f32x8 { avx: permute2z_m256::<0x20>(b1, b5) },
1464          f32x8 { avx: permute2z_m256::<0x20>(b2, b6) },
1465          f32x8 { avx: permute2z_m256::<0x20>(b3, b7) },
1466          f32x8 { avx: permute2z_m256::<0x31>(b0, b4) },
1467          f32x8 { avx: permute2z_m256::<0x31>(b1, b5) },
1468          f32x8 { avx: permute2z_m256::<0x31>(b2, b6) },
1469          f32x8 { avx: permute2z_m256::<0x31>(b3, b7) }
1470        ]
1471      } else {
1472        // possible todo: not sure that 128bit SIMD gives us a a lot of speedup here
1473
1474        #[inline(always)]
1475        fn transpose_column(data: &[f32x8; 8], index: usize) -> f32x8 {
1476          f32x8::new([
1477            data[0].as_array()[index],
1478            data[1].as_array()[index],
1479            data[2].as_array()[index],
1480            data[3].as_array()[index],
1481            data[4].as_array()[index],
1482            data[5].as_array()[index],
1483            data[6].as_array()[index],
1484            data[7].as_array()[index],
1485          ])
1486        }
1487
1488        [
1489          transpose_column(&data, 0),
1490          transpose_column(&data, 1),
1491          transpose_column(&data, 2),
1492          transpose_column(&data, 3),
1493          transpose_column(&data, 4),
1494          transpose_column(&data, 5),
1495          transpose_column(&data, 6),
1496          transpose_column(&data, 7),
1497        ]
1498      }
1499    }
1500  }
1501
1502  #[inline]
1503  pub fn to_array(self) -> [f32; 8] {
1504    cast(self)
1505  }
1506
1507  #[inline]
1508  pub fn as_array(&self) -> &[f32; 8] {
1509    cast_ref(self)
1510  }
1511
1512  #[inline]
1513  pub fn as_mut_array(&mut self) -> &mut [f32; 8] {
1514    cast_mut(self)
1515  }
1516
1517  #[inline]
1518  pub fn from_i32x8(v: i32x8) -> Self {
1519    pick! {
1520      if #[cfg(target_feature="avx2")] {
1521        Self { avx: convert_to_m256_from_i32_m256i(v.avx2) }
1522      } else {
1523        Self::new([
1524            v.as_array()[0] as f32,
1525            v.as_array()[1] as f32,
1526            v.as_array()[2] as f32,
1527            v.as_array()[3] as f32,
1528            v.as_array()[4] as f32,
1529            v.as_array()[5] as f32,
1530            v.as_array()[6] as f32,
1531            v.as_array()[7] as f32,
1532          ])
1533      }
1534    }
1535  }
1536}
1537
1538impl Not for f32x8 {
1539  type Output = Self;
1540  #[inline]
1541  fn not(self) -> Self {
1542    pick! {
1543      if #[cfg(target_feature="avx")] {
1544        Self { avx: self.avx.not()  }
1545      } else {
1546        Self {
1547          a : self.a.not(),
1548          b : self.b.not(),
1549        }
1550      }
1551    }
1552  }
1553}