[go: up one dir, main page]

wide/
f32x4_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="sse")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(16))]
7    pub struct f32x4 { pub(crate) sse: m128 }
8  } else if #[cfg(target_feature="simd128")] {
9    use core::arch::wasm32::*;
10
11    #[derive(Clone, Copy)]
12    #[repr(transparent)]
13    pub struct f32x4 { pub(crate) simd: v128 }
14
15    impl Default for f32x4 {
16      fn default() -> Self {
17        Self::splat(0.0)
18      }
19    }
20
21    impl PartialEq for f32x4 {
22      fn eq(&self, other: &Self) -> bool {
23        u32x4_all_true(f32x4_eq(self.simd, other.simd))
24      }
25    }
26  } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
27    use core::arch::aarch64::*;
28    #[repr(C)]
29    #[derive(Copy, Clone)]
30    pub struct f32x4 { pub(crate) neon : float32x4_t }
31
32    impl Default for f32x4 {
33      #[inline]
34      fn default() -> Self {
35        unsafe { Self { neon: vdupq_n_f32(0.0)} }
36      }
37    }
38
39    impl PartialEq for f32x4 {
40      #[inline]
41      fn eq(&self, other: &Self) -> bool {
42        unsafe { vminvq_u32(vceqq_f32(self.neon, other.neon))==u32::MAX }
43      }
44
45    }
46    } else {
47    #[derive(Default, Clone, Copy, PartialEq)]
48    #[repr(C, align(16))]
49    pub struct f32x4 { pub(crate) arr: [f32;4] }
50  }
51}
52
53macro_rules! const_f32_as_f32x4 {
54  ($i:ident, $f:expr) => {
55    #[allow(non_upper_case_globals)]
56    pub const $i: f32x4 = f32x4::new([$f; 4]);
57  };
58}
59
60impl f32x4 {
61  const_f32_as_f32x4!(ONE, 1.0);
62  const_f32_as_f32x4!(ZERO, 0.0);
63  const_f32_as_f32x4!(HALF, 0.5);
64  const_f32_as_f32x4!(E, core::f32::consts::E);
65  const_f32_as_f32x4!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
66  const_f32_as_f32x4!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
67  const_f32_as_f32x4!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
68  const_f32_as_f32x4!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
69  const_f32_as_f32x4!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
70  const_f32_as_f32x4!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
71  const_f32_as_f32x4!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
72  const_f32_as_f32x4!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
73  const_f32_as_f32x4!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
74  const_f32_as_f32x4!(LN_2, core::f32::consts::LN_2);
75  const_f32_as_f32x4!(LN_10, core::f32::consts::LN_10);
76  const_f32_as_f32x4!(LOG2_E, core::f32::consts::LOG2_E);
77  const_f32_as_f32x4!(LOG10_E, core::f32::consts::LOG10_E);
78  const_f32_as_f32x4!(LOG10_2, core::f32::consts::LOG10_2);
79  const_f32_as_f32x4!(LOG2_10, core::f32::consts::LOG2_10);
80  const_f32_as_f32x4!(PI, core::f32::consts::PI);
81  const_f32_as_f32x4!(SQRT_2, core::f32::consts::SQRT_2);
82  const_f32_as_f32x4!(TAU, core::f32::consts::TAU);
83}
84
85unsafe impl Zeroable for f32x4 {}
86unsafe impl Pod for f32x4 {}
87
88impl AlignTo for f32x4 {
89  type Elem = f32;
90}
91
92impl Add for f32x4 {
93  type Output = Self;
94  #[inline]
95  fn add(self, rhs: Self) -> Self::Output {
96    pick! {
97      if #[cfg(target_feature="sse")] {
98        Self { sse: add_m128(self.sse, rhs.sse) }
99      } else if #[cfg(target_feature="simd128")] {
100        Self { simd: f32x4_add(self.simd, rhs.simd) }
101      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
102        unsafe { Self { neon: vaddq_f32(self.neon, rhs.neon) } }
103      } else {
104        Self { arr: [
105          self.arr[0] + rhs.arr[0],
106          self.arr[1] + rhs.arr[1],
107          self.arr[2] + rhs.arr[2],
108          self.arr[3] + rhs.arr[3],
109        ]}
110      }
111    }
112  }
113}
114
115impl Sub for f32x4 {
116  type Output = Self;
117  #[inline]
118  fn sub(self, rhs: Self) -> Self::Output {
119    pick! {
120      if #[cfg(target_feature="sse")] {
121        Self { sse: sub_m128(self.sse, rhs.sse) }
122      } else if #[cfg(target_feature="simd128")] {
123        Self { simd: f32x4_sub(self.simd, rhs.simd) }
124      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
125        unsafe {Self { neon: vsubq_f32(self.neon, rhs.neon) }}
126      } else {
127        Self { arr: [
128          self.arr[0] - rhs.arr[0],
129          self.arr[1] - rhs.arr[1],
130          self.arr[2] - rhs.arr[2],
131          self.arr[3] - rhs.arr[3],
132        ]}
133      }
134    }
135  }
136}
137
138impl Mul for f32x4 {
139  type Output = Self;
140  #[inline]
141  fn mul(self, rhs: Self) -> Self::Output {
142    pick! {
143      if #[cfg(target_feature="sse")] {
144        Self { sse: mul_m128(self.sse, rhs.sse) }
145      } else if #[cfg(target_feature="simd128")] {
146        Self { simd: f32x4_mul(self.simd, rhs.simd) }
147      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
148        unsafe {Self { neon: vmulq_f32(self.neon, rhs.neon) }}
149      } else {
150        Self { arr: [
151          self.arr[0] * rhs.arr[0],
152          self.arr[1] * rhs.arr[1],
153          self.arr[2] * rhs.arr[2],
154          self.arr[3] * rhs.arr[3],
155        ]}
156      }
157    }
158  }
159}
160
161impl Div for f32x4 {
162  type Output = Self;
163  #[inline]
164  fn div(self, rhs: Self) -> Self::Output {
165    pick! {
166      if #[cfg(target_feature="sse")] {
167        Self { sse: div_m128(self.sse, rhs.sse) }
168      } else if #[cfg(target_feature="simd128")] {
169        Self { simd: f32x4_div(self.simd, rhs.simd) }
170      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
171        unsafe {Self { neon: vdivq_f32(self.neon, rhs.neon) }}
172      } else {
173        Self { arr: [
174          self.arr[0] / rhs.arr[0],
175          self.arr[1] / rhs.arr[1],
176          self.arr[2] / rhs.arr[2],
177          self.arr[3] / rhs.arr[3],
178        ]}
179      }
180    }
181  }
182}
183
184impl Add<f32> for f32x4 {
185  type Output = Self;
186  #[inline]
187  fn add(self, rhs: f32) -> Self::Output {
188    self.add(Self::splat(rhs))
189  }
190}
191
192impl Sub<f32> for f32x4 {
193  type Output = Self;
194  #[inline]
195  fn sub(self, rhs: f32) -> Self::Output {
196    self.sub(Self::splat(rhs))
197  }
198}
199
200impl Mul<f32> for f32x4 {
201  type Output = Self;
202  #[inline]
203  fn mul(self, rhs: f32) -> Self::Output {
204    self.mul(Self::splat(rhs))
205  }
206}
207
208impl Div<f32> for f32x4 {
209  type Output = Self;
210  #[inline]
211  fn div(self, rhs: f32) -> Self::Output {
212    self.div(Self::splat(rhs))
213  }
214}
215
216impl Add<f32x4> for f32 {
217  type Output = f32x4;
218  #[inline]
219  fn add(self, rhs: f32x4) -> Self::Output {
220    f32x4::splat(self).add(rhs)
221  }
222}
223
224impl Sub<f32x4> for f32 {
225  type Output = f32x4;
226  #[inline]
227  fn sub(self, rhs: f32x4) -> Self::Output {
228    f32x4::splat(self).sub(rhs)
229  }
230}
231
232impl Mul<f32x4> for f32 {
233  type Output = f32x4;
234  #[inline]
235  fn mul(self, rhs: f32x4) -> Self::Output {
236    f32x4::splat(self).mul(rhs)
237  }
238}
239
240impl Div<f32x4> for f32 {
241  type Output = f32x4;
242  #[inline]
243  fn div(self, rhs: f32x4) -> Self::Output {
244    f32x4::splat(self).div(rhs)
245  }
246}
247
248impl BitAnd for f32x4 {
249  type Output = Self;
250  #[inline]
251  fn bitand(self, rhs: Self) -> Self::Output {
252    pick! {
253      if #[cfg(target_feature="sse")] {
254        Self { sse: bitand_m128(self.sse, rhs.sse) }
255      } else if #[cfg(target_feature="simd128")] {
256        Self { simd: v128_and(self.simd, rhs.simd) }
257      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
258        unsafe {Self { neon: vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
259      } else {
260        Self { arr: [
261          f32::from_bits(self.arr[0].to_bits() & rhs.arr[0].to_bits()),
262          f32::from_bits(self.arr[1].to_bits() & rhs.arr[1].to_bits()),
263          f32::from_bits(self.arr[2].to_bits() & rhs.arr[2].to_bits()),
264          f32::from_bits(self.arr[3].to_bits() & rhs.arr[3].to_bits()),
265        ]}
266      }
267    }
268  }
269}
270
271impl BitOr for f32x4 {
272  type Output = Self;
273  #[inline]
274  fn bitor(self, rhs: Self) -> Self::Output {
275    pick! {
276      if #[cfg(target_feature="sse")] {
277        Self { sse: bitor_m128(self.sse, rhs.sse) }
278      } else if #[cfg(target_feature="simd128")] {
279        Self { simd: v128_or(self.simd, rhs.simd) }
280      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
281        unsafe {Self { neon: vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
282      } else {
283        Self { arr: [
284          f32::from_bits(self.arr[0].to_bits() | rhs.arr[0].to_bits()),
285          f32::from_bits(self.arr[1].to_bits() | rhs.arr[1].to_bits()),
286          f32::from_bits(self.arr[2].to_bits() | rhs.arr[2].to_bits()),
287          f32::from_bits(self.arr[3].to_bits() | rhs.arr[3].to_bits()),
288        ]}
289      }
290    }
291  }
292}
293
294impl BitXor for f32x4 {
295  type Output = Self;
296  #[inline]
297  fn bitxor(self, rhs: Self) -> Self::Output {
298    pick! {
299      if #[cfg(target_feature="sse")] {
300        Self { sse: bitxor_m128(self.sse, rhs.sse) }
301      } else if #[cfg(target_feature="simd128")] {
302        Self { simd: v128_xor(self.simd, rhs.simd) }
303      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
304        unsafe {Self { neon: vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
305      } else {
306        Self { arr: [
307          f32::from_bits(self.arr[0].to_bits() ^ rhs.arr[0].to_bits()),
308          f32::from_bits(self.arr[1].to_bits() ^ rhs.arr[1].to_bits()),
309          f32::from_bits(self.arr[2].to_bits() ^ rhs.arr[2].to_bits()),
310          f32::from_bits(self.arr[3].to_bits() ^ rhs.arr[3].to_bits()),
311        ]}
312      }
313    }
314  }
315}
316
317impl CmpEq for f32x4 {
318  type Output = Self;
319  #[inline]
320  fn simd_eq(self, rhs: Self) -> Self::Output {
321    pick! {
322      if #[cfg(target_feature="sse")] {
323        Self { sse: cmp_eq_mask_m128(self.sse, rhs.sse) }
324      } else if #[cfg(target_feature="simd128")] {
325        Self { simd: f32x4_eq(self.simd, rhs.simd) }
326      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
327        unsafe {Self { neon: vreinterpretq_f32_u32(vceqq_f32(self.neon, rhs.neon)) }}
328      } else {
329        Self { arr: [
330          if self.arr[0] == rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
331          if self.arr[1] == rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
332          if self.arr[2] == rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
333          if self.arr[3] == rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
334        ]}
335      }
336    }
337  }
338}
339
340impl CmpGe for f32x4 {
341  type Output = Self;
342  #[inline]
343  fn simd_ge(self, rhs: Self) -> Self::Output {
344    pick! {
345      if #[cfg(target_feature="sse")] {
346        Self { sse: cmp_ge_mask_m128(self.sse, rhs.sse) }
347      } else if #[cfg(target_feature="simd128")] {
348        Self { simd: f32x4_ge(self.simd, rhs.simd) }
349      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
350        unsafe {Self { neon: vreinterpretq_f32_u32(vcgeq_f32(self.neon, rhs.neon)) }}
351      } else {
352        Self { arr: [
353          if self.arr[0] >= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
354          if self.arr[1] >= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
355          if self.arr[2] >= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
356          if self.arr[3] >= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
357        ]}
358      }
359    }
360  }
361}
362
363impl CmpGt for f32x4 {
364  type Output = Self;
365  #[inline]
366  fn simd_gt(self, rhs: Self) -> Self::Output {
367    pick! {
368      if #[cfg(target_feature="sse")] {
369        Self { sse: cmp_gt_mask_m128(self.sse, rhs.sse) }
370      } else if #[cfg(target_feature="simd128")] {
371        Self { simd: f32x4_gt(self.simd, rhs.simd) }
372      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
373        unsafe {Self { neon: vreinterpretq_f32_u32(vcgtq_f32(self.neon, rhs.neon)) }}
374      } else {
375        Self { arr: [
376          if self.arr[0] > rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
377          if self.arr[1] > rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
378          if self.arr[2] > rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
379          if self.arr[3] > rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
380        ]}
381      }
382    }
383  }
384}
385
386impl CmpNe for f32x4 {
387  type Output = Self;
388  #[inline]
389  fn simd_ne(self, rhs: Self) -> Self::Output {
390    pick! {
391      if #[cfg(target_feature="sse")] {
392        Self { sse: cmp_neq_mask_m128(self.sse, rhs.sse) }
393      } else if #[cfg(target_feature="simd128")] {
394        Self { simd: f32x4_ne(self.simd, rhs.simd) }
395      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
396        unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, rhs.neon))) }}
397      } else {
398        Self { arr: [
399          if self.arr[0] != rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
400          if self.arr[1] != rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
401          if self.arr[2] != rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
402          if self.arr[3] != rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
403        ]}
404      }
405    }
406  }
407}
408
409impl CmpLe for f32x4 {
410  type Output = Self;
411  #[inline]
412  fn simd_le(self, rhs: Self) -> Self::Output {
413    pick! {
414      if #[cfg(target_feature="sse")] {
415        Self { sse: cmp_le_mask_m128(self.sse, rhs.sse) }
416      } else if #[cfg(target_feature="simd128")] {
417        Self { simd: f32x4_le(self.simd, rhs.simd) }
418      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
419        unsafe {Self { neon: vreinterpretq_f32_u32(vcleq_f32(self.neon, rhs.neon)) }}
420      } else {
421        Self { arr: [
422          if self.arr[0] <= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
423          if self.arr[1] <= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
424          if self.arr[2] <= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
425          if self.arr[3] <= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
426        ]}
427      }
428    }
429  }
430}
431
432impl CmpLt for f32x4 {
433  type Output = Self;
434  #[inline]
435  fn simd_lt(self, rhs: Self) -> Self::Output {
436    pick! {
437      if #[cfg(target_feature="sse")] {
438        Self { sse: cmp_lt_mask_m128(self.sse, rhs.sse) }
439      } else if #[cfg(target_feature="simd128")] {
440        Self { simd: f32x4_lt(self.simd, rhs.simd) }
441      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
442        unsafe {Self { neon: vreinterpretq_f32_u32(vcltq_f32(self.neon, rhs.neon)) }}
443      } else {
444        Self { arr: [
445          if self.arr[0] < rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
446          if self.arr[1] < rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
447          if self.arr[2] < rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
448          if self.arr[3] < rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
449        ]}
450      }
451    }
452  }
453}
454
455impl f32x4 {
456  #[inline]
457  #[must_use]
458  pub const fn new(array: [f32; 4]) -> Self {
459    #[allow(non_upper_case_globals)]
460    unsafe {
461      core::mem::transmute(array)
462    }
463  }
464
465  #[inline]
466  #[must_use]
467  pub fn blend(self, t: Self, f: Self) -> Self {
468    pick! {
469      if #[cfg(target_feature="sse4.1")] {
470        Self { sse: blend_varying_m128(f.sse, t.sse, self.sse) }
471      } else if #[cfg(target_feature="simd128")] {
472        Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
473      } else {
474        generic_bit_blend(self, t, f)
475      }
476    }
477  }
478  #[inline]
479  #[must_use]
480  pub fn abs(self) -> Self {
481    pick! {
482      if #[cfg(target_feature="simd128")] {
483        Self { simd: f32x4_abs(self.simd) }
484      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
485        unsafe {Self { neon: vabsq_f32(self.neon) }}
486      } else {
487        let non_sign_bits = f32x4::from(f32::from_bits(i32::MAX as u32));
488        self & non_sign_bits
489      }
490    }
491  }
492  #[inline]
493  #[must_use]
494  pub fn floor(self) -> Self {
495    pick! {
496      if #[cfg(target_feature="simd128")] {
497        Self { simd: f32x4_floor(self.simd) }
498      } else if #[cfg(target_feature="sse4.1")] {
499        Self { sse: floor_m128(self.sse) }
500      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
501        unsafe {Self { neon: vrndmq_f32(self.neon) }}
502      } else if #[cfg(feature="std")] {
503        let base: [f32; 4] = cast(self);
504        cast(base.map(|val| val.floor()))
505      } else {
506        let base: [f32; 4] = cast(self);
507        let rounded: [f32; 4] = cast(self.round());
508        cast([
509          if base[0] < rounded[0] { rounded[0] - 1.0 } else { rounded[0] },
510          if base[1] < rounded[1] { rounded[1] - 1.0 } else { rounded[1] },
511          if base[2] < rounded[2] { rounded[2] - 1.0 } else { rounded[2] },
512          if base[3] < rounded[3] { rounded[3] - 1.0 } else { rounded[3] },
513        ])
514      }
515    }
516  }
517  #[inline]
518  #[must_use]
519  pub fn ceil(self) -> Self {
520    pick! {
521      if #[cfg(target_feature="simd128")] {
522        Self { simd: f32x4_ceil(self.simd) }
523      } else if #[cfg(target_feature="sse4.1")] {
524        Self { sse: ceil_m128(self.sse) }
525      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
526        unsafe {Self { neon: vrndpq_f32(self.neon) }}
527      } else if #[cfg(feature="std")] {
528        let base: [f32; 4] = cast(self);
529        cast(base.map(|val| val.ceil()))
530      } else {
531        let base: [f32; 4] = cast(self);
532        let rounded: [f32; 4] = cast(self.round());
533        cast([
534          if base[0] > rounded[0] { rounded[0] + 1.0 } else { rounded[0] },
535          if base[1] > rounded[1] { rounded[1] + 1.0 } else { rounded[1] },
536          if base[2] > rounded[2] { rounded[2] + 1.0 } else { rounded[2] },
537          if base[3] > rounded[3] { rounded[3] + 1.0 } else { rounded[3] },
538        ])
539      }
540    }
541  }
542
543  /// Calculates the lanewise maximum of both vectors. This is a faster
544  /// implementation than `max`, but it doesn't specify any behavior if NaNs are
545  /// involved.
546  #[inline]
547  #[must_use]
548  pub fn fast_max(self, rhs: Self) -> Self {
549    pick! {
550      if #[cfg(target_feature="sse")] {
551        Self { sse: max_m128(self.sse, rhs.sse) }
552      } else if #[cfg(target_feature="simd128")] {
553        Self {
554          simd: f32x4_pmax(self.simd, rhs.simd),
555        }
556      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
557        unsafe {Self { neon: vmaxq_f32(self.neon, rhs.neon) }}
558      } else {
559        Self { arr: [
560          if self.arr[0] < rhs.arr[0] { rhs.arr[0] } else { self.arr[0] },
561          if self.arr[1] < rhs.arr[1] { rhs.arr[1] } else { self.arr[1] },
562          if self.arr[2] < rhs.arr[2] { rhs.arr[2] } else { self.arr[2] },
563          if self.arr[3] < rhs.arr[3] { rhs.arr[3] } else { self.arr[3] },
564        ]}
565      }
566    }
567  }
568
569  /// Calculates the lanewise maximum of both vectors. If either lane is NaN,
570  /// the other lane gets chosen. Use `fast_max` for a faster implementation
571  /// that doesn't handle NaNs.
572  #[inline]
573  #[must_use]
574  pub fn max(self, rhs: Self) -> Self {
575    pick! {
576      if #[cfg(target_feature="sse")] {
577        // max_m128 seems to do rhs < self ? self : rhs. So if there's any NaN
578        // involved, it chooses rhs, so we need to specifically check rhs for
579        // NaN.
580        rhs.is_nan().blend(self, Self { sse: max_m128(self.sse, rhs.sse) })
581      } else if #[cfg(target_feature="simd128")] {
582        // WASM has two max intrinsics:
583        // - max: This propagates NaN, that's the opposite of what we need.
584        // - pmax: This is defined as self < rhs ? rhs : self, which basically
585        //   chooses self if either is NaN.
586        //
587        // pmax is what we want, but we need to specifically check self for NaN.
588        Self {
589          simd: v128_bitselect(
590            rhs.simd,
591            f32x4_pmax(self.simd, rhs.simd),
592            f32x4_ne(self.simd, self.simd), // NaN check
593          )
594        }
595      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
596        unsafe {Self { neon: vmaxnmq_f32(self.neon, rhs.neon) }}
597      } else {
598        Self { arr: [
599          self.arr[0].max(rhs.arr[0]),
600          self.arr[1].max(rhs.arr[1]),
601          self.arr[2].max(rhs.arr[2]),
602          self.arr[3].max(rhs.arr[3]),
603        ]}
604      }
605    }
606  }
607
608  /// Calculates the lanewise minimum of both vectors. This is a faster
609  /// implementation than `min`, but it doesn't specify any behavior if NaNs are
610  /// involved.
611  #[inline]
612  #[must_use]
613  pub fn fast_min(self, rhs: Self) -> Self {
614    pick! {
615      if #[cfg(target_feature="sse")] {
616        Self { sse: min_m128(self.sse, rhs.sse) }
617      } else if #[cfg(target_feature="simd128")] {
618        Self {
619          simd: f32x4_pmin(self.simd, rhs.simd),
620        }
621      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
622        unsafe {Self { neon: vminq_f32(self.neon, rhs.neon) }}
623      } else {
624        Self { arr: [
625          if self.arr[0] < rhs.arr[0] { self.arr[0] } else { rhs.arr[0] },
626          if self.arr[1] < rhs.arr[1] { self.arr[1] } else { rhs.arr[1] },
627          if self.arr[2] < rhs.arr[2] { self.arr[2] } else { rhs.arr[2] },
628          if self.arr[3] < rhs.arr[3] { self.arr[3] } else { rhs.arr[3] },
629        ]}
630      }
631    }
632  }
633
634  /// Calculates the lanewise minimum of both vectors. If either lane is NaN,
635  /// the other lane gets chosen. Use `fast_min` for a faster implementation
636  /// that doesn't handle NaNs.
637  #[inline]
638  #[must_use]
639  pub fn min(self, rhs: Self) -> Self {
640    pick! {
641      if #[cfg(target_feature="sse")] {
642        // min_m128 seems to do self < rhs ? self : rhs. So if there's any NaN
643        // involved, it chooses rhs, so we need to specifically check rhs for
644        // NaN.
645        rhs.is_nan().blend(self, Self { sse: min_m128(self.sse, rhs.sse) })
646      } else if #[cfg(target_feature="simd128")] {
647        // WASM has two min intrinsics:
648        // - min: This propagates NaN, that's the opposite of what we need.
649        // - pmin: This is defined as rhs < self ? rhs : self, which basically
650        //   chooses self if either is NaN.
651        //
652        // pmin is what we want, but we need to specifically check self for NaN.
653        Self {
654          simd: v128_bitselect(
655            rhs.simd,
656            f32x4_pmin(self.simd, rhs.simd),
657            f32x4_ne(self.simd, self.simd), // NaN check
658          )
659        }
660      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
661        unsafe {Self { neon: vminnmq_f32(self.neon, rhs.neon) }}
662      } else {
663        Self { arr: [
664          self.arr[0].min(rhs.arr[0]),
665          self.arr[1].min(rhs.arr[1]),
666          self.arr[2].min(rhs.arr[2]),
667          self.arr[3].min(rhs.arr[3]),
668        ]}
669      }
670    }
671  }
672  #[inline]
673  #[must_use]
674  pub fn is_nan(self) -> Self {
675    pick! {
676      if #[cfg(target_feature="sse")] {
677        Self { sse: cmp_unord_mask_m128(self.sse, self.sse) }
678      } else if #[cfg(target_feature="simd128")] {
679        Self { simd: f32x4_ne(self.simd, self.simd) }
680      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
681        unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, self.neon))) }}
682      } else {
683        Self { arr: [
684          if self.arr[0].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
685          if self.arr[1].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
686          if self.arr[2].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
687          if self.arr[3].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
688        ]}
689      }
690    }
691  }
692  #[inline]
693  #[must_use]
694  pub fn is_finite(self) -> Self {
695    let shifted_exp_mask = u32x4::from(0xFF000000);
696    let u: u32x4 = cast(self);
697    let shift_u = u << 1_u64;
698    let out = !(shift_u & shifted_exp_mask).simd_eq(shifted_exp_mask);
699    cast(out)
700  }
701  #[inline]
702  #[must_use]
703  pub fn is_inf(self) -> Self {
704    let shifted_inf = u32x4::from(0xFF000000);
705    let u: u32x4 = cast(self);
706    let shift_u = u << 1_u64;
707    let out = (shift_u).simd_eq(shifted_inf);
708    cast(out)
709  }
710
711  #[inline]
712  #[must_use]
713  pub fn round(self) -> Self {
714    pick! {
715      if #[cfg(target_feature="sse4.1")] {
716        Self { sse: round_m128::<{round_op!(Nearest)}>(self.sse) }
717      } else if #[cfg(target_feature="sse2")] {
718        let mi: m128i = convert_to_i32_m128i_from_m128(self.sse);
719        let f: f32x4 = f32x4 { sse: convert_to_m128_from_i32_m128i(mi) };
720        let i: i32x4 = cast(mi);
721        let mask: f32x4 = cast(i.simd_eq(i32x4::from(0x80000000_u32 as i32)));
722        mask.blend(self, f)
723      } else if #[cfg(target_feature="simd128")] {
724        Self { simd: f32x4_nearest(self.simd) }
725      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
726        unsafe {Self { neon: vrndnq_f32(self.neon) }}
727      } else {
728        // Note(Lokathor): This software fallback is probably very slow compared
729        // to having a hardware option available, even just the sse2 version is
730        // better than this. Oh well.
731        let to_int = f32x4::from(1.0 / f32::EPSILON);
732        let u: u32x4 = cast(self);
733        let e: i32x4 = cast((u >> 23) & u32x4::from(0xff));
734        let mut y: f32x4;
735
736        let no_op_magic = i32x4::from(0x7f + 23);
737        let no_op_mask: f32x4 = cast(e.simd_gt(no_op_magic) | e.simd_eq(no_op_magic));
738        let no_op_val: f32x4 = self;
739
740        let zero_magic = i32x4::from(0x7f - 1);
741        let zero_mask: f32x4 = cast(e.simd_lt(zero_magic));
742        let zero_val: f32x4 = self * f32x4::from(0.0);
743
744        let neg_bit: f32x4 = cast(cast::<u32x4, i32x4>(u).simd_lt(i32x4::default()));
745        let x: f32x4 = neg_bit.blend(-self, self);
746        y = x + to_int - to_int - x;
747        y = y.simd_gt(f32x4::from(0.5)).blend(
748          y + x - f32x4::from(-1.0),
749          y.simd_lt(f32x4::from(-0.5)).blend(y + x + f32x4::from(1.0), y + x),
750        );
751        y = neg_bit.blend(-y, y);
752
753        no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y))
754      }
755    }
756  }
757
758  /// Rounds each lane into an integer. This is a faster implementation than
759  /// `round_int`, but it doesn't handle out of range values or NaNs. For those
760  /// values you get implementation defined behavior.
761  #[inline]
762  #[must_use]
763  pub fn fast_round_int(self) -> i32x4 {
764    pick! {
765      if #[cfg(target_feature="sse2")] {
766        cast(convert_to_i32_m128i_from_m128(self.sse))
767      } else {
768        self.round_int()
769      }
770    }
771  }
772
773  /// Rounds each lane into an integer. This saturates out of range values and
774  /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that
775  /// doesn't handle out of range values or NaNs.
776  #[inline]
777  #[must_use]
778  pub fn round_int(self) -> i32x4 {
779    pick! {
780      if #[cfg(target_feature="sse2")] {
781        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
782        let non_nan_mask = self.simd_eq(self);
783        let non_nan = self & non_nan_mask;
784        let flip_to_max: i32x4 = cast(self.simd_ge(Self::splat(2147483648.0)));
785        let cast: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse));
786        flip_to_max ^ cast
787      } else if #[cfg(target_feature="simd128")] {
788        cast(Self { simd: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd)) })
789      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
790        cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtnq_s32_f32(self.neon)) }})
791      } else {
792        let rounded: [f32; 4] = cast(self.round());
793        cast([
794          rounded[0] as i32,
795          rounded[1] as i32,
796          rounded[2] as i32,
797          rounded[3] as i32,
798        ])
799      }
800    }
801  }
802
803  /// Truncates each lane into an integer. This is a faster implementation than
804  /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those
805  /// values you get implementation defined behavior.
806  #[inline]
807  #[must_use]
808  pub fn fast_trunc_int(self) -> i32x4 {
809    pick! {
810      if #[cfg(target_feature="sse2")] {
811        cast(truncate_m128_to_m128i(self.sse))
812      } else {
813        self.trunc_int()
814      }
815    }
816  }
817
818  /// Truncates each lane into an integer. This saturates out of range values
819  /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation
820  /// that doesn't handle out of range values or NaNs.
821  #[inline]
822  #[must_use]
823  pub fn trunc_int(self) -> i32x4 {
824    pick! {
825      if #[cfg(target_feature="sse2")] {
826        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
827        let non_nan_mask = self.simd_eq(self);
828        let non_nan = self & non_nan_mask;
829        let flip_to_max: i32x4 = cast(self.simd_ge(Self::splat(2147483648.0)));
830        let cast: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse));
831        flip_to_max ^ cast
832      } else if #[cfg(target_feature="simd128")] {
833        cast(Self { simd: i32x4_trunc_sat_f32x4(self.simd) })
834      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
835        cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtq_s32_f32(self.neon)) }})
836      } else {
837        let n: [f32;4] = cast(self);
838        cast([
839          n[0] as i32,
840          n[1] as i32,
841          n[2] as i32,
842          n[3] as i32,
843        ])
844      }
845    }
846  }
847  /// Performs a multiply-add operation: `self * m + a`
848  ///
849  /// When hardware FMA support is available, this computes the result with a
850  /// single rounding operation. Without FMA support, it falls back to separate
851  /// multiply and add operations with two roundings.
852  ///
853  /// # Platform-specific behavior
854  /// - On `x86`/`x86_64` with FMA: Uses `vfmadd` (single rounding, best
855  ///   accuracy)
856  /// - On ARM64 with NEON: Uses `vfmaq_f32` (single rounding, best accuracy)
857  /// - Without FMA support: Uses `(self * m) + a` (two roundings)
858  ///
859  /// # Examples
860  /// ```
861  /// # use wide::f32x4;
862  /// let a = f32x4::from([1.0, 2.0, 3.0, 4.0]);
863  /// let b = f32x4::from([5.0, 6.0, 7.0, 8.0]);
864  /// let c = f32x4::from([9.0, 10.0, 11.0, 12.0]);
865  ///
866  /// let result = a.mul_add(b, c);
867  ///
868  /// let expected = f32x4::from([14.0, 22.0, 32.0, 44.0]);
869  /// assert_eq!(result, expected);
870  /// ```
871  #[inline]
872  #[must_use]
873  pub fn mul_add(self, m: Self, a: Self) -> Self {
874    pick! {
875      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
876        Self { sse: fused_mul_add_m128(self.sse, m.sse, a.sse) }
877      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
878        unsafe { Self { neon: vfmaq_f32(a.neon, self.neon, m.neon) } }
879      } else {
880        (self * m) + a
881      }
882    }
883  }
884
885  /// Performs a multiply-subtract operation: `self * m - s`
886  ///
887  /// When hardware FMA support is available, this computes the result with a
888  /// single rounding operation. Without FMA support, it falls back to separate
889  /// multiply and subtract operations with two roundings.
890  ///
891  /// # Platform-specific behavior
892  /// - On `x86`/`x86_64` with FMA: Uses `vfmsub` (single rounding, best
893  ///   accuracy)
894  /// - On ARM64 with NEON: Uses `vfmaq_f32(-s, self, m)` (single rounding, best
895  ///   accuracy)
896  /// - Without FMA support: Uses `(self * m) - s` (two roundings)
897  ///
898  /// # Examples
899  /// ```
900  /// # use wide::f32x4;
901  /// let a = f32x4::from([10.0, 20.0, 30.0, 40.0]);
902  /// let b = f32x4::from([2.0, 3.0, 4.0, 5.0]);
903  /// let c = f32x4::from([5.0, 10.0, 15.0, 20.0]);
904  ///
905  /// let result = a.mul_sub(b, c);
906  ///
907  /// let expected = f32x4::from([15.0, 50.0, 105.0, 180.0]);
908  /// assert_eq!(result, expected);
909  /// ```
910  #[inline]
911  #[must_use]
912  pub fn mul_sub(self, m: Self, s: Self) -> Self {
913    pick! {
914      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
915        Self { sse: fused_mul_sub_m128(self.sse, m.sse, s.sse) }
916      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
917        unsafe { Self { neon: vfmaq_f32(vnegq_f32(s.neon), self.neon, m.neon) } }
918      } else {
919        (self * m) - s
920      }
921    }
922  }
923
924  /// Performs a negative multiply-add operation: `a - (self * m)`
925  ///
926  /// When hardware FMA support is available, this computes the result with a
927  /// single rounding operation. Without FMA support, it falls back to separate
928  /// operations with two roundings.
929  ///
930  /// # Platform-specific behavior
931  /// - On `x86`/`x86_64` with FMA: Uses `vfnmadd` (single rounding, best
932  ///   accuracy)
933  /// - On ARM64 with NEON: Uses `vfmsq_f32` (single rounding, best accuracy)
934  /// - Without FMA support: Uses `a - (self * m)` (two roundings)
935  ///
936  /// # Examples
937  /// ```
938  /// # use wide::f32x4;
939  /// let a = f32x4::from([3.0, 4.0, 5.0, 6.0]);
940  /// let b = f32x4::from([2.0, 2.0, 2.0, 2.0]);
941  /// let c = f32x4::from([10.0, 20.0, 30.0, 40.0]);
942  ///
943  /// let result = a.mul_neg_add(b, c);
944  ///
945  /// let expected = f32x4::from([4.0, 12.0, 20.0, 28.0]);
946  /// assert_eq!(result, expected);
947  /// ```
948  #[inline]
949  #[must_use]
950  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
951    pick! {
952      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
953        Self { sse: fused_mul_neg_add_m128(self.sse, m.sse, a.sse) }
954      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
955        unsafe { Self { neon: vfmsq_f32(a.neon, self.neon, m.neon) } }
956      } else {
957        a - (self * m)
958      }
959    }
960  }
961
962  /// Performs a negative multiply-subtract operation: `-(self * m) - s`
963  ///
964  /// When hardware FMA support is available, this computes the result with a
965  /// single rounding operation. Without FMA support, it falls back to separate
966  /// operations with two roundings.
967  ///
968  /// # Platform-specific behavior
969  /// - On `x86`/`x86_64` with FMA: Uses `vfnmsub` (single rounding, best
970  ///   accuracy)
971  /// - On ARM64 with NEON: Uses `-(vfmaq_f32(s, self, m))` (single rounding,
972  ///   best accuracy)
973  /// - Without FMA support: Uses `-(self * m) - s` (two roundings)
974  ///
975  /// # Examples
976  /// ```
977  /// # use wide::f32x4;
978  /// let a = f32x4::from([3.0, 4.0, 5.0, 6.0]);
979  /// let b = f32x4::from([2.0, 2.0, 2.0, 2.0]);
980  /// let c = f32x4::from([1.0, 2.0, 3.0, 4.0]);
981  ///
982  /// let result = a.mul_neg_sub(b, c);
983  ///
984  /// let expected = f32x4::from([-7.0, -10.0, -13.0, -16.0]);
985  /// assert_eq!(result, expected);
986  /// ```
987  #[inline]
988  #[must_use]
989  pub fn mul_neg_sub(self, m: Self, s: Self) -> Self {
990    pick! {
991      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
992        Self { sse: fused_mul_neg_sub_m128(self.sse, m.sse, s.sse) }
993      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
994        unsafe { Self { neon: vnegq_f32(vfmaq_f32(s.neon, self.neon, m.neon)) } }
995      } else {
996        -(self * m) - s
997      }
998    }
999  }
1000
1001  #[inline]
1002  #[must_use]
1003  pub fn flip_signs(self, signs: Self) -> Self {
1004    self ^ (signs & Self::from(-0.0))
1005  }
1006
1007  #[inline]
1008  #[must_use]
1009  pub fn copysign(self, sign: Self) -> Self {
1010    let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
1011    (self & magnitude_mask) | (sign & Self::from(-0.0))
1012  }
1013
1014  #[inline]
1015  pub fn asin_acos(self) -> (Self, Self) {
1016    // Based on the Agner Fog "vector class library":
1017    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1018    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
1019    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
1020    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
1021    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
1022    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
1023
1024    let xa = self.abs();
1025    let big = xa.simd_ge(f32x4::splat(0.5));
1026
1027    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
1028    let x2 = xa * xa;
1029    let x3 = big.blend(x1, x2);
1030
1031    let xb = x1.sqrt();
1032
1033    let x4 = big.blend(xb, xa);
1034
1035    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1036    let z = z.mul_add(x3 * x4, x4);
1037
1038    let z1 = z + z;
1039
1040    // acos
1041    let z3 = self.simd_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
1042    let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
1043    let acos = big.blend(z3, z4);
1044
1045    // asin
1046    let z3 = f32x4::FRAC_PI_2 - z1;
1047    let asin = big.blend(z3, z);
1048    let asin = asin.flip_signs(self);
1049
1050    (asin, acos)
1051  }
1052
1053  #[inline]
1054  pub fn asin(self) -> Self {
1055    // Based on the Agner Fog "vector class library":
1056    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1057    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
1058    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
1059    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
1060    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
1061    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
1062
1063    let xa = self.abs();
1064    let big = xa.simd_ge(f32x4::splat(0.5));
1065
1066    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
1067    let x2 = xa * xa;
1068    let x3 = big.blend(x1, x2);
1069
1070    let xb = x1.sqrt();
1071
1072    let x4 = big.blend(xb, xa);
1073
1074    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1075    let z = z.mul_add(x3 * x4, x4);
1076
1077    let z1 = z + z;
1078
1079    // asin
1080    let z3 = f32x4::FRAC_PI_2 - z1;
1081    let asin = big.blend(z3, z);
1082    let asin = asin.flip_signs(self);
1083
1084    asin
1085  }
1086
1087  #[inline]
1088  #[must_use]
1089  pub fn acos(self) -> Self {
1090    // Based on the Agner Fog "vector class library":
1091    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1092    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
1093    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
1094    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
1095    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
1096    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
1097
1098    let xa = self.abs();
1099    let big = xa.simd_ge(f32x4::splat(0.5));
1100
1101    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
1102    let x2 = xa * xa;
1103    let x3 = big.blend(x1, x2);
1104
1105    let xb = x1.sqrt();
1106
1107    let x4 = big.blend(xb, xa);
1108
1109    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1110    let z = z.mul_add(x3 * x4, x4);
1111
1112    let z1 = z + z;
1113
1114    // acos
1115    let z3 = self.simd_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
1116    let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
1117    let acos = big.blend(z3, z4);
1118
1119    acos
1120  }
1121
1122  #[inline]
1123  pub fn atan(self) -> Self {
1124    // Based on the Agner Fog "vector class library":
1125    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1126    const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1127    const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1128    const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1129    const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1130
1131    let t = self.abs();
1132
1133    // small:  z = t / 1.0;
1134    // medium: z = (t-1.0) / (t+1.0);
1135    // big:    z = -1.0 / t;
1136    let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
1137    let notbig = t.simd_le(Self::SQRT_2 + Self::ONE);
1138
1139    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1140    s = notsmal & s;
1141
1142    let mut a = notbig & t;
1143    a = notsmal.blend(a - Self::ONE, a);
1144    let mut b = notbig & Self::ONE;
1145    b = notsmal.blend(b + t, b);
1146    let z = a / b;
1147
1148    let zz = z * z;
1149
1150    // Taylor expansion
1151    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1152    re = re.mul_add(zz * z, z) + s;
1153
1154    // get sign bit
1155    re = (self.sign_bit()).blend(-re, re);
1156
1157    re
1158  }
1159
1160  #[inline]
1161  pub fn atan2(self, x: Self) -> Self {
1162    // Based on the Agner Fog "vector class library":
1163    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1164    const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1165    const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1166    const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1167    const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1168
1169    let y = self;
1170
1171    // move in first octant
1172    let x1 = x.abs();
1173    let y1 = y.abs();
1174    let swapxy = y1.simd_gt(x1);
1175    // swap x and y if y1 > x1
1176    let mut x2 = swapxy.blend(y1, x1);
1177    let mut y2 = swapxy.blend(x1, y1);
1178
1179    // check for special case: x and y are both +/- INF
1180    let both_infinite = x.is_inf() & y.is_inf();
1181    if both_infinite.any() {
1182      let minus_one = -Self::ONE;
1183      x2 = both_infinite.blend(x2 & minus_one, x2);
1184      y2 = both_infinite.blend(y2 & minus_one, y2);
1185    }
1186
1187    // x = y = 0 will produce NAN. No problem, fixed below
1188    let t = y2 / x2;
1189
1190    // small:  z = t / 1.0;
1191    // medium: z = (t-1.0) / (t+1.0);
1192    let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
1193
1194    let a = notsmal.blend(t - Self::ONE, t);
1195    let b = notsmal.blend(t + Self::ONE, Self::ONE);
1196    let s = notsmal & Self::FRAC_PI_4;
1197    let z = a / b;
1198
1199    let zz = z * z;
1200
1201    // Taylor expansion
1202    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1203    re = re.mul_add(zz * z, z) + s;
1204
1205    // move back in place
1206    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1207    re = ((x | y).simd_eq(Self::ZERO)).blend(Self::ZERO, re);
1208    re = (x.sign_bit()).blend(Self::PI - re, re);
1209
1210    // get sign bit
1211    re = (y.sign_bit()).blend(-re, re);
1212
1213    re
1214  }
1215
1216  #[inline]
1217  #[must_use]
1218  pub fn sin_cos(self) -> (Self, Self) {
1219    // Based on the Agner Fog "vector class library":
1220    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1221
1222    const_f32_as_f32x4!(DP1F, 0.78515625_f32 * 2.0);
1223    const_f32_as_f32x4!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
1224    const_f32_as_f32x4!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
1225
1226    const_f32_as_f32x4!(P0sinf, -1.6666654611E-1);
1227    const_f32_as_f32x4!(P1sinf, 8.3321608736E-3);
1228    const_f32_as_f32x4!(P2sinf, -1.9515295891E-4);
1229
1230    const_f32_as_f32x4!(P0cosf, 4.166664568298827E-2);
1231    const_f32_as_f32x4!(P1cosf, -1.388731625493765E-3);
1232    const_f32_as_f32x4!(P2cosf, 2.443315711809948E-5);
1233
1234    const_f32_as_f32x4!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
1235
1236    let xa = self.abs();
1237
1238    // Find quadrant
1239    let y = (xa * TWO_OVER_PI).round();
1240    let q: i32x4 = y.round_int();
1241
1242    let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
1243
1244    let x2 = x * x;
1245    let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
1246    let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
1247      + f32x4::from(0.5).mul_neg_add(x2, f32x4::from(1.0));
1248
1249    let swap = !(q & i32x4::from(1)).simd_eq(i32x4::from(0));
1250
1251    let mut overflow: f32x4 = cast(q.simd_gt(i32x4::from(0x2000000)));
1252    overflow &= xa.is_finite();
1253    s = overflow.blend(f32x4::from(0.0), s);
1254    c = overflow.blend(f32x4::from(1.0), c);
1255
1256    // calc sin
1257    let mut sin1 = cast::<_, f32x4>(swap).blend(c, s);
1258    let sign_sin: i32x4 = (q << 30) ^ cast::<_, i32x4>(self);
1259    sin1 = sin1.flip_signs(cast(sign_sin));
1260
1261    // calc cos
1262    let mut cos1 = cast::<_, f32x4>(swap).blend(s, c);
1263    let sign_cos: i32x4 = ((q + i32x4::from(1)) & i32x4::from(2)) << 30;
1264    cos1 ^= cast::<_, f32x4>(sign_cos);
1265
1266    (sin1, cos1)
1267  }
1268
1269  #[inline]
1270  #[must_use]
1271  pub fn sin(self) -> Self {
1272    let (s, _) = self.sin_cos();
1273    s
1274  }
1275  #[inline]
1276  #[must_use]
1277  pub fn cos(self) -> Self {
1278    let (_, c) = self.sin_cos();
1279    c
1280  }
1281  #[inline]
1282  #[must_use]
1283  pub fn tan(self) -> Self {
1284    let (s, c) = self.sin_cos();
1285    s / c
1286  }
1287  #[inline]
1288  #[must_use]
1289  pub fn to_degrees(self) -> Self {
1290    const_f32_as_f32x4!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
1291    self * RAD_TO_DEG_RATIO
1292  }
1293  #[inline]
1294  #[must_use]
1295  pub fn to_radians(self) -> Self {
1296    const_f32_as_f32x4!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
1297    self * DEG_TO_RAD_RATIO
1298  }
1299  #[inline]
1300  #[must_use]
1301  pub fn recip(self) -> Self {
1302    pick! {
1303      if #[cfg(target_feature="sse")] {
1304        Self { sse: reciprocal_m128(self.sse) }
1305      } else if #[cfg(target_feature="simd128")] {
1306        Self { simd: f32x4_div(f32x4_splat(1.0), self.simd) }
1307      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1308        unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), self.neon) }}
1309      } else {
1310        Self { arr: [
1311          1.0 / self.arr[0],
1312          1.0 / self.arr[1],
1313          1.0 / self.arr[2],
1314          1.0 / self.arr[3],
1315        ]}
1316      }
1317    }
1318  }
1319  #[inline]
1320  #[must_use]
1321  pub fn recip_sqrt(self) -> Self {
1322    pick! {
1323      if #[cfg(target_feature="sse")] {
1324        Self { sse: reciprocal_sqrt_m128(self.sse) }
1325      } else if #[cfg(target_feature="simd128")] {
1326        Self { simd: f32x4_div(f32x4_splat(1.0), f32x4_sqrt(self.simd)) }
1327      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1328        unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), vsqrtq_f32(self.neon)) }}
1329      } else if #[cfg(feature="std")] {
1330        Self { arr: [
1331          1.0 / self.arr[0].sqrt(),
1332          1.0 / self.arr[1].sqrt(),
1333          1.0 / self.arr[2].sqrt(),
1334          1.0 / self.arr[3].sqrt(),
1335        ]}
1336      } else {
1337        Self { arr: [
1338          1.0 / software_sqrt(self.arr[0] as f64) as f32,
1339          1.0 / software_sqrt(self.arr[1] as f64) as f32,
1340          1.0 / software_sqrt(self.arr[2] as f64) as f32,
1341          1.0 / software_sqrt(self.arr[3] as f64) as f32,
1342        ]}
1343      }
1344    }
1345  }
1346  #[inline]
1347  #[must_use]
1348  pub fn sqrt(self) -> Self {
1349    pick! {
1350      if #[cfg(target_feature="sse")] {
1351        Self { sse: sqrt_m128(self.sse) }
1352      } else if #[cfg(target_feature="simd128")] {
1353        Self { simd: f32x4_sqrt(self.simd) }
1354      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1355        unsafe {Self { neon: vsqrtq_f32(self.neon) }}
1356      } else if #[cfg(feature="std")] {
1357        Self { arr: [
1358          self.arr[0].sqrt(),
1359          self.arr[1].sqrt(),
1360          self.arr[2].sqrt(),
1361          self.arr[3].sqrt(),
1362        ]}
1363      } else {
1364        Self { arr: [
1365          software_sqrt(self.arr[0] as f64) as f32,
1366          software_sqrt(self.arr[1] as f64) as f32,
1367          software_sqrt(self.arr[2] as f64) as f32,
1368          software_sqrt(self.arr[3] as f64) as f32,
1369        ]}
1370      }
1371    }
1372  }
1373
1374  #[inline]
1375  #[must_use]
1376  pub fn to_bitmask(self) -> u32 {
1377    pick! {
1378      if #[cfg(target_feature="sse")] {
1379        move_mask_m128(self.sse) as u32
1380      } else if #[cfg(target_feature="simd128")] {
1381        u32x4_bitmask(self.simd) as u32
1382      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1383        unsafe
1384        {
1385          // set all to 1 if top bit is set, else 0
1386          let masked = vcltq_s32( vreinterpretq_s32_f32(self.neon), vdupq_n_s32(0));
1387
1388          // select the right bit out of each lane
1389          let selectbit : uint32x4_t = core::mem::transmute([1u32, 2, 4, 8]);
1390          let r = vandq_u32(masked, selectbit);
1391
1392          // horizontally add the 16-bit lanes
1393          vaddvq_u32(r) as u32
1394        }
1395      } else {
1396        (((self.arr[0].to_bits() as i32) < 0) as u32) << 0 |
1397        (((self.arr[1].to_bits() as i32) < 0) as u32) << 1 |
1398        (((self.arr[2].to_bits() as i32) < 0) as u32) << 2 |
1399        (((self.arr[3].to_bits() as i32) < 0) as u32) << 3
1400      }
1401    }
1402  }
1403  #[inline]
1404  #[must_use]
1405  pub fn any(self) -> bool {
1406    pick! {
1407      if #[cfg(target_feature="simd128")] {
1408        v128_any_true(self.simd)
1409      } else {
1410        self.to_bitmask() != 0
1411      }
1412    }
1413  }
1414  #[inline]
1415  #[must_use]
1416  pub fn all(self) -> bool {
1417    pick! {
1418      if #[cfg(target_feature="simd128")] {
1419        u32x4_all_true(self.simd)
1420      } else {
1421        // four lanes
1422        self.to_bitmask() == 0b1111
1423      }
1424    }
1425  }
1426  #[inline]
1427  #[must_use]
1428  pub fn none(self) -> bool {
1429    !self.any()
1430  }
1431
1432  #[inline]
1433  fn vm_pow2n(self) -> Self {
1434    const_f32_as_f32x4!(pow2_23, 8388608.0);
1435    const_f32_as_f32x4!(bias, 127.0);
1436    let a = self + (bias + pow2_23);
1437    let c = cast::<_, i32x4>(a) << 23;
1438    cast::<_, f32x4>(c)
1439  }
1440
1441  /// Calculate the exponent of a packed `f32x4`
1442  #[inline]
1443  #[must_use]
1444  pub fn exp(self) -> Self {
1445    const_f32_as_f32x4!(P0, 1.0 / 2.0);
1446    const_f32_as_f32x4!(P1, 1.0 / 6.0);
1447    const_f32_as_f32x4!(P2, 1. / 24.);
1448    const_f32_as_f32x4!(P3, 1. / 120.);
1449    const_f32_as_f32x4!(P4, 1. / 720.);
1450    const_f32_as_f32x4!(P5, 1. / 5040.);
1451    const_f32_as_f32x4!(LN2D_HI, 0.693359375);
1452    const_f32_as_f32x4!(LN2D_LO, -2.12194440e-4);
1453    let max_x = f32x4::from(87.3);
1454    let r = (self * Self::LOG2_E).round();
1455    let x = r.mul_neg_add(LN2D_HI, self);
1456    let x = r.mul_neg_add(LN2D_LO, x);
1457    let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1458    let x2 = x * x;
1459    let z = z.mul_add(x2, x);
1460    let n2 = Self::vm_pow2n(r);
1461    let z = (z + Self::ONE) * n2;
1462    // check for overflow
1463    let in_range = self.abs().simd_lt(max_x);
1464    let in_range = in_range & self.is_finite();
1465    in_range.blend(z, Self::ZERO)
1466  }
1467
1468  #[inline]
1469  fn exponent(self) -> f32x4 {
1470    const_f32_as_f32x4!(pow2_23, 8388608.0);
1471    const_f32_as_f32x4!(bias, 127.0);
1472    let a = cast::<_, u32x4>(self);
1473    let b = a >> 23;
1474    let c = b | cast::<_, u32x4>(pow2_23);
1475    let d = cast::<_, f32x4>(c);
1476    let e = d - (pow2_23 + bias);
1477    e
1478  }
1479
1480  #[inline]
1481  fn fraction_2(self) -> Self {
1482    let t1 = cast::<_, u32x4>(self);
1483    let t2 = cast::<_, u32x4>(
1484      (t1 & u32x4::from(0x007FFFFF)) | u32x4::from(0x3F000000),
1485    );
1486    cast::<_, f32x4>(t2)
1487  }
1488  #[inline]
1489  fn is_zero_or_subnormal(self) -> Self {
1490    let t = cast::<_, i32x4>(self);
1491    let t = t & i32x4::splat(0x7F800000);
1492    i32x4::round_float(t.simd_eq(i32x4::splat(0)))
1493  }
1494  #[inline]
1495  fn infinity() -> Self {
1496    cast::<_, f32x4>(i32x4::splat(0x7F800000))
1497  }
1498  #[inline]
1499  fn nan_log() -> Self {
1500    cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1501  }
1502  #[inline]
1503  fn nan_pow() -> Self {
1504    cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1505  }
1506  #[inline]
1507  pub fn sign_bit(self) -> Self {
1508    let t1 = cast::<_, i32x4>(self);
1509    let t2 = t1 >> 31;
1510    !cast::<_, f32x4>(t2).simd_eq(f32x4::ZERO)
1511  }
1512
1513  /// horizontal add of all the elements of the vector
1514  #[inline]
1515  #[must_use]
1516  pub fn reduce_add(self) -> f32 {
1517    let arr: [f32; 4] = cast(self);
1518    arr.iter().sum()
1519  }
1520
1521  /// Natural log (ln(x))
1522  #[inline]
1523  #[must_use]
1524  pub fn ln(self) -> Self {
1525    const_f32_as_f32x4!(HALF, 0.5);
1526    const_f32_as_f32x4!(P0, 3.3333331174E-1);
1527    const_f32_as_f32x4!(P1, -2.4999993993E-1);
1528    const_f32_as_f32x4!(P2, 2.0000714765E-1);
1529    const_f32_as_f32x4!(P3, -1.6668057665E-1);
1530    const_f32_as_f32x4!(P4, 1.4249322787E-1);
1531    const_f32_as_f32x4!(P5, -1.2420140846E-1);
1532    const_f32_as_f32x4!(P6, 1.1676998740E-1);
1533    const_f32_as_f32x4!(P7, -1.1514610310E-1);
1534    const_f32_as_f32x4!(P8, 7.0376836292E-2);
1535    const_f32_as_f32x4!(LN2F_HI, 0.693359375);
1536    const_f32_as_f32x4!(LN2F_LO, -2.12194440e-4);
1537    const_f32_as_f32x4!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1538
1539    let x1 = self;
1540    let x = Self::fraction_2(x1);
1541    let e = Self::exponent(x1);
1542    let mask = x.simd_gt(Self::SQRT_2 * HALF);
1543    let x = (!mask).blend(x + x, x);
1544    let fe = mask.blend(e + Self::ONE, e);
1545    let x = x - Self::ONE;
1546    let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1547    let x2 = x * x;
1548    let res = x2 * x * res;
1549    let res = fe.mul_add(LN2F_LO, res);
1550    let res = res + x2.mul_neg_add(HALF, x);
1551    let res = fe.mul_add(LN2F_HI, res);
1552    let overflow = !self.is_finite();
1553    let underflow = x1.simd_lt(VM_SMALLEST_NORMAL);
1554    let mask = overflow | underflow;
1555    if !mask.any() {
1556      res
1557    } else {
1558      let is_zero = self.is_zero_or_subnormal();
1559      let res = underflow.blend(Self::nan_log(), res);
1560      let res = is_zero.blend(Self::infinity(), res);
1561      let res = overflow.blend(self, res);
1562      res
1563    }
1564  }
1565
1566  #[inline]
1567  #[must_use]
1568  pub fn log2(self) -> Self {
1569    Self::ln(self) * Self::LOG2_E
1570  }
1571  #[inline]
1572  #[must_use]
1573  pub fn log10(self) -> Self {
1574    Self::ln(self) * Self::LOG10_E
1575  }
1576
1577  #[inline]
1578  #[must_use]
1579  pub fn pow_f32x4(self, y: f32x4) -> Self {
1580    const_f32_as_f32x4!(ln2f_hi, 0.693359375);
1581    const_f32_as_f32x4!(ln2f_lo, -2.12194440e-4);
1582    const_f32_as_f32x4!(P0logf, 3.3333331174E-1);
1583    const_f32_as_f32x4!(P1logf, -2.4999993993E-1);
1584    const_f32_as_f32x4!(P2logf, 2.0000714765E-1);
1585    const_f32_as_f32x4!(P3logf, -1.6668057665E-1);
1586    const_f32_as_f32x4!(P4logf, 1.4249322787E-1);
1587    const_f32_as_f32x4!(P5logf, -1.2420140846E-1);
1588    const_f32_as_f32x4!(P6logf, 1.1676998740E-1);
1589    const_f32_as_f32x4!(P7logf, -1.1514610310E-1);
1590    const_f32_as_f32x4!(P8logf, 7.0376836292E-2);
1591
1592    const_f32_as_f32x4!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1593    const_f32_as_f32x4!(p3expf, 1.0 / 6.0);
1594    const_f32_as_f32x4!(p4expf, 1.0 / 24.0);
1595    const_f32_as_f32x4!(p5expf, 1.0 / 120.0);
1596    const_f32_as_f32x4!(p6expf, 1.0 / 720.0);
1597    const_f32_as_f32x4!(p7expf, 1.0 / 5040.0);
1598
1599    let x1 = self.abs();
1600    let x = x1.fraction_2();
1601
1602    let mask = x.simd_gt(f32x4::SQRT_2 * f32x4::HALF);
1603    let x = (!mask).blend(x + x, x);
1604
1605    let x = x - f32x4::ONE;
1606    let x2 = x * x;
1607    let lg1 = polynomial_8!(
1608      x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1609    );
1610    let lg1 = lg1 * x2 * x;
1611
1612    let ef = x1.exponent();
1613    let ef = mask.blend(ef + f32x4::ONE, ef);
1614
1615    let e1 = (ef * y).round();
1616    let yr = ef.mul_sub(y, e1);
1617
1618    let lg = f32x4::HALF.mul_neg_add(x2, x) + lg1;
1619    let x2_err = (f32x4::HALF * x).mul_sub(x, f32x4::HALF * x2);
1620    let lg_err = f32x4::HALF.mul_add(x2, lg - x) - lg1;
1621
1622    let e2 = (lg * y * f32x4::LOG2_E).round();
1623    let v = lg.mul_sub(y, e2 * ln2f_hi);
1624    let v = e2.mul_neg_add(ln2f_lo, v);
1625    let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x4::LN_2);
1626
1627    let x = v;
1628    let e3 = (x * f32x4::LOG2_E).round();
1629    let x = e3.mul_neg_add(f32x4::LN_2, x);
1630    let x2 = x * x;
1631    let z = x2.mul_add(
1632      polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1633      x + f32x4::ONE,
1634    );
1635
1636    let ee = e1 + e2 + e3;
1637    let ei = cast::<_, i32x4>(ee.round_int());
1638    let ej = cast::<_, i32x4>(ei + (cast::<_, i32x4>(z) >> 23));
1639
1640    let overflow = cast::<_, f32x4>(ej.simd_gt(i32x4::splat(0x0FF)))
1641      | (ee.simd_gt(f32x4::splat(300.0)));
1642    let underflow = cast::<_, f32x4>(ej.simd_lt(i32x4::splat(0x000)))
1643      | (ee.simd_lt(f32x4::splat(-300.0)));
1644
1645    // Add exponent by integer addition
1646    let z = cast::<_, f32x4>(cast::<_, i32x4>(z) + (ei << 23));
1647
1648    // Check for overflow/underflow
1649    let z = if (overflow | underflow).any() {
1650      let z = underflow.blend(f32x4::ZERO, z);
1651      overflow.blend(Self::infinity(), z)
1652    } else {
1653      z
1654    };
1655
1656    // Check for self == 0
1657    let x_zero = self.is_zero_or_subnormal();
1658    let z = x_zero.blend(
1659      y.simd_lt(f32x4::ZERO).blend(
1660        Self::infinity(),
1661        y.simd_eq(f32x4::ZERO).blend(f32x4::ONE, f32x4::ZERO),
1662      ),
1663      z,
1664    );
1665
1666    let x_sign = self.sign_bit();
1667    let z = if x_sign.any() {
1668      // Y into an integer
1669      let yi = y.simd_eq(y.round());
1670      // Is y odd?
1671      let y_odd = cast::<_, i32x4>(y.round_int() << 31).round_float();
1672
1673      let z1 =
1674        yi.blend(z | y_odd, self.simd_eq(Self::ZERO).blend(z, Self::nan_pow()));
1675      x_sign.blend(z1, z)
1676    } else {
1677      z
1678    };
1679
1680    let x_finite = self.is_finite();
1681    let y_finite = y.is_finite();
1682    let e_finite = ee.is_finite();
1683    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1684      return z;
1685    }
1686
1687    (self.is_nan() | y.is_nan()).blend(self + y, z)
1688  }
1689
1690  #[inline]
1691  pub fn powf(self, y: f32) -> Self {
1692    Self::pow_f32x4(self, f32x4::splat(y))
1693  }
1694
1695  #[must_use]
1696  #[inline]
1697  pub fn unpack_lo(self, b: Self) -> Self {
1698    pick! {
1699      if #[cfg(target_feature="sse")] {
1700        Self { sse: unpack_low_m128(self.sse, b.sse) }
1701      } else if #[cfg(target_feature="simd128")] {
1702        Self {
1703          simd: u32x4_shuffle::<0, 4, 1, 5>(self.simd, b.simd)
1704        }
1705      } else if #[cfg(all(target_feature="neon", target_arch="aarch64"))]{
1706        unsafe {Self { neon: vzip1q_f32(self.neon, b.neon) }}
1707      } else {
1708        Self { arr: [
1709          self.arr[0],
1710          b.arr[0],
1711          self.arr[1],
1712          b.arr[1],
1713        ]}
1714      }
1715    }
1716  }
1717
1718  #[must_use]
1719  #[inline]
1720  pub fn unpack_hi(self, b: Self) -> Self {
1721    pick! {
1722      if #[cfg(target_feature="sse")] {
1723        Self { sse: unpack_high_m128(self.sse, b.sse) }
1724      } else if #[cfg(target_feature="simd128")] {
1725        Self {
1726          simd: u32x4_shuffle::<2, 6, 3, 7>(self.simd, b.simd)
1727        }
1728      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1729        unsafe {Self { neon: vzip2q_f32(self.neon, b.neon) }}
1730      } else {
1731        Self { arr: [
1732          self.arr[2],
1733          b.arr[2],
1734          self.arr[3],
1735          b.arr[3],
1736        ]}
1737      }
1738    }
1739  }
1740
1741  /// Transpose matrix of 4x4 `f32` matrix. Currently only accelerated on SSE.
1742  #[must_use]
1743  #[inline]
1744  pub fn transpose(data: [f32x4; 4]) -> [f32x4; 4] {
1745    pick! {
1746      if #[cfg(target_feature="sse")] {
1747        let mut e0 = data[0];
1748        let mut e1 = data[1];
1749        let mut e2 = data[2];
1750        let mut e3 = data[3];
1751
1752        transpose_four_m128(&mut e0.sse, &mut e1.sse, &mut e2.sse, &mut e3.sse);
1753
1754        [e0, e1, e2, e3]
1755      } else if #[cfg(any(all(target_feature="neon",target_arch="aarch64"), target_feature="simd128"))] {
1756        let a = data[0].unpack_lo(data[2]);
1757        let b = data[1].unpack_lo(data[3]);
1758        let c = data[0].unpack_hi(data[2]);
1759        let d = data[1].unpack_hi(data[3]);
1760
1761        [
1762          a.unpack_lo(b),
1763          a.unpack_hi(b),
1764          c.unpack_lo(d),
1765          c.unpack_hi(d),
1766        ]
1767      } else {
1768        #[inline(always)]
1769        fn transpose_column(data: &[f32x4; 4], index: usize) -> f32x4 {
1770          f32x4::new([
1771            data[0].as_array()[index],
1772            data[1].as_array()[index],
1773            data[2].as_array()[index],
1774            data[3].as_array()[index],
1775          ])
1776        }
1777
1778        [
1779          transpose_column(&data, 0),
1780          transpose_column(&data, 1),
1781          transpose_column(&data, 2),
1782          transpose_column(&data, 3),
1783        ]
1784      }
1785    }
1786  }
1787
1788  #[inline]
1789  pub fn to_array(self) -> [f32; 4] {
1790    cast(self)
1791  }
1792
1793  #[inline]
1794  pub fn as_array(&self) -> &[f32; 4] {
1795    cast_ref(self)
1796  }
1797
1798  #[inline]
1799  pub fn as_mut_array(&mut self) -> &mut [f32; 4] {
1800    cast_mut(self)
1801  }
1802
1803  #[inline]
1804  pub fn from_i32x4(v: i32x4) -> Self {
1805    pick! {
1806      if #[cfg(target_feature="sse2")] {
1807        Self { sse: convert_to_m128_from_i32_m128i(v.sse) }
1808      } else if #[cfg(target_feature="simd128")] {
1809        Self { simd: f32x4_convert_i32x4(v.simd) }
1810      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1811        Self { neon: unsafe { vcvtq_f32_s32(v.neon) }}
1812      } else {
1813        Self { arr: [
1814            v.as_array()[0] as f32,
1815            v.as_array()[1] as f32,
1816            v.as_array()[2] as f32,
1817            v.as_array()[3] as f32,
1818          ] }
1819      }
1820    }
1821  }
1822}