[go: up one dir, main page]

wide/
f64x4_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="avx")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(32))]
7    pub struct f64x4 { pub(crate) avx: m256d }
8  } else {
9    #[derive(Default, Clone, Copy, PartialEq)]
10    #[repr(C, align(32))]
11    pub struct f64x4 { pub(crate) a: f64x2, pub(crate) b: f64x2 }
12  }
13}
14
15macro_rules! const_f64_as_f64x4 {
16  ($i:ident, $f:expr) => {
17    #[allow(non_upper_case_globals)]
18    pub const $i: f64x4 = f64x4::new([$f; 4]);
19  };
20}
21
22impl f64x4 {
23  const_f64_as_f64x4!(ONE, 1.0);
24  const_f64_as_f64x4!(ZERO, 0.0);
25  const_f64_as_f64x4!(HALF, 0.5);
26  const_f64_as_f64x4!(E, core::f64::consts::E);
27  const_f64_as_f64x4!(FRAC_1_PI, core::f64::consts::FRAC_1_PI);
28  const_f64_as_f64x4!(FRAC_2_PI, core::f64::consts::FRAC_2_PI);
29  const_f64_as_f64x4!(FRAC_2_SQRT_PI, core::f64::consts::FRAC_2_SQRT_PI);
30  const_f64_as_f64x4!(FRAC_1_SQRT_2, core::f64::consts::FRAC_1_SQRT_2);
31  const_f64_as_f64x4!(FRAC_PI_2, core::f64::consts::FRAC_PI_2);
32  const_f64_as_f64x4!(FRAC_PI_3, core::f64::consts::FRAC_PI_3);
33  const_f64_as_f64x4!(FRAC_PI_4, core::f64::consts::FRAC_PI_4);
34  const_f64_as_f64x4!(FRAC_PI_6, core::f64::consts::FRAC_PI_6);
35  const_f64_as_f64x4!(FRAC_PI_8, core::f64::consts::FRAC_PI_8);
36  const_f64_as_f64x4!(LN_2, core::f64::consts::LN_2);
37  const_f64_as_f64x4!(LN_10, core::f64::consts::LN_10);
38  const_f64_as_f64x4!(LOG2_E, core::f64::consts::LOG2_E);
39  const_f64_as_f64x4!(LOG10_E, core::f64::consts::LOG10_E);
40  const_f64_as_f64x4!(LOG10_2, core::f64::consts::LOG10_2);
41  const_f64_as_f64x4!(LOG2_10, core::f64::consts::LOG2_10);
42  const_f64_as_f64x4!(PI, core::f64::consts::PI);
43  const_f64_as_f64x4!(SQRT_2, core::f64::consts::SQRT_2);
44  const_f64_as_f64x4!(TAU, core::f64::consts::TAU);
45}
46
47unsafe impl Zeroable for f64x4 {}
48unsafe impl Pod for f64x4 {}
49
50impl AlignTo for f64x4 {
51  type Elem = f64;
52}
53
54impl Add for f64x4 {
55  type Output = Self;
56  #[inline]
57  fn add(self, rhs: Self) -> Self::Output {
58    pick! {
59      if #[cfg(target_feature="avx")] {
60        Self { avx: add_m256d(self.avx, rhs.avx) }
61      } else {
62        Self {
63          a : self.a.add(rhs.a),
64          b : self.b.add(rhs.b),
65        }
66      }
67    }
68  }
69}
70
71impl Sub for f64x4 {
72  type Output = Self;
73  #[inline]
74  fn sub(self, rhs: Self) -> Self::Output {
75    pick! {
76      if #[cfg(target_feature="avx")] {
77        Self { avx: sub_m256d(self.avx, rhs.avx) }
78      } else {
79        Self {
80          a : self.a.sub(rhs.a),
81          b : self.b.sub(rhs.b),
82        }
83      }
84    }
85  }
86}
87
88impl Mul for f64x4 {
89  type Output = Self;
90  #[inline]
91  fn mul(self, rhs: Self) -> Self::Output {
92    pick! {
93      if #[cfg(target_feature="avx")] {
94        Self { avx: mul_m256d(self.avx, rhs.avx) }
95      } else {
96        Self {
97          a : self.a.mul(rhs.a),
98          b : self.b.mul(rhs.b),
99        }
100      }
101    }
102  }
103}
104
105impl Div for f64x4 {
106  type Output = Self;
107  #[inline]
108  fn div(self, rhs: Self) -> Self::Output {
109    pick! {
110      if #[cfg(target_feature="avx")] {
111        Self { avx: div_m256d(self.avx, rhs.avx) }
112      } else {
113        Self {
114          a : self.a.div(rhs.a),
115          b : self.b.div(rhs.b),
116        }
117      }
118    }
119  }
120}
121
122impl Add<f64> for f64x4 {
123  type Output = Self;
124  #[inline]
125  fn add(self, rhs: f64) -> Self::Output {
126    self.add(Self::splat(rhs))
127  }
128}
129
130impl Sub<f64> for f64x4 {
131  type Output = Self;
132  #[inline]
133  fn sub(self, rhs: f64) -> Self::Output {
134    self.sub(Self::splat(rhs))
135  }
136}
137
138impl Mul<f64> for f64x4 {
139  type Output = Self;
140  #[inline]
141  fn mul(self, rhs: f64) -> Self::Output {
142    self.mul(Self::splat(rhs))
143  }
144}
145
146impl Div<f64> for f64x4 {
147  type Output = Self;
148  #[inline]
149  fn div(self, rhs: f64) -> Self::Output {
150    self.div(Self::splat(rhs))
151  }
152}
153
154impl Add<f64x4> for f64 {
155  type Output = f64x4;
156  #[inline]
157  fn add(self, rhs: f64x4) -> Self::Output {
158    f64x4::splat(self).add(rhs)
159  }
160}
161
162impl Sub<f64x4> for f64 {
163  type Output = f64x4;
164  #[inline]
165  fn sub(self, rhs: f64x4) -> Self::Output {
166    f64x4::splat(self).sub(rhs)
167  }
168}
169
170impl Mul<f64x4> for f64 {
171  type Output = f64x4;
172  #[inline]
173  fn mul(self, rhs: f64x4) -> Self::Output {
174    f64x4::splat(self).mul(rhs)
175  }
176}
177
178impl Div<f64x4> for f64 {
179  type Output = f64x4;
180  #[inline]
181  fn div(self, rhs: f64x4) -> Self::Output {
182    f64x4::splat(self).div(rhs)
183  }
184}
185
186impl BitAnd for f64x4 {
187  type Output = Self;
188  #[inline]
189  fn bitand(self, rhs: Self) -> Self::Output {
190    pick! {
191      if #[cfg(target_feature="avx")] {
192        Self { avx: bitand_m256d(self.avx, rhs.avx) }
193      } else {
194        Self {
195          a : self.a.bitand(rhs.a),
196          b : self.b.bitand(rhs.b),
197        }
198      }
199    }
200  }
201}
202
203impl BitOr for f64x4 {
204  type Output = Self;
205  #[inline]
206  fn bitor(self, rhs: Self) -> Self::Output {
207    pick! {
208      if #[cfg(target_feature="avx")] {
209        Self { avx: bitor_m256d(self.avx, rhs.avx) }
210      } else {
211        Self {
212          a : self.a.bitor(rhs.a),
213          b : self.b.bitor(rhs.b),
214        }
215      }
216    }
217  }
218}
219
220impl BitXor for f64x4 {
221  type Output = Self;
222  #[inline]
223  fn bitxor(self, rhs: Self) -> Self::Output {
224    pick! {
225      if #[cfg(target_feature="avx")] {
226        Self { avx: bitxor_m256d(self.avx, rhs.avx) }
227      } else {
228        Self {
229          a : self.a.bitxor(rhs.a),
230          b : self.b.bitxor(rhs.b),
231        }
232      }
233    }
234  }
235}
236
237impl CmpEq for f64x4 {
238  type Output = Self;
239  #[inline]
240  fn simd_eq(self, rhs: Self) -> Self::Output {
241    pick! {
242      if #[cfg(target_feature="avx")]{
243        Self { avx: cmp_op_mask_m256d::<{cmp_op!(EqualOrdered)}>(self.avx, rhs.avx) }
244      } else {
245        Self {
246          a : self.a.simd_eq(rhs.a),
247          b : self.b.simd_eq(rhs.b),
248        }
249      }
250    }
251  }
252}
253
254impl CmpGe for f64x4 {
255  type Output = Self;
256  #[inline]
257  fn simd_ge(self, rhs: Self) -> Self::Output {
258    pick! {
259      if #[cfg(target_feature="avx")]{
260        Self { avx: cmp_op_mask_m256d::<{cmp_op!(GreaterEqualOrdered)}>(self.avx, rhs.avx) }
261      } else {
262        Self {
263          a : self.a.simd_ge(rhs.a),
264          b : self.b.simd_ge(rhs.b),
265        }
266      }
267    }
268  }
269}
270
271impl CmpGt for f64x4 {
272  type Output = Self;
273  #[inline]
274  fn simd_gt(self, rhs: Self) -> Self::Output {
275    pick! {
276      if #[cfg(target_feature="avx")]{
277        Self { avx: cmp_op_mask_m256d::<{cmp_op!( GreaterThanOrdered)}>(self.avx, rhs.avx) }
278      } else {
279        Self {
280          a : self.a.simd_gt(rhs.a),
281          b : self.b.simd_gt(rhs.b),
282        }
283      }
284    }
285  }
286}
287
288impl CmpNe for f64x4 {
289  type Output = Self;
290  #[inline]
291  fn simd_ne(self, rhs: Self) -> Self::Output {
292    pick! {
293      if #[cfg(target_feature="avx")]{
294        Self { avx: cmp_op_mask_m256d::<{cmp_op!(NotEqualOrdered)}>(self.avx, rhs.avx) }
295      } else {
296        Self {
297          a : self.a.simd_ne(rhs.a),
298          b : self.b.simd_ne(rhs.b),
299        }
300      }
301    }
302  }
303}
304
305impl CmpLe for f64x4 {
306  type Output = Self;
307  #[inline]
308  fn simd_le(self, rhs: Self) -> Self::Output {
309    pick! {
310      if #[cfg(target_feature="avx")]{
311        Self { avx: cmp_op_mask_m256d::<{cmp_op!(LessEqualOrdered)}>(self.avx, rhs.avx) }
312      } else {
313        Self {
314          a : self.a.simd_le(rhs.a),
315          b : self.b.simd_le(rhs.b),
316        }
317      }
318    }
319  }
320}
321
322impl CmpLt for f64x4 {
323  type Output = Self;
324  #[inline]
325  fn simd_lt(self, rhs: Self) -> Self::Output {
326    pick! {
327      if #[cfg(target_feature="avx")]{
328        Self { avx: cmp_op_mask_m256d::<{cmp_op!(LessThanOrdered)}>(self.avx, rhs.avx) }
329      } else {
330        Self {
331          a : self.a.simd_lt(rhs.a),
332          b : self.b.simd_lt(rhs.b),
333        }
334      }
335    }
336  }
337}
338
339impl f64x4 {
340  #[inline]
341  #[must_use]
342  pub const fn new(array: [f64; 4]) -> Self {
343    unsafe { core::mem::transmute(array) }
344  }
345  #[inline]
346  #[must_use]
347  pub fn blend(self, t: Self, f: Self) -> Self {
348    pick! {
349      if #[cfg(target_feature="avx")] {
350        Self { avx: blend_varying_m256d(f.avx, t.avx, self.avx) }
351      } else {
352        Self {
353          a : self.a.blend(t.a, f.a),
354          b : self.b.blend(t.b, f.b),
355        }
356      }
357    }
358  }
359
360  #[inline]
361  #[must_use]
362  pub fn abs(self) -> Self {
363    pick! {
364      if #[cfg(target_feature="avx")] {
365        let non_sign_bits = f64x4::from(f64::from_bits(i64::MAX as u64));
366        self & non_sign_bits
367      } else {
368        Self {
369          a : self.a.abs(),
370          b : self.b.abs(),
371        }
372      }
373    }
374  }
375
376  #[inline]
377  #[must_use]
378  pub fn floor(self) -> Self {
379    pick! {
380      if #[cfg(target_feature="avx")] {
381        Self { avx: floor_m256d(self.avx) }
382      } else {
383        Self {
384          a : self.a.floor(),
385          b : self.b.floor(),
386        }
387      }
388    }
389  }
390  #[inline]
391  #[must_use]
392  pub fn ceil(self) -> Self {
393    pick! {
394      if #[cfg(target_feature="avx")] {
395        Self { avx: ceil_m256d(self.avx) }
396      } else {
397        Self {
398          a : self.a.ceil(),
399          b : self.b.ceil(),
400        }
401      }
402    }
403  }
404
405  /// Calculates the lanewise maximum of both vectors. This is a faster
406  /// implementation than `max`, but it doesn't specify any behavior if NaNs are
407  /// involved.
408  #[inline]
409  #[must_use]
410  pub fn fast_max(self, rhs: Self) -> Self {
411    pick! {
412      if #[cfg(target_feature="avx")] {
413        Self { avx: max_m256d(self.avx, rhs.avx) }
414      } else {
415        Self {
416          a : self.a.fast_max(rhs.a),
417          b : self.b.fast_max(rhs.b),
418        }
419      }
420    }
421  }
422
423  /// Calculates the lanewise maximum of both vectors. If either lane is NaN,
424  /// the other lane gets chosen. Use `fast_max` for a faster implementation
425  /// that doesn't handle NaNs.
426  #[inline]
427  #[must_use]
428  pub fn max(self, rhs: Self) -> Self {
429    pick! {
430      if #[cfg(target_feature="avx")] {
431        // max_m256d seems to do rhs < self ? self : rhs. So if there's any NaN
432        // involved, it chooses rhs, so we need to specifically check rhs for
433        // NaN.
434        rhs.is_nan().blend(self, Self { avx: max_m256d(self.avx, rhs.avx) })
435      } else {
436        Self {
437          a : self.a.max(rhs.a),
438          b : self.b.max(rhs.b),
439        }
440      }
441    }
442  }
443
444  /// Calculates the lanewise minimum of both vectors. This is a faster
445  /// implementation than `min`, but it doesn't specify any behavior if NaNs are
446  /// involved.
447  #[inline]
448  #[must_use]
449  pub fn fast_min(self, rhs: Self) -> Self {
450    pick! {
451      if #[cfg(target_feature="avx")] {
452        Self { avx: min_m256d(self.avx, rhs.avx) }
453      } else {
454        Self {
455          a : self.a.fast_min(rhs.a),
456          b : self.b.fast_min(rhs.b),
457        }
458      }
459    }
460  }
461
462  /// Calculates the lanewise minimum of both vectors. If either lane is NaN,
463  /// the other lane gets chosen. Use `fast_min` for a faster implementation
464  /// that doesn't handle NaNs.
465  #[inline]
466  #[must_use]
467  pub fn min(self, rhs: Self) -> Self {
468    pick! {
469      if #[cfg(target_feature="avx")] {
470        // min_m256d seems to do rhs < self ? self : rhs. So if there's any NaN
471        // involved, it chooses rhs, so we need to specifically check rhs for
472        // NaN.
473        rhs.is_nan().blend(self, Self { avx: min_m256d(self.avx, rhs.avx) })
474      } else {
475        Self {
476          a : self.a.min(rhs.a),
477          b : self.b.min(rhs.b),
478        }
479      }
480    }
481  }
482
483  #[inline]
484  #[must_use]
485  pub fn is_nan(self) -> Self {
486    pick! {
487      if #[cfg(target_feature="avx")] {
488        Self { avx: cmp_op_mask_m256d::<{cmp_op!(Unordered)}>(self.avx, self.avx ) }
489      } else {
490        Self {
491          a : self.a.is_nan(),
492          b : self.b.is_nan(),
493        }
494      }
495    }
496  }
497
498  #[inline]
499  #[must_use]
500  pub fn is_finite(self) -> Self {
501    let shifted_exp_mask = u64x4::from(0xFFE0000000000000);
502    let u: u64x4 = cast(self);
503    let shift_u = u << 1_u64;
504    let out = !(shift_u & shifted_exp_mask).simd_eq(shifted_exp_mask);
505    cast(out)
506  }
507
508  #[inline]
509  #[must_use]
510  pub fn is_inf(self) -> Self {
511    let shifted_inf = u64x4::from(0xFFE0000000000000);
512    let u: u64x4 = cast(self);
513    let shift_u = u << 1_u64;
514    let out = (shift_u).simd_eq(shifted_inf);
515    cast(out)
516  }
517
518  #[inline]
519  #[must_use]
520  pub fn round(self) -> Self {
521    pick! {
522      if #[cfg(target_feature="avx")] {
523        Self { avx: round_m256d::<{round_op!(Nearest)}>(self.avx) }
524      } else {
525        Self {
526          a : self.a.round(),
527          b : self.b.round(),
528        }
529      }
530    }
531  }
532
533  #[inline]
534  #[must_use]
535  pub fn round_int(self) -> i64x4 {
536    // NOTE:No optimization for this currently available so delegate to LLVM
537    let rounded: [f64; 4] = cast(self.round());
538    cast([
539      rounded[0] as i64,
540      rounded[1] as i64,
541      rounded[2] as i64,
542      rounded[3] as i64,
543    ])
544  }
545
546  /// Performs a multiply-add operation: `self * m + a`
547  ///
548  /// When hardware FMA support is available, this computes the result with a
549  /// single rounding operation. Without FMA support, it falls back to separate
550  /// multiply and add operations with two roundings.
551  ///
552  /// # Platform-specific behavior
553  /// - On `x86`/`x86_64` with AVX+FMA: Uses 256-bit `vfmadd` (single rounding,
554  ///   best accuracy)
555  /// - On `x86`/`x86_64` with AVX only: Uses `(self * m) + a` (two roundings)
556  /// - Other platforms: Delegates to [`f64x2`] (may use NEON FMA or fallback)
557  ///
558  /// # Examples
559  /// ```
560  /// # use wide::f64x4;
561  /// let a = f64x4::from([1.0, 2.0, 3.0, 4.0]);
562  /// let b = f64x4::from([2.0; 4]);
563  /// let c = f64x4::from([10.0; 4]);
564  ///
565  /// let result = a.mul_add(b, c);
566  ///
567  /// let expected = f64x4::from([12.0, 14.0, 16.0, 18.0]);
568  /// assert_eq!(result, expected);
569  /// ```
570  #[inline]
571  #[must_use]
572  pub fn mul_add(self, m: Self, a: Self) -> Self {
573    pick! {
574      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
575        Self { avx: fused_mul_add_m256d(self.avx, m.avx, a.avx) }
576      } else if #[cfg(target_feature="avx")] {
577        // still want to use 256 bit ops
578        (self * m) + a
579      } else {
580        Self {
581          a : self.a.mul_add(m.a, a.a),
582          b : self.b.mul_add(m.b, a.b),
583        }
584      }
585    }
586  }
587
588  /// Performs a multiply-subtract operation: `self * m - s`
589  ///
590  /// When hardware FMA support is available, this computes the result with a
591  /// single rounding operation. Without FMA support, it falls back to separate
592  /// multiply and subtract operations with two roundings.
593  ///
594  /// # Platform-specific behavior
595  /// - On `x86`/`x86_64` with AVX+FMA: Uses 256-bit `vfmsub` (single rounding,
596  ///   best accuracy)
597  /// - On `x86`/`x86_64` with AVX only: Uses `(self * m) - s` (two roundings)
598  /// - Other platforms: Delegates to [`f64x2`] (may use NEON FMA or fallback)
599  ///
600  /// # Examples
601  /// ```
602  /// # use wide::f64x4;
603  /// let a = f64x4::from([10.0; 4]);
604  /// let b = f64x4::from([2.0; 4]);
605  /// let c = f64x4::from([5.0; 4]);
606  ///
607  /// let result = a.mul_sub(b, c);
608  ///
609  /// let expected = f64x4::from([15.0; 4]);
610  /// assert_eq!(result, expected);
611  /// ```
612  #[inline]
613  #[must_use]
614  pub fn mul_sub(self, m: Self, s: Self) -> Self {
615    pick! {
616      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
617        Self { avx: fused_mul_sub_m256d(self.avx, m.avx, s.avx) }
618      } else if #[cfg(target_feature="avx")] {
619        // still want to use 256 bit ops
620        (self * m) - s
621      } else {
622        Self {
623          a : self.a.mul_sub(m.a, s.a),
624          b : self.b.mul_sub(m.b, s.b),
625        }
626      }
627    }
628  }
629
630  /// Performs a negative multiply-add operation: `a - (self * m)`
631  ///
632  /// When hardware FMA support is available, this computes the result with a
633  /// single rounding operation. Without FMA support, it falls back to separate
634  /// operations with two roundings.
635  ///
636  /// # Platform-specific behavior
637  /// - On `x86`/`x86_64` with AVX+FMA: Uses 256-bit `vfnmadd` (single rounding,
638  ///   best accuracy)
639  /// - On `x86`/`x86_64` with AVX only: Uses `a - (self * m)` (two roundings)
640  /// - Other platforms: Delegates to [`f64x2`] (may use NEON FMA or fallback)
641  ///
642  /// # Examples
643  /// ```
644  /// # use wide::f64x4;
645  /// let a = f64x4::from([3.0; 4]);
646  /// let b = f64x4::from([2.0; 4]);
647  /// let c = f64x4::from([10.0; 4]);
648  ///
649  /// let result = a.mul_neg_add(b, c);
650  ///
651  /// let expected = f64x4::from([4.0; 4]);
652  /// assert_eq!(result, expected);
653  /// ```
654  #[inline]
655  #[must_use]
656  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
657    pick! {
658      if #[cfg(all(target_feature="avx",target_feature="fma"))] {
659        Self { avx: fused_mul_neg_add_m256d(self.avx, m.avx, a.avx) }
660      } else if #[cfg(target_feature="avx")] {
661        // still want to use 256 bit ops
662        a - (self * m)
663      } else {
664        Self {
665          a : self.a.mul_neg_add(m.a, a.a),
666          b : self.b.mul_neg_add(m.b, a.b),
667        }
668      }
669    }
670  }
671
672  /// Performs a negative multiply-subtract operation: `-(self * m) - s`
673  ///
674  /// When hardware FMA support is available, this computes the result with a
675  /// single rounding operation. Without FMA support, it falls back to separate
676  /// operations with two roundings.
677  ///
678  /// # Platform-specific behavior
679  /// - On `x86`/`x86_64` with AVX+FMA: Uses 256-bit `vfnmsub` (single rounding,
680  ///   best accuracy)
681  /// - On `x86`/`x86_64` with AVX only: Uses `-(self * m) - s` (two roundings)
682  /// - Other platforms: Delegates to [`f64x2`] (may use NEON FMA or fallback)
683  ///
684  /// # Examples
685  /// ```
686  /// # use wide::f64x4;
687  /// let a = f64x4::from([3.0; 4]);
688  /// let b = f64x4::from([2.0; 4]);
689  /// let c = f64x4::from([1.0; 4]);
690  ///
691  /// let result = a.mul_neg_sub(b, c);
692  ///
693  /// let expected = f64x4::from([-7.0; 4]);
694  /// assert_eq!(result, expected);
695  /// ```
696  #[inline]
697  #[must_use]
698  pub fn mul_neg_sub(self, m: Self, s: Self) -> Self {
699    pick! {
700       if #[cfg(all(target_feature="avx",target_feature="fma"))] {
701         Self { avx: fused_mul_neg_sub_m256d(self.avx, m.avx, s.avx) }
702        } else if #[cfg(target_feature="avx")] {
703          // still want to use 256 bit ops
704          -(self * m) - s
705        } else {
706         Self {
707           a : self.a.mul_neg_sub(m.a, s.a),
708           b : self.b.mul_neg_sub(m.b, s.b),
709         }
710       }
711    }
712  }
713
714  #[inline]
715  #[must_use]
716  pub fn flip_signs(self, signs: Self) -> Self {
717    self ^ (signs & Self::from(-0.0))
718  }
719
720  #[inline]
721  #[must_use]
722  pub fn copysign(self, sign: Self) -> Self {
723    let magnitude_mask = Self::from(f64::from_bits(u64::MAX >> 1));
724    (self & magnitude_mask) | (sign & Self::from(-0.0))
725  }
726
727  #[inline]
728  pub fn asin_acos(self) -> (Self, Self) {
729    // Based on the Agner Fog "vector class library":
730    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
731    const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3);
732    const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1);
733    const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0);
734    const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1);
735    const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1);
736
737    const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1);
738    const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2);
739    const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2);
740    const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2);
741
742    const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3);
743    const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1);
744    const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0);
745    const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1);
746    const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1);
747    const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0);
748
749    const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1);
750    const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1);
751    const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2);
752    const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2);
753    const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1);
754
755    let xa = self.abs();
756
757    let big = xa.simd_ge(f64x4::splat(0.625));
758
759    let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa);
760
761    let x2 = x1 * x1;
762    let x3 = x2 * x1;
763    let x4 = x2 * x2;
764    let x5 = x4 * x1;
765
766    let do_big = big.any();
767    let do_small = !big.all();
768
769    let mut rx = f64x4::default();
770    let mut sx = f64x4::default();
771    let mut px = f64x4::default();
772    let mut qx = f64x4::default();
773
774    if do_big {
775      rx = x3.mul_add(R3asin, x2 * R2asin)
776        + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin));
777      sx =
778        x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin));
779    }
780
781    if do_small {
782      px = x3.mul_add(P3asin, P0asin)
783        + x4.mul_add(P4asin, x1 * P1asin)
784        + x5.mul_add(P5asin, x2 * P2asin);
785      qx = x4.mul_add(Q4asin, x5)
786        + x3.mul_add(Q3asin, x1 * Q1asin)
787        + x2.mul_add(Q2asin, Q0asin);
788    };
789
790    let vx = big.blend(rx, px);
791    let wx = big.blend(sx, qx);
792
793    let y1 = vx / wx * x1;
794
795    let mut z1 = f64x4::default();
796    let mut z2 = f64x4::default();
797    if do_big {
798      let xb = (x1 + x1).sqrt();
799      z1 = xb.mul_add(y1, xb);
800    }
801
802    if do_small {
803      z2 = xa.mul_add(y1, xa);
804    }
805
806    // asin
807    let z3 = f64x4::FRAC_PI_2 - z1;
808    let asin = big.blend(z3, z2);
809    let asin = asin.flip_signs(self);
810
811    // acos
812    let z3 = self.simd_lt(f64x4::ZERO).blend(f64x4::PI - z1, z1);
813    let z4 = f64x4::FRAC_PI_2 - z2.flip_signs(self);
814    let acos = big.blend(z3, z4);
815
816    (asin, acos)
817  }
818
819  #[inline]
820  pub fn acos(self) -> Self {
821    // Based on the Agner Fog "vector class library":
822    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
823    const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3);
824    const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1);
825    const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0);
826    const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1);
827    const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1);
828
829    const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1);
830    const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2);
831    const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2);
832    const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2);
833
834    const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3);
835    const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1);
836    const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0);
837    const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1);
838    const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1);
839    const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0);
840
841    const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1);
842    const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1);
843    const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2);
844    const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2);
845    const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1);
846
847    let xa = self.abs();
848
849    let big = xa.simd_ge(f64x4::splat(0.625));
850
851    let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa);
852
853    let x2 = x1 * x1;
854    let x3 = x2 * x1;
855    let x4 = x2 * x2;
856    let x5 = x4 * x1;
857
858    let do_big = big.any();
859    let do_small = !big.all();
860
861    let mut rx = f64x4::default();
862    let mut sx = f64x4::default();
863    let mut px = f64x4::default();
864    let mut qx = f64x4::default();
865
866    if do_big {
867      rx = x3.mul_add(R3asin, x2 * R2asin)
868        + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin));
869      sx =
870        x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin));
871    }
872    if do_small {
873      px = x3.mul_add(P3asin, P0asin)
874        + x4.mul_add(P4asin, x1 * P1asin)
875        + x5.mul_add(P5asin, x2 * P2asin);
876      qx = x4.mul_add(Q4asin, x5)
877        + x3.mul_add(Q3asin, x1 * Q1asin)
878        + x2.mul_add(Q2asin, Q0asin);
879    };
880
881    let vx = big.blend(rx, px);
882    let wx = big.blend(sx, qx);
883
884    let y1 = vx / wx * x1;
885
886    let mut z1 = f64x4::default();
887    let mut z2 = f64x4::default();
888    if do_big {
889      let xb = (x1 + x1).sqrt();
890      z1 = xb.mul_add(y1, xb);
891    }
892
893    if do_small {
894      z2 = xa.mul_add(y1, xa);
895    }
896
897    // acos
898    let z3 = self.simd_lt(f64x4::ZERO).blend(f64x4::PI - z1, z1);
899    let z4 = f64x4::FRAC_PI_2 - z2.flip_signs(self);
900    let acos = big.blend(z3, z4);
901
902    acos
903  }
904  #[inline]
905  #[must_use]
906  pub fn asin(self) -> Self {
907    // Based on the Agner Fog "vector class library":
908    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
909    const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3);
910    const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1);
911    const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0);
912    const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1);
913    const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1);
914
915    const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1);
916    const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2);
917    const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2);
918    const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2);
919
920    const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3);
921    const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1);
922    const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0);
923    const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1);
924    const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1);
925    const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0);
926
927    const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1);
928    const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1);
929    const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2);
930    const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2);
931    const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1);
932
933    let xa = self.abs();
934
935    let big = xa.simd_ge(f64x4::splat(0.625));
936
937    let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa);
938
939    let x2 = x1 * x1;
940    let x3 = x2 * x1;
941    let x4 = x2 * x2;
942    let x5 = x4 * x1;
943
944    let do_big = big.any();
945    let do_small = !big.all();
946
947    let mut rx = f64x4::default();
948    let mut sx = f64x4::default();
949    let mut px = f64x4::default();
950    let mut qx = f64x4::default();
951
952    if do_big {
953      rx = x3.mul_add(R3asin, x2 * R2asin)
954        + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin));
955      sx =
956        x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin));
957    }
958    if do_small {
959      px = x3.mul_add(P3asin, P0asin)
960        + x4.mul_add(P4asin, x1 * P1asin)
961        + x5.mul_add(P5asin, x2 * P2asin);
962      qx = x4.mul_add(Q4asin, x5)
963        + x3.mul_add(Q3asin, x1 * Q1asin)
964        + x2.mul_add(Q2asin, Q0asin);
965    };
966
967    let vx = big.blend(rx, px);
968    let wx = big.blend(sx, qx);
969
970    let y1 = vx / wx * x1;
971
972    let mut z1 = f64x4::default();
973    let mut z2 = f64x4::default();
974    if do_big {
975      let xb = (x1 + x1).sqrt();
976      z1 = xb.mul_add(y1, xb);
977    }
978
979    if do_small {
980      z2 = xa.mul_add(y1, xa);
981    }
982
983    // asin
984    let z3 = f64x4::FRAC_PI_2 - z1;
985    let asin = big.blend(z3, z2);
986    let asin = asin.flip_signs(self);
987
988    asin
989  }
990
991  #[inline]
992  pub fn atan(self) -> Self {
993    // Based on the Agner Fog "vector class library":
994    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
995    const_f64_as_f64x4!(MORE_BITS, 6.123233995736765886130E-17);
996    const_f64_as_f64x4!(MORE_BITS_O2, 6.123233995736765886130E-17 * 0.5);
997    const_f64_as_f64x4!(T3PO8, core::f64::consts::SQRT_2 + 1.0);
998
999    const_f64_as_f64x4!(P4atan, -8.750608600031904122785E-1);
1000    const_f64_as_f64x4!(P3atan, -1.615753718733365076637E1);
1001    const_f64_as_f64x4!(P2atan, -7.500855792314704667340E1);
1002    const_f64_as_f64x4!(P1atan, -1.228866684490136173410E2);
1003    const_f64_as_f64x4!(P0atan, -6.485021904942025371773E1);
1004
1005    const_f64_as_f64x4!(Q4atan, 2.485846490142306297962E1);
1006    const_f64_as_f64x4!(Q3atan, 1.650270098316988542046E2);
1007    const_f64_as_f64x4!(Q2atan, 4.328810604912902668951E2);
1008    const_f64_as_f64x4!(Q1atan, 4.853903996359136964868E2);
1009    const_f64_as_f64x4!(Q0atan, 1.945506571482613964425E2);
1010
1011    let t = self.abs();
1012
1013    // small:  t < 0.66
1014    // medium: t <= t <= 2.4142 (1+sqrt(2))
1015    // big:    t > 2.4142
1016    let notbig = t.simd_le(T3PO8);
1017    let notsmal = t.simd_ge(Self::splat(0.66));
1018
1019    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1020    s = notsmal & s;
1021    let mut fac = notbig.blend(MORE_BITS_O2, MORE_BITS);
1022    fac = notsmal & fac;
1023
1024    // small:  z = t / 1.0;
1025    // medium: z = (t-1.0) / (t+1.0);
1026    // big:    z = -1.0 / t;
1027    let mut a = notbig & t;
1028    a = notsmal.blend(a - Self::ONE, a);
1029    let mut b = notbig & Self::ONE;
1030    b = notsmal.blend(b + t, b);
1031    let z = a / b;
1032
1033    let zz = z * z;
1034
1035    let px = polynomial_4!(zz, P0atan, P1atan, P2atan, P3atan, P4atan);
1036    let qx = polynomial_5n!(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan);
1037
1038    let mut re = (px / qx).mul_add(z * zz, z);
1039    re += s + fac;
1040
1041    // get sign bit
1042    re = (self.sign_bit()).blend(-re, re);
1043
1044    re
1045  }
1046
1047  #[inline]
1048  pub fn atan2(self, x: Self) -> Self {
1049    // Based on the Agner Fog "vector class library":
1050    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1051    const_f64_as_f64x4!(MORE_BITS, 6.123233995736765886130E-17);
1052    const_f64_as_f64x4!(MORE_BITS_O2, 6.123233995736765886130E-17 * 0.5);
1053    const_f64_as_f64x4!(T3PO8, core::f64::consts::SQRT_2 + 1.0);
1054
1055    const_f64_as_f64x4!(P4atan, -8.750608600031904122785E-1);
1056    const_f64_as_f64x4!(P3atan, -1.615753718733365076637E1);
1057    const_f64_as_f64x4!(P2atan, -7.500855792314704667340E1);
1058    const_f64_as_f64x4!(P1atan, -1.228866684490136173410E2);
1059    const_f64_as_f64x4!(P0atan, -6.485021904942025371773E1);
1060
1061    const_f64_as_f64x4!(Q4atan, 2.485846490142306297962E1);
1062    const_f64_as_f64x4!(Q3atan, 1.650270098316988542046E2);
1063    const_f64_as_f64x4!(Q2atan, 4.328810604912902668951E2);
1064    const_f64_as_f64x4!(Q1atan, 4.853903996359136964868E2);
1065    const_f64_as_f64x4!(Q0atan, 1.945506571482613964425E2);
1066
1067    let y = self;
1068
1069    // move in first octant
1070    let x1 = x.abs();
1071    let y1 = y.abs();
1072    let swapxy = y1.simd_gt(x1);
1073    // swap x and y if y1 > x1
1074    let mut x2 = swapxy.blend(y1, x1);
1075    let mut y2 = swapxy.blend(x1, y1);
1076
1077    // check for special case: x and y are both +/- INF
1078    let both_infinite = x.is_inf() & y.is_inf();
1079    if both_infinite.any() {
1080      let minus_one = -Self::ONE;
1081      x2 = both_infinite.blend(x2 & minus_one, x2);
1082      y2 = both_infinite.blend(y2 & minus_one, y2);
1083    }
1084
1085    // x = y = 0 gives NAN here
1086    let t = y2 / x2;
1087
1088    // small:  t < 0.66
1089    // medium: t <= t <= 2.4142 (1+sqrt(2))
1090    // big:    t > 2.4142
1091    let notbig = t.simd_le(T3PO8);
1092    let notsmal = t.simd_ge(Self::splat(0.66));
1093
1094    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1095    s = notsmal & s;
1096    let mut fac = notbig.blend(MORE_BITS_O2, MORE_BITS);
1097    fac = notsmal & fac;
1098
1099    // small:  z = t / 1.0;
1100    // medium: z = (t-1.0) / (t+1.0);
1101    // big:    z = -1.0 / t;
1102    let mut a = notbig & t;
1103    a = notsmal.blend(a - Self::ONE, a);
1104    let mut b = notbig & Self::ONE;
1105    b = notsmal.blend(b + t, b);
1106    let z = a / b;
1107
1108    let zz = z * z;
1109
1110    let px = polynomial_4!(zz, P0atan, P1atan, P2atan, P3atan, P4atan);
1111    let qx = polynomial_5n!(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan);
1112
1113    let mut re = (px / qx).mul_add(z * zz, z);
1114    re += s + fac;
1115
1116    // move back in place
1117    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1118    re = ((x | y).simd_eq(Self::ZERO)).blend(Self::ZERO, re);
1119    re = (x.sign_bit()).blend(Self::PI - re, re);
1120
1121    // get sign bit
1122    re = (y.sign_bit()).blend(-re, re);
1123
1124    re
1125  }
1126
1127  #[inline]
1128  #[must_use]
1129  pub fn sin_cos(self) -> (Self, Self) {
1130    // Based on the Agner Fog "vector class library":
1131    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1132
1133    const_f64_as_f64x4!(P0sin, -1.66666666666666307295E-1);
1134    const_f64_as_f64x4!(P1sin, 8.33333333332211858878E-3);
1135    const_f64_as_f64x4!(P2sin, -1.98412698295895385996E-4);
1136    const_f64_as_f64x4!(P3sin, 2.75573136213857245213E-6);
1137    const_f64_as_f64x4!(P4sin, -2.50507477628578072866E-8);
1138    const_f64_as_f64x4!(P5sin, 1.58962301576546568060E-10);
1139
1140    const_f64_as_f64x4!(P0cos, 4.16666666666665929218E-2);
1141    const_f64_as_f64x4!(P1cos, -1.38888888888730564116E-3);
1142    const_f64_as_f64x4!(P2cos, 2.48015872888517045348E-5);
1143    const_f64_as_f64x4!(P3cos, -2.75573141792967388112E-7);
1144    const_f64_as_f64x4!(P4cos, 2.08757008419747316778E-9);
1145    const_f64_as_f64x4!(P5cos, -1.13585365213876817300E-11);
1146
1147    const_f64_as_f64x4!(DP1, 7.853981554508209228515625E-1 * 2.);
1148    const_f64_as_f64x4!(DP2, 7.94662735614792836714E-9 * 2.);
1149    const_f64_as_f64x4!(DP3, 3.06161699786838294307E-17 * 2.);
1150
1151    const_f64_as_f64x4!(TWO_OVER_PI, 2.0 / core::f64::consts::PI);
1152
1153    let xa = self.abs();
1154
1155    let y = (xa * TWO_OVER_PI).round();
1156    let q = y.round_int();
1157
1158    let x = y.mul_neg_add(DP3, y.mul_neg_add(DP2, y.mul_neg_add(DP1, xa)));
1159
1160    let x2 = x * x;
1161    let mut s = polynomial_5!(x2, P0sin, P1sin, P2sin, P3sin, P4sin, P5sin);
1162    let mut c = polynomial_5!(x2, P0cos, P1cos, P2cos, P3cos, P4cos, P5cos);
1163    s = (x * x2).mul_add(s, x);
1164    c =
1165      (x2 * x2).mul_add(c, x2.mul_neg_add(f64x4::from(0.5), f64x4::from(1.0)));
1166
1167    let swap = !((q & i64x4::from(1)).simd_eq(i64x4::from(0)));
1168
1169    let mut overflow: f64x4 = cast(q.simd_gt(i64x4::from(0x80000000000000)));
1170    overflow &= xa.is_finite();
1171    s = overflow.blend(f64x4::from(0.0), s);
1172    c = overflow.blend(f64x4::from(1.0), c);
1173
1174    // calc sin
1175    let mut sin1 = cast::<_, f64x4>(swap).blend(c, s);
1176    let sign_sin: i64x4 = (q << 62) ^ cast::<_, i64x4>(self);
1177    sin1 = sin1.flip_signs(cast(sign_sin));
1178
1179    // calc cos
1180    let mut cos1 = cast::<_, f64x4>(swap).blend(s, c);
1181    let sign_cos: i64x4 = ((q + i64x4::from(1)) & i64x4::from(2)) << 62;
1182    cos1 ^= cast::<_, f64x4>(sign_cos);
1183
1184    (sin1, cos1)
1185  }
1186  #[inline]
1187  #[must_use]
1188  pub fn sin(self) -> Self {
1189    let (s, _) = self.sin_cos();
1190    s
1191  }
1192  #[inline]
1193  #[must_use]
1194  pub fn cos(self) -> Self {
1195    let (_, c) = self.sin_cos();
1196    c
1197  }
1198  #[inline]
1199  #[must_use]
1200  pub fn tan(self) -> Self {
1201    let (s, c) = self.sin_cos();
1202    s / c
1203  }
1204  #[inline]
1205  #[must_use]
1206  pub fn to_degrees(self) -> Self {
1207    const_f64_as_f64x4!(RAD_TO_DEG_RATIO, 180.0_f64 / core::f64::consts::PI);
1208    self * RAD_TO_DEG_RATIO
1209  }
1210  #[inline]
1211  #[must_use]
1212  pub fn to_radians(self) -> Self {
1213    const_f64_as_f64x4!(DEG_TO_RAD_RATIO, core::f64::consts::PI / 180.0_f64);
1214    self * DEG_TO_RAD_RATIO
1215  }
1216  #[inline]
1217  #[must_use]
1218  pub fn sqrt(self) -> Self {
1219    pick! {
1220      if #[cfg(target_feature="avx")] {
1221        Self { avx: sqrt_m256d(self.avx) }
1222      } else {
1223        Self {
1224          a : self.a.sqrt(),
1225          b : self.b.sqrt(),
1226        }
1227      }
1228    }
1229  }
1230  #[inline]
1231  #[must_use]
1232  pub fn to_bitmask(self) -> u32 {
1233    pick! {
1234      if #[cfg(target_feature="avx")] {
1235        move_mask_m256d(self.avx) as u32
1236      } else {
1237        (self.b.to_bitmask() << 2) | self.a.to_bitmask()
1238      }
1239    }
1240  }
1241  #[inline]
1242  #[must_use]
1243  pub fn any(self) -> bool {
1244    pick! {
1245      if #[cfg(target_feature="avx")] {
1246        move_mask_m256d(self.avx) != 0
1247      } else {
1248        self.a.any() || self.b.any()
1249      }
1250    }
1251  }
1252  #[inline]
1253  #[must_use]
1254  pub fn all(self) -> bool {
1255    pick! {
1256      if #[cfg(target_feature="avx")] {
1257        move_mask_m256d(self.avx) == 0b1111
1258      } else {
1259        self.a.all() && self.b.all()
1260      }
1261    }
1262  }
1263  #[inline]
1264  #[must_use]
1265  pub fn none(self) -> bool {
1266    !self.any()
1267  }
1268
1269  #[inline]
1270  fn vm_pow2n(self) -> Self {
1271    const_f64_as_f64x4!(pow2_52, 4503599627370496.0);
1272    const_f64_as_f64x4!(bias, 1023.0);
1273    let a = self + (bias + pow2_52);
1274    let c = cast::<_, i64x4>(a) << 52;
1275    cast::<_, f64x4>(c)
1276  }
1277
1278  /// Calculate the exponent of a packed `f64x4`
1279  #[inline]
1280  #[must_use]
1281  pub fn exp(self) -> Self {
1282    const_f64_as_f64x4!(P2, 1.0 / 2.0);
1283    const_f64_as_f64x4!(P3, 1.0 / 6.0);
1284    const_f64_as_f64x4!(P4, 1. / 24.);
1285    const_f64_as_f64x4!(P5, 1. / 120.);
1286    const_f64_as_f64x4!(P6, 1. / 720.);
1287    const_f64_as_f64x4!(P7, 1. / 5040.);
1288    const_f64_as_f64x4!(P8, 1. / 40320.);
1289    const_f64_as_f64x4!(P9, 1. / 362880.);
1290    const_f64_as_f64x4!(P10, 1. / 3628800.);
1291    const_f64_as_f64x4!(P11, 1. / 39916800.);
1292    const_f64_as_f64x4!(P12, 1. / 479001600.);
1293    const_f64_as_f64x4!(P13, 1. / 6227020800.);
1294    const_f64_as_f64x4!(LN2D_HI, 0.693145751953125);
1295    const_f64_as_f64x4!(LN2D_LO, 1.42860682030941723212E-6);
1296    let max_x = f64x4::from(708.39);
1297    let r = (self * Self::LOG2_E).round();
1298    let x = r.mul_neg_add(LN2D_HI, self);
1299    let x = r.mul_neg_add(LN2D_LO, x);
1300    let z =
1301      polynomial_13!(x, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13);
1302    let n2 = Self::vm_pow2n(r);
1303    let z = (z + Self::ONE) * n2;
1304    // check for overflow
1305    let in_range = self.abs().simd_lt(max_x);
1306    let in_range = in_range & self.is_finite();
1307    in_range.blend(z, Self::ZERO)
1308  }
1309
1310  #[inline]
1311  fn exponent(self) -> f64x4 {
1312    const_f64_as_f64x4!(pow2_52, 4503599627370496.0);
1313    const_f64_as_f64x4!(bias, 1023.0);
1314    let a = cast::<_, u64x4>(self);
1315    let b = a >> 52;
1316    let c = b | cast::<_, u64x4>(pow2_52);
1317    let d = cast::<_, f64x4>(c);
1318    let e = d - (pow2_52 + bias);
1319    e
1320  }
1321
1322  #[inline]
1323  fn fraction_2(self) -> Self {
1324    let t1 = cast::<_, u64x4>(self);
1325    let t2 = cast::<_, u64x4>(
1326      (t1 & u64x4::from(0x000FFFFFFFFFFFFF)) | u64x4::from(0x3FE0000000000000),
1327    );
1328    cast::<_, f64x4>(t2)
1329  }
1330  #[inline]
1331  fn is_zero_or_subnormal(self) -> Self {
1332    let t = cast::<_, i64x4>(self);
1333    let t = t & i64x4::splat(0x7FF0000000000000);
1334    i64x4::round_float(t.simd_eq(i64x4::splat(0)))
1335  }
1336  #[inline]
1337  fn infinity() -> Self {
1338    cast::<_, f64x4>(i64x4::splat(0x7FF0000000000000))
1339  }
1340  #[inline]
1341  fn nan_log() -> Self {
1342    cast::<_, f64x4>(i64x4::splat(0x7FF8000000000000 | 0x101 << 29))
1343  }
1344  #[inline]
1345  fn nan_pow() -> Self {
1346    cast::<_, f64x4>(i64x4::splat(0x7FF8000000000000 | 0x101 << 29))
1347  }
1348  #[inline]
1349  fn sign_bit(self) -> Self {
1350    let t1 = cast::<_, i64x4>(self);
1351    let t2 = t1 >> 63;
1352    !cast::<_, f64x4>(t2).simd_eq(f64x4::ZERO)
1353  }
1354
1355  /// horizontal add of all the elements of the vector
1356  #[inline]
1357  pub fn reduce_add(self) -> f64 {
1358    pick! {
1359      if #[cfg(target_feature="avx")] {
1360        // From https://stackoverflow.com/questions/49941645/get-sum-of-values-stored-in-m256d-with-sse-avx
1361        let lo = cast_to_m128d_from_m256d(self.avx);
1362        let hi = extract_m128d_from_m256d::<1>(self.avx);
1363        let lo = add_m128d(lo,hi);
1364        let hi64 = unpack_high_m128d(lo,lo);
1365        let sum = add_m128d_s(lo,hi64);
1366        get_f64_from_m128d_s(sum)
1367      } else {
1368        self.a.reduce_add() + self.b.reduce_add()
1369      }
1370    }
1371  }
1372
1373  /// Natural log (ln(x))
1374  #[inline]
1375  #[must_use]
1376  pub fn ln(self) -> Self {
1377    const_f64_as_f64x4!(HALF, 0.5);
1378    const_f64_as_f64x4!(P0, 7.70838733755885391666E0);
1379    const_f64_as_f64x4!(P1, 1.79368678507819816313E1);
1380    const_f64_as_f64x4!(P2, 1.44989225341610930846E1);
1381    const_f64_as_f64x4!(P3, 4.70579119878881725854E0);
1382    const_f64_as_f64x4!(P4, 4.97494994976747001425E-1);
1383    const_f64_as_f64x4!(P5, 1.01875663804580931796E-4);
1384
1385    const_f64_as_f64x4!(Q0, 2.31251620126765340583E1);
1386    const_f64_as_f64x4!(Q1, 7.11544750618563894466E1);
1387    const_f64_as_f64x4!(Q2, 8.29875266912776603211E1);
1388    const_f64_as_f64x4!(Q3, 4.52279145837532221105E1);
1389    const_f64_as_f64x4!(Q4, 1.12873587189167450590E1);
1390    const_f64_as_f64x4!(LN2F_HI, 0.693359375);
1391    const_f64_as_f64x4!(LN2F_LO, -2.12194440e-4);
1392    const_f64_as_f64x4!(VM_SQRT2, 1.414213562373095048801);
1393    const_f64_as_f64x4!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1394
1395    let x1 = self;
1396    let x = Self::fraction_2(x1);
1397    let e = Self::exponent(x1);
1398    let mask = x.simd_gt(VM_SQRT2 * HALF);
1399    let x = (!mask).blend(x + x, x);
1400    let fe = mask.blend(e + Self::ONE, e);
1401    let x = x - Self::ONE;
1402    let px = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1403    let x2 = x * x;
1404    let px = x2 * x * px;
1405    let qx = polynomial_5n!(x, Q0, Q1, Q2, Q3, Q4);
1406    let res = px / qx;
1407    let res = fe.mul_add(LN2F_LO, res);
1408    let res = res + x2.mul_neg_add(HALF, x);
1409    let res = fe.mul_add(LN2F_HI, res);
1410    let overflow = !self.is_finite();
1411    let underflow = x1.simd_lt(VM_SMALLEST_NORMAL);
1412    let mask = overflow | underflow;
1413    if !mask.any() {
1414      res
1415    } else {
1416      let is_zero = self.is_zero_or_subnormal();
1417      let res = underflow.blend(Self::nan_log(), res);
1418      let res = is_zero.blend(Self::infinity(), res);
1419      let res = overflow.blend(self, res);
1420      res
1421    }
1422  }
1423
1424  #[inline]
1425  #[must_use]
1426  pub fn log2(self) -> Self {
1427    Self::ln(self) * Self::LOG2_E
1428  }
1429  #[inline]
1430  #[must_use]
1431  pub fn log10(self) -> Self {
1432    Self::ln(self) * Self::LOG10_E
1433  }
1434
1435  #[inline]
1436  #[must_use]
1437  pub fn pow_f64x4(self, y: Self) -> Self {
1438    const_f64_as_f64x4!(ln2d_hi, 0.693145751953125);
1439    const_f64_as_f64x4!(ln2d_lo, 1.42860682030941723212E-6);
1440    const_f64_as_f64x4!(P0log, 2.0039553499201281259648E1);
1441    const_f64_as_f64x4!(P1log, 5.7112963590585538103336E1);
1442    const_f64_as_f64x4!(P2log, 6.0949667980987787057556E1);
1443    const_f64_as_f64x4!(P3log, 2.9911919328553073277375E1);
1444    const_f64_as_f64x4!(P4log, 6.5787325942061044846969E0);
1445    const_f64_as_f64x4!(P5log, 4.9854102823193375972212E-1);
1446    const_f64_as_f64x4!(P6log, 4.5270000862445199635215E-5);
1447    const_f64_as_f64x4!(Q0log, 6.0118660497603843919306E1);
1448    const_f64_as_f64x4!(Q1log, 2.1642788614495947685003E2);
1449    const_f64_as_f64x4!(Q2log, 3.0909872225312059774938E2);
1450    const_f64_as_f64x4!(Q3log, 2.2176239823732856465394E2);
1451    const_f64_as_f64x4!(Q4log, 8.3047565967967209469434E1);
1452    const_f64_as_f64x4!(Q5log, 1.5062909083469192043167E1);
1453
1454    // Taylor expansion constants
1455    const_f64_as_f64x4!(p2, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1456    const_f64_as_f64x4!(p3, 1.0 / 6.0);
1457    const_f64_as_f64x4!(p4, 1.0 / 24.0);
1458    const_f64_as_f64x4!(p5, 1.0 / 120.0);
1459    const_f64_as_f64x4!(p6, 1.0 / 720.0);
1460    const_f64_as_f64x4!(p7, 1.0 / 5040.0);
1461    const_f64_as_f64x4!(p8, 1.0 / 40320.0);
1462    const_f64_as_f64x4!(p9, 1.0 / 362880.0);
1463    const_f64_as_f64x4!(p10, 1.0 / 3628800.0);
1464    const_f64_as_f64x4!(p11, 1.0 / 39916800.0);
1465    const_f64_as_f64x4!(p12, 1.0 / 479001600.0);
1466    const_f64_as_f64x4!(p13, 1.0 / 6227020800.0);
1467
1468    let x1 = self.abs();
1469    let x = x1.fraction_2();
1470    let mask = x.simd_gt(f64x4::SQRT_2 * f64x4::HALF);
1471    let x = (!mask).blend(x + x, x);
1472    let x = x - f64x4::ONE;
1473    let x2 = x * x;
1474    let px = polynomial_6!(x, P0log, P1log, P2log, P3log, P4log, P5log, P6log);
1475    let px = px * x * x2;
1476    let qx = polynomial_6n!(x, Q0log, Q1log, Q2log, Q3log, Q4log, Q5log);
1477    let lg1 = px / qx;
1478
1479    let ef = x1.exponent();
1480    let ef = mask.blend(ef + f64x4::ONE, ef);
1481    let e1 = (ef * y).round();
1482    let yr = ef.mul_sub(y, e1);
1483
1484    let lg = f64x4::HALF.mul_neg_add(x2, x) + lg1;
1485    let x2err = (f64x4::HALF * x).mul_sub(x, f64x4::HALF * x2);
1486    let lg_err = f64x4::HALF.mul_add(x2, lg - x) - lg1;
1487
1488    let e2 = (lg * y * f64x4::LOG2_E).round();
1489    let v = lg.mul_sub(y, e2 * ln2d_hi);
1490    let v = e2.mul_neg_add(ln2d_lo, v);
1491    let v = v - (lg_err + x2err).mul_sub(y, yr * f64x4::LN_2);
1492
1493    let x = v;
1494    let e3 = (x * f64x4::LOG2_E).round();
1495    let x = e3.mul_neg_add(f64x4::LN_2, x);
1496    let z =
1497      polynomial_13m!(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13)
1498        + f64x4::ONE;
1499    let ee = e1 + e2 + e3;
1500    let ei = cast::<_, i64x4>(ee.round_int());
1501    let ej = cast::<_, i64x4>(ei + (cast::<_, i64x4>(z) >> 52));
1502
1503    let overflow = cast::<_, f64x4>(!ej.simd_lt(i64x4::splat(0x07FF)))
1504      | ee.simd_gt(f64x4::splat(3000.0));
1505    let underflow = cast::<_, f64x4>(!ej.simd_gt(i64x4::splat(0x000)))
1506      | ee.simd_lt(f64x4::splat(-3000.0));
1507
1508    // Add exponent by integer addition
1509    let z = cast::<_, f64x4>(cast::<_, i64x4>(z) + (ei << 52));
1510
1511    // Check for overflow/underflow
1512    let z = if (overflow | underflow).any() {
1513      let z = underflow.blend(f64x4::ZERO, z);
1514      overflow.blend(Self::infinity(), z)
1515    } else {
1516      z
1517    };
1518
1519    // Check for self == 0
1520    let x_zero = self.is_zero_or_subnormal();
1521    let z = x_zero.blend(
1522      y.simd_lt(f64x4::ZERO).blend(
1523        Self::infinity(),
1524        y.simd_eq(f64x4::ZERO).blend(f64x4::ONE, f64x4::ZERO),
1525      ),
1526      z,
1527    );
1528
1529    let x_sign = self.sign_bit();
1530
1531    let z = if x_sign.any() {
1532      // Y into an integer
1533      let yi = y.simd_eq(y.round());
1534      // Is y odd?
1535      let y_odd = cast::<_, i64x4>(y.round_int() << 63).round_float();
1536      let z1 =
1537        yi.blend(z | y_odd, self.simd_eq(Self::ZERO).blend(z, Self::nan_pow()));
1538      x_sign.blend(z1, z)
1539    } else {
1540      z
1541    };
1542
1543    let x_finite = self.is_finite();
1544    let y_finite = y.is_finite();
1545    let e_finite = ee.is_finite();
1546
1547    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1548      return z;
1549    }
1550
1551    (self.is_nan() | y.is_nan()).blend(self + y, z)
1552  }
1553  #[inline]
1554  pub fn powf(self, y: f64) -> Self {
1555    Self::pow_f64x4(self, f64x4::splat(y))
1556  }
1557
1558  #[inline]
1559  pub fn to_array(self) -> [f64; 4] {
1560    cast(self)
1561  }
1562
1563  #[inline]
1564  pub fn as_array(&self) -> &[f64; 4] {
1565    cast_ref(self)
1566  }
1567
1568  #[inline]
1569  pub fn as_mut_array(&mut self) -> &mut [f64; 4] {
1570    cast_mut(self)
1571  }
1572
1573  #[inline]
1574  pub fn from_i32x4(v: i32x4) -> Self {
1575    pick! {
1576      if #[cfg(target_feature="avx")] {
1577        Self { avx: convert_to_m256d_from_i32_m128i(v.sse) }
1578      } else {
1579        Self::new([
1580          v.as_array()[0] as f64,
1581          v.as_array()[1] as f64,
1582          v.as_array()[2] as f64,
1583          v.as_array()[3] as f64,
1584        ])
1585      }
1586    }
1587  }
1588}
1589
1590impl From<i32x4> for f64x4 {
1591  #[inline]
1592  fn from(v: i32x4) -> Self {
1593    Self::from_i32x4(v)
1594  }
1595}
1596
1597impl Not for f64x4 {
1598  type Output = Self;
1599  #[inline]
1600  fn not(self) -> Self {
1601    pick! {
1602      if #[cfg(target_feature="avx")] {
1603        Self { avx: self.avx.not()  }
1604      } else {
1605        Self {
1606          a : self.a.not(),
1607          b : self.b.not(),
1608        }
1609      }
1610    }
1611  }
1612}