1use super::*;
2
3pick! {
4 if #[cfg(target_feature="sse")] {
5 #[derive(Default, Clone, Copy, PartialEq)]
6 #[repr(C, align(16))]
7 pub struct f32x4 { pub(crate) sse: m128 }
8 } else if #[cfg(target_feature="simd128")] {
9 use core::arch::wasm32::*;
10
11 #[derive(Clone, Copy)]
12 #[repr(transparent)]
13 pub struct f32x4 { pub(crate) simd: v128 }
14
15 impl Default for f32x4 {
16 fn default() -> Self {
17 Self::splat(0.0)
18 }
19 }
20
21 impl PartialEq for f32x4 {
22 fn eq(&self, other: &Self) -> bool {
23 u32x4_all_true(f32x4_eq(self.simd, other.simd))
24 }
25 }
26 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
27 use core::arch::aarch64::*;
28 #[repr(C)]
29 #[derive(Copy, Clone)]
30 pub struct f32x4 { pub(crate) neon : float32x4_t }
31
32 impl Default for f32x4 {
33 #[inline]
34 fn default() -> Self {
35 unsafe { Self { neon: vdupq_n_f32(0.0)} }
36 }
37 }
38
39 impl PartialEq for f32x4 {
40 #[inline]
41 fn eq(&self, other: &Self) -> bool {
42 unsafe { vminvq_u32(vceqq_f32(self.neon, other.neon))==u32::MAX }
43 }
44
45 }
46 } else {
47 #[derive(Default, Clone, Copy, PartialEq)]
48 #[repr(C, align(16))]
49 pub struct f32x4 { pub(crate) arr: [f32;4] }
50 }
51}
52
53macro_rules! const_f32_as_f32x4 {
54 ($i:ident, $f:expr) => {
55 #[allow(non_upper_case_globals)]
56 pub const $i: f32x4 = f32x4::new([$f; 4]);
57 };
58}
59
60impl f32x4 {
61 const_f32_as_f32x4!(ONE, 1.0);
62 const_f32_as_f32x4!(ZERO, 0.0);
63 const_f32_as_f32x4!(HALF, 0.5);
64 const_f32_as_f32x4!(E, core::f32::consts::E);
65 const_f32_as_f32x4!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
66 const_f32_as_f32x4!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
67 const_f32_as_f32x4!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
68 const_f32_as_f32x4!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
69 const_f32_as_f32x4!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
70 const_f32_as_f32x4!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
71 const_f32_as_f32x4!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
72 const_f32_as_f32x4!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
73 const_f32_as_f32x4!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
74 const_f32_as_f32x4!(LN_2, core::f32::consts::LN_2);
75 const_f32_as_f32x4!(LN_10, core::f32::consts::LN_10);
76 const_f32_as_f32x4!(LOG2_E, core::f32::consts::LOG2_E);
77 const_f32_as_f32x4!(LOG10_E, core::f32::consts::LOG10_E);
78 const_f32_as_f32x4!(LOG10_2, core::f32::consts::LOG10_2);
79 const_f32_as_f32x4!(LOG2_10, core::f32::consts::LOG2_10);
80 const_f32_as_f32x4!(PI, core::f32::consts::PI);
81 const_f32_as_f32x4!(SQRT_2, core::f32::consts::SQRT_2);
82 const_f32_as_f32x4!(TAU, core::f32::consts::TAU);
83}
84
85unsafe impl Zeroable for f32x4 {}
86unsafe impl Pod for f32x4 {}
87
88impl AlignTo for f32x4 {
89 type Elem = f32;
90}
91
92impl Add for f32x4 {
93 type Output = Self;
94 #[inline]
95 fn add(self, rhs: Self) -> Self::Output {
96 pick! {
97 if #[cfg(target_feature="sse")] {
98 Self { sse: add_m128(self.sse, rhs.sse) }
99 } else if #[cfg(target_feature="simd128")] {
100 Self { simd: f32x4_add(self.simd, rhs.simd) }
101 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
102 unsafe { Self { neon: vaddq_f32(self.neon, rhs.neon) } }
103 } else {
104 Self { arr: [
105 self.arr[0] + rhs.arr[0],
106 self.arr[1] + rhs.arr[1],
107 self.arr[2] + rhs.arr[2],
108 self.arr[3] + rhs.arr[3],
109 ]}
110 }
111 }
112 }
113}
114
115impl Sub for f32x4 {
116 type Output = Self;
117 #[inline]
118 fn sub(self, rhs: Self) -> Self::Output {
119 pick! {
120 if #[cfg(target_feature="sse")] {
121 Self { sse: sub_m128(self.sse, rhs.sse) }
122 } else if #[cfg(target_feature="simd128")] {
123 Self { simd: f32x4_sub(self.simd, rhs.simd) }
124 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
125 unsafe {Self { neon: vsubq_f32(self.neon, rhs.neon) }}
126 } else {
127 Self { arr: [
128 self.arr[0] - rhs.arr[0],
129 self.arr[1] - rhs.arr[1],
130 self.arr[2] - rhs.arr[2],
131 self.arr[3] - rhs.arr[3],
132 ]}
133 }
134 }
135 }
136}
137
138impl Mul for f32x4 {
139 type Output = Self;
140 #[inline]
141 fn mul(self, rhs: Self) -> Self::Output {
142 pick! {
143 if #[cfg(target_feature="sse")] {
144 Self { sse: mul_m128(self.sse, rhs.sse) }
145 } else if #[cfg(target_feature="simd128")] {
146 Self { simd: f32x4_mul(self.simd, rhs.simd) }
147 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
148 unsafe {Self { neon: vmulq_f32(self.neon, rhs.neon) }}
149 } else {
150 Self { arr: [
151 self.arr[0] * rhs.arr[0],
152 self.arr[1] * rhs.arr[1],
153 self.arr[2] * rhs.arr[2],
154 self.arr[3] * rhs.arr[3],
155 ]}
156 }
157 }
158 }
159}
160
161impl Div for f32x4 {
162 type Output = Self;
163 #[inline]
164 fn div(self, rhs: Self) -> Self::Output {
165 pick! {
166 if #[cfg(target_feature="sse")] {
167 Self { sse: div_m128(self.sse, rhs.sse) }
168 } else if #[cfg(target_feature="simd128")] {
169 Self { simd: f32x4_div(self.simd, rhs.simd) }
170 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
171 unsafe {Self { neon: vdivq_f32(self.neon, rhs.neon) }}
172 } else {
173 Self { arr: [
174 self.arr[0] / rhs.arr[0],
175 self.arr[1] / rhs.arr[1],
176 self.arr[2] / rhs.arr[2],
177 self.arr[3] / rhs.arr[3],
178 ]}
179 }
180 }
181 }
182}
183
184impl Add<f32> for f32x4 {
185 type Output = Self;
186 #[inline]
187 fn add(self, rhs: f32) -> Self::Output {
188 self.add(Self::splat(rhs))
189 }
190}
191
192impl Sub<f32> for f32x4 {
193 type Output = Self;
194 #[inline]
195 fn sub(self, rhs: f32) -> Self::Output {
196 self.sub(Self::splat(rhs))
197 }
198}
199
200impl Mul<f32> for f32x4 {
201 type Output = Self;
202 #[inline]
203 fn mul(self, rhs: f32) -> Self::Output {
204 self.mul(Self::splat(rhs))
205 }
206}
207
208impl Div<f32> for f32x4 {
209 type Output = Self;
210 #[inline]
211 fn div(self, rhs: f32) -> Self::Output {
212 self.div(Self::splat(rhs))
213 }
214}
215
216impl Add<f32x4> for f32 {
217 type Output = f32x4;
218 #[inline]
219 fn add(self, rhs: f32x4) -> Self::Output {
220 f32x4::splat(self).add(rhs)
221 }
222}
223
224impl Sub<f32x4> for f32 {
225 type Output = f32x4;
226 #[inline]
227 fn sub(self, rhs: f32x4) -> Self::Output {
228 f32x4::splat(self).sub(rhs)
229 }
230}
231
232impl Mul<f32x4> for f32 {
233 type Output = f32x4;
234 #[inline]
235 fn mul(self, rhs: f32x4) -> Self::Output {
236 f32x4::splat(self).mul(rhs)
237 }
238}
239
240impl Div<f32x4> for f32 {
241 type Output = f32x4;
242 #[inline]
243 fn div(self, rhs: f32x4) -> Self::Output {
244 f32x4::splat(self).div(rhs)
245 }
246}
247
248impl BitAnd for f32x4 {
249 type Output = Self;
250 #[inline]
251 fn bitand(self, rhs: Self) -> Self::Output {
252 pick! {
253 if #[cfg(target_feature="sse")] {
254 Self { sse: bitand_m128(self.sse, rhs.sse) }
255 } else if #[cfg(target_feature="simd128")] {
256 Self { simd: v128_and(self.simd, rhs.simd) }
257 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
258 unsafe {Self { neon: vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
259 } else {
260 Self { arr: [
261 f32::from_bits(self.arr[0].to_bits() & rhs.arr[0].to_bits()),
262 f32::from_bits(self.arr[1].to_bits() & rhs.arr[1].to_bits()),
263 f32::from_bits(self.arr[2].to_bits() & rhs.arr[2].to_bits()),
264 f32::from_bits(self.arr[3].to_bits() & rhs.arr[3].to_bits()),
265 ]}
266 }
267 }
268 }
269}
270
271impl BitOr for f32x4 {
272 type Output = Self;
273 #[inline]
274 fn bitor(self, rhs: Self) -> Self::Output {
275 pick! {
276 if #[cfg(target_feature="sse")] {
277 Self { sse: bitor_m128(self.sse, rhs.sse) }
278 } else if #[cfg(target_feature="simd128")] {
279 Self { simd: v128_or(self.simd, rhs.simd) }
280 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
281 unsafe {Self { neon: vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
282 } else {
283 Self { arr: [
284 f32::from_bits(self.arr[0].to_bits() | rhs.arr[0].to_bits()),
285 f32::from_bits(self.arr[1].to_bits() | rhs.arr[1].to_bits()),
286 f32::from_bits(self.arr[2].to_bits() | rhs.arr[2].to_bits()),
287 f32::from_bits(self.arr[3].to_bits() | rhs.arr[3].to_bits()),
288 ]}
289 }
290 }
291 }
292}
293
294impl BitXor for f32x4 {
295 type Output = Self;
296 #[inline]
297 fn bitxor(self, rhs: Self) -> Self::Output {
298 pick! {
299 if #[cfg(target_feature="sse")] {
300 Self { sse: bitxor_m128(self.sse, rhs.sse) }
301 } else if #[cfg(target_feature="simd128")] {
302 Self { simd: v128_xor(self.simd, rhs.simd) }
303 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
304 unsafe {Self { neon: vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
305 } else {
306 Self { arr: [
307 f32::from_bits(self.arr[0].to_bits() ^ rhs.arr[0].to_bits()),
308 f32::from_bits(self.arr[1].to_bits() ^ rhs.arr[1].to_bits()),
309 f32::from_bits(self.arr[2].to_bits() ^ rhs.arr[2].to_bits()),
310 f32::from_bits(self.arr[3].to_bits() ^ rhs.arr[3].to_bits()),
311 ]}
312 }
313 }
314 }
315}
316
317impl CmpEq for f32x4 {
318 type Output = Self;
319 #[inline]
320 fn simd_eq(self, rhs: Self) -> Self::Output {
321 pick! {
322 if #[cfg(target_feature="sse")] {
323 Self { sse: cmp_eq_mask_m128(self.sse, rhs.sse) }
324 } else if #[cfg(target_feature="simd128")] {
325 Self { simd: f32x4_eq(self.simd, rhs.simd) }
326 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
327 unsafe {Self { neon: vreinterpretq_f32_u32(vceqq_f32(self.neon, rhs.neon)) }}
328 } else {
329 Self { arr: [
330 if self.arr[0] == rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
331 if self.arr[1] == rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
332 if self.arr[2] == rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
333 if self.arr[3] == rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
334 ]}
335 }
336 }
337 }
338}
339
340impl CmpGe for f32x4 {
341 type Output = Self;
342 #[inline]
343 fn simd_ge(self, rhs: Self) -> Self::Output {
344 pick! {
345 if #[cfg(target_feature="sse")] {
346 Self { sse: cmp_ge_mask_m128(self.sse, rhs.sse) }
347 } else if #[cfg(target_feature="simd128")] {
348 Self { simd: f32x4_ge(self.simd, rhs.simd) }
349 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
350 unsafe {Self { neon: vreinterpretq_f32_u32(vcgeq_f32(self.neon, rhs.neon)) }}
351 } else {
352 Self { arr: [
353 if self.arr[0] >= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
354 if self.arr[1] >= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
355 if self.arr[2] >= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
356 if self.arr[3] >= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
357 ]}
358 }
359 }
360 }
361}
362
363impl CmpGt for f32x4 {
364 type Output = Self;
365 #[inline]
366 fn simd_gt(self, rhs: Self) -> Self::Output {
367 pick! {
368 if #[cfg(target_feature="sse")] {
369 Self { sse: cmp_gt_mask_m128(self.sse, rhs.sse) }
370 } else if #[cfg(target_feature="simd128")] {
371 Self { simd: f32x4_gt(self.simd, rhs.simd) }
372 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
373 unsafe {Self { neon: vreinterpretq_f32_u32(vcgtq_f32(self.neon, rhs.neon)) }}
374 } else {
375 Self { arr: [
376 if self.arr[0] > rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
377 if self.arr[1] > rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
378 if self.arr[2] > rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
379 if self.arr[3] > rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
380 ]}
381 }
382 }
383 }
384}
385
386impl CmpNe for f32x4 {
387 type Output = Self;
388 #[inline]
389 fn simd_ne(self, rhs: Self) -> Self::Output {
390 pick! {
391 if #[cfg(target_feature="sse")] {
392 Self { sse: cmp_neq_mask_m128(self.sse, rhs.sse) }
393 } else if #[cfg(target_feature="simd128")] {
394 Self { simd: f32x4_ne(self.simd, rhs.simd) }
395 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
396 unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, rhs.neon))) }}
397 } else {
398 Self { arr: [
399 if self.arr[0] != rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
400 if self.arr[1] != rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
401 if self.arr[2] != rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
402 if self.arr[3] != rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
403 ]}
404 }
405 }
406 }
407}
408
409impl CmpLe for f32x4 {
410 type Output = Self;
411 #[inline]
412 fn simd_le(self, rhs: Self) -> Self::Output {
413 pick! {
414 if #[cfg(target_feature="sse")] {
415 Self { sse: cmp_le_mask_m128(self.sse, rhs.sse) }
416 } else if #[cfg(target_feature="simd128")] {
417 Self { simd: f32x4_le(self.simd, rhs.simd) }
418 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
419 unsafe {Self { neon: vreinterpretq_f32_u32(vcleq_f32(self.neon, rhs.neon)) }}
420 } else {
421 Self { arr: [
422 if self.arr[0] <= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
423 if self.arr[1] <= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
424 if self.arr[2] <= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
425 if self.arr[3] <= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
426 ]}
427 }
428 }
429 }
430}
431
432impl CmpLt for f32x4 {
433 type Output = Self;
434 #[inline]
435 fn simd_lt(self, rhs: Self) -> Self::Output {
436 pick! {
437 if #[cfg(target_feature="sse")] {
438 Self { sse: cmp_lt_mask_m128(self.sse, rhs.sse) }
439 } else if #[cfg(target_feature="simd128")] {
440 Self { simd: f32x4_lt(self.simd, rhs.simd) }
441 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
442 unsafe {Self { neon: vreinterpretq_f32_u32(vcltq_f32(self.neon, rhs.neon)) }}
443 } else {
444 Self { arr: [
445 if self.arr[0] < rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
446 if self.arr[1] < rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
447 if self.arr[2] < rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
448 if self.arr[3] < rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
449 ]}
450 }
451 }
452 }
453}
454
455impl f32x4 {
456 #[inline]
457 #[must_use]
458 pub const fn new(array: [f32; 4]) -> Self {
459 #[allow(non_upper_case_globals)]
460 unsafe {
461 core::mem::transmute(array)
462 }
463 }
464
465 #[inline]
466 #[must_use]
467 pub fn blend(self, t: Self, f: Self) -> Self {
468 pick! {
469 if #[cfg(target_feature="sse4.1")] {
470 Self { sse: blend_varying_m128(f.sse, t.sse, self.sse) }
471 } else if #[cfg(target_feature="simd128")] {
472 Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
473 } else {
474 generic_bit_blend(self, t, f)
475 }
476 }
477 }
478 #[inline]
479 #[must_use]
480 pub fn abs(self) -> Self {
481 pick! {
482 if #[cfg(target_feature="simd128")] {
483 Self { simd: f32x4_abs(self.simd) }
484 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
485 unsafe {Self { neon: vabsq_f32(self.neon) }}
486 } else {
487 let non_sign_bits = f32x4::from(f32::from_bits(i32::MAX as u32));
488 self & non_sign_bits
489 }
490 }
491 }
492 #[inline]
493 #[must_use]
494 pub fn floor(self) -> Self {
495 pick! {
496 if #[cfg(target_feature="simd128")] {
497 Self { simd: f32x4_floor(self.simd) }
498 } else if #[cfg(target_feature="sse4.1")] {
499 Self { sse: floor_m128(self.sse) }
500 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
501 unsafe {Self { neon: vrndmq_f32(self.neon) }}
502 } else if #[cfg(feature="std")] {
503 let base: [f32; 4] = cast(self);
504 cast(base.map(|val| val.floor()))
505 } else {
506 let base: [f32; 4] = cast(self);
507 let rounded: [f32; 4] = cast(self.round());
508 cast([
509 if base[0] < rounded[0] { rounded[0] - 1.0 } else { rounded[0] },
510 if base[1] < rounded[1] { rounded[1] - 1.0 } else { rounded[1] },
511 if base[2] < rounded[2] { rounded[2] - 1.0 } else { rounded[2] },
512 if base[3] < rounded[3] { rounded[3] - 1.0 } else { rounded[3] },
513 ])
514 }
515 }
516 }
517 #[inline]
518 #[must_use]
519 pub fn ceil(self) -> Self {
520 pick! {
521 if #[cfg(target_feature="simd128")] {
522 Self { simd: f32x4_ceil(self.simd) }
523 } else if #[cfg(target_feature="sse4.1")] {
524 Self { sse: ceil_m128(self.sse) }
525 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
526 unsafe {Self { neon: vrndpq_f32(self.neon) }}
527 } else if #[cfg(feature="std")] {
528 let base: [f32; 4] = cast(self);
529 cast(base.map(|val| val.ceil()))
530 } else {
531 let base: [f32; 4] = cast(self);
532 let rounded: [f32; 4] = cast(self.round());
533 cast([
534 if base[0] > rounded[0] { rounded[0] + 1.0 } else { rounded[0] },
535 if base[1] > rounded[1] { rounded[1] + 1.0 } else { rounded[1] },
536 if base[2] > rounded[2] { rounded[2] + 1.0 } else { rounded[2] },
537 if base[3] > rounded[3] { rounded[3] + 1.0 } else { rounded[3] },
538 ])
539 }
540 }
541 }
542
543 #[inline]
547 #[must_use]
548 pub fn fast_max(self, rhs: Self) -> Self {
549 pick! {
550 if #[cfg(target_feature="sse")] {
551 Self { sse: max_m128(self.sse, rhs.sse) }
552 } else if #[cfg(target_feature="simd128")] {
553 Self {
554 simd: f32x4_pmax(self.simd, rhs.simd),
555 }
556 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
557 unsafe {Self { neon: vmaxq_f32(self.neon, rhs.neon) }}
558 } else {
559 Self { arr: [
560 if self.arr[0] < rhs.arr[0] { rhs.arr[0] } else { self.arr[0] },
561 if self.arr[1] < rhs.arr[1] { rhs.arr[1] } else { self.arr[1] },
562 if self.arr[2] < rhs.arr[2] { rhs.arr[2] } else { self.arr[2] },
563 if self.arr[3] < rhs.arr[3] { rhs.arr[3] } else { self.arr[3] },
564 ]}
565 }
566 }
567 }
568
569 #[inline]
573 #[must_use]
574 pub fn max(self, rhs: Self) -> Self {
575 pick! {
576 if #[cfg(target_feature="sse")] {
577 rhs.is_nan().blend(self, Self { sse: max_m128(self.sse, rhs.sse) })
581 } else if #[cfg(target_feature="simd128")] {
582 Self {
589 simd: v128_bitselect(
590 rhs.simd,
591 f32x4_pmax(self.simd, rhs.simd),
592 f32x4_ne(self.simd, self.simd), )
594 }
595 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
596 unsafe {Self { neon: vmaxnmq_f32(self.neon, rhs.neon) }}
597 } else {
598 Self { arr: [
599 self.arr[0].max(rhs.arr[0]),
600 self.arr[1].max(rhs.arr[1]),
601 self.arr[2].max(rhs.arr[2]),
602 self.arr[3].max(rhs.arr[3]),
603 ]}
604 }
605 }
606 }
607
608 #[inline]
612 #[must_use]
613 pub fn fast_min(self, rhs: Self) -> Self {
614 pick! {
615 if #[cfg(target_feature="sse")] {
616 Self { sse: min_m128(self.sse, rhs.sse) }
617 } else if #[cfg(target_feature="simd128")] {
618 Self {
619 simd: f32x4_pmin(self.simd, rhs.simd),
620 }
621 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
622 unsafe {Self { neon: vminq_f32(self.neon, rhs.neon) }}
623 } else {
624 Self { arr: [
625 if self.arr[0] < rhs.arr[0] { self.arr[0] } else { rhs.arr[0] },
626 if self.arr[1] < rhs.arr[1] { self.arr[1] } else { rhs.arr[1] },
627 if self.arr[2] < rhs.arr[2] { self.arr[2] } else { rhs.arr[2] },
628 if self.arr[3] < rhs.arr[3] { self.arr[3] } else { rhs.arr[3] },
629 ]}
630 }
631 }
632 }
633
634 #[inline]
638 #[must_use]
639 pub fn min(self, rhs: Self) -> Self {
640 pick! {
641 if #[cfg(target_feature="sse")] {
642 rhs.is_nan().blend(self, Self { sse: min_m128(self.sse, rhs.sse) })
646 } else if #[cfg(target_feature="simd128")] {
647 Self {
654 simd: v128_bitselect(
655 rhs.simd,
656 f32x4_pmin(self.simd, rhs.simd),
657 f32x4_ne(self.simd, self.simd), )
659 }
660 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
661 unsafe {Self { neon: vminnmq_f32(self.neon, rhs.neon) }}
662 } else {
663 Self { arr: [
664 self.arr[0].min(rhs.arr[0]),
665 self.arr[1].min(rhs.arr[1]),
666 self.arr[2].min(rhs.arr[2]),
667 self.arr[3].min(rhs.arr[3]),
668 ]}
669 }
670 }
671 }
672 #[inline]
673 #[must_use]
674 pub fn is_nan(self) -> Self {
675 pick! {
676 if #[cfg(target_feature="sse")] {
677 Self { sse: cmp_unord_mask_m128(self.sse, self.sse) }
678 } else if #[cfg(target_feature="simd128")] {
679 Self { simd: f32x4_ne(self.simd, self.simd) }
680 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
681 unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, self.neon))) }}
682 } else {
683 Self { arr: [
684 if self.arr[0].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
685 if self.arr[1].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
686 if self.arr[2].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
687 if self.arr[3].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
688 ]}
689 }
690 }
691 }
692 #[inline]
693 #[must_use]
694 pub fn is_finite(self) -> Self {
695 let shifted_exp_mask = u32x4::from(0xFF000000);
696 let u: u32x4 = cast(self);
697 let shift_u = u << 1_u64;
698 let out = !(shift_u & shifted_exp_mask).simd_eq(shifted_exp_mask);
699 cast(out)
700 }
701 #[inline]
702 #[must_use]
703 pub fn is_inf(self) -> Self {
704 let shifted_inf = u32x4::from(0xFF000000);
705 let u: u32x4 = cast(self);
706 let shift_u = u << 1_u64;
707 let out = (shift_u).simd_eq(shifted_inf);
708 cast(out)
709 }
710
711 #[inline]
712 #[must_use]
713 pub fn round(self) -> Self {
714 pick! {
715 if #[cfg(target_feature="sse4.1")] {
716 Self { sse: round_m128::<{round_op!(Nearest)}>(self.sse) }
717 } else if #[cfg(target_feature="sse2")] {
718 let mi: m128i = convert_to_i32_m128i_from_m128(self.sse);
719 let f: f32x4 = f32x4 { sse: convert_to_m128_from_i32_m128i(mi) };
720 let i: i32x4 = cast(mi);
721 let mask: f32x4 = cast(i.simd_eq(i32x4::from(0x80000000_u32 as i32)));
722 mask.blend(self, f)
723 } else if #[cfg(target_feature="simd128")] {
724 Self { simd: f32x4_nearest(self.simd) }
725 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
726 unsafe {Self { neon: vrndnq_f32(self.neon) }}
727 } else {
728 let to_int = f32x4::from(1.0 / f32::EPSILON);
732 let u: u32x4 = cast(self);
733 let e: i32x4 = cast((u >> 23) & u32x4::from(0xff));
734 let mut y: f32x4;
735
736 let no_op_magic = i32x4::from(0x7f + 23);
737 let no_op_mask: f32x4 = cast(e.simd_gt(no_op_magic) | e.simd_eq(no_op_magic));
738 let no_op_val: f32x4 = self;
739
740 let zero_magic = i32x4::from(0x7f - 1);
741 let zero_mask: f32x4 = cast(e.simd_lt(zero_magic));
742 let zero_val: f32x4 = self * f32x4::from(0.0);
743
744 let neg_bit: f32x4 = cast(cast::<u32x4, i32x4>(u).simd_lt(i32x4::default()));
745 let x: f32x4 = neg_bit.blend(-self, self);
746 y = x + to_int - to_int - x;
747 y = y.simd_gt(f32x4::from(0.5)).blend(
748 y + x - f32x4::from(-1.0),
749 y.simd_lt(f32x4::from(-0.5)).blend(y + x + f32x4::from(1.0), y + x),
750 );
751 y = neg_bit.blend(-y, y);
752
753 no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y))
754 }
755 }
756 }
757
758 #[inline]
762 #[must_use]
763 pub fn fast_round_int(self) -> i32x4 {
764 pick! {
765 if #[cfg(target_feature="sse2")] {
766 cast(convert_to_i32_m128i_from_m128(self.sse))
767 } else {
768 self.round_int()
769 }
770 }
771 }
772
773 #[inline]
777 #[must_use]
778 pub fn round_int(self) -> i32x4 {
779 pick! {
780 if #[cfg(target_feature="sse2")] {
781 let non_nan_mask = self.simd_eq(self);
783 let non_nan = self & non_nan_mask;
784 let flip_to_max: i32x4 = cast(self.simd_ge(Self::splat(2147483648.0)));
785 let cast: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse));
786 flip_to_max ^ cast
787 } else if #[cfg(target_feature="simd128")] {
788 cast(Self { simd: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd)) })
789 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
790 cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtnq_s32_f32(self.neon)) }})
791 } else {
792 let rounded: [f32; 4] = cast(self.round());
793 cast([
794 rounded[0] as i32,
795 rounded[1] as i32,
796 rounded[2] as i32,
797 rounded[3] as i32,
798 ])
799 }
800 }
801 }
802
803 #[inline]
807 #[must_use]
808 pub fn fast_trunc_int(self) -> i32x4 {
809 pick! {
810 if #[cfg(target_feature="sse2")] {
811 cast(truncate_m128_to_m128i(self.sse))
812 } else {
813 self.trunc_int()
814 }
815 }
816 }
817
818 #[inline]
822 #[must_use]
823 pub fn trunc_int(self) -> i32x4 {
824 pick! {
825 if #[cfg(target_feature="sse2")] {
826 let non_nan_mask = self.simd_eq(self);
828 let non_nan = self & non_nan_mask;
829 let flip_to_max: i32x4 = cast(self.simd_ge(Self::splat(2147483648.0)));
830 let cast: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse));
831 flip_to_max ^ cast
832 } else if #[cfg(target_feature="simd128")] {
833 cast(Self { simd: i32x4_trunc_sat_f32x4(self.simd) })
834 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
835 cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtq_s32_f32(self.neon)) }})
836 } else {
837 let n: [f32;4] = cast(self);
838 cast([
839 n[0] as i32,
840 n[1] as i32,
841 n[2] as i32,
842 n[3] as i32,
843 ])
844 }
845 }
846 }
847 #[inline]
872 #[must_use]
873 pub fn mul_add(self, m: Self, a: Self) -> Self {
874 pick! {
875 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
876 Self { sse: fused_mul_add_m128(self.sse, m.sse, a.sse) }
877 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
878 unsafe { Self { neon: vfmaq_f32(a.neon, self.neon, m.neon) } }
879 } else {
880 (self * m) + a
881 }
882 }
883 }
884
885 #[inline]
911 #[must_use]
912 pub fn mul_sub(self, m: Self, s: Self) -> Self {
913 pick! {
914 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
915 Self { sse: fused_mul_sub_m128(self.sse, m.sse, s.sse) }
916 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
917 unsafe { Self { neon: vfmaq_f32(vnegq_f32(s.neon), self.neon, m.neon) } }
918 } else {
919 (self * m) - s
920 }
921 }
922 }
923
924 #[inline]
949 #[must_use]
950 pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
951 pick! {
952 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
953 Self { sse: fused_mul_neg_add_m128(self.sse, m.sse, a.sse) }
954 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
955 unsafe { Self { neon: vfmsq_f32(a.neon, self.neon, m.neon) } }
956 } else {
957 a - (self * m)
958 }
959 }
960 }
961
962 #[inline]
988 #[must_use]
989 pub fn mul_neg_sub(self, m: Self, s: Self) -> Self {
990 pick! {
991 if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
992 Self { sse: fused_mul_neg_sub_m128(self.sse, m.sse, s.sse) }
993 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
994 unsafe { Self { neon: vnegq_f32(vfmaq_f32(s.neon, self.neon, m.neon)) } }
995 } else {
996 -(self * m) - s
997 }
998 }
999 }
1000
1001 #[inline]
1002 #[must_use]
1003 pub fn flip_signs(self, signs: Self) -> Self {
1004 self ^ (signs & Self::from(-0.0))
1005 }
1006
1007 #[inline]
1008 #[must_use]
1009 pub fn copysign(self, sign: Self) -> Self {
1010 let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
1011 (self & magnitude_mask) | (sign & Self::from(-0.0))
1012 }
1013
1014 #[inline]
1015 pub fn asin_acos(self) -> (Self, Self) {
1016 const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
1019 const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
1020 const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
1021 const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
1022 const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
1023
1024 let xa = self.abs();
1025 let big = xa.simd_ge(f32x4::splat(0.5));
1026
1027 let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
1028 let x2 = xa * xa;
1029 let x3 = big.blend(x1, x2);
1030
1031 let xb = x1.sqrt();
1032
1033 let x4 = big.blend(xb, xa);
1034
1035 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1036 let z = z.mul_add(x3 * x4, x4);
1037
1038 let z1 = z + z;
1039
1040 let z3 = self.simd_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
1042 let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
1043 let acos = big.blend(z3, z4);
1044
1045 let z3 = f32x4::FRAC_PI_2 - z1;
1047 let asin = big.blend(z3, z);
1048 let asin = asin.flip_signs(self);
1049
1050 (asin, acos)
1051 }
1052
1053 #[inline]
1054 pub fn asin(self) -> Self {
1055 const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
1058 const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
1059 const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
1060 const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
1061 const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
1062
1063 let xa = self.abs();
1064 let big = xa.simd_ge(f32x4::splat(0.5));
1065
1066 let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
1067 let x2 = xa * xa;
1068 let x3 = big.blend(x1, x2);
1069
1070 let xb = x1.sqrt();
1071
1072 let x4 = big.blend(xb, xa);
1073
1074 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1075 let z = z.mul_add(x3 * x4, x4);
1076
1077 let z1 = z + z;
1078
1079 let z3 = f32x4::FRAC_PI_2 - z1;
1081 let asin = big.blend(z3, z);
1082 let asin = asin.flip_signs(self);
1083
1084 asin
1085 }
1086
1087 #[inline]
1088 #[must_use]
1089 pub fn acos(self) -> Self {
1090 const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
1093 const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
1094 const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
1095 const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
1096 const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
1097
1098 let xa = self.abs();
1099 let big = xa.simd_ge(f32x4::splat(0.5));
1100
1101 let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
1102 let x2 = xa * xa;
1103 let x3 = big.blend(x1, x2);
1104
1105 let xb = x1.sqrt();
1106
1107 let x4 = big.blend(xb, xa);
1108
1109 let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1110 let z = z.mul_add(x3 * x4, x4);
1111
1112 let z1 = z + z;
1113
1114 let z3 = self.simd_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
1116 let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
1117 let acos = big.blend(z3, z4);
1118
1119 acos
1120 }
1121
1122 #[inline]
1123 pub fn atan(self) -> Self {
1124 const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1127 const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1128 const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1129 const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1130
1131 let t = self.abs();
1132
1133 let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
1137 let notbig = t.simd_le(Self::SQRT_2 + Self::ONE);
1138
1139 let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1140 s = notsmal & s;
1141
1142 let mut a = notbig & t;
1143 a = notsmal.blend(a - Self::ONE, a);
1144 let mut b = notbig & Self::ONE;
1145 b = notsmal.blend(b + t, b);
1146 let z = a / b;
1147
1148 let zz = z * z;
1149
1150 let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1152 re = re.mul_add(zz * z, z) + s;
1153
1154 re = (self.sign_bit()).blend(-re, re);
1156
1157 re
1158 }
1159
1160 #[inline]
1161 pub fn atan2(self, x: Self) -> Self {
1162 const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1165 const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1166 const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1167 const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1168
1169 let y = self;
1170
1171 let x1 = x.abs();
1173 let y1 = y.abs();
1174 let swapxy = y1.simd_gt(x1);
1175 let mut x2 = swapxy.blend(y1, x1);
1177 let mut y2 = swapxy.blend(x1, y1);
1178
1179 let both_infinite = x.is_inf() & y.is_inf();
1181 if both_infinite.any() {
1182 let minus_one = -Self::ONE;
1183 x2 = both_infinite.blend(x2 & minus_one, x2);
1184 y2 = both_infinite.blend(y2 & minus_one, y2);
1185 }
1186
1187 let t = y2 / x2;
1189
1190 let notsmal = t.simd_ge(Self::SQRT_2 - Self::ONE);
1193
1194 let a = notsmal.blend(t - Self::ONE, t);
1195 let b = notsmal.blend(t + Self::ONE, Self::ONE);
1196 let s = notsmal & Self::FRAC_PI_4;
1197 let z = a / b;
1198
1199 let zz = z * z;
1200
1201 let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1203 re = re.mul_add(zz * z, z) + s;
1204
1205 re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1207 re = ((x | y).simd_eq(Self::ZERO)).blend(Self::ZERO, re);
1208 re = (x.sign_bit()).blend(Self::PI - re, re);
1209
1210 re = (y.sign_bit()).blend(-re, re);
1212
1213 re
1214 }
1215
1216 #[inline]
1217 #[must_use]
1218 pub fn sin_cos(self) -> (Self, Self) {
1219 const_f32_as_f32x4!(DP1F, 0.78515625_f32 * 2.0);
1223 const_f32_as_f32x4!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
1224 const_f32_as_f32x4!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
1225
1226 const_f32_as_f32x4!(P0sinf, -1.6666654611E-1);
1227 const_f32_as_f32x4!(P1sinf, 8.3321608736E-3);
1228 const_f32_as_f32x4!(P2sinf, -1.9515295891E-4);
1229
1230 const_f32_as_f32x4!(P0cosf, 4.166664568298827E-2);
1231 const_f32_as_f32x4!(P1cosf, -1.388731625493765E-3);
1232 const_f32_as_f32x4!(P2cosf, 2.443315711809948E-5);
1233
1234 const_f32_as_f32x4!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
1235
1236 let xa = self.abs();
1237
1238 let y = (xa * TWO_OVER_PI).round();
1240 let q: i32x4 = y.round_int();
1241
1242 let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
1243
1244 let x2 = x * x;
1245 let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
1246 let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
1247 + f32x4::from(0.5).mul_neg_add(x2, f32x4::from(1.0));
1248
1249 let swap = !(q & i32x4::from(1)).simd_eq(i32x4::from(0));
1250
1251 let mut overflow: f32x4 = cast(q.simd_gt(i32x4::from(0x2000000)));
1252 overflow &= xa.is_finite();
1253 s = overflow.blend(f32x4::from(0.0), s);
1254 c = overflow.blend(f32x4::from(1.0), c);
1255
1256 let mut sin1 = cast::<_, f32x4>(swap).blend(c, s);
1258 let sign_sin: i32x4 = (q << 30) ^ cast::<_, i32x4>(self);
1259 sin1 = sin1.flip_signs(cast(sign_sin));
1260
1261 let mut cos1 = cast::<_, f32x4>(swap).blend(s, c);
1263 let sign_cos: i32x4 = ((q + i32x4::from(1)) & i32x4::from(2)) << 30;
1264 cos1 ^= cast::<_, f32x4>(sign_cos);
1265
1266 (sin1, cos1)
1267 }
1268
1269 #[inline]
1270 #[must_use]
1271 pub fn sin(self) -> Self {
1272 let (s, _) = self.sin_cos();
1273 s
1274 }
1275 #[inline]
1276 #[must_use]
1277 pub fn cos(self) -> Self {
1278 let (_, c) = self.sin_cos();
1279 c
1280 }
1281 #[inline]
1282 #[must_use]
1283 pub fn tan(self) -> Self {
1284 let (s, c) = self.sin_cos();
1285 s / c
1286 }
1287 #[inline]
1288 #[must_use]
1289 pub fn to_degrees(self) -> Self {
1290 const_f32_as_f32x4!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
1291 self * RAD_TO_DEG_RATIO
1292 }
1293 #[inline]
1294 #[must_use]
1295 pub fn to_radians(self) -> Self {
1296 const_f32_as_f32x4!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
1297 self * DEG_TO_RAD_RATIO
1298 }
1299 #[inline]
1300 #[must_use]
1301 pub fn recip(self) -> Self {
1302 pick! {
1303 if #[cfg(target_feature="sse")] {
1304 Self { sse: reciprocal_m128(self.sse) }
1305 } else if #[cfg(target_feature="simd128")] {
1306 Self { simd: f32x4_div(f32x4_splat(1.0), self.simd) }
1307 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1308 unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), self.neon) }}
1309 } else {
1310 Self { arr: [
1311 1.0 / self.arr[0],
1312 1.0 / self.arr[1],
1313 1.0 / self.arr[2],
1314 1.0 / self.arr[3],
1315 ]}
1316 }
1317 }
1318 }
1319 #[inline]
1320 #[must_use]
1321 pub fn recip_sqrt(self) -> Self {
1322 pick! {
1323 if #[cfg(target_feature="sse")] {
1324 Self { sse: reciprocal_sqrt_m128(self.sse) }
1325 } else if #[cfg(target_feature="simd128")] {
1326 Self { simd: f32x4_div(f32x4_splat(1.0), f32x4_sqrt(self.simd)) }
1327 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1328 unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), vsqrtq_f32(self.neon)) }}
1329 } else if #[cfg(feature="std")] {
1330 Self { arr: [
1331 1.0 / self.arr[0].sqrt(),
1332 1.0 / self.arr[1].sqrt(),
1333 1.0 / self.arr[2].sqrt(),
1334 1.0 / self.arr[3].sqrt(),
1335 ]}
1336 } else {
1337 Self { arr: [
1338 1.0 / software_sqrt(self.arr[0] as f64) as f32,
1339 1.0 / software_sqrt(self.arr[1] as f64) as f32,
1340 1.0 / software_sqrt(self.arr[2] as f64) as f32,
1341 1.0 / software_sqrt(self.arr[3] as f64) as f32,
1342 ]}
1343 }
1344 }
1345 }
1346 #[inline]
1347 #[must_use]
1348 pub fn sqrt(self) -> Self {
1349 pick! {
1350 if #[cfg(target_feature="sse")] {
1351 Self { sse: sqrt_m128(self.sse) }
1352 } else if #[cfg(target_feature="simd128")] {
1353 Self { simd: f32x4_sqrt(self.simd) }
1354 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1355 unsafe {Self { neon: vsqrtq_f32(self.neon) }}
1356 } else if #[cfg(feature="std")] {
1357 Self { arr: [
1358 self.arr[0].sqrt(),
1359 self.arr[1].sqrt(),
1360 self.arr[2].sqrt(),
1361 self.arr[3].sqrt(),
1362 ]}
1363 } else {
1364 Self { arr: [
1365 software_sqrt(self.arr[0] as f64) as f32,
1366 software_sqrt(self.arr[1] as f64) as f32,
1367 software_sqrt(self.arr[2] as f64) as f32,
1368 software_sqrt(self.arr[3] as f64) as f32,
1369 ]}
1370 }
1371 }
1372 }
1373
1374 #[inline]
1375 #[must_use]
1376 pub fn to_bitmask(self) -> u32 {
1377 pick! {
1378 if #[cfg(target_feature="sse")] {
1379 move_mask_m128(self.sse) as u32
1380 } else if #[cfg(target_feature="simd128")] {
1381 u32x4_bitmask(self.simd) as u32
1382 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1383 unsafe
1384 {
1385 let masked = vcltq_s32( vreinterpretq_s32_f32(self.neon), vdupq_n_s32(0));
1387
1388 let selectbit : uint32x4_t = core::mem::transmute([1u32, 2, 4, 8]);
1390 let r = vandq_u32(masked, selectbit);
1391
1392 vaddvq_u32(r) as u32
1394 }
1395 } else {
1396 (((self.arr[0].to_bits() as i32) < 0) as u32) << 0 |
1397 (((self.arr[1].to_bits() as i32) < 0) as u32) << 1 |
1398 (((self.arr[2].to_bits() as i32) < 0) as u32) << 2 |
1399 (((self.arr[3].to_bits() as i32) < 0) as u32) << 3
1400 }
1401 }
1402 }
1403 #[inline]
1404 #[must_use]
1405 pub fn any(self) -> bool {
1406 pick! {
1407 if #[cfg(target_feature="simd128")] {
1408 v128_any_true(self.simd)
1409 } else {
1410 self.to_bitmask() != 0
1411 }
1412 }
1413 }
1414 #[inline]
1415 #[must_use]
1416 pub fn all(self) -> bool {
1417 pick! {
1418 if #[cfg(target_feature="simd128")] {
1419 u32x4_all_true(self.simd)
1420 } else {
1421 self.to_bitmask() == 0b1111
1423 }
1424 }
1425 }
1426 #[inline]
1427 #[must_use]
1428 pub fn none(self) -> bool {
1429 !self.any()
1430 }
1431
1432 #[inline]
1433 fn vm_pow2n(self) -> Self {
1434 const_f32_as_f32x4!(pow2_23, 8388608.0);
1435 const_f32_as_f32x4!(bias, 127.0);
1436 let a = self + (bias + pow2_23);
1437 let c = cast::<_, i32x4>(a) << 23;
1438 cast::<_, f32x4>(c)
1439 }
1440
1441 #[inline]
1443 #[must_use]
1444 pub fn exp(self) -> Self {
1445 const_f32_as_f32x4!(P0, 1.0 / 2.0);
1446 const_f32_as_f32x4!(P1, 1.0 / 6.0);
1447 const_f32_as_f32x4!(P2, 1. / 24.);
1448 const_f32_as_f32x4!(P3, 1. / 120.);
1449 const_f32_as_f32x4!(P4, 1. / 720.);
1450 const_f32_as_f32x4!(P5, 1. / 5040.);
1451 const_f32_as_f32x4!(LN2D_HI, 0.693359375);
1452 const_f32_as_f32x4!(LN2D_LO, -2.12194440e-4);
1453 let max_x = f32x4::from(87.3);
1454 let r = (self * Self::LOG2_E).round();
1455 let x = r.mul_neg_add(LN2D_HI, self);
1456 let x = r.mul_neg_add(LN2D_LO, x);
1457 let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1458 let x2 = x * x;
1459 let z = z.mul_add(x2, x);
1460 let n2 = Self::vm_pow2n(r);
1461 let z = (z + Self::ONE) * n2;
1462 let in_range = self.abs().simd_lt(max_x);
1464 let in_range = in_range & self.is_finite();
1465 in_range.blend(z, Self::ZERO)
1466 }
1467
1468 #[inline]
1469 fn exponent(self) -> f32x4 {
1470 const_f32_as_f32x4!(pow2_23, 8388608.0);
1471 const_f32_as_f32x4!(bias, 127.0);
1472 let a = cast::<_, u32x4>(self);
1473 let b = a >> 23;
1474 let c = b | cast::<_, u32x4>(pow2_23);
1475 let d = cast::<_, f32x4>(c);
1476 let e = d - (pow2_23 + bias);
1477 e
1478 }
1479
1480 #[inline]
1481 fn fraction_2(self) -> Self {
1482 let t1 = cast::<_, u32x4>(self);
1483 let t2 = cast::<_, u32x4>(
1484 (t1 & u32x4::from(0x007FFFFF)) | u32x4::from(0x3F000000),
1485 );
1486 cast::<_, f32x4>(t2)
1487 }
1488 #[inline]
1489 fn is_zero_or_subnormal(self) -> Self {
1490 let t = cast::<_, i32x4>(self);
1491 let t = t & i32x4::splat(0x7F800000);
1492 i32x4::round_float(t.simd_eq(i32x4::splat(0)))
1493 }
1494 #[inline]
1495 fn infinity() -> Self {
1496 cast::<_, f32x4>(i32x4::splat(0x7F800000))
1497 }
1498 #[inline]
1499 fn nan_log() -> Self {
1500 cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1501 }
1502 #[inline]
1503 fn nan_pow() -> Self {
1504 cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1505 }
1506 #[inline]
1507 pub fn sign_bit(self) -> Self {
1508 let t1 = cast::<_, i32x4>(self);
1509 let t2 = t1 >> 31;
1510 !cast::<_, f32x4>(t2).simd_eq(f32x4::ZERO)
1511 }
1512
1513 #[inline]
1515 #[must_use]
1516 pub fn reduce_add(self) -> f32 {
1517 let arr: [f32; 4] = cast(self);
1518 arr.iter().sum()
1519 }
1520
1521 #[inline]
1523 #[must_use]
1524 pub fn ln(self) -> Self {
1525 const_f32_as_f32x4!(HALF, 0.5);
1526 const_f32_as_f32x4!(P0, 3.3333331174E-1);
1527 const_f32_as_f32x4!(P1, -2.4999993993E-1);
1528 const_f32_as_f32x4!(P2, 2.0000714765E-1);
1529 const_f32_as_f32x4!(P3, -1.6668057665E-1);
1530 const_f32_as_f32x4!(P4, 1.4249322787E-1);
1531 const_f32_as_f32x4!(P5, -1.2420140846E-1);
1532 const_f32_as_f32x4!(P6, 1.1676998740E-1);
1533 const_f32_as_f32x4!(P7, -1.1514610310E-1);
1534 const_f32_as_f32x4!(P8, 7.0376836292E-2);
1535 const_f32_as_f32x4!(LN2F_HI, 0.693359375);
1536 const_f32_as_f32x4!(LN2F_LO, -2.12194440e-4);
1537 const_f32_as_f32x4!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1538
1539 let x1 = self;
1540 let x = Self::fraction_2(x1);
1541 let e = Self::exponent(x1);
1542 let mask = x.simd_gt(Self::SQRT_2 * HALF);
1543 let x = (!mask).blend(x + x, x);
1544 let fe = mask.blend(e + Self::ONE, e);
1545 let x = x - Self::ONE;
1546 let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1547 let x2 = x * x;
1548 let res = x2 * x * res;
1549 let res = fe.mul_add(LN2F_LO, res);
1550 let res = res + x2.mul_neg_add(HALF, x);
1551 let res = fe.mul_add(LN2F_HI, res);
1552 let overflow = !self.is_finite();
1553 let underflow = x1.simd_lt(VM_SMALLEST_NORMAL);
1554 let mask = overflow | underflow;
1555 if !mask.any() {
1556 res
1557 } else {
1558 let is_zero = self.is_zero_or_subnormal();
1559 let res = underflow.blend(Self::nan_log(), res);
1560 let res = is_zero.blend(Self::infinity(), res);
1561 let res = overflow.blend(self, res);
1562 res
1563 }
1564 }
1565
1566 #[inline]
1567 #[must_use]
1568 pub fn log2(self) -> Self {
1569 Self::ln(self) * Self::LOG2_E
1570 }
1571 #[inline]
1572 #[must_use]
1573 pub fn log10(self) -> Self {
1574 Self::ln(self) * Self::LOG10_E
1575 }
1576
1577 #[inline]
1578 #[must_use]
1579 pub fn pow_f32x4(self, y: f32x4) -> Self {
1580 const_f32_as_f32x4!(ln2f_hi, 0.693359375);
1581 const_f32_as_f32x4!(ln2f_lo, -2.12194440e-4);
1582 const_f32_as_f32x4!(P0logf, 3.3333331174E-1);
1583 const_f32_as_f32x4!(P1logf, -2.4999993993E-1);
1584 const_f32_as_f32x4!(P2logf, 2.0000714765E-1);
1585 const_f32_as_f32x4!(P3logf, -1.6668057665E-1);
1586 const_f32_as_f32x4!(P4logf, 1.4249322787E-1);
1587 const_f32_as_f32x4!(P5logf, -1.2420140846E-1);
1588 const_f32_as_f32x4!(P6logf, 1.1676998740E-1);
1589 const_f32_as_f32x4!(P7logf, -1.1514610310E-1);
1590 const_f32_as_f32x4!(P8logf, 7.0376836292E-2);
1591
1592 const_f32_as_f32x4!(p2expf, 1.0 / 2.0); const_f32_as_f32x4!(p3expf, 1.0 / 6.0);
1594 const_f32_as_f32x4!(p4expf, 1.0 / 24.0);
1595 const_f32_as_f32x4!(p5expf, 1.0 / 120.0);
1596 const_f32_as_f32x4!(p6expf, 1.0 / 720.0);
1597 const_f32_as_f32x4!(p7expf, 1.0 / 5040.0);
1598
1599 let x1 = self.abs();
1600 let x = x1.fraction_2();
1601
1602 let mask = x.simd_gt(f32x4::SQRT_2 * f32x4::HALF);
1603 let x = (!mask).blend(x + x, x);
1604
1605 let x = x - f32x4::ONE;
1606 let x2 = x * x;
1607 let lg1 = polynomial_8!(
1608 x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1609 );
1610 let lg1 = lg1 * x2 * x;
1611
1612 let ef = x1.exponent();
1613 let ef = mask.blend(ef + f32x4::ONE, ef);
1614
1615 let e1 = (ef * y).round();
1616 let yr = ef.mul_sub(y, e1);
1617
1618 let lg = f32x4::HALF.mul_neg_add(x2, x) + lg1;
1619 let x2_err = (f32x4::HALF * x).mul_sub(x, f32x4::HALF * x2);
1620 let lg_err = f32x4::HALF.mul_add(x2, lg - x) - lg1;
1621
1622 let e2 = (lg * y * f32x4::LOG2_E).round();
1623 let v = lg.mul_sub(y, e2 * ln2f_hi);
1624 let v = e2.mul_neg_add(ln2f_lo, v);
1625 let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x4::LN_2);
1626
1627 let x = v;
1628 let e3 = (x * f32x4::LOG2_E).round();
1629 let x = e3.mul_neg_add(f32x4::LN_2, x);
1630 let x2 = x * x;
1631 let z = x2.mul_add(
1632 polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1633 x + f32x4::ONE,
1634 );
1635
1636 let ee = e1 + e2 + e3;
1637 let ei = cast::<_, i32x4>(ee.round_int());
1638 let ej = cast::<_, i32x4>(ei + (cast::<_, i32x4>(z) >> 23));
1639
1640 let overflow = cast::<_, f32x4>(ej.simd_gt(i32x4::splat(0x0FF)))
1641 | (ee.simd_gt(f32x4::splat(300.0)));
1642 let underflow = cast::<_, f32x4>(ej.simd_lt(i32x4::splat(0x000)))
1643 | (ee.simd_lt(f32x4::splat(-300.0)));
1644
1645 let z = cast::<_, f32x4>(cast::<_, i32x4>(z) + (ei << 23));
1647
1648 let z = if (overflow | underflow).any() {
1650 let z = underflow.blend(f32x4::ZERO, z);
1651 overflow.blend(Self::infinity(), z)
1652 } else {
1653 z
1654 };
1655
1656 let x_zero = self.is_zero_or_subnormal();
1658 let z = x_zero.blend(
1659 y.simd_lt(f32x4::ZERO).blend(
1660 Self::infinity(),
1661 y.simd_eq(f32x4::ZERO).blend(f32x4::ONE, f32x4::ZERO),
1662 ),
1663 z,
1664 );
1665
1666 let x_sign = self.sign_bit();
1667 let z = if x_sign.any() {
1668 let yi = y.simd_eq(y.round());
1670 let y_odd = cast::<_, i32x4>(y.round_int() << 31).round_float();
1672
1673 let z1 =
1674 yi.blend(z | y_odd, self.simd_eq(Self::ZERO).blend(z, Self::nan_pow()));
1675 x_sign.blend(z1, z)
1676 } else {
1677 z
1678 };
1679
1680 let x_finite = self.is_finite();
1681 let y_finite = y.is_finite();
1682 let e_finite = ee.is_finite();
1683 if (x_finite & y_finite & (e_finite | x_zero)).all() {
1684 return z;
1685 }
1686
1687 (self.is_nan() | y.is_nan()).blend(self + y, z)
1688 }
1689
1690 #[inline]
1691 pub fn powf(self, y: f32) -> Self {
1692 Self::pow_f32x4(self, f32x4::splat(y))
1693 }
1694
1695 #[must_use]
1696 #[inline]
1697 pub fn unpack_lo(self, b: Self) -> Self {
1698 pick! {
1699 if #[cfg(target_feature="sse")] {
1700 Self { sse: unpack_low_m128(self.sse, b.sse) }
1701 } else if #[cfg(target_feature="simd128")] {
1702 Self {
1703 simd: u32x4_shuffle::<0, 4, 1, 5>(self.simd, b.simd)
1704 }
1705 } else if #[cfg(all(target_feature="neon", target_arch="aarch64"))]{
1706 unsafe {Self { neon: vzip1q_f32(self.neon, b.neon) }}
1707 } else {
1708 Self { arr: [
1709 self.arr[0],
1710 b.arr[0],
1711 self.arr[1],
1712 b.arr[1],
1713 ]}
1714 }
1715 }
1716 }
1717
1718 #[must_use]
1719 #[inline]
1720 pub fn unpack_hi(self, b: Self) -> Self {
1721 pick! {
1722 if #[cfg(target_feature="sse")] {
1723 Self { sse: unpack_high_m128(self.sse, b.sse) }
1724 } else if #[cfg(target_feature="simd128")] {
1725 Self {
1726 simd: u32x4_shuffle::<2, 6, 3, 7>(self.simd, b.simd)
1727 }
1728 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1729 unsafe {Self { neon: vzip2q_f32(self.neon, b.neon) }}
1730 } else {
1731 Self { arr: [
1732 self.arr[2],
1733 b.arr[2],
1734 self.arr[3],
1735 b.arr[3],
1736 ]}
1737 }
1738 }
1739 }
1740
1741 #[must_use]
1743 #[inline]
1744 pub fn transpose(data: [f32x4; 4]) -> [f32x4; 4] {
1745 pick! {
1746 if #[cfg(target_feature="sse")] {
1747 let mut e0 = data[0];
1748 let mut e1 = data[1];
1749 let mut e2 = data[2];
1750 let mut e3 = data[3];
1751
1752 transpose_four_m128(&mut e0.sse, &mut e1.sse, &mut e2.sse, &mut e3.sse);
1753
1754 [e0, e1, e2, e3]
1755 } else if #[cfg(any(all(target_feature="neon",target_arch="aarch64"), target_feature="simd128"))] {
1756 let a = data[0].unpack_lo(data[2]);
1757 let b = data[1].unpack_lo(data[3]);
1758 let c = data[0].unpack_hi(data[2]);
1759 let d = data[1].unpack_hi(data[3]);
1760
1761 [
1762 a.unpack_lo(b),
1763 a.unpack_hi(b),
1764 c.unpack_lo(d),
1765 c.unpack_hi(d),
1766 ]
1767 } else {
1768 #[inline(always)]
1769 fn transpose_column(data: &[f32x4; 4], index: usize) -> f32x4 {
1770 f32x4::new([
1771 data[0].as_array()[index],
1772 data[1].as_array()[index],
1773 data[2].as_array()[index],
1774 data[3].as_array()[index],
1775 ])
1776 }
1777
1778 [
1779 transpose_column(&data, 0),
1780 transpose_column(&data, 1),
1781 transpose_column(&data, 2),
1782 transpose_column(&data, 3),
1783 ]
1784 }
1785 }
1786 }
1787
1788 #[inline]
1789 pub fn to_array(self) -> [f32; 4] {
1790 cast(self)
1791 }
1792
1793 #[inline]
1794 pub fn as_array(&self) -> &[f32; 4] {
1795 cast_ref(self)
1796 }
1797
1798 #[inline]
1799 pub fn as_mut_array(&mut self) -> &mut [f32; 4] {
1800 cast_mut(self)
1801 }
1802
1803 #[inline]
1804 pub fn from_i32x4(v: i32x4) -> Self {
1805 pick! {
1806 if #[cfg(target_feature="sse2")] {
1807 Self { sse: convert_to_m128_from_i32_m128i(v.sse) }
1808 } else if #[cfg(target_feature="simd128")] {
1809 Self { simd: f32x4_convert_i32x4(v.simd) }
1810 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1811 Self { neon: unsafe { vcvtq_f32_s32(v.neon) }}
1812 } else {
1813 Self { arr: [
1814 v.as_array()[0] as f32,
1815 v.as_array()[1] as f32,
1816 v.as_array()[2] as f32,
1817 v.as_array()[3] as f32,
1818 ] }
1819 }
1820 }
1821 }
1822}