[go: up one dir, main page]

wide/
i32x4_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="sse2")] {
5    #[derive(Default, Clone, Copy, PartialEq, Eq)]
6    #[repr(C, align(16))]
7    pub struct i32x4 { pub(crate) sse: m128i }
8  } else if #[cfg(target_feature="simd128")] {
9    use core::arch::wasm32::*;
10
11    #[derive(Clone, Copy)]
12    #[repr(transparent)]
13    pub struct i32x4 { pub(crate) simd: v128 }
14
15    impl Default for i32x4 {
16      fn default() -> Self {
17        Self::splat(0)
18      }
19    }
20
21    impl PartialEq for i32x4 {
22      fn eq(&self, other: &Self) -> bool {
23        u32x4_all_true(i32x4_eq(self.simd, other.simd))
24      }
25    }
26
27    impl Eq for i32x4 { }
28  } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
29    use core::arch::aarch64::*;
30    #[repr(C)]
31    #[derive(Copy, Clone)]
32    pub struct i32x4 { pub(crate) neon : int32x4_t }
33
34    impl Default for i32x4 {
35      #[inline]
36      fn default() -> Self {
37        Self::splat(0)
38      }
39    }
40
41    impl PartialEq for i32x4 {
42      #[inline]
43      fn eq(&self, other: &Self) -> bool {
44        unsafe { vminvq_u32(vceqq_s32(self.neon, other.neon))==u32::MAX }
45      }
46    }
47
48    impl Eq for i32x4 { }
49  } else {
50    #[derive(Default, Clone, Copy, PartialEq, Eq)]
51    #[repr(C, align(16))]
52    pub struct i32x4 { pub(crate) arr: [i32;4] }
53  }
54}
55
56int_uint_consts!(i32, 4, i32x4, 128);
57
58unsafe impl Zeroable for i32x4 {}
59unsafe impl Pod for i32x4 {}
60
61impl AlignTo for i32x4 {
62  type Elem = i32;
63}
64
65impl Add for i32x4 {
66  type Output = Self;
67  #[inline]
68  fn add(self, rhs: Self) -> Self::Output {
69    pick! {
70      if #[cfg(target_feature="sse2")] {
71        Self { sse: add_i32_m128i(self.sse, rhs.sse) }
72      } else if #[cfg(target_feature="simd128")] {
73        Self { simd: i32x4_add(self.simd, rhs.simd) }
74      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
75        unsafe { Self { neon: vaddq_s32(self.neon, rhs.neon) } }
76      } else {
77        Self { arr: [
78          self.arr[0].wrapping_add(rhs.arr[0]),
79          self.arr[1].wrapping_add(rhs.arr[1]),
80          self.arr[2].wrapping_add(rhs.arr[2]),
81          self.arr[3].wrapping_add(rhs.arr[3]),
82        ]}
83      }
84    }
85  }
86}
87
88impl Sub for i32x4 {
89  type Output = Self;
90  #[inline]
91  fn sub(self, rhs: Self) -> Self::Output {
92    pick! {
93      if #[cfg(target_feature="sse2")] {
94        Self { sse: sub_i32_m128i(self.sse, rhs.sse) }
95      } else if #[cfg(target_feature="simd128")] {
96        Self { simd: i32x4_sub(self.simd, rhs.simd) }
97      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
98        unsafe {Self { neon: vsubq_s32(self.neon, rhs.neon) }}
99      } else {
100        Self { arr: [
101          self.arr[0].wrapping_sub(rhs.arr[0]),
102          self.arr[1].wrapping_sub(rhs.arr[1]),
103          self.arr[2].wrapping_sub(rhs.arr[2]),
104          self.arr[3].wrapping_sub(rhs.arr[3]),
105        ]}
106      }
107    }
108  }
109}
110
111impl Mul for i32x4 {
112  type Output = Self;
113  #[inline]
114  fn mul(self, rhs: Self) -> Self::Output {
115    pick! {
116      if #[cfg(target_feature="sse4.1")] {
117        Self { sse: mul_32_m128i(self.sse, rhs.sse) }
118      } else if #[cfg(target_feature="simd128")] {
119        Self { simd: i32x4_mul(self.simd, rhs.simd) }
120      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
121        unsafe {Self { neon: vmulq_s32(self.neon, rhs.neon) }}
122      } else {
123        let arr1: [i32; 4] = cast(self);
124        let arr2: [i32; 4] = cast(rhs);
125        cast([
126          arr1[0].wrapping_mul(arr2[0]),
127          arr1[1].wrapping_mul(arr2[1]),
128          arr1[2].wrapping_mul(arr2[2]),
129          arr1[3].wrapping_mul(arr2[3]),
130        ])
131      }
132    }
133  }
134}
135
136impl Add<i32> for i32x4 {
137  type Output = Self;
138  #[inline]
139  fn add(self, rhs: i32) -> Self::Output {
140    self.add(Self::splat(rhs))
141  }
142}
143
144impl Sub<i32> for i32x4 {
145  type Output = Self;
146  #[inline]
147  fn sub(self, rhs: i32) -> Self::Output {
148    self.sub(Self::splat(rhs))
149  }
150}
151
152impl Mul<i32> for i32x4 {
153  type Output = Self;
154  #[inline]
155  fn mul(self, rhs: i32) -> Self::Output {
156    self.mul(Self::splat(rhs))
157  }
158}
159
160impl Add<i32x4> for i32 {
161  type Output = i32x4;
162  #[inline]
163  fn add(self, rhs: i32x4) -> Self::Output {
164    i32x4::splat(self).add(rhs)
165  }
166}
167
168impl Sub<i32x4> for i32 {
169  type Output = i32x4;
170  #[inline]
171  fn sub(self, rhs: i32x4) -> Self::Output {
172    i32x4::splat(self).sub(rhs)
173  }
174}
175
176impl Mul<i32x4> for i32 {
177  type Output = i32x4;
178  #[inline]
179  fn mul(self, rhs: i32x4) -> Self::Output {
180    i32x4::splat(self).mul(rhs)
181  }
182}
183
184impl BitAnd for i32x4 {
185  type Output = Self;
186  #[inline]
187  fn bitand(self, rhs: Self) -> Self::Output {
188    pick! {
189      if #[cfg(target_feature="sse2")] {
190        Self { sse: bitand_m128i(self.sse, rhs.sse) }
191      } else if #[cfg(target_feature="simd128")] {
192        Self { simd: v128_and(self.simd, rhs.simd) }
193      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
194        unsafe {Self { neon: vandq_s32(self.neon, rhs.neon) }}
195      } else {
196        Self { arr: [
197          self.arr[0].bitand(rhs.arr[0]),
198          self.arr[1].bitand(rhs.arr[1]),
199          self.arr[2].bitand(rhs.arr[2]),
200          self.arr[3].bitand(rhs.arr[3]),
201        ]}
202      }
203    }
204  }
205}
206
207impl BitOr for i32x4 {
208  type Output = Self;
209  #[inline]
210  fn bitor(self, rhs: Self) -> Self::Output {
211    pick! {
212      if #[cfg(target_feature="sse2")] {
213        Self { sse: bitor_m128i(self.sse, rhs.sse) }
214      } else if #[cfg(target_feature="simd128")] {
215        Self { simd: v128_or(self.simd, rhs.simd) }
216      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
217        unsafe {Self { neon: vorrq_s32(self.neon, rhs.neon) }}
218      } else {
219        Self { arr: [
220          self.arr[0].bitor(rhs.arr[0]),
221          self.arr[1].bitor(rhs.arr[1]),
222          self.arr[2].bitor(rhs.arr[2]),
223          self.arr[3].bitor(rhs.arr[3]),
224        ]}
225      }
226    }
227  }
228}
229
230impl BitXor for i32x4 {
231  type Output = Self;
232  #[inline]
233  fn bitxor(self, rhs: Self) -> Self::Output {
234    pick! {
235      if #[cfg(target_feature="sse2")] {
236        Self { sse: bitxor_m128i(self.sse, rhs.sse) }
237      } else if #[cfg(target_feature="simd128")] {
238        Self { simd: v128_xor(self.simd, rhs.simd) }
239      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
240        unsafe {Self { neon: veorq_s32(self.neon, rhs.neon) }}
241      } else {
242        Self { arr: [
243          self.arr[0].bitxor(rhs.arr[0]),
244          self.arr[1].bitxor(rhs.arr[1]),
245          self.arr[2].bitxor(rhs.arr[2]),
246          self.arr[3].bitxor(rhs.arr[3]),
247        ]}
248      }
249    }
250  }
251}
252
253macro_rules! impl_shl_t_for_i32x4 {
254  ($($shift_type:ty),+ $(,)?) => {
255    $(impl Shl<$shift_type> for i32x4 {
256      type Output = Self;
257      /// Shifts all lanes by the value given.
258      #[inline]
259      fn shl(self, rhs: $shift_type) -> Self::Output {
260        pick! {
261          if #[cfg(target_feature="sse2")] {
262            let shift = cast([rhs as u64, 0]);
263            Self { sse: shl_all_u32_m128i(self.sse, shift) }
264          } else if #[cfg(target_feature="simd128")] {
265            Self { simd: i32x4_shl(self.simd, rhs as u32) }
266          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
267            unsafe {Self { neon: vshlq_s32(self.neon, vmovq_n_s32(rhs as i32)) }}
268          } else {
269            let u = rhs as u32;
270            Self { arr: [
271              self.arr[0].wrapping_shl(u),
272              self.arr[1].wrapping_shl(u),
273              self.arr[2].wrapping_shl(u),
274              self.arr[3].wrapping_shl(u),
275            ]}
276          }
277        }
278      }
279    })+
280  };
281}
282impl_shl_t_for_i32x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
283
284macro_rules! impl_shr_t_for_i32x4 {
285  ($($shift_type:ty),+ $(,)?) => {
286    $(impl Shr<$shift_type> for i32x4 {
287      type Output = Self;
288      /// Shifts all lanes by the value given.
289      #[inline]
290      fn shr(self, rhs: $shift_type) -> Self::Output {
291        pick! {
292          if #[cfg(target_feature="sse2")] {
293            let shift = cast([rhs as u64, 0]);
294            Self { sse: shr_all_i32_m128i(self.sse, shift) }
295          } else if #[cfg(target_feature="simd128")] {
296            Self { simd: i32x4_shr(self.simd, rhs as u32) }
297          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
298            unsafe {Self { neon: vshlq_s32(self.neon, vmovq_n_s32( -(rhs as i32))) }}
299          } else {
300            let u = rhs as u32;
301            Self { arr: [
302              self.arr[0].wrapping_shr(u),
303              self.arr[1].wrapping_shr(u),
304              self.arr[2].wrapping_shr(u),
305              self.arr[3].wrapping_shr(u),
306            ]}
307          }
308        }
309      }
310    })+
311  };
312}
313impl_shr_t_for_i32x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
314
315/// Shifts lanes by the corresponding lane.
316///
317/// Bitwise shift-right; yields `self >> mask(rhs)`, where mask removes any
318/// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
319/// of the type. (same as `wrapping_shr`)
320impl Shr<i32x4> for i32x4 {
321  type Output = Self;
322
323  #[inline]
324  fn shr(self, rhs: i32x4) -> Self::Output {
325    pick! {
326      if #[cfg(target_feature="avx2")] {
327        // mask the shift count to 31 to have same behavior on all platforms
328        let shift_by = bitand_m128i(rhs.sse, set_splat_i32_m128i(31));
329        Self { sse: shr_each_i32_m128i(self.sse, shift_by) }
330      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
331        unsafe {
332          // mask the shift count to 31 to have same behavior on all platforms
333          // no right shift, have to pass negative value to left shift on neon
334          let shift_by = vnegq_s32(vandq_s32(rhs.neon, vmovq_n_s32(31)));
335          Self { neon: vshlq_s32(self.neon, shift_by) }
336        }
337      } else {
338        let arr: [i32; 4] = cast(self);
339        let rhs: [i32; 4] = cast(rhs);
340        cast([
341          arr[0].wrapping_shr(rhs[0] as u32),
342          arr[1].wrapping_shr(rhs[1] as u32),
343          arr[2].wrapping_shr(rhs[2] as u32),
344          arr[3].wrapping_shr(rhs[3] as u32),
345        ])
346      }
347    }
348  }
349}
350
351/// Shifts lanes by the corresponding lane.
352///
353/// Bitwise shift-left; yields `self << mask(rhs)`, where mask removes any
354/// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
355/// of the type. (same as `wrapping_shl`)
356impl Shl<i32x4> for i32x4 {
357  type Output = Self;
358
359  #[inline]
360  fn shl(self, rhs: i32x4) -> Self::Output {
361    pick! {
362      if #[cfg(target_feature="avx2")] {
363        // mask the shift count to 31 to have same behavior on all platforms
364        let shift_by = bitand_m128i(rhs.sse, set_splat_i32_m128i(31));
365        Self { sse: shl_each_u32_m128i(self.sse, shift_by) }
366      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
367        unsafe {
368          // mask the shift count to 31 to have same behavior on all platforms
369          let shift_by = vandq_s32(rhs.neon, vmovq_n_s32(31));
370          Self { neon: vshlq_s32(self.neon, shift_by) }
371        }
372      } else {
373        let arr: [i32; 4] = cast(self);
374        let rhs: [i32; 4] = cast(rhs);
375        cast([
376          arr[0].wrapping_shl(rhs[0] as u32),
377          arr[1].wrapping_shl(rhs[1] as u32),
378          arr[2].wrapping_shl(rhs[2] as u32),
379          arr[3].wrapping_shl(rhs[3] as u32),
380        ])
381      }
382    }
383  }
384}
385
386impl CmpEq for i32x4 {
387  type Output = Self;
388  #[inline]
389  fn simd_eq(self, rhs: Self) -> Self::Output {
390    pick! {
391      if #[cfg(target_feature="sse2")] {
392        Self { sse: cmp_eq_mask_i32_m128i(self.sse, rhs.sse) }
393      } else if #[cfg(target_feature="simd128")] {
394        Self { simd: i32x4_eq(self.simd, rhs.simd) }
395      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
396        unsafe {Self { neon: vreinterpretq_s32_u32(vceqq_s32(self.neon, rhs.neon)) }}
397      } else {
398        Self { arr: [
399          if self.arr[0] == rhs.arr[0] { -1 } else { 0 },
400          if self.arr[1] == rhs.arr[1] { -1 } else { 0 },
401          if self.arr[2] == rhs.arr[2] { -1 } else { 0 },
402          if self.arr[3] == rhs.arr[3] { -1 } else { 0 },
403        ]}
404      }
405    }
406  }
407}
408
409impl CmpGt for i32x4 {
410  type Output = Self;
411  #[inline]
412  fn simd_gt(self, rhs: Self) -> Self::Output {
413    pick! {
414      if #[cfg(target_feature="sse2")] {
415        Self { sse: cmp_gt_mask_i32_m128i(self.sse, rhs.sse) }
416      } else if #[cfg(target_feature="simd128")] {
417        Self { simd: i32x4_gt(self.simd, rhs.simd) }
418      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
419        unsafe {Self { neon: vreinterpretq_s32_u32(vcgtq_s32(self.neon, rhs.neon)) }}
420      } else {
421        Self { arr: [
422          if self.arr[0] > rhs.arr[0] { -1 } else { 0 },
423          if self.arr[1] > rhs.arr[1] { -1 } else { 0 },
424          if self.arr[2] > rhs.arr[2] { -1 } else { 0 },
425          if self.arr[3] > rhs.arr[3] { -1 } else { 0 },
426        ]}
427      }
428    }
429  }
430}
431
432impl CmpLt for i32x4 {
433  type Output = Self;
434  #[inline]
435  fn simd_lt(self, rhs: Self) -> Self::Output {
436    pick! {
437      if #[cfg(target_feature="sse2")] {
438        Self { sse: cmp_lt_mask_i32_m128i(self.sse, rhs.sse) }
439      } else if #[cfg(target_feature="simd128")] {
440        Self { simd: i32x4_lt(self.simd, rhs.simd) }
441      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
442        unsafe {Self { neon: vreinterpretq_s32_u32(vcltq_s32(self.neon, rhs.neon)) }}
443      } else {
444        Self { arr: [
445          if self.arr[0] < rhs.arr[0] { -1 } else { 0 },
446          if self.arr[1] < rhs.arr[1] { -1 } else { 0 },
447          if self.arr[2] < rhs.arr[2] { -1 } else { 0 },
448          if self.arr[3] < rhs.arr[3] { -1 } else { 0 },
449        ]}
450      }
451    }
452  }
453}
454
455impl i32x4 {
456  #[inline]
457  #[must_use]
458  pub const fn new(array: [i32; 4]) -> Self {
459    unsafe { core::mem::transmute(array) }
460  }
461  #[inline]
462  #[must_use]
463  pub fn blend(self, t: Self, f: Self) -> Self {
464    pick! {
465      if #[cfg(target_feature="sse4.1")] {
466        Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) }
467      } else if #[cfg(target_feature="simd128")] {
468        Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
469      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
470        unsafe {Self { neon: vbslq_s32(vreinterpretq_u32_s32(self.neon), t.neon, f.neon) }}
471      } else {
472        generic_bit_blend(self, t, f)
473      }
474    }
475  }
476
477  /// Multiplies corresponding 32 bit lanes and returns the 64 bit result
478  /// on the corresponding lanes.
479  ///
480  /// Effectively does two multiplies on 128 bit platforms, but is easier
481  /// to use than wrapping `mul_widen_i32_odd_m128i` individually.
482  #[inline]
483  #[must_use]
484  pub fn mul_widen(self, rhs: Self) -> i64x4 {
485    pick! {
486      if #[cfg(target_feature="avx2")] {
487        let a = convert_to_i64_m256i_from_i32_m128i(self.sse);
488        let b = convert_to_i64_m256i_from_i32_m128i(rhs.sse);
489        cast(mul_i64_low_bits_m256i(a, b))
490      } else if #[cfg(target_feature="sse4.1")] {
491          let evenp = mul_widen_i32_odd_m128i(self.sse, rhs.sse);
492
493          let oddp = mul_widen_i32_odd_m128i(
494            shr_imm_u64_m128i::<32>(self.sse),
495            shr_imm_u64_m128i::<32>(rhs.sse));
496
497          i64x4 {
498            a: i64x2 { sse: unpack_low_i64_m128i(evenp, oddp)},
499            b: i64x2 { sse: unpack_high_i64_m128i(evenp, oddp)}
500          }
501      } else if #[cfg(target_feature="simd128")] {
502          i64x4 {
503            a: i64x2 { simd: i64x2_extmul_low_i32x4(self.simd, rhs.simd) },
504            b: i64x2 { simd: i64x2_extmul_high_i32x4(self.simd, rhs.simd) },
505          }
506      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
507        unsafe {
508          i64x4 { a: i64x2 { neon: vmull_s32(vget_low_s32(self.neon), vget_low_s32(rhs.neon)) },
509                  b: i64x2 { neon: vmull_s32(vget_high_s32(self.neon), vget_high_s32(rhs.neon)) } }
510        }
511      } else {
512        let a: [i32; 4] = cast(self);
513        let b: [i32; 4] = cast(rhs);
514        cast([
515          i64::from(a[0]) * i64::from(b[0]),
516          i64::from(a[1]) * i64::from(b[1]),
517          i64::from(a[2]) * i64::from(b[2]),
518          i64::from(a[3]) * i64::from(b[3]),
519        ])
520      }
521    }
522  }
523
524  #[inline]
525  #[must_use]
526  pub fn abs(self) -> Self {
527    pick! {
528      if #[cfg(target_feature="ssse3")] {
529        Self { sse: abs_i32_m128i(self.sse) }
530      } else if #[cfg(target_feature="simd128")] {
531        Self { simd: i32x4_abs(self.simd) }
532      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
533        unsafe {Self { neon: vabsq_s32(self.neon) }}
534      } else {
535        let arr: [i32; 4] = cast(self);
536        cast([
537          arr[0].wrapping_abs(),
538          arr[1].wrapping_abs(),
539          arr[2].wrapping_abs(),
540          arr[3].wrapping_abs(),
541        ])
542      }
543    }
544  }
545
546  #[inline]
547  #[must_use]
548  pub fn unsigned_abs(self) -> u32x4 {
549    pick! {
550      if #[cfg(target_feature="ssse3")] {
551        u32x4 { sse: abs_i32_m128i(self.sse) }
552      } else if #[cfg(target_feature="simd128")] {
553        u32x4 { simd: i32x4_abs(self.simd) }
554      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
555        unsafe {u32x4 { neon: vreinterpretq_u32_s32(vabsq_s32(self.neon)) }}
556      } else {
557        let arr: [i32; 4] = cast(self);
558        cast([
559          arr[0].unsigned_abs(),
560          arr[1].unsigned_abs(),
561          arr[2].unsigned_abs(),
562          arr[3].unsigned_abs(),
563        ])
564      }
565    }
566  }
567
568  /// horizontal add of all the elements of the vector
569  #[inline]
570  #[must_use]
571  pub fn reduce_add(self) -> i32 {
572    pick! {
573      if #[cfg(target_feature="sse2")] {
574        let hi64  = unpack_high_i64_m128i(self.sse, self.sse);
575        let sum64 = add_i32_m128i(hi64, self.sse);
576        let hi32  = shuffle_ai_f32_all_m128i::<0b10_11_00_01>(sum64);    // Swap the low two elements
577        let sum32 = add_i32_m128i(sum64, hi32);
578        get_i32_from_m128i_s(sum32)
579      } else {
580        let arr: [i32; 4] = cast(self);
581        arr[0].wrapping_add(arr[1]).wrapping_add(
582        arr[2].wrapping_add(arr[3]))
583      }
584    }
585  }
586
587  /// horizontal max of all the elements of the vector
588  #[inline]
589  #[must_use]
590  pub fn reduce_max(self) -> i32 {
591    let arr: [i32; 4] = cast(self);
592    arr[0].max(arr[1]).max(arr[2].max(arr[3]))
593  }
594
595  /// horizontal min of all the elements of the vector
596  #[inline]
597  #[must_use]
598  pub fn reduce_min(self) -> i32 {
599    let arr: [i32; 4] = cast(self);
600    arr[0].min(arr[1]).min(arr[2].min(arr[3]))
601  }
602
603  #[inline]
604  #[must_use]
605  pub fn max(self, rhs: Self) -> Self {
606    pick! {
607      if #[cfg(target_feature="sse4.1")] {
608        Self { sse: max_i32_m128i(self.sse, rhs.sse) }
609      } else if #[cfg(target_feature="simd128")] {
610        Self { simd: i32x4_max(self.simd, rhs.simd) }
611      } else {
612        self.simd_lt(rhs).blend(rhs, self)
613      }
614    }
615  }
616  #[inline]
617  #[must_use]
618  pub fn min(self, rhs: Self) -> Self {
619    pick! {
620      if #[cfg(target_feature="sse4.1")] {
621        Self { sse: min_i32_m128i(self.sse, rhs.sse) }
622      } else if #[cfg(target_feature="simd128")] {
623        Self { simd: i32x4_min(self.simd, rhs.simd) }
624      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
625        unsafe {Self { neon: vminq_s32(self.neon, rhs.neon) }}
626      } else {
627        self.simd_lt(rhs).blend(self, rhs)
628      }
629    }
630  }
631  #[inline]
632  #[must_use]
633  pub fn round_float(self) -> f32x4 {
634    pick! {
635      if #[cfg(target_feature="sse2")] {
636        cast(convert_to_m128_from_i32_m128i(self.sse))
637      } else if #[cfg(target_feature="simd128")] {
638        cast(Self { simd: f32x4_convert_i32x4(self.simd) })
639      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
640        cast(unsafe {Self { neon: vreinterpretq_s32_f32(vcvtq_f32_s32(self.neon)) }})
641      } else {
642        let arr: [i32; 4] = cast(self);
643        cast([
644          arr[0] as f32,
645          arr[1] as f32,
646          arr[2] as f32,
647          arr[3] as f32,
648        ])
649      }
650    }
651  }
652
653  #[inline]
654  #[must_use]
655  pub fn to_bitmask(self) -> u32 {
656    pick! {
657      if #[cfg(target_feature="sse2")] {
658        // use f32 move_mask since it is the same size as i32
659        move_mask_m128(cast(self.sse)) as u32
660      } else if #[cfg(target_feature="simd128")] {
661        u32x4_bitmask(self.simd) as u32
662      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
663        unsafe
664        {
665          // set all to 1 if top bit is set, else 0
666          let masked = vcltq_s32(self.neon, vdupq_n_s32(0));
667
668          // select the right bit out of each lane
669          let selectbit : uint32x4_t = core::mem::transmute([1u32, 2, 4, 8]);
670          let r = vandq_u32(masked, selectbit);
671
672          // horizontally add the 32-bit lanes
673          vaddvq_u32(r) as u32
674         }
675      } else {
676        ((self.arr[0] < 0) as u32) << 0 |
677        ((self.arr[1] < 0) as u32) << 1 |
678        ((self.arr[2] < 0) as u32) << 2 |
679        ((self.arr[3] < 0) as u32) << 3
680      }
681    }
682  }
683
684  #[inline]
685  #[must_use]
686  pub fn any(self) -> bool {
687    pick! {
688      if #[cfg(target_feature="sse2")] {
689        // use f32 move_mask since it is the same size as i32
690        move_mask_m128(cast(self.sse)) != 0
691      } else if #[cfg(target_feature="simd128")] {
692        u32x4_bitmask(self.simd) != 0
693      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
694        // some lanes are negative
695        unsafe {
696          vminvq_s32(self.neon) < 0
697        }
698      } else {
699        let v : [u64;2] = cast(self);
700        ((v[0] | v[1]) & 0x8000000080000000) != 0
701      }
702    }
703  }
704
705  #[inline]
706  #[must_use]
707  pub fn all(self) -> bool {
708    pick! {
709      if #[cfg(target_feature="sse2")] {
710        // use f32 move_mask since it is the same size as i32
711        move_mask_m128(cast(self.sse)) == 0b1111
712      } else if #[cfg(target_feature="simd128")] {
713        u32x4_bitmask(self.simd) == 0b1111
714      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
715        // all lanes are negative
716        unsafe {
717          vmaxvq_s32(self.neon) < 0
718        }
719      } else {
720        let v : [u64;2] = cast(self);
721        (v[0] & v[1] & 0x8000000080000000) == 0x8000000080000000
722      }
723    }
724  }
725
726  #[inline]
727  #[must_use]
728  pub fn none(self) -> bool {
729    !self.any()
730  }
731
732  /// Transpose matrix of 4x4 `i32` matrix. Currently only accelerated on SSE.
733  #[must_use]
734  #[inline]
735  pub fn transpose(data: [i32x4; 4]) -> [i32x4; 4] {
736    pick! {
737      if #[cfg(target_feature="sse")] {
738        let mut e0 = data[0];
739        let mut e1 = data[1];
740        let mut e2 = data[2];
741        let mut e3 = data[3];
742
743        transpose_four_m128(
744          cast_mut(&mut e0.sse),
745          cast_mut(&mut e1.sse),
746          cast_mut(&mut e2.sse),
747          cast_mut(&mut e3.sse),
748        );
749
750        [e0, e1, e2, e3]
751      } else {
752        #[inline(always)]
753        fn transpose_column(data: &[i32x4; 4], index: usize) -> i32x4 {
754          i32x4::new([
755            data[0].as_array()[index],
756            data[1].as_array()[index],
757            data[2].as_array()[index],
758            data[3].as_array()[index],
759          ])
760        }
761
762        [
763          transpose_column(&data, 0),
764          transpose_column(&data, 1),
765          transpose_column(&data, 2),
766          transpose_column(&data, 3),
767        ]
768      }
769    }
770  }
771
772  #[inline]
773  pub fn to_array(self) -> [i32; 4] {
774    cast(self)
775  }
776
777  #[inline]
778  pub fn as_array(&self) -> &[i32; 4] {
779    cast_ref(self)
780  }
781
782  #[inline]
783  pub fn as_mut_array(&mut self) -> &mut [i32; 4] {
784    cast_mut(self)
785  }
786}