[go: up one dir, main page]

wide/
u64x2_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="sse2")] {
5    #[derive(Default, Clone, Copy, PartialEq, Eq)]
6    #[repr(C, align(16))]
7    pub struct u64x2 { pub(crate) sse: m128i }
8  } else if #[cfg(target_feature="simd128")] {
9    use core::arch::wasm32::*;
10
11    #[derive(Clone, Copy)]
12    #[repr(transparent)]
13    pub struct u64x2 { pub(crate) simd: v128 }
14
15    impl Default for u64x2 {
16      fn default() -> Self {
17        Self::splat(0)
18      }
19    }
20
21    impl PartialEq for u64x2 {
22      fn eq(&self, other: &Self) -> bool {
23        u64x2_all_true(u64x2_eq(self.simd, other.simd))
24      }
25    }
26
27    impl Eq for u64x2 { }
28  } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
29    use core::arch::aarch64::*;
30    #[repr(C)]
31    #[derive(Copy, Clone)]
32    pub struct u64x2 { pub(crate) neon : uint64x2_t }
33
34    impl Default for u64x2 {
35      #[inline]
36      fn default() -> Self {
37        unsafe { Self { neon: vdupq_n_u64(0)} }
38      }
39    }
40
41    impl PartialEq for u64x2 {
42      #[inline]
43      fn eq(&self, other: &Self) -> bool {
44        unsafe {
45          vgetq_lane_u64(self.neon,0) == vgetq_lane_u64(other.neon,0) &&
46          vgetq_lane_u64(self.neon,1) == vgetq_lane_u64(other.neon,1)
47        }
48      }
49    }
50
51    impl Eq for u64x2 { }
52  } else {
53    #[derive(Default, Clone, Copy, PartialEq, Eq)]
54    #[repr(C, align(16))]
55    pub struct u64x2 { arr: [u64;2] }
56  }
57}
58
59int_uint_consts!(u64, 2, u64x2, 128);
60
61unsafe impl Zeroable for u64x2 {}
62unsafe impl Pod for u64x2 {}
63
64impl AlignTo for u64x2 {
65  type Elem = u64;
66}
67
68impl Add for u64x2 {
69  type Output = Self;
70  #[inline]
71  fn add(self, rhs: Self) -> Self::Output {
72    pick! {
73      if #[cfg(target_feature="sse2")] {
74        Self { sse: add_i64_m128i(self.sse, rhs.sse) }
75      } else if #[cfg(target_feature="simd128")] {
76        Self { simd: u64x2_add(self.simd, rhs.simd) }
77      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
78        unsafe { Self { neon: vaddq_u64(self.neon, rhs.neon) } }
79      } else {
80        Self { arr: [
81          self.arr[0].wrapping_add(rhs.arr[0]),
82          self.arr[1].wrapping_add(rhs.arr[1]),
83        ]}
84      }
85    }
86  }
87}
88
89impl Sub for u64x2 {
90  type Output = Self;
91  #[inline]
92  fn sub(self, rhs: Self) -> Self::Output {
93    pick! {
94      if #[cfg(target_feature="sse2")] {
95        Self { sse: sub_i64_m128i(self.sse, rhs.sse) }
96      } else if #[cfg(target_feature="simd128")] {
97        Self { simd: u64x2_sub(self.simd, rhs.simd) }
98      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
99        unsafe { Self { neon: vsubq_u64(self.neon, rhs.neon) } }
100      } else {
101        Self { arr: [
102          self.arr[0].wrapping_sub(rhs.arr[0]),
103          self.arr[1].wrapping_sub(rhs.arr[1]),
104        ]}
105      }
106    }
107  }
108}
109
110//we should try to implement this on sse2
111impl Mul for u64x2 {
112  type Output = Self;
113  #[inline]
114  fn mul(self, rhs: Self) -> Self::Output {
115    pick! {
116      if #[cfg(target_feature="simd128")] {
117        Self { simd: u64x2_mul(self.simd, rhs.simd) }
118      } else {
119        let arr1: [u64; 2] = cast(self);
120        let arr2: [u64; 2] = cast(rhs);
121        cast([
122          arr1[0].wrapping_mul(arr2[0]),
123          arr1[1].wrapping_mul(arr2[1]),
124        ])
125      }
126    }
127  }
128}
129
130impl Add<u64> for u64x2 {
131  type Output = Self;
132  #[inline]
133  fn add(self, rhs: u64) -> Self::Output {
134    self.add(Self::splat(rhs))
135  }
136}
137
138impl Sub<u64> for u64x2 {
139  type Output = Self;
140  #[inline]
141  fn sub(self, rhs: u64) -> Self::Output {
142    self.sub(Self::splat(rhs))
143  }
144}
145
146impl Mul<u64> for u64x2 {
147  type Output = Self;
148  #[inline]
149  fn mul(self, rhs: u64) -> Self::Output {
150    self.mul(Self::splat(rhs))
151  }
152}
153
154impl Add<u64x2> for u64 {
155  type Output = u64x2;
156  #[inline]
157  fn add(self, rhs: u64x2) -> Self::Output {
158    u64x2::splat(self).add(rhs)
159  }
160}
161
162impl Sub<u64x2> for u64 {
163  type Output = u64x2;
164  #[inline]
165  fn sub(self, rhs: u64x2) -> Self::Output {
166    u64x2::splat(self).sub(rhs)
167  }
168}
169
170impl Mul<u64x2> for u64 {
171  type Output = u64x2;
172  #[inline]
173  fn mul(self, rhs: u64x2) -> Self::Output {
174    u64x2::splat(self).mul(rhs)
175  }
176}
177
178impl BitAnd for u64x2 {
179  type Output = Self;
180  #[inline]
181  fn bitand(self, rhs: Self) -> Self::Output {
182    pick! {
183      if #[cfg(target_feature="sse2")] {
184        Self { sse: bitand_m128i(self.sse, rhs.sse) }
185      } else if #[cfg(target_feature="simd128")] {
186        Self { simd: v128_and(self.simd, rhs.simd) }
187      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
188        unsafe {Self { neon: vandq_u64(self.neon, rhs.neon) }}
189      } else {
190        Self { arr: [
191          self.arr[0].bitand(rhs.arr[0]),
192          self.arr[1].bitand(rhs.arr[1]),
193        ]}
194      }
195    }
196  }
197}
198
199impl BitOr for u64x2 {
200  type Output = Self;
201  #[inline]
202  fn bitor(self, rhs: Self) -> Self::Output {
203    pick! {
204      if #[cfg(target_feature="sse2")] {
205        Self { sse: bitor_m128i(self.sse, rhs.sse) }
206      } else if #[cfg(target_feature="simd128")] {
207        Self { simd: v128_or(self.simd, rhs.simd) }
208      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
209        unsafe {Self { neon: vorrq_u64(self.neon, rhs.neon) }}
210      } else {
211        Self { arr: [
212          self.arr[0].bitor(rhs.arr[0]),
213          self.arr[1].bitor(rhs.arr[1]),
214        ]}
215      }
216    }
217  }
218}
219
220impl BitXor for u64x2 {
221  type Output = Self;
222  #[inline]
223  fn bitxor(self, rhs: Self) -> Self::Output {
224    pick! {
225      if #[cfg(target_feature="sse2")] {
226        Self { sse: bitxor_m128i(self.sse, rhs.sse) }
227      } else if #[cfg(target_feature="simd128")] {
228        Self { simd: v128_xor(self.simd, rhs.simd) }
229      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
230        unsafe {Self { neon: veorq_u64(self.neon, rhs.neon) }}
231      } else {
232        Self { arr: [
233          self.arr[0].bitxor(rhs.arr[0]),
234          self.arr[1].bitxor(rhs.arr[1]),
235        ]}
236      }
237    }
238  }
239}
240
241/// Shifts lanes by the corresponding lane.
242///
243/// Bitwise shift-left; yields `self << mask(rhs)`, where mask removes any
244/// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
245/// of the type. (same as `wrapping_shl`)
246impl Shl for u64x2 {
247  type Output = Self;
248
249  #[inline]
250  fn shl(self, rhs: Self) -> Self::Output {
251    pick! {
252      if #[cfg(target_feature="avx2")] {
253        // mask the shift count to 63 to have same behavior on all platforms
254        let shift_by = rhs & Self::splat(63);
255        Self { sse: shl_each_u64_m128i(self.sse, shift_by.sse) }
256      } else if #[cfg(all(target_feature="neon", target_arch="aarch64"))] {
257        unsafe {
258          // mask the shift count to 63 to have same behavior on all platforms
259          let shift_by = vreinterpretq_s64_u64(vandq_u64(rhs.neon, vmovq_n_u64(63)));
260          Self { neon: vshlq_u64(self.neon, shift_by) }
261        }
262      } else {
263        let arr: [u64; 2] = cast(self);
264        let rhs: [u64; 2] = cast(rhs);
265        cast([
266          arr[0].wrapping_shl(rhs[0] as u32),
267          arr[1].wrapping_shl(rhs[1] as u32),
268        ])
269      }
270    }
271  }
272}
273
274macro_rules! impl_shl_t_for_u64x2 {
275  ($($shift_type:ty),+ $(,)?) => {
276    $(impl Shl<$shift_type> for u64x2 {
277      type Output = Self;
278      /// Shifts all lanes by the value given.
279      #[inline]
280      fn shl(self, rhs: $shift_type) -> Self::Output {
281        pick! {
282          if #[cfg(target_feature="sse2")] {
283            let shift = cast([rhs as u64, 0]);
284            Self { sse: shl_all_u64_m128i(self.sse, shift) }
285          } else if #[cfg(target_feature="simd128")] {
286            Self { simd: u64x2_shl(self.simd, rhs as u32) }
287          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
288            unsafe {Self { neon: vshlq_u64(self.neon, vmovq_n_s64(rhs as i64)) }}
289          } else {
290            let u = rhs as u32;
291            Self { arr: [
292              self.arr[0].wrapping_shl(u),
293              self.arr[1].wrapping_shl(u),
294            ]}
295          }
296        }
297      }
298    })+
299  };
300}
301impl_shl_t_for_u64x2!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
302
303/// Shifts lanes by the corresponding lane.
304///
305/// Bitwise shift-right; yields `self >> mask(rhs)`, where mask removes any
306/// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
307/// of the type. (same as `wrapping_shr`)
308impl Shr for u64x2 {
309  type Output = Self;
310
311  #[inline]
312  fn shr(self, rhs: Self) -> Self::Output {
313    pick! {
314      if #[cfg(target_feature="avx2")] {
315        // mask the shift count to 63 to have same behavior on all platforms
316        let shift_by = rhs & Self::splat(63);
317        Self { sse: shr_each_u64_m128i(self.sse, shift_by.sse) }
318      } else if #[cfg(all(target_feature="neon", target_arch="aarch64"))] {
319        unsafe {
320          // mask the shift count to 63 to have same behavior on all platforms
321          // no right shift, have to pass negative value to left shift on neon
322          let shift_by = vnegq_s64(vreinterpretq_s64_u64(vandq_u64(rhs.neon, vmovq_n_u64(63))));
323          Self { neon: vshlq_u64(self.neon, shift_by) }
324        }
325      } else {
326        let arr: [u64; 2] = cast(self);
327        let rhs: [u64; 2] = cast(rhs);
328        cast([
329          arr[0].wrapping_shr(rhs[0] as u32),
330          arr[1].wrapping_shr(rhs[1] as u32),
331        ])
332      }
333    }
334  }
335}
336
337macro_rules! impl_shr_t_for_u64x2 {
338  ($($shift_type:ty),+ $(,)?) => {
339    $(impl Shr<$shift_type> for u64x2 {
340      type Output = Self;
341      /// Shifts all lanes by the value given.
342      #[inline]
343      fn shr(self, rhs: $shift_type) -> Self::Output {
344        pick! {
345          if #[cfg(target_feature="sse2")] {
346            let shift = cast([rhs as u64, 0]);
347            Self { sse: shr_all_u64_m128i(self.sse, shift) }
348          } else if #[cfg(target_feature="simd128")] {
349            Self { simd: u64x2_shr(self.simd, rhs as u32) }
350          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
351            unsafe {Self { neon: vshlq_u64(self.neon, vmovq_n_s64(-(rhs as i64))) }}
352          } else {
353            let u = rhs as u32;
354            Self { arr: [
355              self.arr[0].wrapping_shr(u),
356              self.arr[1].wrapping_shr(u),
357            ]}
358          }
359        }
360      }
361    })+
362  };
363}
364impl_shr_t_for_u64x2!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
365
366impl CmpEq for u64x2 {
367  type Output = Self;
368  #[inline]
369  fn simd_eq(self, rhs: Self) -> Self::Output {
370    Self::simd_eq(self, rhs)
371  }
372}
373
374impl CmpGt for u64x2 {
375  type Output = Self;
376  #[inline]
377  fn simd_gt(self, rhs: Self) -> Self::Output {
378    Self::simd_gt(self, rhs)
379  }
380}
381
382impl CmpLt for u64x2 {
383  type Output = Self;
384  #[inline]
385  fn simd_lt(self, rhs: Self) -> Self::Output {
386    // no lt, so just call gt with swapped args
387    Self::simd_gt(rhs, self)
388  }
389}
390
391impl u64x2 {
392  #[inline]
393  #[must_use]
394  pub const fn new(array: [u64; 2]) -> Self {
395    unsafe { core::mem::transmute(array) }
396  }
397  #[inline]
398  #[must_use]
399  pub fn simd_eq(self, rhs: Self) -> Self {
400    pick! {
401      if #[cfg(target_feature="sse4.1")] {
402        Self { sse: cmp_eq_mask_i64_m128i(self.sse, rhs.sse) }
403      } else if #[cfg(target_feature="simd128")] {
404        Self { simd: u64x2_eq(self.simd, rhs.simd) }
405      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
406        unsafe {Self { neon: vceqq_u64(self.neon, rhs.neon) } }
407      } else {
408        let s: [u64;2] = cast(self);
409        let r: [u64;2] = cast(rhs);
410        cast([
411          if s[0] == r[0] { -1_i64 } else { 0 },
412          if s[1] == r[1] { -1_i64 } else { 0 },
413        ])
414      }
415    }
416  }
417  #[inline]
418  #[must_use]
419  pub fn simd_gt(self, rhs: Self) -> Self {
420    pick! {
421      if #[cfg(target_feature="sse4.2")] {
422        // no unsigned gt so inverting the high bit will get the correct result
423        let highbit = u64x2::splat(1 << 63);
424        Self { sse: cmp_gt_mask_i64_m128i((self ^ highbit).sse, (rhs ^ highbit).sse) }
425      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
426        unsafe {Self { neon: vcgtq_u64(self.neon, rhs.neon) }}
427      } else {
428        // u64x2_gt on WASM is not a thing. https://github.com/WebAssembly/simd/pull/414
429        let s: [u64;2] = cast(self);
430        let r: [u64;2] = cast(rhs);
431        cast([
432          if s[0] > r[0] { u64::MAX } else { 0 },
433          if s[1] > r[1] { u64::MAX } else { 0 },
434        ])
435      }
436    }
437  }
438
439  #[inline]
440  #[must_use]
441  pub fn simd_lt(self, rhs: Self) -> Self {
442    // lt is just gt the other way around
443    rhs.simd_gt(self)
444  }
445
446  #[inline]
447  #[must_use]
448  pub fn blend(self, t: Self, f: Self) -> Self {
449    pick! {
450      if #[cfg(target_feature="sse4.1")] {
451        Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) }
452      } else if #[cfg(target_feature="simd128")] {
453        Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
454      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
455        unsafe {Self { neon: vbslq_u64(self.neon, t.neon, f.neon) }}
456      } else {
457        generic_bit_blend(self, t, f)
458      }
459    }
460  }
461  
462  #[inline]
463  #[must_use]
464  pub fn to_bitmask(self) -> u32 {
465    i64x2::to_bitmask(cast(self))
466  }
467
468  #[inline]
469  pub fn to_array(self) -> [u64; 2] {
470    cast(self)
471  }
472
473  #[inline]
474  pub fn as_array(&self) -> &[u64; 2] {
475    cast_ref(self)
476  }
477
478  #[inline]
479  pub fn as_mut_array(&mut self) -> &mut [u64; 2] {
480    cast_mut(self)
481  }
482
483  #[inline]
484  #[must_use]
485  pub fn min(self, rhs: Self) -> Self {
486    self.simd_lt(rhs).blend(self, rhs)
487  }
488
489  #[inline]
490  #[must_use]
491  pub fn max(self, rhs: Self) -> Self {
492    self.simd_gt(rhs).blend(self, rhs)
493  }
494
495  #[inline]
496  #[must_use]
497  pub fn mul_keep_high(self, rhs: Self) -> Self {
498    let arr1: [u64; 2] = cast(self);
499    let arr2: [u64; 2] = cast(rhs);
500    cast([
501      ((arr1[0] as u128 * arr2[0] as u128) >> 64) as u64,
502      ((arr1[1] as u128 * arr2[1] as u128) >> 64) as u64,
503    ])
504  }
505}