[go: up one dir, main page]

wide/
i32x8_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="avx2")] {
5    #[derive(Default, Clone, Copy, PartialEq, Eq)]
6    #[repr(C, align(32))]
7    pub struct i32x8 { pub(crate) avx2: m256i }
8  } else {
9    #[derive(Default, Clone, Copy, PartialEq, Eq)]
10    #[repr(C, align(32))]
11    pub struct i32x8 { pub(crate) a : i32x4, pub(crate) b : i32x4}
12  }
13}
14
15int_uint_consts!(i32, 8, i32x8, 256);
16
17unsafe impl Zeroable for i32x8 {}
18unsafe impl Pod for i32x8 {}
19
20impl AlignTo for i32x8 {
21  type Elem = i32;
22}
23
24impl Add for i32x8 {
25  type Output = Self;
26  #[inline]
27  fn add(self, rhs: Self) -> Self::Output {
28    pick! {
29      if #[cfg(target_feature="avx2")] {
30        Self { avx2: add_i32_m256i(self.avx2, rhs.avx2) }
31      } else {
32        Self {
33          a : self.a.add(rhs.a),
34          b : self.b.add(rhs.b),
35        }
36      }
37    }
38  }
39}
40
41impl Sub for i32x8 {
42  type Output = Self;
43  #[inline]
44  fn sub(self, rhs: Self) -> Self::Output {
45    pick! {
46      if #[cfg(target_feature="avx2")] {
47        Self { avx2: sub_i32_m256i(self.avx2, rhs.avx2) }
48      } else {
49        Self {
50          a : self.a.sub(rhs.a),
51          b : self.b.sub(rhs.b),
52        }
53      }
54    }
55  }
56}
57
58impl Mul for i32x8 {
59  type Output = Self;
60  #[inline]
61  fn mul(self, rhs: Self) -> Self::Output {
62    pick! {
63      if #[cfg(target_feature="avx2")] {
64        Self { avx2: mul_i32_keep_low_m256i(self.avx2, rhs.avx2) }
65      } else {
66        Self {
67          a : self.a.mul(rhs.a),
68          b : self.b.mul(rhs.b),
69        }
70      }
71    }
72  }
73}
74
75impl Add<i32> for i32x8 {
76  type Output = Self;
77  #[inline]
78  fn add(self, rhs: i32) -> Self::Output {
79    self.add(Self::splat(rhs))
80  }
81}
82
83impl Sub<i32> for i32x8 {
84  type Output = Self;
85  #[inline]
86  fn sub(self, rhs: i32) -> Self::Output {
87    self.sub(Self::splat(rhs))
88  }
89}
90
91impl Mul<i32> for i32x8 {
92  type Output = Self;
93  #[inline]
94  fn mul(self, rhs: i32) -> Self::Output {
95    self.mul(Self::splat(rhs))
96  }
97}
98
99impl Add<i32x8> for i32 {
100  type Output = i32x8;
101  #[inline]
102  fn add(self, rhs: i32x8) -> Self::Output {
103    i32x8::splat(self) + rhs
104  }
105}
106
107impl Sub<i32x8> for i32 {
108  type Output = i32x8;
109  #[inline]
110  fn sub(self, rhs: i32x8) -> Self::Output {
111    i32x8::splat(self) - rhs
112  }
113}
114
115impl Mul<i32x8> for i32 {
116  type Output = i32x8;
117  #[inline]
118  fn mul(self, rhs: i32x8) -> Self::Output {
119    i32x8::splat(self) * rhs
120  }
121}
122
123impl BitAnd for i32x8 {
124  type Output = Self;
125  #[inline]
126  fn bitand(self, rhs: Self) -> Self::Output {
127    pick! {
128      if #[cfg(target_feature="avx2")] {
129        Self { avx2: bitand_m256i(self.avx2, rhs.avx2) }
130      } else {
131        Self {
132          a : self.a.bitand(rhs.a),
133          b : self.b.bitand(rhs.b),
134        }
135      }
136    }
137  }
138}
139
140impl BitOr for i32x8 {
141  type Output = Self;
142  #[inline]
143  fn bitor(self, rhs: Self) -> Self::Output {
144    pick! {
145    if #[cfg(target_feature="avx2")] {
146      Self { avx2: bitor_m256i(self.avx2, rhs.avx2) }
147    } else {
148      Self {
149        a : self.a.bitor(rhs.a),
150        b : self.b.bitor(rhs.b),
151      }
152    }    }
153  }
154}
155
156impl BitXor for i32x8 {
157  type Output = Self;
158  #[inline]
159  fn bitxor(self, rhs: Self) -> Self::Output {
160    pick! {
161      if #[cfg(target_feature="avx2")] {
162        Self { avx2: bitxor_m256i(self.avx2, rhs.avx2) }
163      } else {
164        Self {
165          a : self.a.bitxor(rhs.a),
166          b : self.b.bitxor(rhs.b),
167        }
168      }
169    }
170  }
171}
172
173macro_rules! impl_shl_t_for_i32x8 {
174  ($($shift_type:ty),+ $(,)?) => {
175    $(impl Shl<$shift_type> for i32x8 {
176      type Output = Self;
177      /// Shifts all lanes by the value given.
178      #[inline]
179      fn shl(self, rhs: $shift_type) -> Self::Output {
180        pick! {
181          if #[cfg(target_feature="avx2")] {
182            let shift = cast([rhs as u64, 0]);
183            Self { avx2: shl_all_u32_m256i(self.avx2, shift) }
184          } else {
185            Self {
186              a : self.a.shl(rhs),
187              b : self.b.shl(rhs),
188            }
189          }
190        }
191      }
192    })+
193  };
194}
195impl_shl_t_for_i32x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
196
197macro_rules! impl_shr_t_for_i32x8 {
198  ($($shift_type:ty),+ $(,)?) => {
199    $(impl Shr<$shift_type> for i32x8 {
200      type Output = Self;
201      /// Shifts all lanes by the value given.
202      #[inline]
203      fn shr(self, rhs: $shift_type) -> Self::Output {
204        pick! {
205          if #[cfg(target_feature="avx2")] {
206            let shift = cast([rhs as u64, 0]);
207            Self { avx2: shr_all_i32_m256i(self.avx2, shift) }
208          } else {
209            Self {
210              a : self.a.shr(rhs),
211              b : self.b.shr(rhs),
212            }
213          }
214        }
215      }
216    })+
217  };
218}
219
220impl_shr_t_for_i32x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
221
222/// Shifts lanes by the corresponding lane.
223///
224/// Bitwise shift-right; yields `self >> mask(rhs)`, where mask removes any
225/// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
226/// of the type. (same as `wrapping_shr`)
227impl Shr<i32x8> for i32x8 {
228  type Output = Self;
229
230  #[inline]
231  fn shr(self, rhs: i32x8) -> Self::Output {
232    pick! {
233      if #[cfg(target_feature="avx2")] {
234        // ensure same behavior as scalar
235        let shift_by = bitand_m256i(rhs.avx2, set_splat_i32_m256i(31));
236        Self { avx2: shr_each_i32_m256i(self.avx2, shift_by ) }
237      } else {
238        Self {
239          a : self.a.shr(rhs.a),
240          b : self.b.shr(rhs.b),
241        }
242      }
243    }
244  }
245}
246
247/// Shifts lanes by the corresponding lane.
248///
249/// Bitwise shift-left; yields `self << mask(rhs)`, where mask removes any
250/// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
251/// of the type. (same as `wrapping_shl`)
252impl Shl<i32x8> for i32x8 {
253  type Output = Self;
254
255  #[inline]
256  fn shl(self, rhs: i32x8) -> Self::Output {
257    pick! {
258      if #[cfg(target_feature="avx2")] {
259        // ensure same behavior as scalar wrapping_shl by masking the shift count
260        let shift_by = bitand_m256i(rhs.avx2, set_splat_i32_m256i(31));
261        // shl is the same for unsigned and signed
262        Self { avx2: shl_each_u32_m256i(self.avx2, shift_by) }
263      } else {
264        Self {
265          a : self.a.shl(rhs.a),
266          b : self.b.shl(rhs.b),
267        }
268      }
269    }
270  }
271}
272
273impl CmpEq for i32x8 {
274  type Output = Self;
275  #[inline]
276  fn simd_eq(self, rhs: Self) -> Self::Output {
277    pick! {
278      if #[cfg(target_feature="avx2")] {
279        Self { avx2: cmp_eq_mask_i32_m256i(self.avx2, rhs.avx2) }
280      } else {
281        Self {
282          a : self.a.simd_eq(rhs.a),
283          b : self.b.simd_eq(rhs.b),
284        }
285      }
286    }
287  }
288}
289
290impl CmpGt for i32x8 {
291  type Output = Self;
292  #[inline]
293  fn simd_gt(self, rhs: Self) -> Self::Output {
294    pick! {
295      if #[cfg(target_feature="avx2")] {
296        Self { avx2: cmp_gt_mask_i32_m256i(self.avx2, rhs.avx2) }
297      } else {
298        Self {
299          a : self.a.simd_gt(rhs.a),
300          b : self.b.simd_gt(rhs.b),
301        }
302      }
303    }
304  }
305}
306
307impl CmpLt for i32x8 {
308  type Output = Self;
309  #[inline]
310  fn simd_lt(self, rhs: Self) -> Self::Output {
311    pick! {
312      if #[cfg(target_feature="avx2")] {
313        Self { avx2: cmp_gt_mask_i32_m256i(rhs.avx2, self.avx2) }
314      } else {
315        Self {
316          a : self.a.simd_lt(rhs.a),
317          b : self.b.simd_lt(rhs.b),
318        }
319      }
320    }
321  }
322}
323
324impl From<i16x8> for i32x8 {
325  #[inline]
326  fn from(value: i16x8) -> Self {
327    i32x8::from_i16x8(value)
328  }
329}
330
331impl i32x8 {
332  #[inline]
333  #[must_use]
334  pub const fn new(array: [i32; 8]) -> Self {
335    unsafe { core::mem::transmute(array) }
336  }
337
338  /// widens and sign extends to `i32x8`
339  #[inline]
340  #[must_use]
341  pub fn from_i16x8(v: i16x8) -> Self {
342    pick! {
343      if #[cfg(target_feature="avx2")] {
344        i32x8 { avx2:convert_to_i32_m256i_from_i16_m128i(v.sse) }
345      } else if #[cfg(target_feature="sse2")] {
346        i32x8 {
347          a: i32x4 { sse: shr_imm_i32_m128i::<16>( unpack_low_i16_m128i(v.sse, v.sse)) },
348          b: i32x4 { sse: shr_imm_i32_m128i::<16>( unpack_high_i16_m128i(v.sse, v.sse)) },
349        }
350      } else {
351        i32x8::new([
352          i32::from(v.as_array()[0]),
353          i32::from(v.as_array()[1]),
354          i32::from(v.as_array()[2]),
355          i32::from(v.as_array()[3]),
356          i32::from(v.as_array()[4]),
357          i32::from(v.as_array()[5]),
358          i32::from(v.as_array()[6]),
359          i32::from(v.as_array()[7]),
360        ])
361      }
362    }
363  }
364
365  /// widens and zero extends to `i32x8`
366  #[inline]
367  #[must_use]
368  pub fn from_u16x8(v: u16x8) -> Self {
369    pick! {
370      if #[cfg(target_feature="avx2")] {
371        i32x8 { avx2:convert_to_i32_m256i_from_u16_m128i(v.sse) }
372      } else if #[cfg(target_feature="sse2")] {
373        i32x8 {
374          a: i32x4 { sse: shr_imm_u32_m128i::<16>( unpack_low_i16_m128i(v.sse, v.sse)) },
375          b: i32x4 { sse: shr_imm_u32_m128i::<16>( unpack_high_i16_m128i(v.sse, v.sse)) },
376        }
377      } else {
378        i32x8::new([
379          i32::from(v.as_array()[0]),
380          i32::from(v.as_array()[1]),
381          i32::from(v.as_array()[2]),
382          i32::from(v.as_array()[3]),
383          i32::from(v.as_array()[4]),
384          i32::from(v.as_array()[5]),
385          i32::from(v.as_array()[6]),
386          i32::from(v.as_array()[7]),
387        ])
388      }
389    }
390  }
391
392  #[inline]
393  #[must_use]
394  pub fn blend(self, t: Self, f: Self) -> Self {
395    pick! {
396      if #[cfg(target_feature="avx2")] {
397        Self { avx2: blend_varying_i8_m256i(f.avx2, t.avx2, self.avx2) }
398      } else {
399        Self {
400          a : self.a.blend(t.a, f.a),
401          b : self.b.blend(t.b, f.b)
402        }
403      }
404    }
405  }
406
407  /// horizontal add of all the elements of the vector
408  #[inline]
409  #[must_use]
410  pub fn reduce_add(self) -> i32 {
411    let arr: [i32x4; 2] = cast(self);
412    (arr[0] + arr[1]).reduce_add()
413  }
414
415  /// horizontal max of all the elements of the vector
416  #[inline]
417  #[must_use]
418  pub fn reduce_max(self) -> i32 {
419    let arr: [i32x4; 2] = cast(self);
420    arr[0].max(arr[1]).reduce_max()
421  }
422
423  /// horizontal min of all the elements of the vector
424  #[inline]
425  #[must_use]
426  pub fn reduce_min(self) -> i32 {
427    let arr: [i32x4; 2] = cast(self);
428    arr[0].min(arr[1]).reduce_min()
429  }
430
431  #[inline]
432  #[must_use]
433  pub fn abs(self) -> Self {
434    pick! {
435      if #[cfg(target_feature="avx2")] {
436        Self { avx2: abs_i32_m256i(self.avx2) }
437      } else {
438        Self {
439          a : self.a.abs(),
440          b : self.b.abs(),
441        }
442      }
443    }
444  }
445
446  #[inline]
447  #[must_use]
448  pub fn unsigned_abs(self) -> u32x8 {
449    pick! {
450      if #[cfg(target_feature="avx2")] {
451        u32x8 { avx2: abs_i32_m256i(self.avx2) }
452      } else {
453        u32x8 {
454          a : self.a.unsigned_abs(),
455          b : self.b.unsigned_abs(),
456        }
457      }
458    }
459  }
460
461  #[inline]
462  #[must_use]
463  pub fn max(self, rhs: Self) -> Self {
464    pick! {
465      if #[cfg(target_feature="avx2")] {
466        Self { avx2: max_i32_m256i(self.avx2, rhs.avx2) }
467      } else {
468        Self {
469          a : self.a.max(rhs.a),
470          b : self.b.max(rhs.b),
471        }
472      }
473    }
474  }
475  #[inline]
476  #[must_use]
477  pub fn min(self, rhs: Self) -> Self {
478    pick! {
479      if #[cfg(target_feature="avx2")] {
480        Self { avx2: min_i32_m256i(self.avx2, rhs.avx2) }
481      } else {
482        Self {
483          a : self.a.min(rhs.a),
484          b : self.b.min(rhs.b),
485        }
486      }
487    }
488  }
489  #[inline]
490  #[must_use]
491  pub fn round_float(self) -> f32x8 {
492    pick! {
493      if #[cfg(target_feature="avx2")] {
494        cast(convert_to_m256_from_i32_m256i(self.avx2))
495      } else {
496        cast([
497          self.a.round_float(),
498          self.b.round_float(),
499        ])
500      }
501    }
502  }
503
504  #[inline]
505  #[must_use]
506  pub fn to_bitmask(self) -> u32 {
507    pick! {
508      if #[cfg(target_feature="avx2")] {
509        // use f32 move_mask since it is the same size as i32
510        move_mask_m256(cast(self.avx2)) as u32
511      } else {
512        self.a.to_bitmask() | (self.b.to_bitmask() << 4)
513      }
514    }
515  }
516
517  #[inline]
518  #[must_use]
519  pub fn any(self) -> bool {
520    pick! {
521      if #[cfg(target_feature="avx2")] {
522        move_mask_m256(cast(self.avx2)) != 0
523      } else {
524        (self.a | self.b).any()
525      }
526    }
527  }
528  #[inline]
529  #[must_use]
530  pub fn all(self) -> bool {
531    pick! {
532      if #[cfg(target_feature="avx2")] {
533        move_mask_m256(cast(self.avx2)) == 0b11111111
534      } else {
535        (self.a & self.b).all()
536      }
537    }
538  }
539  #[inline]
540  #[must_use]
541  pub fn none(self) -> bool {
542    !self.any()
543  }
544
545  /// Transpose matrix of 8x8 `i32` matrix. Currently only accelerated on AVX2.
546  #[must_use]
547  #[inline]
548  pub fn transpose(data: [i32x8; 8]) -> [i32x8; 8] {
549    pick! {
550      if #[cfg(target_feature="avx2")] {
551        let a0 = unpack_low_i32_m256i(data[0].avx2, data[1].avx2);
552        let a1 = unpack_high_i32_m256i(data[0].avx2, data[1].avx2);
553        let a2 = unpack_low_i32_m256i(data[2].avx2, data[3].avx2);
554        let a3 = unpack_high_i32_m256i(data[2].avx2, data[3].avx2);
555        let a4 = unpack_low_i32_m256i(data[4].avx2, data[5].avx2);
556        let a5 = unpack_high_i32_m256i(data[4].avx2, data[5].avx2);
557        let a6 = unpack_low_i32_m256i(data[6].avx2, data[7].avx2);
558        let a7 = unpack_high_i32_m256i(data[6].avx2, data[7].avx2);
559
560        pub const fn mm_shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 {
561          (z << 6) | (y << 4) | (x << 2) | w
562        }
563
564        const SHUFF_LO : i32 = mm_shuffle(1,0,1,0);
565        const SHUFF_HI : i32 = mm_shuffle(3,2,3,2);
566
567        // possible todo: intel performance manual suggests alternative with blend to avoid port 5 pressure
568        // (since blend runs on a different port than shuffle)
569        let b0 = cast::<m256,m256i>(shuffle_m256::<SHUFF_LO>(cast(a0),cast(a2)));
570        let b1 = cast::<m256,m256i>(shuffle_m256::<SHUFF_HI>(cast(a0),cast(a2)));
571        let b2 = cast::<m256,m256i>(shuffle_m256::<SHUFF_LO>(cast(a1),cast(a3)));
572        let b3 = cast::<m256,m256i>(shuffle_m256::<SHUFF_HI>(cast(a1),cast(a3)));
573        let b4 = cast::<m256,m256i>(shuffle_m256::<SHUFF_LO>(cast(a4),cast(a6)));
574        let b5 = cast::<m256,m256i>(shuffle_m256::<SHUFF_HI>(cast(a4),cast(a6)));
575        let b6 = cast::<m256,m256i>(shuffle_m256::<SHUFF_LO>(cast(a5),cast(a7)));
576        let b7 = cast::<m256,m256i>(shuffle_m256::<SHUFF_HI>(cast(a5),cast(a7)));
577
578        [
579          i32x8 { avx2: permute2z_m256i::<0x20>(b0, b4) },
580          i32x8 { avx2: permute2z_m256i::<0x20>(b1, b5) },
581          i32x8 { avx2: permute2z_m256i::<0x20>(b2, b6) },
582          i32x8 { avx2: permute2z_m256i::<0x20>(b3, b7) },
583          i32x8 { avx2: permute2z_m256i::<0x31>(b0, b4) },
584          i32x8 { avx2: permute2z_m256i::<0x31>(b1, b5) },
585          i32x8 { avx2: permute2z_m256i::<0x31>(b2, b6) },
586          i32x8 { avx2: permute2z_m256i::<0x31>(b3, b7) }
587        ]
588      } else {
589        // possible todo: not sure that 128bit SIMD gives us a a lot of speedup here
590
591        #[inline(always)]
592        fn transpose_column(data: &[i32x8; 8], index: usize) -> i32x8 {
593          i32x8::new([
594            data[0].as_array()[index],
595            data[1].as_array()[index],
596            data[2].as_array()[index],
597            data[3].as_array()[index],
598            data[4].as_array()[index],
599            data[5].as_array()[index],
600            data[6].as_array()[index],
601            data[7].as_array()[index],
602          ])
603        }
604
605        [
606          transpose_column(&data, 0),
607          transpose_column(&data, 1),
608          transpose_column(&data, 2),
609          transpose_column(&data, 3),
610          transpose_column(&data, 4),
611          transpose_column(&data, 5),
612          transpose_column(&data, 6),
613          transpose_column(&data, 7),
614        ]
615      }
616    }
617  }
618
619  #[inline]
620  pub fn to_array(self) -> [i32; 8] {
621    cast(self)
622  }
623
624  #[inline]
625  pub fn as_array(&self) -> &[i32; 8] {
626    cast_ref(self)
627  }
628
629  #[inline]
630  pub fn as_mut_array(&mut self) -> &mut [i32; 8] {
631    cast_mut(self)
632  }
633}
634
635impl Not for i32x8 {
636  type Output = Self;
637  #[inline]
638  fn not(self) -> Self {
639    pick! {
640      if #[cfg(target_feature="avx2")] {
641        Self { avx2: self.avx2.not()  }
642      } else {
643        Self {
644          a : self.a.not(),
645          b : self.b.not(),
646        }
647      }
648    }
649  }
650}