[go: up one dir, main page]

wide/
i16x16_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="avx2")] {
5    #[derive(Default, Clone, Copy, PartialEq, Eq)]
6    #[repr(C, align(32))]
7    pub struct i16x16 { pub(crate) avx2: m256i }
8  } else {
9    #[derive(Default, Clone, Copy, PartialEq, Eq)]
10    #[repr(C, align(32))]
11    pub struct i16x16 { pub(crate) a : i16x8, pub(crate) b : i16x8 }
12  }
13}
14
15int_uint_consts!(i16, 16, i16x16, 256);
16
17unsafe impl Zeroable for i16x16 {}
18unsafe impl Pod for i16x16 {}
19
20impl AlignTo for i16x16 {
21  type Elem = i16;
22}
23
24impl Add for i16x16 {
25  type Output = Self;
26  #[inline]
27  fn add(self, rhs: Self) -> Self::Output {
28    pick! {
29      if #[cfg(target_feature="avx2")] {
30        Self { avx2: add_i16_m256i(self.avx2, rhs.avx2) }
31      } else {
32        Self {
33          a : self.a.add(rhs.a),
34          b : self.b.add(rhs.b),
35        }
36      }
37    }
38  }
39}
40
41impl Sub for i16x16 {
42  type Output = Self;
43  #[inline]
44  fn sub(self, rhs: Self) -> Self::Output {
45    pick! {
46      if #[cfg(target_feature="avx2")] {
47        Self { avx2: sub_i16_m256i(self.avx2, rhs.avx2) }
48      } else {
49        Self {
50          a : self.a.sub(rhs.a),
51          b : self.b.sub(rhs.b),
52        }
53      }
54    }
55  }
56}
57
58impl Mul for i16x16 {
59  type Output = Self;
60  #[inline]
61  fn mul(self, rhs: Self) -> Self::Output {
62    pick! {
63      if #[cfg(target_feature="avx2")] {
64        Self { avx2: mul_i16_keep_low_m256i(self.avx2, rhs.avx2) }
65      } else {
66        Self {
67          a : self.a.mul(rhs.a),
68          b : self.b.mul(rhs.b),
69        }
70      }
71    }
72  }
73}
74
75impl Add<i16> for i16x16 {
76  type Output = Self;
77  #[inline]
78  fn add(self, rhs: i16) -> Self::Output {
79    self.add(Self::splat(rhs))
80  }
81}
82
83impl Sub<i16> for i16x16 {
84  type Output = Self;
85  #[inline]
86  fn sub(self, rhs: i16) -> Self::Output {
87    self.sub(Self::splat(rhs))
88  }
89}
90
91impl Mul<i16> for i16x16 {
92  type Output = Self;
93  #[inline]
94  fn mul(self, rhs: i16) -> Self::Output {
95    self.mul(Self::splat(rhs))
96  }
97}
98
99impl Add<i16x16> for i16 {
100  type Output = i16x16;
101  #[inline]
102  fn add(self, rhs: i16x16) -> Self::Output {
103    i16x16::splat(self).add(rhs)
104  }
105}
106
107impl Sub<i16x16> for i16 {
108  type Output = i16x16;
109  #[inline]
110  fn sub(self, rhs: i16x16) -> Self::Output {
111    i16x16::splat(self).sub(rhs)
112  }
113}
114
115impl Mul<i16x16> for i16 {
116  type Output = i16x16;
117  #[inline]
118  fn mul(self, rhs: i16x16) -> Self::Output {
119    i16x16::splat(self).mul(rhs)
120  }
121}
122
123impl BitAnd for i16x16 {
124  type Output = Self;
125  #[inline]
126  fn bitand(self, rhs: Self) -> Self::Output {
127    pick! {
128      if #[cfg(target_feature="avx2")] {
129        Self { avx2: bitand_m256i(self.avx2, rhs.avx2) }
130      } else {
131        Self {
132          a : self.a.bitand(rhs.a),
133          b : self.b.bitand(rhs.b),
134        }
135      }
136    }
137  }
138}
139
140impl BitOr for i16x16 {
141  type Output = Self;
142  #[inline]
143  fn bitor(self, rhs: Self) -> Self::Output {
144    pick! {
145      if #[cfg(target_feature="avx2")] {
146        Self { avx2: bitor_m256i(self.avx2, rhs.avx2) }
147      } else {
148        Self {
149          a : self.a.bitor(rhs.a),
150          b : self.b.bitor(rhs.b),
151        }
152      }
153    }
154  }
155}
156
157impl BitXor for i16x16 {
158  type Output = Self;
159  #[inline]
160  fn bitxor(self, rhs: Self) -> Self::Output {
161    pick! {
162      if #[cfg(target_feature="avx2")] {
163        Self { avx2: bitxor_m256i(self.avx2, rhs.avx2) }
164      } else {
165        Self {
166          a : self.a.bitxor(rhs.a),
167          b : self.b.bitxor(rhs.b),
168        }
169      }
170    }
171  }
172}
173
174macro_rules! impl_shl_t_for_i16x16 {
175  ($($shift_type:ty),+ $(,)?) => {
176    $(impl Shl<$shift_type> for i16x16 {
177      type Output = Self;
178      /// Shifts all lanes by the value given.
179      #[inline]
180      fn shl(self, rhs: $shift_type) -> Self::Output {
181        pick! {
182          if #[cfg(target_feature="avx2")] {
183            let shift = cast([rhs as u64, 0]);
184            Self { avx2: shl_all_u16_m256i(self.avx2, shift) }
185          } else {
186            Self {
187              a : self.a.shl(rhs),
188              b : self.b.shl(rhs),
189            }
190          }
191       }
192     }
193    })+
194  };
195}
196impl_shl_t_for_i16x16!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
197
198macro_rules! impl_shr_t_for_i16x16 {
199  ($($shift_type:ty),+ $(,)?) => {
200    $(impl Shr<$shift_type> for i16x16 {
201      type Output = Self;
202      /// Shifts all lanes by the value given.
203      #[inline]
204      fn shr(self, rhs: $shift_type) -> Self::Output {
205        pick! {
206          if #[cfg(target_feature="avx2")] {
207            let shift = cast([rhs as u64, 0]);
208            Self { avx2: shr_all_i16_m256i(self.avx2, shift) }
209          } else {
210            Self {
211              a : self.a.shr(rhs),
212              b : self.b.shr(rhs),
213            }
214          }
215        }
216      }
217    })+
218  };
219}
220impl_shr_t_for_i16x16!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
221
222impl CmpEq for i16x16 {
223  type Output = Self;
224  #[inline]
225  fn simd_eq(self, rhs: Self) -> Self::Output {
226    pick! {
227      if #[cfg(target_feature="avx2")] {
228        Self { avx2: cmp_eq_mask_i16_m256i(self.avx2, rhs.avx2) }
229      } else {
230        Self {
231          a : self.a.simd_eq(rhs.a),
232          b : self.b.simd_eq(rhs.b),
233        }
234      }
235    }
236  }
237}
238
239impl CmpGt for i16x16 {
240  type Output = Self;
241  #[inline]
242  fn simd_gt(self, rhs: Self) -> Self::Output {
243    pick! {
244      if #[cfg(target_feature="avx2")] {
245        Self { avx2: cmp_gt_mask_i16_m256i(self.avx2, rhs.avx2) }
246      } else {
247        Self {
248          a : self.a.simd_gt(rhs.a),
249          b : self.b.simd_gt(rhs.b),
250        }
251      }
252    }
253  }
254}
255
256impl CmpLt for i16x16 {
257  type Output = Self;
258  #[inline]
259  fn simd_lt(self, rhs: Self) -> Self::Output {
260    pick! {
261      if #[cfg(target_feature="avx2")] {
262        Self { avx2: !cmp_gt_mask_i16_m256i(self.avx2, rhs.avx2) ^ cmp_eq_mask_i16_m256i(self.avx2,rhs.avx2) }
263      } else {
264        Self {
265          a : self.a.simd_lt(rhs.a),
266          b : self.b.simd_lt(rhs.b),
267        }
268      }
269    }
270  }
271}
272
273impl From<i8x16> for i16x16 {
274  /// widen with sign extend from i8 to i16
275  #[inline]
276  fn from(i: i8x16) -> Self {
277    i16x16::from_i8x16(i)
278  }
279}
280
281impl From<u8x16> for i16x16 {
282  /// widen with zero extend from u8 to i16
283  #[inline]
284  fn from(i: u8x16) -> Self {
285    cast(u16x16::from(i))
286  }
287}
288
289impl Not for i16x16 {
290  type Output = Self;
291  #[inline]
292  fn not(self) -> Self {
293    pick! {
294      if #[cfg(target_feature="avx2")] {
295        Self { avx2: self.avx2.not()  }
296      } else {
297        Self {
298          a : self.a.not(),
299          b : self.b.not(),
300        }
301      }
302    }
303  }
304}
305
306impl i16x16 {
307  #[inline]
308  #[must_use]
309  pub const fn new(array: [i16; 16]) -> Self {
310    unsafe { core::mem::transmute(array) }
311  }
312
313  #[inline]
314  #[must_use]
315  pub fn to_bitmask(self) -> u32 {
316    pick! {
317      if #[cfg(target_feature="sse2")] {
318          let [a,b] = cast::<_,[m128i;2]>(self);
319          move_mask_i8_m128i( pack_i16_to_i8_m128i(a,b)) as u32
320        } else {
321        self.a.to_bitmask() | (self.b.to_bitmask() << 8)
322      }
323    }
324  }
325
326  #[inline]
327  #[must_use]
328  pub fn any(self) -> bool {
329    pick! {
330      if #[cfg(target_feature="avx2")] {
331        ((move_mask_i8_m256i(self.avx2) as u32) & 0b10101010101010101010101010101010) != 0
332      } else {
333        (self.a | self.b).any()
334      }
335    }
336  }
337  #[inline]
338  #[must_use]
339  pub fn all(self) -> bool {
340    pick! {
341      if #[cfg(target_feature="avx2")] {
342        ((move_mask_i8_m256i(self.avx2) as u32) & 0b10101010101010101010101010101010) == 0b10101010101010101010101010101010
343      } else {
344        (self.a & self.b).all()
345      }
346    }
347  }
348  #[inline]
349  #[must_use]
350  pub fn none(self) -> bool {
351    !self.any()
352  }
353
354  /// widens and sign extends to i16x16
355  #[inline]
356  #[must_use]
357  pub fn from_i8x16(v: i8x16) -> Self {
358    pick! {
359      if #[cfg(target_feature="avx2")] {
360        i16x16 { avx2:convert_to_i16_m256i_from_i8_m128i(v.sse) }
361      } else if #[cfg(target_feature="sse4.1")] {
362        i16x16 {
363          a: i16x8 { sse: convert_to_i16_m128i_from_lower8_i8_m128i(v.sse) },
364          b: i16x8 { sse: convert_to_i16_m128i_from_lower8_i8_m128i(unpack_high_i64_m128i(v.sse, v.sse)) }
365        }
366      } else if #[cfg(target_feature="sse2")] {
367        i16x16 {
368          a: i16x8 { sse: shr_imm_i16_m128i::<8>( unpack_low_i8_m128i(v.sse, v.sse)) },
369          b: i16x8 { sse: shr_imm_i16_m128i::<8>( unpack_high_i8_m128i(v.sse, v.sse)) },
370        }
371      } else {
372
373        i16x16::new([
374          v.as_array()[0] as i16,
375          v.as_array()[1] as i16,
376          v.as_array()[2] as i16,
377          v.as_array()[3] as i16,
378          v.as_array()[4] as i16,
379          v.as_array()[5] as i16,
380          v.as_array()[6] as i16,
381          v.as_array()[7] as i16,
382          v.as_array()[8] as i16,
383          v.as_array()[9] as i16,
384          v.as_array()[10] as i16,
385          v.as_array()[11] as i16,
386          v.as_array()[12] as i16,
387          v.as_array()[13] as i16,
388          v.as_array()[14] as i16,
389          v.as_array()[15] as i16,
390          ])
391      }
392    }
393  }
394
395  #[inline]
396  #[must_use]
397  pub fn blend(self, t: Self, f: Self) -> Self {
398    pick! {
399      if #[cfg(target_feature="avx2")] {
400        Self { avx2: blend_varying_i8_m256i(f.avx2, t.avx2, self.avx2) }
401      } else {
402        Self {
403          a : self.a.blend(t.a, f.a),
404          b : self.b.blend(t.b, f.b),
405        }
406      }
407    }
408  }
409
410  /// horizontal add of all the elements of the vector
411  #[inline]
412  #[must_use]
413  pub fn reduce_add(self) -> i16 {
414    let arr: [i16x8; 2] = cast(self);
415
416    (arr[0] + arr[1]).reduce_add()
417  }
418
419  /// horizontal min of all the elements of the vector
420  #[inline]
421  #[must_use]
422  pub fn reduce_min(self) -> i16 {
423    let arr: [i16x8; 2] = cast(self);
424
425    arr[0].min(arr[1]).reduce_min()
426  }
427
428  /// horizontal max of all the elements of the vector
429  #[inline]
430  #[must_use]
431  pub fn reduce_max(self) -> i16 {
432    let arr: [i16x8; 2] = cast(self);
433
434    arr[0].max(arr[1]).reduce_max()
435  }
436
437  #[inline]
438  #[must_use]
439  pub fn abs(self) -> Self {
440    pick! {
441      if #[cfg(target_feature="avx2")] {
442        Self { avx2: abs_i16_m256i(self.avx2) }
443      } else {
444        Self {
445          a : self.a.abs(),
446          b : self.b.abs(),
447        }
448      }
449    }
450  }
451  #[inline]
452  #[must_use]
453  pub fn max(self, rhs: Self) -> Self {
454    pick! {
455      if #[cfg(target_feature="avx2")] {
456        Self { avx2: max_i16_m256i(self.avx2, rhs.avx2) }
457      } else {
458        Self {
459          a : self.a.max(rhs.a),
460          b : self.b.max(rhs.b),
461        }
462      }
463    }
464  }
465  #[inline]
466  #[must_use]
467  pub fn min(self, rhs: Self) -> Self {
468    pick! {
469      if #[cfg(target_feature="avx2")] {
470        Self { avx2: min_i16_m256i(self.avx2, rhs.avx2) }
471      } else {
472        Self {
473          a : self.a.min(rhs.a),
474          b : self.b.min(rhs.b),
475        }
476      }
477    }
478  }
479
480  #[inline]
481  #[must_use]
482  pub fn saturating_add(self, rhs: Self) -> Self {
483    pick! {
484      if #[cfg(target_feature="avx2")] {
485        Self { avx2: add_saturating_i16_m256i(self.avx2, rhs.avx2) }
486      } else {
487        Self {
488          a : self.a.saturating_add(rhs.a),
489          b : self.b.saturating_add(rhs.b),
490        }
491      }
492    }
493  }
494  #[inline]
495  #[must_use]
496  pub fn saturating_sub(self, rhs: Self) -> Self {
497    pick! {
498      if #[cfg(target_feature="avx2")] {
499        Self { avx2: sub_saturating_i16_m256i(self.avx2, rhs.avx2) }
500      } else {
501        Self {
502          a : self.a.saturating_sub(rhs.a),
503          b : self.b.saturating_sub(rhs.b),
504        }
505      }
506    }
507  }
508
509  /// Calculates partial dot product.
510  /// Multiplies packed signed 16-bit integers, producing intermediate signed
511  /// 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit
512  /// integers.
513  #[inline]
514  #[must_use]
515  pub fn dot(self, rhs: Self) -> i32x8 {
516    pick! {
517      if #[cfg(target_feature="avx2")] {
518        i32x8 { avx2:  mul_i16_horizontal_add_m256i(self.avx2, rhs.avx2) }
519      } else {
520        i32x8 {
521          a : self.a.dot(rhs.a),
522          b : self.b.dot(rhs.b),
523        }
524      }
525    }
526  }
527
528  /// Multiply and scale equivalent to `((self * rhs) + 0x4000) >> 15` on each
529  /// lane, effectively multiplying by a 16 bit fixed point number between `-1`
530  /// and `1`. This corresponds to the following instructions:
531  /// - `vqrdmulhq_n_s16` instruction on neon
532  /// - `i16x8_q15mulr_sat` on simd128
533  /// - `_mm256_mulhrs_epi16` on avx2
534  /// - emulated via `mul_i16_*` on sse2
535  #[inline]
536  #[must_use]
537  pub fn mul_scale_round(self, rhs: Self) -> Self {
538    pick! {
539      if #[cfg(target_feature="avx2")] {
540        Self { avx2: mul_i16_scale_round_m256i(self.avx2, rhs.avx2) }
541      } else {
542        Self {
543          a : self.a.mul_scale_round(rhs.a),
544          b : self.b.mul_scale_round(rhs.b),
545        }
546      }
547    }
548  }
549
550  /// Multiply and scale equivalent to `((self * rhs) + 0x4000) >> 15` on each
551  /// lane, effectively multiplying by a 16 bit fixed point number between `-1`
552  /// and `1`. This corresponds to the following instructions:
553  /// - `vqrdmulhq_n_s16` instruction on neon
554  /// - `i16x8_q15mulr_sat` on simd128
555  /// - `_mm256_mulhrs_epi16` on avx2
556  /// - emulated via `mul_i16_*` on sse2
557  #[inline]
558  #[must_use]
559  pub fn mul_scale_round_n(self, rhs: i16) -> Self {
560    pick! {
561      if #[cfg(target_feature="avx2")] {
562        Self { avx2: mul_i16_scale_round_m256i(self.avx2, set_splat_i16_m256i(rhs)) }
563      } else {
564        Self {
565          a : self.a.mul_scale_round_n(rhs),
566          b : self.b.mul_scale_round_n(rhs),
567        }
568      }
569    }
570  }
571
572  #[inline]
573  pub fn to_array(self) -> [i16; 16] {
574    cast(self)
575  }
576
577  #[inline]
578  pub fn as_array(&self) -> &[i16; 16] {
579    cast_ref(self)
580  }
581
582  #[inline]
583  pub fn as_mut_array(&mut self) -> &mut [i16; 16] {
584    cast_mut(self)
585  }
586}