1use super::*;
2
3pick! {
4 if #[cfg(target_feature="sse2")] {
5 #[derive(Default, Clone, Copy, PartialEq, Eq)]
6 #[repr(C, align(16))]
7 pub struct i16x8 { pub(crate) sse: m128i }
8 } else if #[cfg(target_feature="simd128")] {
9 use core::arch::wasm32::*;
10
11 #[derive(Clone, Copy)]
12 #[repr(transparent)]
13 pub struct i16x8 { pub(crate) simd: v128 }
14
15 impl Default for i16x8 {
16 fn default() -> Self {
17 Self::splat(0)
18 }
19 }
20
21 impl PartialEq for i16x8 {
22 fn eq(&self, other: &Self) -> bool {
23 u16x8_all_true(i16x8_eq(self.simd, other.simd))
24 }
25 }
26
27 impl Eq for i16x8 { }
28 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
29 use core::arch::aarch64::*;
30 #[repr(C)]
31 #[derive(Copy, Clone)]
32 pub struct i16x8 { pub(crate) neon : int16x8_t }
33
34 impl Default for i16x8 {
35 #[inline]
36 fn default() -> Self {
37 Self::splat(0)
38 }
39 }
40
41 impl PartialEq for i16x8 {
42 #[inline]
43 fn eq(&self, other: &Self) -> bool {
44 unsafe { vminvq_u16(vceqq_s16(self.neon, other.neon))==u16::MAX }
45 }
46 }
47
48 impl Eq for i16x8 { }
49 } else {
50 #[derive(Default, Clone, Copy, PartialEq, Eq)]
51 #[repr(C, align(16))]
52 pub struct i16x8 { pub(crate) arr: [i16;8] }
53 }
54}
55
56int_uint_consts!(i16, 8, i16x8, 128);
57
58unsafe impl Zeroable for i16x8 {}
59unsafe impl Pod for i16x8 {}
60
61impl AlignTo for i16x8 {
62 type Elem = i16;
63}
64
65impl Add for i16x8 {
66 type Output = Self;
67 #[inline]
68 fn add(self, rhs: Self) -> Self::Output {
69 pick! {
70 if #[cfg(target_feature="sse2")] {
71 Self { sse: add_i16_m128i(self.sse, rhs.sse) }
72 } else if #[cfg(target_feature="simd128")] {
73 Self { simd: i16x8_add(self.simd, rhs.simd) }
74 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
75 unsafe { Self { neon: vaddq_s16(self.neon, rhs.neon) } }
76 } else {
77 Self { arr: [
78 self.arr[0].wrapping_add(rhs.arr[0]),
79 self.arr[1].wrapping_add(rhs.arr[1]),
80 self.arr[2].wrapping_add(rhs.arr[2]),
81 self.arr[3].wrapping_add(rhs.arr[3]),
82 self.arr[4].wrapping_add(rhs.arr[4]),
83 self.arr[5].wrapping_add(rhs.arr[5]),
84 self.arr[6].wrapping_add(rhs.arr[6]),
85 self.arr[7].wrapping_add(rhs.arr[7]),
86 ]}
87 }
88 }
89 }
90}
91
92impl Sub for i16x8 {
93 type Output = Self;
94 #[inline]
95 fn sub(self, rhs: Self) -> Self::Output {
96 pick! {
97 if #[cfg(target_feature="sse2")] {
98 Self { sse: sub_i16_m128i(self.sse, rhs.sse) }
99 } else if #[cfg(target_feature="simd128")] {
100 Self { simd: i16x8_sub(self.simd, rhs.simd) }
101 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
102 unsafe {Self { neon: vsubq_s16(self.neon, rhs.neon) }}
103 } else {
104 Self { arr: [
105 self.arr[0].wrapping_sub(rhs.arr[0]),
106 self.arr[1].wrapping_sub(rhs.arr[1]),
107 self.arr[2].wrapping_sub(rhs.arr[2]),
108 self.arr[3].wrapping_sub(rhs.arr[3]),
109 self.arr[4].wrapping_sub(rhs.arr[4]),
110 self.arr[5].wrapping_sub(rhs.arr[5]),
111 self.arr[6].wrapping_sub(rhs.arr[6]),
112 self.arr[7].wrapping_sub(rhs.arr[7]),
113 ]}
114 }
115 }
116 }
117}
118
119impl Mul for i16x8 {
120 type Output = Self;
121 #[inline]
122 fn mul(self, rhs: Self) -> Self::Output {
123 pick! {
124 if #[cfg(target_feature="sse2")] {
125 Self { sse: mul_i16_keep_low_m128i(self.sse, rhs.sse) }
126 } else if #[cfg(target_feature="simd128")] {
127 Self { simd: i16x8_mul(self.simd, rhs.simd) }
128 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
129 unsafe {Self { neon: vmulq_s16(self.neon, rhs.neon) }}
130 } else {
131 Self { arr: [
132 self.arr[0].wrapping_mul(rhs.arr[0]),
133 self.arr[1].wrapping_mul(rhs.arr[1]),
134 self.arr[2].wrapping_mul(rhs.arr[2]),
135 self.arr[3].wrapping_mul(rhs.arr[3]),
136 self.arr[4].wrapping_mul(rhs.arr[4]),
137 self.arr[5].wrapping_mul(rhs.arr[5]),
138 self.arr[6].wrapping_mul(rhs.arr[6]),
139 self.arr[7].wrapping_mul(rhs.arr[7]),
140 ]}
141 }
142 }
143 }
144}
145
146impl Add<i16> for i16x8 {
147 type Output = Self;
148 #[inline]
149 fn add(self, rhs: i16) -> Self::Output {
150 self.add(Self::splat(rhs))
151 }
152}
153
154impl Sub<i16> for i16x8 {
155 type Output = Self;
156 #[inline]
157 fn sub(self, rhs: i16) -> Self::Output {
158 self.sub(Self::splat(rhs))
159 }
160}
161
162impl Mul<i16> for i16x8 {
163 type Output = Self;
164 #[inline]
165 fn mul(self, rhs: i16) -> Self::Output {
166 self.mul(Self::splat(rhs))
167 }
168}
169
170impl Add<i16x8> for i16 {
171 type Output = i16x8;
172 #[inline]
173 fn add(self, rhs: i16x8) -> Self::Output {
174 i16x8::splat(self).add(rhs)
175 }
176}
177
178impl Sub<i16x8> for i16 {
179 type Output = i16x8;
180 #[inline]
181 fn sub(self, rhs: i16x8) -> Self::Output {
182 i16x8::splat(self).sub(rhs)
183 }
184}
185
186impl Mul<i16x8> for i16 {
187 type Output = i16x8;
188 #[inline]
189 fn mul(self, rhs: i16x8) -> Self::Output {
190 i16x8::splat(self).mul(rhs)
191 }
192}
193
194impl BitAnd for i16x8 {
195 type Output = Self;
196 #[inline]
197 fn bitand(self, rhs: Self) -> Self::Output {
198 pick! {
199 if #[cfg(target_feature="sse2")] {
200 Self { sse: bitand_m128i(self.sse, rhs.sse) }
201 } else if #[cfg(target_feature="simd128")] {
202 Self { simd: v128_and(self.simd, rhs.simd) }
203 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
204 unsafe {Self { neon: vandq_s16(self.neon, rhs.neon) }}
205 } else {
206 Self { arr: [
207 self.arr[0].bitand(rhs.arr[0]),
208 self.arr[1].bitand(rhs.arr[1]),
209 self.arr[2].bitand(rhs.arr[2]),
210 self.arr[3].bitand(rhs.arr[3]),
211 self.arr[4].bitand(rhs.arr[4]),
212 self.arr[5].bitand(rhs.arr[5]),
213 self.arr[6].bitand(rhs.arr[6]),
214 self.arr[7].bitand(rhs.arr[7]),
215 ]}
216 }
217 }
218 }
219}
220
221impl BitOr for i16x8 {
222 type Output = Self;
223 #[inline]
224 fn bitor(self, rhs: Self) -> Self::Output {
225 pick! {
226 if #[cfg(target_feature="sse2")] {
227 Self { sse: bitor_m128i(self.sse, rhs.sse) }
228 } else if #[cfg(target_feature="simd128")] {
229 Self { simd: v128_or(self.simd, rhs.simd) }
230 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
231 unsafe {Self { neon: vorrq_s16(self.neon, rhs.neon) }}
232 } else {
233 Self { arr: [
234 self.arr[0].bitor(rhs.arr[0]),
235 self.arr[1].bitor(rhs.arr[1]),
236 self.arr[2].bitor(rhs.arr[2]),
237 self.arr[3].bitor(rhs.arr[3]),
238 self.arr[4].bitor(rhs.arr[4]),
239 self.arr[5].bitor(rhs.arr[5]),
240 self.arr[6].bitor(rhs.arr[6]),
241 self.arr[7].bitor(rhs.arr[7]),
242 ]}
243 }
244 }
245 }
246}
247
248impl BitXor for i16x8 {
249 type Output = Self;
250 #[inline]
251 fn bitxor(self, rhs: Self) -> Self::Output {
252 pick! {
253 if #[cfg(target_feature="sse2")] {
254 Self { sse: bitxor_m128i(self.sse, rhs.sse) }
255 } else if #[cfg(target_feature="simd128")] {
256 Self { simd: v128_xor(self.simd, rhs.simd) }
257 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
258 unsafe {Self { neon: veorq_s16(self.neon, rhs.neon) }}
259 } else {
260 Self { arr: [
261 self.arr[0].bitxor(rhs.arr[0]),
262 self.arr[1].bitxor(rhs.arr[1]),
263 self.arr[2].bitxor(rhs.arr[2]),
264 self.arr[3].bitxor(rhs.arr[3]),
265 self.arr[4].bitxor(rhs.arr[4]),
266 self.arr[5].bitxor(rhs.arr[5]),
267 self.arr[6].bitxor(rhs.arr[6]),
268 self.arr[7].bitxor(rhs.arr[7]),
269 ]}
270 }
271 }
272 }
273}
274
275macro_rules! impl_shl_t_for_i16x8 {
276 ($($shift_type:ty),+ $(,)?) => {
277 $(impl Shl<$shift_type> for i16x8 {
278 type Output = Self;
279 #[inline]
281 fn shl(self, rhs: $shift_type) -> Self::Output {
282 pick! {
283 if #[cfg(target_feature="sse2")] {
284 let shift = cast([rhs as u64, 0]);
285 Self { sse: shl_all_u16_m128i(self.sse, shift) }
286 } else if #[cfg(target_feature="simd128")] {
287 Self { simd: i16x8_shl(self.simd, rhs as u32) }
288 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
289 unsafe {Self { neon: vshlq_s16(self.neon, vmovq_n_s16(rhs as i16)) }}
290 } else {
291 let u = rhs as u32;
292 Self { arr: [
293 self.arr[0].wrapping_shl(u),
294 self.arr[1].wrapping_shl(u),
295 self.arr[2].wrapping_shl(u),
296 self.arr[3].wrapping_shl(u),
297 self.arr[4].wrapping_shl(u),
298 self.arr[5].wrapping_shl(u),
299 self.arr[6].wrapping_shl(u),
300 self.arr[7].wrapping_shl(u),
301 ]}
302 }
303 }
304 }
305 })+
306 };
307}
308impl_shl_t_for_i16x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
309
310macro_rules! impl_shr_t_for_i16x8 {
311 ($($shift_type:ty),+ $(,)?) => {
312 $(impl Shr<$shift_type> for i16x8 {
313 type Output = Self;
314 #[inline]
316 fn shr(self, rhs: $shift_type) -> Self::Output {
317 pick! {
318 if #[cfg(target_feature="sse2")] {
319 let shift = cast([rhs as u64, 0]);
320 Self { sse: shr_all_i16_m128i(self.sse, shift) }
321 } else if #[cfg(target_feature="simd128")] {
322 Self { simd: i16x8_shr(self.simd, rhs as u32) }
323 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
324 unsafe {Self { neon: vshlq_s16(self.neon, vmovq_n_s16( -(rhs as i16))) }}
325 } else {
326 let u = rhs as u32;
327 Self { arr: [
328 self.arr[0].wrapping_shr(u),
329 self.arr[1].wrapping_shr(u),
330 self.arr[2].wrapping_shr(u),
331 self.arr[3].wrapping_shr(u),
332 self.arr[4].wrapping_shr(u),
333 self.arr[5].wrapping_shr(u),
334 self.arr[6].wrapping_shr(u),
335 self.arr[7].wrapping_shr(u),
336 ]}
337 }
338 }
339 }
340 })+
341 };
342}
343impl_shr_t_for_i16x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
344
345impl CmpEq for i16x8 {
346 type Output = Self;
347 #[inline]
348 fn simd_eq(self, rhs: Self) -> Self::Output {
349 pick! {
350 if #[cfg(target_feature="sse2")] {
351 Self { sse: cmp_eq_mask_i16_m128i(self.sse, rhs.sse) }
352 } else if #[cfg(target_feature="simd128")] {
353 Self { simd: i16x8_eq(self.simd, rhs.simd) }
354 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
355 unsafe {Self { neon: vreinterpretq_s16_u16(vceqq_s16(self.neon, rhs.neon)) }}
356 } else {
357 Self { arr: [
358 if self.arr[0] == rhs.arr[0] { -1 } else { 0 },
359 if self.arr[1] == rhs.arr[1] { -1 } else { 0 },
360 if self.arr[2] == rhs.arr[2] { -1 } else { 0 },
361 if self.arr[3] == rhs.arr[3] { -1 } else { 0 },
362 if self.arr[4] == rhs.arr[4] { -1 } else { 0 },
363 if self.arr[5] == rhs.arr[5] { -1 } else { 0 },
364 if self.arr[6] == rhs.arr[6] { -1 } else { 0 },
365 if self.arr[7] == rhs.arr[7] { -1 } else { 0 },
366 ]}
367 }
368 }
369 }
370}
371
372impl CmpGt for i16x8 {
373 type Output = Self;
374 #[inline]
375 fn simd_gt(self, rhs: Self) -> Self::Output {
376 pick! {
377 if #[cfg(target_feature="sse2")] {
378 Self { sse: cmp_gt_mask_i16_m128i(self.sse, rhs.sse) }
379 } else if #[cfg(target_feature="simd128")] {
380 Self { simd: i16x8_gt(self.simd, rhs.simd) }
381 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
382 unsafe {Self { neon: vreinterpretq_s16_u16(vcgtq_s16(self.neon, rhs.neon)) }}
383 } else {
384 Self { arr: [
385 if self.arr[0] > rhs.arr[0] { -1 } else { 0 },
386 if self.arr[1] > rhs.arr[1] { -1 } else { 0 },
387 if self.arr[2] > rhs.arr[2] { -1 } else { 0 },
388 if self.arr[3] > rhs.arr[3] { -1 } else { 0 },
389 if self.arr[4] > rhs.arr[4] { -1 } else { 0 },
390 if self.arr[5] > rhs.arr[5] { -1 } else { 0 },
391 if self.arr[6] > rhs.arr[6] { -1 } else { 0 },
392 if self.arr[7] > rhs.arr[7] { -1 } else { 0 },
393 ]}
394 }
395 }
396 }
397}
398
399impl CmpLt for i16x8 {
400 type Output = Self;
401 #[inline]
402 fn simd_lt(self, rhs: Self) -> Self::Output {
403 pick! {
404 if #[cfg(target_feature="sse2")] {
405 Self { sse: cmp_lt_mask_i16_m128i(self.sse, rhs.sse) }
406 } else if #[cfg(target_feature="simd128")] {
407 Self { simd: i16x8_lt(self.simd, rhs.simd) }
408 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
409 unsafe {Self { neon: vreinterpretq_s16_u16(vcltq_s16(self.neon, rhs.neon)) }}
410 } else {
411 Self { arr: [
412 if self.arr[0] < rhs.arr[0] { -1 } else { 0 },
413 if self.arr[1] < rhs.arr[1] { -1 } else { 0 },
414 if self.arr[2] < rhs.arr[2] { -1 } else { 0 },
415 if self.arr[3] < rhs.arr[3] { -1 } else { 0 },
416 if self.arr[4] < rhs.arr[4] { -1 } else { 0 },
417 if self.arr[5] < rhs.arr[5] { -1 } else { 0 },
418 if self.arr[6] < rhs.arr[6] { -1 } else { 0 },
419 if self.arr[7] < rhs.arr[7] { -1 } else { 0 },
420 ]}
421 }
422 }
423 }
424}
425
426impl i16x8 {
427 #[inline]
428 #[must_use]
429 pub const fn new(array: [i16; 8]) -> Self {
430 unsafe { core::mem::transmute(array) }
431 }
432
433 #[inline]
434 #[must_use]
435 pub fn to_bitmask(self) -> u32 {
436 pick! {
437 if #[cfg(target_feature="sse2")] {
438 (move_mask_i8_m128i( pack_i16_to_i8_m128i(self.sse,self.sse)) as u32) & 0xff
439 } else if #[cfg(target_feature="simd128")] {
440 i16x8_bitmask(self.simd) as u32
441 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
442 unsafe
443 {
444 let masked = vcltq_s16(self.neon, vdupq_n_s16(0));
446
447 let selectbit : uint16x8_t = core::mem::transmute([1u16, 2, 4, 8, 16, 32, 64, 128]);
449 let r = vandq_u16(masked, selectbit);
450
451 vaddvq_u16(r) as u32
453 }
454 } else {
455 ((self.arr[0] < 0) as u32) << 0 |
456 ((self.arr[1] < 0) as u32) << 1 |
457 ((self.arr[2] < 0) as u32) << 2 |
458 ((self.arr[3] < 0) as u32) << 3 |
459 ((self.arr[4] < 0) as u32) << 4 |
460 ((self.arr[5] < 0) as u32) << 5 |
461 ((self.arr[6] < 0) as u32) << 6 |
462 ((self.arr[7] < 0) as u32) << 7
463 }
464 }
465 }
466
467 #[inline]
468 #[must_use]
469 pub fn any(self) -> bool {
470 pick! {
471 if #[cfg(target_feature="sse2")] {
472 (move_mask_i8_m128i(self.sse) & 0b1010101010101010) != 0
473 } else if #[cfg(target_feature="simd128")] {
474 u16x8_bitmask(self.simd) != 0
475 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
476 unsafe {
477 vminvq_s16(self.neon) < 0
478 }
479 } else {
480 let v : [u64;2] = cast(self);
481 ((v[0] | v[1]) & 0x8000800080008000) != 0
482 }
483 }
484 }
485
486 #[inline]
487 #[must_use]
488 pub fn all(self) -> bool {
489 pick! {
490 if #[cfg(target_feature="sse2")] {
491 (move_mask_i8_m128i(self.sse) & 0b1010101010101010) == 0b1010101010101010
492 } else if #[cfg(target_feature="simd128")] {
493 u16x8_bitmask(self.simd) == 0b11111111
494 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
495 unsafe {
496 vmaxvq_s16(self.neon) < 0
497 }
498 } else {
499 let v : [u64;2] = cast(self);
500 (v[0] & v[1] & 0x8000800080008000) == 0x8000800080008000
501 }
502 }
503 }
504
505 #[inline]
506 #[must_use]
507 pub fn none(self) -> bool {
508 !self.any()
509 }
510
511 #[inline]
513 #[must_use]
514 pub fn from_u8x16_low(u: u8x16) -> Self {
515 pick! {
516 if #[cfg(target_feature="sse2")] {
517 Self{ sse: unpack_low_i8_m128i(u.sse, m128i::zeroed()) }
518 } else {
519 let u_arr: [u8; 16] = cast(u);
520 cast([
521 u_arr[0] as u16 as i16,
522 u_arr[1] as u16 as i16,
523 u_arr[2] as u16 as i16,
524 u_arr[3] as u16 as i16,
525 u_arr[4] as u16 as i16,
526 u_arr[5] as u16 as i16,
527 u_arr[6] as u16 as i16,
528 u_arr[7] as u16 as i16,
529 ])
530 }
531 }
532 }
533
534 #[inline]
536 #[must_use]
537 pub fn from_u8x16_high(u: u8x16) -> Self {
538 pick! {
539 if #[cfg(target_feature="sse2")] {
540 Self{ sse: unpack_high_i8_m128i(u.sse, m128i::zeroed()) }
541 } else {
542 let u_arr: [u8; 16] = cast(u);
543 cast([
544 u_arr[8] as u16 as i16,
545 u_arr[9] as u16 as i16,
546 u_arr[10] as u16 as i16,
547 u_arr[11] as u16 as i16,
548 u_arr[12] as u16 as i16,
549 u_arr[13] as u16 as i16,
550 u_arr[14] as u16 as i16,
551 u_arr[15] as u16 as i16,
552 ])
553 }
554 }
555 }
556
557 #[inline]
559 #[must_use]
560 pub fn from_i32x8_saturate(v: i32x8) -> Self {
561 pick! {
562 if #[cfg(target_feature="avx2")] {
563 i16x8 { sse: pack_i32_to_i16_m128i( extract_m128i_from_m256i::<0>(v.avx2), extract_m128i_from_m256i::<1>(v.avx2)) }
564 } else if #[cfg(target_feature="sse2")] {
565 i16x8 { sse: pack_i32_to_i16_m128i( v.a.sse, v.b.sse ) }
566 } else if #[cfg(target_feature="simd128")] {
567 use core::arch::wasm32::*;
568
569 i16x8 { simd: i16x8_narrow_i32x4(v.a.simd, v.b.simd) }
570 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
571 use core::arch::aarch64::*;
572
573 unsafe {
574 i16x8 { neon: vcombine_s16(vqmovn_s32(v.a.neon), vqmovn_s32(v.b.neon)) }
575 }
576 } else {
577 fn clamp(a : i32) -> i16 {
578 if a < i16::MIN as i32 {
579 i16::MIN
580 }
581 else if a > i16::MAX as i32 {
582 i16::MAX
583 } else {
584 a as i16
585 }
586 }
587
588 i16x8::new([
589 clamp(v.as_array()[0]),
590 clamp(v.as_array()[1]),
591 clamp(v.as_array()[2]),
592 clamp(v.as_array()[3]),
593 clamp(v.as_array()[4]),
594 clamp(v.as_array()[5]),
595 clamp(v.as_array()[6]),
596 clamp(v.as_array()[7]),
597 ])
598 }
599 }
600 }
601
602 #[inline]
604 #[must_use]
605 pub fn from_i32x8_truncate(v: i32x8) -> Self {
606 pick! {
607 if #[cfg(target_feature="avx2")] {
608 let a = v.avx2.bitand(set_splat_i32_m256i(0xffff));
609 i16x8 { sse: pack_i32_to_u16_m128i( extract_m128i_from_m256i::<0>(a), extract_m128i_from_m256i::<1>(a) ) }
610 } else if #[cfg(target_feature="sse2")] {
611 let a = shr_imm_i32_m128i::<16>(shl_imm_u32_m128i::<16>(v.a.sse));
612 let b = shr_imm_i32_m128i::<16>(shl_imm_u32_m128i::<16>(v.b.sse));
613
614 i16x8 { sse: pack_i32_to_i16_m128i( a, b) }
615 } else {
616 i16x8::new([
617 v.as_array()[0] as i16,
618 v.as_array()[1] as i16,
619 v.as_array()[2] as i16,
620 v.as_array()[3] as i16,
621 v.as_array()[4] as i16,
622 v.as_array()[5] as i16,
623 v.as_array()[6] as i16,
624 v.as_array()[7] as i16,
625 ])
626 }
627 }
628 }
629
630 #[inline]
631 #[must_use]
632 pub fn from_slice_unaligned(input: &[i16]) -> Self {
633 assert!(input.len() >= 8);
634
635 pick! {
636 if #[cfg(target_feature="sse2")] {
637 unsafe { Self { sse: load_unaligned_m128i( &*(input.as_ptr() as * const [u8;16]) ) } }
638 } else if #[cfg(target_feature="simd128")] {
639 unsafe { Self { simd: v128_load(input.as_ptr() as *const v128 ) } }
640 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
641 unsafe { Self { neon: vld1q_s16( input.as_ptr() as *const i16 ) } }
642 } else {
643 unsafe { Self::new( *(input.as_ptr() as * const [i16;8]) ) }
645 }
646 }
647 }
648
649 #[inline]
650 #[must_use]
651 pub fn blend(self, t: Self, f: Self) -> Self {
652 pick! {
653 if #[cfg(target_feature="sse4.1")] {
654 Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) }
655 } else if #[cfg(target_feature="simd128")] {
656 Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
657 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
658 unsafe {Self { neon: vbslq_s16(vreinterpretq_u16_s16(self.neon), t.neon, f.neon) }}
659 } else {
660 generic_bit_blend(self, t, f)
661 }
662 }
663 }
664 #[inline]
665 #[must_use]
666 pub fn is_negative(self) -> Self {
667 self.simd_lt(Self::zeroed())
668 }
669
670 #[inline]
672 #[must_use]
673 pub fn reduce_add(self) -> i16 {
674 pick! {
675 if #[cfg(target_feature="sse2")] {
676 let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse);
678 let sum64 = add_i16_m128i(self.sse, hi64);
679 let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64);
680 let sum32 = add_i16_m128i(sum64, hi32);
681 let lo16 = shr_imm_u32_m128i::<16>(sum32);
682 let sum16 = add_i16_m128i(sum32, lo16);
683 extract_i16_as_i32_m128i::<0>(sum16) as i16
684 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
685 unsafe { vaddvq_s16(self.neon) }
686 } else {
687 let arr: [i16; 8] = cast(self);
688
689 let mut r = arr[0];
691 r = r.wrapping_add(arr[1]);
692 r = r.wrapping_add(arr[2]);
693 r = r.wrapping_add(arr[3]);
694 r = r.wrapping_add(arr[4]);
695 r = r.wrapping_add(arr[5]);
696 r = r.wrapping_add(arr[6]);
697 r.wrapping_add(arr[7])
698 }
699 }
700 }
701
702 #[inline]
704 #[must_use]
705 pub fn reduce_min(self) -> i16 {
706 pick! {
707 if #[cfg(target_feature="sse2")] {
708 let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse);
709 let sum64 = min_i16_m128i(self.sse, hi64);
710 let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64);
711 let sum32 = min_i16_m128i(sum64, hi32);
712 let lo16 = shr_imm_u32_m128i::<16>(sum32);
713 let sum16 = min_i16_m128i(sum32, lo16);
714 extract_i16_as_i32_m128i::<0>(sum16) as i16
715 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
716 unsafe { vminvq_s16(self.neon) }
717 } else {
718 let arr: [i16; 8] = cast(self);
719
720 let mut r = arr[0];
722 r = r.min(arr[1]);
723 r = r.min(arr[2]);
724 r = r.min(arr[3]);
725 r = r.min(arr[4]);
726 r = r.min(arr[5]);
727 r = r.min(arr[6]);
728 r.min(arr[7])
729 }
730 }
731 }
732
733 #[inline]
735 #[must_use]
736 pub fn reduce_max(self) -> i16 {
737 pick! {
738 if #[cfg(target_feature="sse2")] {
739 let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse);
740 let sum64 = max_i16_m128i(self.sse, hi64);
741 let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64);
742 let sum32 = max_i16_m128i(sum64, hi32);
743 let lo16 = shr_imm_u32_m128i::<16>(sum32);
744 let sum16 = max_i16_m128i(sum32, lo16);
745 extract_i16_as_i32_m128i::<0>(sum16) as i16
746 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
747 unsafe { vmaxvq_s16(self.neon) }
748 } else {
749 let arr: [i16; 8] = cast(self);
750
751 let mut r = arr[0];
753 r = r.max(arr[1]);
754 r = r.max(arr[2]);
755 r = r.max(arr[3]);
756 r = r.max(arr[4]);
757 r = r.max(arr[5]);
758 r = r.max(arr[6]);
759 r.max(arr[7])
760 }
761 }
762 }
763
764 #[inline]
765 #[must_use]
766 pub fn abs(self) -> Self {
767 pick! {
768 if #[cfg(target_feature="sse2")] {
769 let mask = shr_imm_i16_m128i::<15>(self.sse);
770 Self { sse: bitxor_m128i(add_i16_m128i(self.sse, mask), mask) }
771 } else if #[cfg(target_feature="ssse3")] {
772 Self { sse: abs_i16_m128i(self.sse) }
773 } else if #[cfg(target_feature="simd128")] {
774 Self { simd: i16x8_abs(self.simd) }
775 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
776 unsafe {Self { neon: vabsq_s16(self.neon) }}
777 } else {
778 let arr: [i16; 8] = cast(self);
779 cast(
780 [
781 arr[0].wrapping_abs(),
782 arr[1].wrapping_abs(),
783 arr[2].wrapping_abs(),
784 arr[3].wrapping_abs(),
785 arr[4].wrapping_abs(),
786 arr[5].wrapping_abs(),
787 arr[6].wrapping_abs(),
788 arr[7].wrapping_abs(),
789 ])
790 }
791 }
792 }
793
794 #[inline]
795 #[must_use]
796 pub fn unsigned_abs(self) -> u16x8 {
797 pick! {
798 if #[cfg(target_feature="sse2")] {
799 let mask = shr_imm_i16_m128i::<15>(self.sse);
800 u16x8 { sse: bitxor_m128i(add_i16_m128i(self.sse, mask), mask) }
801 } else if #[cfg(target_feature="ssse3")] {
802 u16x8 { sse: abs_i16_m128i(self.sse) }
803 } else if #[cfg(target_feature="simd128")] {
804 u16x8 { simd: i16x8_abs(self.simd) }
805 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
806 unsafe {u16x8 { neon: vreinterpretq_u16_s16(vabsq_s16(self.neon)) }}
807 } else {
808 let arr: [i16; 8] = cast(self);
809 cast(
810 [
811 arr[0].unsigned_abs(),
812 arr[1].unsigned_abs(),
813 arr[2].unsigned_abs(),
814 arr[3].unsigned_abs(),
815 arr[4].unsigned_abs(),
816 arr[5].unsigned_abs(),
817 arr[6].unsigned_abs(),
818 arr[7].unsigned_abs(),
819 ])
820 }
821 }
822 }
823
824 #[inline]
825 #[must_use]
826 pub fn max(self, rhs: Self) -> Self {
827 pick! {
828 if #[cfg(target_feature="sse2")] {
829 Self { sse: max_i16_m128i(self.sse, rhs.sse) }
830 } else if #[cfg(target_feature="simd128")] {
831 Self { simd: i16x8_max(self.simd, rhs.simd) }
832 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
833 unsafe {Self { neon: vmaxq_s16(self.neon, rhs.neon) }}
834 } else {
835 self.simd_lt(rhs).blend(rhs, self)
836 }
837 }
838 }
839 #[inline]
840 #[must_use]
841 pub fn min(self, rhs: Self) -> Self {
842 pick! {
843 if #[cfg(target_feature="sse2")] {
844 Self { sse: min_i16_m128i(self.sse, rhs.sse) }
845 } else if #[cfg(target_feature="simd128")] {
846 Self { simd: i16x8_min(self.simd, rhs.simd) }
847 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
848 unsafe {Self { neon: vminq_s16(self.neon, rhs.neon) }}
849 } else {
850 self.simd_lt(rhs).blend(self, rhs)
851 }
852 }
853 }
854
855 #[inline]
856 #[must_use]
857 pub fn saturating_add(self, rhs: Self) -> Self {
858 pick! {
859 if #[cfg(target_feature="sse2")] {
860 Self { sse: add_saturating_i16_m128i(self.sse, rhs.sse) }
861 } else if #[cfg(target_feature="simd128")] {
862 Self { simd: i16x8_add_sat(self.simd, rhs.simd) }
863 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
864 unsafe {Self { neon: vqaddq_s16(self.neon, rhs.neon) }}
865 } else {
866 Self { arr: [
867 self.arr[0].saturating_add(rhs.arr[0]),
868 self.arr[1].saturating_add(rhs.arr[1]),
869 self.arr[2].saturating_add(rhs.arr[2]),
870 self.arr[3].saturating_add(rhs.arr[3]),
871 self.arr[4].saturating_add(rhs.arr[4]),
872 self.arr[5].saturating_add(rhs.arr[5]),
873 self.arr[6].saturating_add(rhs.arr[6]),
874 self.arr[7].saturating_add(rhs.arr[7]),
875 ]}
876 }
877 }
878 }
879 #[inline]
880 #[must_use]
881 pub fn saturating_sub(self, rhs: Self) -> Self {
882 pick! {
883 if #[cfg(target_feature="sse2")] {
884 Self { sse: sub_saturating_i16_m128i(self.sse, rhs.sse) }
885 } else if #[cfg(target_feature="simd128")] {
886 Self { simd: i16x8_sub_sat(self.simd, rhs.simd) }
887 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
888 unsafe { Self { neon: vqsubq_s16(self.neon, rhs.neon) } }
889 } else {
890 Self { arr: [
891 self.arr[0].saturating_sub(rhs.arr[0]),
892 self.arr[1].saturating_sub(rhs.arr[1]),
893 self.arr[2].saturating_sub(rhs.arr[2]),
894 self.arr[3].saturating_sub(rhs.arr[3]),
895 self.arr[4].saturating_sub(rhs.arr[4]),
896 self.arr[5].saturating_sub(rhs.arr[5]),
897 self.arr[6].saturating_sub(rhs.arr[6]),
898 self.arr[7].saturating_sub(rhs.arr[7]),
899 ]}
900 }
901 }
902 }
903
904 #[inline]
909 #[must_use]
910 pub fn dot(self, rhs: Self) -> i32x4 {
911 pick! {
912 if #[cfg(target_feature="sse2")] {
913 i32x4 { sse: mul_i16_horizontal_add_m128i(self.sse, rhs.sse) }
914 } else if #[cfg(target_feature="simd128")] {
915 i32x4 { simd: i32x4_dot_i16x8(self.simd, rhs.simd) }
916 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
917 unsafe {
918 let pl = vmull_s16(vget_low_s16(self.neon), vget_low_s16(rhs.neon));
919 let ph = vmull_high_s16(self.neon, rhs.neon);
920 i32x4 { neon: vpaddq_s32(pl, ph) }
921 }
922 } else {
923 i32x4 { arr: [
924 (i32::from(self.arr[0]) * i32::from(rhs.arr[0])) + (i32::from(self.arr[1]) * i32::from(rhs.arr[1])),
925 (i32::from(self.arr[2]) * i32::from(rhs.arr[2])) + (i32::from(self.arr[3]) * i32::from(rhs.arr[3])),
926 (i32::from(self.arr[4]) * i32::from(rhs.arr[4])) + (i32::from(self.arr[5]) * i32::from(rhs.arr[5])),
927 (i32::from(self.arr[6]) * i32::from(rhs.arr[6])) + (i32::from(self.arr[7]) * i32::from(rhs.arr[7])),
928 ] }
929 }
930 }
931 }
932
933 #[inline]
941 #[must_use]
942 pub fn mul_scale_round(self, rhs: Self) -> Self {
943 pick! {
944 if #[cfg(target_feature="ssse3")] {
945 Self { sse: mul_i16_scale_round_m128i(self.sse, rhs.sse) }
946 } else if #[cfg(target_feature="sse2")] {
947 let hi = mul_i16_keep_high_m128i(self.sse, rhs.sse);
949 let lo = mul_i16_keep_low_m128i(self.sse, rhs.sse);
950 let mut v1 = unpack_low_i16_m128i(lo, hi);
951 let mut v2 = unpack_high_i16_m128i(lo, hi);
952 let a = set_splat_i32_m128i(0x4000);
953 v1 = shr_imm_i32_m128i::<15>(add_i32_m128i(v1, a));
954 v2 = shr_imm_i32_m128i::<15>(add_i32_m128i(v2, a));
955 let s = pack_i32_to_i16_m128i(v1, v2);
956 Self { sse: s }
957 } else if #[cfg(target_feature="simd128")] {
958 Self { simd: i16x8_q15mulr_sat(self.simd, rhs.simd) }
959 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
960 unsafe { Self { neon: vqrdmulhq_s16(self.neon, rhs.neon) } }
961 } else {
962 Self { arr: [
964 ((i32::from(self.arr[0]) * i32::from(rhs.arr[0]) + 0x4000) >> 15) as i16,
965 ((i32::from(self.arr[1]) * i32::from(rhs.arr[1]) + 0x4000) >> 15) as i16,
966 ((i32::from(self.arr[2]) * i32::from(rhs.arr[2]) + 0x4000) >> 15) as i16,
967 ((i32::from(self.arr[3]) * i32::from(rhs.arr[3]) + 0x4000) >> 15) as i16,
968 ((i32::from(self.arr[4]) * i32::from(rhs.arr[4]) + 0x4000) >> 15) as i16,
969 ((i32::from(self.arr[5]) * i32::from(rhs.arr[5]) + 0x4000) >> 15) as i16,
970 ((i32::from(self.arr[6]) * i32::from(rhs.arr[6]) + 0x4000) >> 15) as i16,
971 ((i32::from(self.arr[7]) * i32::from(rhs.arr[7]) + 0x4000) >> 15) as i16,
972 ]}
973 }
974 }
975 }
976
977 #[inline]
979 #[must_use]
980 pub fn mul_keep_high(lhs: Self, rhs: Self) -> Self {
981 pick! {
982 if #[cfg(target_feature="sse2")] {
983 Self { sse: mul_i16_keep_high_m128i(lhs.sse, rhs.sse) }
984 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
985 let lhs_low = unsafe { vget_low_s16(lhs.neon) };
986 let rhs_low = unsafe { vget_low_s16(rhs.neon) };
987
988 let lhs_high = unsafe { vget_high_s16(lhs.neon) };
989 let rhs_high = unsafe { vget_high_s16(rhs.neon) };
990
991 let low = unsafe { vmull_s16(lhs_low, rhs_low) };
992 let high = unsafe { vmull_s16(lhs_high, rhs_high) };
993
994 i16x8 { neon: unsafe { vreinterpretq_s16_u16(vuzpq_u16(vreinterpretq_u16_s32(low), vreinterpretq_u16_s32(high)).1) } }
995 } else if #[cfg(target_feature="simd128")] {
996 let low = i32x4_extmul_low_i16x8(lhs.simd, rhs.simd);
997 let high = i32x4_extmul_high_i16x8(lhs.simd, rhs.simd);
998
999 Self { simd: i16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(low, high) }
1000 } else {
1001 i16x8::new([
1002 ((i32::from(rhs.as_array()[0]) * i32::from(lhs.as_array()[0])) >> 16) as i16,
1003 ((i32::from(rhs.as_array()[1]) * i32::from(lhs.as_array()[1])) >> 16) as i16,
1004 ((i32::from(rhs.as_array()[2]) * i32::from(lhs.as_array()[2])) >> 16) as i16,
1005 ((i32::from(rhs.as_array()[3]) * i32::from(lhs.as_array()[3])) >> 16) as i16,
1006 ((i32::from(rhs.as_array()[4]) * i32::from(lhs.as_array()[4])) >> 16) as i16,
1007 ((i32::from(rhs.as_array()[5]) * i32::from(lhs.as_array()[5])) >> 16) as i16,
1008 ((i32::from(rhs.as_array()[6]) * i32::from(lhs.as_array()[6])) >> 16) as i16,
1009 ((i32::from(rhs.as_array()[7]) * i32::from(lhs.as_array()[7])) >> 16) as i16,
1010 ])
1011 }
1012 }
1013 }
1014
1015 #[inline]
1017 #[must_use]
1018 pub fn mul_widen(self, rhs: Self) -> i32x8 {
1019 pick! {
1020 if #[cfg(target_feature="avx2")] {
1021 let a = convert_to_i32_m256i_from_i16_m128i(self.sse);
1022 let b = convert_to_i32_m256i_from_i16_m128i(rhs.sse);
1023 i32x8 { avx2: mul_i32_keep_low_m256i(a,b) }
1024 } else if #[cfg(target_feature="sse2")] {
1025 let low = mul_i16_keep_low_m128i(self.sse, rhs.sse);
1026 let high = mul_i16_keep_high_m128i(self.sse, rhs.sse);
1027 i32x8 {
1028 a: i32x4 { sse:unpack_low_i16_m128i(low, high) },
1029 b: i32x4 { sse:unpack_high_i16_m128i(low, high) }
1030 }
1031 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1032 let lhs_low = unsafe { vget_low_s16(self.neon) };
1033 let rhs_low = unsafe { vget_low_s16(rhs.neon) };
1034
1035 let lhs_high = unsafe { vget_high_s16(self.neon) };
1036 let rhs_high = unsafe { vget_high_s16(rhs.neon) };
1037
1038 let low = unsafe { vmull_s16(lhs_low, rhs_low) };
1039 let high = unsafe { vmull_s16(lhs_high, rhs_high) };
1040
1041 i32x8 { a: i32x4 { neon: low }, b: i32x4 {neon: high } }
1042 } else {
1043 let a = self.as_array();
1044 let b = rhs.as_array();
1045 i32x8::new([
1046 i32::from(a[0]) * i32::from(b[0]),
1047 i32::from(a[1]) * i32::from(b[1]),
1048 i32::from(a[2]) * i32::from(b[2]),
1049 i32::from(a[3]) * i32::from(b[3]),
1050 i32::from(a[4]) * i32::from(b[4]),
1051 i32::from(a[5]) * i32::from(b[5]),
1052 i32::from(a[6]) * i32::from(b[6]),
1053 i32::from(a[7]) * i32::from(b[7]),
1054 ])
1055 }
1056 }
1057 }
1058
1059 #[must_use]
1061 #[inline]
1062 pub fn transpose(data: [i16x8; 8]) -> [i16x8; 8] {
1063 pick! {
1064 if #[cfg(target_feature="sse2")] {
1065 let a1 = unpack_low_i16_m128i(data[0].sse, data[1].sse);
1066 let a2 = unpack_high_i16_m128i(data[0].sse, data[1].sse);
1067 let a3 = unpack_low_i16_m128i(data[2].sse, data[3].sse);
1068 let a4 = unpack_high_i16_m128i(data[2].sse, data[3].sse);
1069 let a5 = unpack_low_i16_m128i(data[4].sse, data[5].sse);
1070 let a6 = unpack_high_i16_m128i(data[4].sse, data[5].sse);
1071 let a7 = unpack_low_i16_m128i(data[6].sse, data[7].sse);
1072 let a8 = unpack_high_i16_m128i(data[6].sse, data[7].sse);
1073
1074 let b1 = unpack_low_i32_m128i(a1, a3);
1075 let b2 = unpack_high_i32_m128i(a1, a3);
1076 let b3 = unpack_low_i32_m128i(a2, a4);
1077 let b4 = unpack_high_i32_m128i(a2, a4);
1078 let b5 = unpack_low_i32_m128i(a5, a7);
1079 let b6 = unpack_high_i32_m128i(a5, a7);
1080 let b7 = unpack_low_i32_m128i(a6, a8);
1081 let b8 = unpack_high_i32_m128i(a6, a8);
1082
1083 [
1084 i16x8 { sse: unpack_low_i64_m128i(b1, b5) },
1085 i16x8 { sse: unpack_high_i64_m128i(b1, b5) },
1086 i16x8 { sse: unpack_low_i64_m128i(b2, b6) },
1087 i16x8 { sse: unpack_high_i64_m128i(b2, b6) },
1088 i16x8 { sse: unpack_low_i64_m128i(b3, b7) },
1089 i16x8 { sse: unpack_high_i64_m128i(b3, b7) },
1090 i16x8 { sse: unpack_low_i64_m128i(b4, b8) },
1091 i16x8 { sse: unpack_high_i64_m128i(b4, b8) } ,
1092 ]
1093 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1094
1095 #[inline] fn vtrq32(a : int16x8_t, b : int16x8_t) -> (int16x8_t, int16x8_t)
1096 {
1097 unsafe {
1098 let r = vtrnq_s32(vreinterpretq_s32_s16(a),vreinterpretq_s32_s16(b));
1099 (vreinterpretq_s16_s32(r.0), vreinterpretq_s16_s32(r.1))
1100 }
1101 }
1102
1103 unsafe {
1104 let (q0,q2) = vtrq32(data[0].neon, data[2].neon);
1105 let (q1,q3) = vtrq32(data[1].neon, data[3].neon);
1106 let (q4,q6) = vtrq32(data[4].neon, data[6].neon);
1107 let (q5,q7) = vtrq32(data[5].neon, data[7].neon);
1108
1109 let b1 = vtrnq_s16(q0, q1);
1110 let b2 = vtrnq_s16(q2, q3);
1111 let b3 = vtrnq_s16(q4, q5);
1112 let b4 = vtrnq_s16(q6, q7);
1113
1114 [
1118 i16x8 { neon: vcombine_s16(vget_low_s16(b1.0), vget_low_s16(b3.0)) },
1119 i16x8 { neon: vcombine_s16(vget_low_s16(b1.1), vget_low_s16(b3.1)) },
1120 i16x8 { neon: vcombine_s16(vget_low_s16(b2.0), vget_low_s16(b4.0)) },
1121 i16x8 { neon: vcombine_s16(vget_low_s16(b2.1), vget_low_s16(b4.1)) },
1122 i16x8 { neon: vcombine_s16(vget_high_s16(b1.0), vget_high_s16(b3.0)) },
1123 i16x8 { neon: vcombine_s16(vget_high_s16(b1.1), vget_high_s16(b3.1)) },
1124 i16x8 { neon: vcombine_s16(vget_high_s16(b2.0), vget_high_s16(b4.0)) },
1125 i16x8 { neon: vcombine_s16(vget_high_s16(b2.1), vget_high_s16(b4.1)) },
1126 ]
1127 }
1128 } else if #[cfg(target_feature="simd128")] {
1129 #[inline] fn lo_i16(a : v128, b : v128) -> v128 { i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(a,b) }
1130 #[inline] fn hi_i16(a : v128, b : v128) -> v128 { i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(a,b) }
1131 #[inline] fn lo_i32(a : v128, b : v128) -> v128 { i32x4_shuffle::<0, 4, 1, 5>(a,b) }
1132 #[inline] fn hi_i32(a : v128, b : v128) -> v128 { i32x4_shuffle::<2, 6, 3, 7>(a,b) }
1133 #[inline] fn lo_i64(a : v128, b : v128) -> v128 { i64x2_shuffle::<0, 2>(a,b) }
1134 #[inline] fn hi_i64(a : v128, b : v128) -> v128 { i64x2_shuffle::<1, 3>(a,b) }
1135
1136 let a1 = lo_i16(data[0].simd, data[1].simd);
1137 let a2 = hi_i16(data[0].simd, data[1].simd);
1138 let a3 = lo_i16(data[2].simd, data[3].simd);
1139 let a4 = hi_i16(data[2].simd, data[3].simd);
1140 let a5 = lo_i16(data[4].simd, data[5].simd);
1141 let a6 = hi_i16(data[4].simd, data[5].simd);
1142 let a7 = lo_i16(data[6].simd, data[7].simd);
1143 let a8 = hi_i16(data[6].simd, data[7].simd);
1144
1145 let b1 = lo_i32(a1, a3);
1146 let b2 = hi_i32(a1, a3);
1147 let b3 = lo_i32(a2, a4);
1148 let b4 = hi_i32(a2, a4);
1149 let b5 = lo_i32(a5, a7);
1150 let b6 = hi_i32(a5, a7);
1151 let b7 = lo_i32(a6, a8);
1152 let b8 = hi_i32(a6, a8);
1153
1154 [
1155 i16x8 { simd: lo_i64(b1, b5) },
1156 i16x8 { simd: hi_i64(b1, b5) },
1157 i16x8 { simd: lo_i64(b2, b6) },
1158 i16x8 { simd: hi_i64(b2, b6) },
1159 i16x8 { simd: lo_i64(b3, b7) },
1160 i16x8 { simd: hi_i64(b3, b7) },
1161 i16x8 { simd: lo_i64(b4, b8) },
1162 i16x8 { simd: hi_i64(b4, b8) } ,
1163 ]
1164
1165 } else {
1166 #[inline(always)]
1167 fn transpose_column(data: &[i16x8; 8], index: usize) -> i16x8 {
1168 i16x8::new([
1169 data[0].as_array()[index],
1170 data[1].as_array()[index],
1171 data[2].as_array()[index],
1172 data[3].as_array()[index],
1173 data[4].as_array()[index],
1174 data[5].as_array()[index],
1175 data[6].as_array()[index],
1176 data[7].as_array()[index],
1177 ])
1178 }
1179
1180 [
1181 transpose_column(&data, 0),
1182 transpose_column(&data, 1),
1183 transpose_column(&data, 2),
1184 transpose_column(&data, 3),
1185 transpose_column(&data, 4),
1186 transpose_column(&data, 5),
1187 transpose_column(&data, 6),
1188 transpose_column(&data, 7),
1189 ]
1190 }
1191 }
1192 }
1193
1194 #[inline]
1195 #[must_use]
1196 pub fn mul_scale_round_n(self, rhs: i16) -> Self {
1204 pick! {
1205 if #[cfg(target_feature="ssse3")] {
1206 Self { sse: mul_i16_scale_round_m128i(self.sse, set_splat_i16_m128i(rhs)) }
1207 } else if #[cfg(target_feature="sse2")] {
1208 let r = set_splat_i16_m128i(rhs);
1210 let hi = mul_i16_keep_high_m128i(self.sse, r);
1211 let lo = mul_i16_keep_low_m128i(self.sse, r);
1212 let mut v1 = unpack_low_i16_m128i(lo, hi);
1213 let mut v2 = unpack_high_i16_m128i(lo, hi);
1214 let a = set_splat_i32_m128i(0x4000);
1215 v1 = shr_imm_i32_m128i::<15>(add_i32_m128i(v1, a));
1216 v2 = shr_imm_i32_m128i::<15>(add_i32_m128i(v2, a));
1217 let s = pack_i32_to_i16_m128i(v1, v2);
1218 Self { sse: s }
1219 } else if #[cfg(target_feature="simd128")] {
1220 Self { simd: i16x8_q15mulr_sat(self.simd, i16x8_splat(rhs)) }
1221 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1222 unsafe { Self { neon: vqrdmulhq_n_s16(self.neon, rhs) } }
1223 } else {
1224 Self { arr: [
1226 ((i32::from(self.arr[0]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1227 ((i32::from(self.arr[1]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1228 ((i32::from(self.arr[2]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1229 ((i32::from(self.arr[3]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1230 ((i32::from(self.arr[4]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1231 ((i32::from(self.arr[5]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1232 ((i32::from(self.arr[6]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1233 ((i32::from(self.arr[7]) * i32::from(rhs) + 0x4000) >> 15) as i16,
1234 ]}
1235 }
1236 }
1237 }
1238
1239 #[inline]
1240 pub fn to_array(self) -> [i16; 8] {
1241 cast(self)
1242 }
1243
1244 #[inline]
1245 pub fn as_array(&self) -> &[i16; 8] {
1246 cast_ref(self)
1247 }
1248
1249 #[inline]
1250 pub fn as_mut_array(&mut self) -> &mut [i16; 8] {
1251 cast_mut(self)
1252 }
1253}