core/stdarch/crates/core_arch/src/x86/
avx512fp16.rs

1use crate::arch::asm;
2use crate::core_arch::{simd::*, x86::*};
3use crate::intrinsics::{fmaf16, simd::*};
4use crate::ptr;
5
6/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
7///
8/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ph)
9#[inline]
10#[target_feature(enable = "avx512fp16")]
11#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12pub fn _mm_set_ph(
13    e7: f16,
14    e6: f16,
15    e5: f16,
16    e4: f16,
17    e3: f16,
18    e2: f16,
19    e1: f16,
20    e0: f16,
21) -> __m128h {
22    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
23}
24
25/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
26///
27/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ph)
28#[inline]
29#[target_feature(enable = "avx512fp16")]
30#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
31pub fn _mm256_set_ph(
32    e15: f16,
33    e14: f16,
34    e13: f16,
35    e12: f16,
36    e11: f16,
37    e10: f16,
38    e9: f16,
39    e8: f16,
40    e7: f16,
41    e6: f16,
42    e5: f16,
43    e4: f16,
44    e3: f16,
45    e2: f16,
46    e1: f16,
47    e0: f16,
48) -> __m256h {
49    __m256h([
50        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
51    ])
52}
53
54/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
55///
56/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ph)
57#[inline]
58#[target_feature(enable = "avx512fp16")]
59#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
60pub fn _mm512_set_ph(
61    e31: f16,
62    e30: f16,
63    e29: f16,
64    e28: f16,
65    e27: f16,
66    e26: f16,
67    e25: f16,
68    e24: f16,
69    e23: f16,
70    e22: f16,
71    e21: f16,
72    e20: f16,
73    e19: f16,
74    e18: f16,
75    e17: f16,
76    e16: f16,
77    e15: f16,
78    e14: f16,
79    e13: f16,
80    e12: f16,
81    e11: f16,
82    e10: f16,
83    e9: f16,
84    e8: f16,
85    e7: f16,
86    e6: f16,
87    e5: f16,
88    e4: f16,
89    e3: f16,
90    e2: f16,
91    e1: f16,
92    e0: f16,
93) -> __m512h {
94    __m512h([
95        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
96        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
97    ])
98}
99
100/// Copy half-precision (16-bit) floating-point elements from a to the lower element of dst and zero
101/// the upper 7 elements.
102///
103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sh)
104#[inline]
105#[target_feature(enable = "avx512fp16")]
106#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
107pub fn _mm_set_sh(a: f16) -> __m128h {
108    __m128h([a, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
109}
110
111/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
112///
113/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ph)
114#[inline]
115#[target_feature(enable = "avx512fp16")]
116#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
117pub fn _mm_set1_ph(a: f16) -> __m128h {
118    unsafe { transmute(f16x8::splat(a)) }
119}
120
121/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
122///
123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ph)
124#[inline]
125#[target_feature(enable = "avx512fp16")]
126#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
127pub fn _mm256_set1_ph(a: f16) -> __m256h {
128    unsafe { transmute(f16x16::splat(a)) }
129}
130
131/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
132///
133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_ph)
134#[inline]
135#[target_feature(enable = "avx512fp16")]
136#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
137pub fn _mm512_set1_ph(a: f16) -> __m512h {
138    unsafe { transmute(f16x32::splat(a)) }
139}
140
141/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
142///
143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ph)
144#[inline]
145#[target_feature(enable = "avx512fp16")]
146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
147pub fn _mm_setr_ph(
148    e0: f16,
149    e1: f16,
150    e2: f16,
151    e3: f16,
152    e4: f16,
153    e5: f16,
154    e6: f16,
155    e7: f16,
156) -> __m128h {
157    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
158}
159
160/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
161///
162/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ph)
163#[inline]
164#[target_feature(enable = "avx512fp16")]
165#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
166pub fn _mm256_setr_ph(
167    e0: f16,
168    e1: f16,
169    e2: f16,
170    e3: f16,
171    e4: f16,
172    e5: f16,
173    e6: f16,
174    e7: f16,
175    e8: f16,
176    e9: f16,
177    e10: f16,
178    e11: f16,
179    e12: f16,
180    e13: f16,
181    e14: f16,
182    e15: f16,
183) -> __m256h {
184    __m256h([
185        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
186    ])
187}
188
189/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
190///
191/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ph)
192#[inline]
193#[target_feature(enable = "avx512fp16")]
194#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
195pub fn _mm512_setr_ph(
196    e0: f16,
197    e1: f16,
198    e2: f16,
199    e3: f16,
200    e4: f16,
201    e5: f16,
202    e6: f16,
203    e7: f16,
204    e8: f16,
205    e9: f16,
206    e10: f16,
207    e11: f16,
208    e12: f16,
209    e13: f16,
210    e14: f16,
211    e15: f16,
212    e16: f16,
213    e17: f16,
214    e18: f16,
215    e19: f16,
216    e20: f16,
217    e21: f16,
218    e22: f16,
219    e23: f16,
220    e24: f16,
221    e25: f16,
222    e26: f16,
223    e27: f16,
224    e28: f16,
225    e29: f16,
226    e30: f16,
227    e31: f16,
228) -> __m512h {
229    __m512h([
230        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
231        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
232    ])
233}
234
235/// Return vector of type __m128h with all elements set to zero.
236///
237/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ph)
238#[inline]
239#[target_feature(enable = "avx512fp16,avx512vl")]
240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
241pub fn _mm_setzero_ph() -> __m128h {
242    unsafe { transmute(f16x8::ZERO) }
243}
244
245/// Return vector of type __m256h with all elements set to zero.
246///
247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ph)
248#[inline]
249#[target_feature(enable = "avx512fp16,avx512vl")]
250#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
251pub fn _mm256_setzero_ph() -> __m256h {
252    unsafe { transmute(f16x16::ZERO) }
253}
254
255/// Return vector of type __m512h with all elements set to zero.
256///
257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ph)
258#[inline]
259#[target_feature(enable = "avx512fp16")]
260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
261pub fn _mm512_setzero_ph() -> __m512h {
262    unsafe { transmute(f16x32::ZERO) }
263}
264
265/// Return vector of type `__m128h` with undefined elements. In practice, this returns the all-zero
266/// vector.
267///
268/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ph)
269#[inline]
270#[target_feature(enable = "avx512fp16,avx512vl")]
271#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
272pub fn _mm_undefined_ph() -> __m128h {
273    unsafe { transmute(f16x8::ZERO) }
274}
275
276/// Return vector of type `__m256h` with undefined elements. In practice, this returns the all-zero
277/// vector.
278///
279/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ph)
280#[inline]
281#[target_feature(enable = "avx512fp16,avx512vl")]
282#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
283pub fn _mm256_undefined_ph() -> __m256h {
284    unsafe { transmute(f16x16::ZERO) }
285}
286
287/// Return vector of type `__m512h` with undefined elements. In practice, this returns the all-zero
288/// vector.
289///
290/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ph)
291#[inline]
292#[target_feature(enable = "avx512fp16")]
293#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
294pub fn _mm512_undefined_ph() -> __m512h {
295    unsafe { transmute(f16x32::ZERO) }
296}
297
298/// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
299/// does not generate any instructions, thus it has zero latency.
300///
301/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ph)
302#[inline]
303#[target_feature(enable = "avx512fp16")]
304#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
305pub fn _mm_castpd_ph(a: __m128d) -> __m128h {
306    unsafe { transmute(a) }
307}
308
309/// Cast vector of type `__m256d` to type `__m256h`. This intrinsic is only used for compilation and
310/// does not generate any instructions, thus it has zero latency.
311///
312/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
313#[inline]
314#[target_feature(enable = "avx512fp16")]
315#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
316pub fn _mm256_castpd_ph(a: __m256d) -> __m256h {
317    unsafe { transmute(a) }
318}
319
320/// Cast vector of type `__m512d` to type `__m512h`. This intrinsic is only used for compilation and
321/// does not generate any instructions, thus it has zero latency.
322///
323/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ph)
324#[inline]
325#[target_feature(enable = "avx512fp16")]
326#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
327pub fn _mm512_castpd_ph(a: __m512d) -> __m512h {
328    unsafe { transmute(a) }
329}
330
331/// Cast vector of type `__m128h` to type `__m128d`. This intrinsic is only used for compilation and
332/// does not generate any instructions, thus it has zero latency.
333///
334/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_pd)
335#[inline]
336#[target_feature(enable = "avx512fp16")]
337#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
338pub fn _mm_castph_pd(a: __m128h) -> __m128d {
339    unsafe { transmute(a) }
340}
341
342/// Cast vector of type `__m256h` to type `__m256d`. This intrinsic is only used for compilation and
343/// does not generate any instructions, thus it has zero latency.
344///
345/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_pd)
346#[inline]
347#[target_feature(enable = "avx512fp16")]
348#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
349pub fn _mm256_castph_pd(a: __m256h) -> __m256d {
350    unsafe { transmute(a) }
351}
352
353/// Cast vector of type `__m512h` to type `__m512d`. This intrinsic is only used for compilation and
354/// does not generate any instructions, thus it has zero latency.
355///
356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_pd)
357#[inline]
358#[target_feature(enable = "avx512fp16")]
359#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
360pub fn _mm512_castph_pd(a: __m512h) -> __m512d {
361    unsafe { transmute(a) }
362}
363
364/// Cast vector of type `__m128` to type `__m128h`. This intrinsic is only used for compilation and
365/// does not generate any instructions, thus it has zero latency.
366///
367/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_ph)
368#[inline]
369#[target_feature(enable = "avx512fp16")]
370#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
371pub fn _mm_castps_ph(a: __m128) -> __m128h {
372    unsafe { transmute(a) }
373}
374
375/// Cast vector of type `__m256` to type `__m256h`. This intrinsic is only used for compilation and
376/// does not generate any instructions, thus it has zero latency.
377///
378/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_ph)
379#[inline]
380#[target_feature(enable = "avx512fp16")]
381#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
382pub fn _mm256_castps_ph(a: __m256) -> __m256h {
383    unsafe { transmute(a) }
384}
385
386/// Cast vector of type `__m512` to type `__m512h`. This intrinsic is only used for compilation and
387/// does not generate any instructions, thus it has zero latency.
388///
389/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_ph)
390#[inline]
391#[target_feature(enable = "avx512fp16")]
392#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
393pub fn _mm512_castps_ph(a: __m512) -> __m512h {
394    unsafe { transmute(a) }
395}
396
397/// Cast vector of type `__m128h` to type `__m128`. This intrinsic is only used for compilation and
398/// does not generate any instructions, thus it has zero latency.
399///
400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_ps)
401#[inline]
402#[target_feature(enable = "avx512fp16")]
403#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
404pub fn _mm_castph_ps(a: __m128h) -> __m128 {
405    unsafe { transmute(a) }
406}
407
408/// Cast vector of type `__m256h` to type `__m256`. This intrinsic is only used for compilation and
409/// does not generate any instructions, thus it has zero latency.
410///
411/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_ps)
412#[inline]
413#[target_feature(enable = "avx512fp16")]
414#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
415pub fn _mm256_castph_ps(a: __m256h) -> __m256 {
416    unsafe { transmute(a) }
417}
418
419/// Cast vector of type `__m512h` to type `__m512`. This intrinsic is only used for compilation and
420/// does not generate any instructions, thus it has zero latency.
421///
422/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_ps)
423#[inline]
424#[target_feature(enable = "avx512fp16")]
425#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
426pub fn _mm512_castph_ps(a: __m512h) -> __m512 {
427    unsafe { transmute(a) }
428}
429
430/// Cast vector of type `__m128i` to type `__m128h`. This intrinsic is only used for compilation and
431/// does not generate any instructions, thus it has zero latency.
432///
433/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ph)
434#[inline]
435#[target_feature(enable = "avx512fp16")]
436#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
437pub fn _mm_castsi128_ph(a: __m128i) -> __m128h {
438    unsafe { transmute(a) }
439}
440
441/// Cast vector of type `__m256i` to type `__m256h`. This intrinsic is only used for compilation and
442/// does not generate any instructions, thus it has zero latency.
443///
444/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ph)
445#[inline]
446#[target_feature(enable = "avx512fp16")]
447#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
448pub fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
449    unsafe { transmute(a) }
450}
451
452/// Cast vector of type `__m512i` to type `__m512h`. This intrinsic is only used for compilation and
453/// does not generate any instructions, thus it has zero latency.
454///
455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ph)
456#[inline]
457#[target_feature(enable = "avx512fp16")]
458#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
459pub fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
460    unsafe { transmute(a) }
461}
462
463/// Cast vector of type `__m128h` to type `__m128i`. This intrinsic is only used for compilation and
464/// does not generate any instructions, thus it has zero latency.
465///
466/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_si128)
467#[inline]
468#[target_feature(enable = "avx512fp16")]
469#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
470pub fn _mm_castph_si128(a: __m128h) -> __m128i {
471    unsafe { transmute(a) }
472}
473
474/// Cast vector of type `__m256h` to type `__m256i`. This intrinsic is only used for compilation and
475/// does not generate any instructions, thus it has zero latency.
476///
477/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_si256)
478#[inline]
479#[target_feature(enable = "avx512fp16")]
480#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
481pub fn _mm256_castph_si256(a: __m256h) -> __m256i {
482    unsafe { transmute(a) }
483}
484
485/// Cast vector of type `__m512h` to type `__m512i`. This intrinsic is only used for compilation and
486/// does not generate any instructions, thus it has zero latency.
487///
488/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_si512)
489#[inline]
490#[target_feature(enable = "avx512fp16")]
491#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
492pub fn _mm512_castph_si512(a: __m512h) -> __m512i {
493    unsafe { transmute(a) }
494}
495
496/// Cast vector of type `__m256h` to type `__m128h`. This intrinsic is only used for compilation and
497/// does not generate any instructions, thus it has zero latency.
498///
499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph256_ph128)
500#[inline]
501#[target_feature(enable = "avx512fp16")]
502#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
503pub fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
504    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
505}
506
507/// Cast vector of type `__m512h` to type `__m128h`. This intrinsic is only used for compilation and
508/// does not generate any instructions, thus it has zero latency.
509///
510/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph128)
511#[inline]
512#[target_feature(enable = "avx512fp16")]
513#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
514pub fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
515    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
516}
517
518/// Cast vector of type `__m512h` to type `__m256h`. This intrinsic is only used for compilation and
519/// does not generate any instructions, thus it has zero latency.
520///
521/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph256)
522#[inline]
523#[target_feature(enable = "avx512fp16")]
524#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
525pub fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
526    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
527}
528
529/// Cast vector of type `__m128h` to type `__m256h`. The upper 8 elements of the result are undefined.
530/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
531/// but most of the time it does not generate any instructions.
532///
533/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph128_ph256)
534#[inline]
535#[target_feature(enable = "avx512fp16")]
536#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
537pub fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
538    unsafe {
539        simd_shuffle!(
540            a,
541            _mm_undefined_ph(),
542            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
543        )
544    }
545}
546
547/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are undefined.
548/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
549/// but most of the time it does not generate any instructions.
550///
551/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph128_ph512)
552#[inline]
553#[target_feature(enable = "avx512fp16")]
554#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
555pub fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
556    unsafe {
557        simd_shuffle!(
558            a,
559            _mm_undefined_ph(),
560            [
561                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
562                8, 8, 8, 8
563            ]
564        )
565    }
566}
567
568/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are undefined.
569/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
570/// but most of the time it does not generate any instructions.
571///
572/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph256_ph512)
573#[inline]
574#[target_feature(enable = "avx512fp16")]
575#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
576pub fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
577    unsafe {
578        simd_shuffle!(
579            a,
580            _mm256_undefined_ph(),
581            [
582                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
583                16, 16, 16, 16, 16, 16, 16, 16, 16
584            ]
585        )
586    }
587}
588
589/// Cast vector of type `__m256h` to type `__m128h`. The upper 8 elements of the result are zeroed.
590/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
591/// any instructions.
592///
593/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextph128_ph256)
594#[inline]
595#[target_feature(enable = "avx512fp16")]
596#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
597pub fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
598    unsafe {
599        simd_shuffle!(
600            a,
601            _mm_setzero_ph(),
602            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
603        )
604    }
605}
606
607/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
608/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
609/// any instructions.
610///
611/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
612#[inline]
613#[target_feature(enable = "avx512fp16")]
614#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
615pub fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
616    unsafe {
617        simd_shuffle!(
618            a,
619            _mm256_setzero_ph(),
620            [
621                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
622                16, 16, 16, 16, 16, 16, 16, 16, 16
623            ]
624        )
625    }
626}
627
628/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
629/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
630/// any instructions.
631///
632/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph128_ph512)
633#[inline]
634#[target_feature(enable = "avx512fp16")]
635#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
636pub fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
637    unsafe {
638        simd_shuffle!(
639            a,
640            _mm_setzero_ph(),
641            [
642                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
643                8, 8, 8, 8
644            ]
645        )
646    }
647}
648
649macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
650    ($mask_type: ty, $reg: ident, $a: expr, $b: expr) => {{
651        let dst: $mask_type;
652        asm!(
653            "vcmpph {k}, {a}, {b}, {imm8}",
654            k = lateout(kreg) dst,
655            a = in($reg) $a,
656            b = in($reg) $b,
657            imm8 = const IMM5,
658            options(pure, nomem, nostack)
659        );
660        dst
661    }};
662    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr, $b: expr) => {{
663        let dst: $mask_type;
664        asm!(
665            "vcmpph {k} {{ {mask} }}, {a}, {b}, {imm8}",
666            k = lateout(kreg) dst,
667            mask = in(kreg) $mask,
668            a = in($reg) $a,
669            b = in($reg) $b,
670            imm8 = const IMM5,
671            options(pure, nomem, nostack)
672        );
673        dst
674    }};
675}
676
677/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
678/// operand specified by imm8, and store the results in mask vector k.
679///
680/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ph_mask)
681#[inline]
682#[target_feature(enable = "avx512fp16,avx512vl")]
683#[rustc_legacy_const_generics(2)]
684#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
685pub fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
686    unsafe {
687        static_assert_uimm_bits!(IMM5, 5);
688        cmp_asm!(__mmask8, xmm_reg, a, b)
689    }
690}
691
692/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
693/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
694/// zeroed out when the corresponding mask bit is not set).
695///
696/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ph_mask)
697#[inline]
698#[target_feature(enable = "avx512fp16,avx512vl")]
699#[rustc_legacy_const_generics(3)]
700#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
701pub fn _mm_mask_cmp_ph_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
702    unsafe {
703        static_assert_uimm_bits!(IMM5, 5);
704        cmp_asm!(__mmask8, k1, xmm_reg, a, b)
705    }
706}
707
708/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
709/// operand specified by imm8, and store the results in mask vector k.
710///
711/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ph_mask)
712#[inline]
713#[target_feature(enable = "avx512fp16,avx512vl")]
714#[rustc_legacy_const_generics(2)]
715#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
716pub fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __mmask16 {
717    unsafe {
718        static_assert_uimm_bits!(IMM5, 5);
719        cmp_asm!(__mmask16, ymm_reg, a, b)
720    }
721}
722
723/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
724/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
725/// zeroed out when the corresponding mask bit is not set).
726///
727/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ph_mask)
728#[inline]
729#[target_feature(enable = "avx512fp16,avx512vl")]
730#[rustc_legacy_const_generics(3)]
731#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
732pub fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
733    k1: __mmask16,
734    a: __m256h,
735    b: __m256h,
736) -> __mmask16 {
737    unsafe {
738        static_assert_uimm_bits!(IMM5, 5);
739        cmp_asm!(__mmask16, k1, ymm_reg, a, b)
740    }
741}
742
743/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
744/// operand specified by imm8, and store the results in mask vector k.
745///
746/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ph_mask)
747#[inline]
748#[target_feature(enable = "avx512fp16")]
749#[rustc_legacy_const_generics(2)]
750#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
751pub fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __mmask32 {
752    unsafe {
753        static_assert_uimm_bits!(IMM5, 5);
754        cmp_asm!(__mmask32, zmm_reg, a, b)
755    }
756}
757
758/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
759/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
760/// zeroed out when the corresponding mask bit is not set).
761///
762/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ph_mask)
763#[inline]
764#[target_feature(enable = "avx512fp16")]
765#[rustc_legacy_const_generics(3)]
766#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
767pub fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
768    k1: __mmask32,
769    a: __m512h,
770    b: __m512h,
771) -> __mmask32 {
772    unsafe {
773        static_assert_uimm_bits!(IMM5, 5);
774        cmp_asm!(__mmask32, k1, zmm_reg, a, b)
775    }
776}
777
778/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
779/// operand specified by imm8, and store the results in mask vector k.
780///
781/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
782///
783/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
784#[inline]
785#[target_feature(enable = "avx512fp16")]
786#[rustc_legacy_const_generics(2, 3)]
787#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
788pub fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
789    a: __m512h,
790    b: __m512h,
791) -> __mmask32 {
792    unsafe {
793        static_assert_uimm_bits!(IMM5, 5);
794        static_assert_sae!(SAE);
795        if SAE == _MM_FROUND_NO_EXC {
796            let dst: __mmask32;
797            asm!(
798                "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
799                k = lateout(kreg) dst,
800                a = in(zmm_reg) a,
801                b = in(zmm_reg) b,
802                imm8 = const IMM5,
803                options(pure, nomem, nostack)
804            );
805            dst
806        } else {
807            cmp_asm!(__mmask32, zmm_reg, a, b)
808        }
809    }
810}
811
812/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
813/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
814/// zeroed out when the corresponding mask bit is not set).
815///
816/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
817///
818/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
819#[inline]
820#[target_feature(enable = "avx512fp16")]
821#[rustc_legacy_const_generics(3, 4)]
822#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
823pub fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
824    k1: __mmask32,
825    a: __m512h,
826    b: __m512h,
827) -> __mmask32 {
828    unsafe {
829        static_assert_uimm_bits!(IMM5, 5);
830        static_assert_sae!(SAE);
831        if SAE == _MM_FROUND_NO_EXC {
832            let dst: __mmask32;
833            asm!(
834                "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
835                k = lateout(kreg) dst,
836                k1 = in(kreg) k1,
837                a = in(zmm_reg) a,
838                b = in(zmm_reg) b,
839                imm8 = const IMM5,
840                options(pure, nomem, nostack)
841            );
842            dst
843        } else {
844            cmp_asm!(__mmask32, k1, zmm_reg, a, b)
845        }
846    }
847}
848
849/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
850/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
851/// passing _MM_FROUND_NO_EXC in the sae parameter.
852///
853/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sh_mask)
854#[inline]
855#[target_feature(enable = "avx512fp16")]
856#[rustc_legacy_const_generics(2, 3)]
857#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
858pub fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __mmask8 {
859    static_assert_uimm_bits!(IMM5, 5);
860    static_assert_sae!(SAE);
861    _mm_mask_cmp_round_sh_mask::<IMM5, SAE>(0xff, a, b)
862}
863
864/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
865/// operand specified by imm8, and store the result in mask vector k using zeromask k1. Exceptions can be
866/// suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
867///
868/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sh_mask)
869#[inline]
870#[target_feature(enable = "avx512fp16")]
871#[rustc_legacy_const_generics(3, 4)]
872#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
873pub fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
874    k1: __mmask8,
875    a: __m128h,
876    b: __m128h,
877) -> __mmask8 {
878    unsafe {
879        static_assert_uimm_bits!(IMM5, 5);
880        static_assert_sae!(SAE);
881        vcmpsh(a, b, IMM5, k1, SAE)
882    }
883}
884
885/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
886/// operand specified by imm8, and store the result in mask vector k.
887///
888/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sh_mask)
889#[inline]
890#[target_feature(enable = "avx512fp16")]
891#[rustc_legacy_const_generics(2)]
892#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
893pub fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
894    static_assert_uimm_bits!(IMM5, 5);
895    _mm_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
896}
897
898/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
899/// operand specified by imm8, and store the result in mask vector k using zeromask k1.
900///
901/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sh_mask)
902#[inline]
903#[target_feature(enable = "avx512fp16")]
904#[rustc_legacy_const_generics(3)]
905#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
906pub fn _mm_mask_cmp_sh_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
907    static_assert_uimm_bits!(IMM5, 5);
908    _mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
909}
910
911/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
912/// operand specified by imm8, and return the boolean result (0 or 1).
913/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
914///
915/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sh)
916#[inline]
917#[target_feature(enable = "avx512fp16")]
918#[rustc_legacy_const_generics(2, 3)]
919#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
920pub fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> i32 {
921    unsafe {
922        static_assert_uimm_bits!(IMM5, 5);
923        static_assert_sae!(SAE);
924        vcomish(a, b, IMM5, SAE)
925    }
926}
927
928/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
929/// operand specified by imm8, and return the boolean result (0 or 1).
930///
931/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_sh)
932#[inline]
933#[target_feature(enable = "avx512fp16")]
934#[rustc_legacy_const_generics(2)]
935#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
936pub fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
937    static_assert_uimm_bits!(IMM5, 5);
938    _mm_comi_round_sh::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
939}
940
941/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and return
942/// the boolean result (0 or 1).
943///
944/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sh)
945#[inline]
946#[target_feature(enable = "avx512fp16")]
947#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
948pub fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
949    _mm_comi_sh::<_CMP_EQ_OS>(a, b)
950}
951
952/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
953/// and return the boolean result (0 or 1).
954///
955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sh)
956#[inline]
957#[target_feature(enable = "avx512fp16")]
958#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
959pub fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
960    _mm_comi_sh::<_CMP_GE_OS>(a, b)
961}
962
963/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
964/// the boolean result (0 or 1).
965///
966/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sh)
967#[inline]
968#[target_feature(enable = "avx512fp16")]
969#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
970pub fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
971    _mm_comi_sh::<_CMP_GT_OS>(a, b)
972}
973
974/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
975/// return the boolean result (0 or 1).
976///
977/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sh)
978#[inline]
979#[target_feature(enable = "avx512fp16")]
980#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
981pub fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
982    _mm_comi_sh::<_CMP_LE_OS>(a, b)
983}
984
985/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
986/// the boolean result (0 or 1).
987///
988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sh)
989#[inline]
990#[target_feature(enable = "avx512fp16")]
991#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
992pub fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
993    _mm_comi_sh::<_CMP_LT_OS>(a, b)
994}
995
996/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
997/// the boolean result (0 or 1).
998///
999/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sh)
1000#[inline]
1001#[target_feature(enable = "avx512fp16")]
1002#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1003pub fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
1004    _mm_comi_sh::<_CMP_NEQ_OS>(a, b)
1005}
1006
1007/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and
1008/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1009///
1010/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sh)
1011#[inline]
1012#[target_feature(enable = "avx512fp16")]
1013#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1014pub fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
1015    _mm_comi_sh::<_CMP_EQ_OQ>(a, b)
1016}
1017
1018/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
1019/// and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1020///
1021/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sh)
1022#[inline]
1023#[target_feature(enable = "avx512fp16")]
1024#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1025pub fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
1026    _mm_comi_sh::<_CMP_GE_OQ>(a, b)
1027}
1028
1029/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
1030/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1031///
1032/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sh)
1033#[inline]
1034#[target_feature(enable = "avx512fp16")]
1035#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1036pub fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
1037    _mm_comi_sh::<_CMP_GT_OQ>(a, b)
1038}
1039
1040/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
1041/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1042///
1043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sh)
1044#[inline]
1045#[target_feature(enable = "avx512fp16")]
1046#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1047pub fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
1048    _mm_comi_sh::<_CMP_LE_OQ>(a, b)
1049}
1050
1051/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
1052/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1053///
1054/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sh)
1055#[inline]
1056#[target_feature(enable = "avx512fp16")]
1057#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1058pub fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
1059    _mm_comi_sh::<_CMP_LT_OQ>(a, b)
1060}
1061
1062/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1063/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1064///
1065/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sh)
1066#[inline]
1067#[target_feature(enable = "avx512fp16")]
1068#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1069pub fn _mm_ucomineq_sh(a: __m128h, b: __m128h) -> i32 {
1070    _mm_comi_sh::<_CMP_NEQ_OQ>(a, b)
1071}
1072
1073/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1074/// a new vector. The address must be aligned to 16 bytes or a general-protection exception may be generated.
1075///
1076/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ph)
1077#[inline]
1078#[target_feature(enable = "avx512fp16,avx512vl")]
1079#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1080pub unsafe fn _mm_load_ph(mem_addr: *const f16) -> __m128h {
1081    *mem_addr.cast()
1082}
1083
1084/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1085/// a new vector. The address must be aligned to 32 bytes or a general-protection exception may be generated.
1086///
1087/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ph)
1088#[inline]
1089#[target_feature(enable = "avx512fp16,avx512vl")]
1090#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1091pub unsafe fn _mm256_load_ph(mem_addr: *const f16) -> __m256h {
1092    *mem_addr.cast()
1093}
1094
1095/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1096/// a new vector. The address must be aligned to 64 bytes or a general-protection exception may be generated.
1097///
1098/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_ph)
1099#[inline]
1100#[target_feature(enable = "avx512fp16")]
1101#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1102pub unsafe fn _mm512_load_ph(mem_addr: *const f16) -> __m512h {
1103    *mem_addr.cast()
1104}
1105
1106/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector,
1107/// and zero the upper elements
1108///
1109/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sh)
1110#[inline]
1111#[target_feature(enable = "avx512fp16")]
1112#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1113pub unsafe fn _mm_load_sh(mem_addr: *const f16) -> __m128h {
1114    _mm_set_sh(*mem_addr)
1115}
1116
1117/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1118/// using writemask k (the element is copied from src when mask bit 0 is not set), and zero the upper elements.
1119///
1120/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sh)
1121#[inline]
1122#[target_feature(enable = "avx512fp16")]
1123#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1124pub unsafe fn _mm_mask_load_sh(src: __m128h, k: __mmask8, mem_addr: *const f16) -> __m128h {
1125    let mut dst = src;
1126    asm!(
1127        vpl!("vmovsh {dst}{{{k}}}"),
1128        dst = inout(xmm_reg) dst,
1129        k = in(kreg) k,
1130        p = in(reg) mem_addr,
1131        options(pure, readonly, nostack, preserves_flags)
1132    );
1133    dst
1134}
1135
1136/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1137/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and zero the upper elements.
1138///
1139/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sh)
1140#[inline]
1141#[target_feature(enable = "avx512fp16")]
1142#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1143pub unsafe fn _mm_maskz_load_sh(k: __mmask8, mem_addr: *const f16) -> __m128h {
1144    let mut dst: __m128h;
1145    asm!(
1146        vpl!("vmovsh {dst}{{{k}}}{{z}}"),
1147        dst = out(xmm_reg) dst,
1148        k = in(kreg) k,
1149        p = in(reg) mem_addr,
1150        options(pure, readonly, nostack, preserves_flags)
1151    );
1152    dst
1153}
1154
1155/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1156/// a new vector. The address does not need to be aligned to any particular boundary.
1157///
1158/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ph)
1159#[inline]
1160#[target_feature(enable = "avx512fp16,avx512vl")]
1161#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1162pub unsafe fn _mm_loadu_ph(mem_addr: *const f16) -> __m128h {
1163    ptr::read_unaligned(mem_addr.cast())
1164}
1165
1166/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1167/// a new vector. The address does not need to be aligned to any particular boundary.
1168///
1169/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ph)
1170#[inline]
1171#[target_feature(enable = "avx512fp16,avx512vl")]
1172#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1173pub unsafe fn _mm256_loadu_ph(mem_addr: *const f16) -> __m256h {
1174    ptr::read_unaligned(mem_addr.cast())
1175}
1176
1177/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1178/// a new vector. The address does not need to be aligned to any particular boundary.
1179///
1180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ph)
1181#[inline]
1182#[target_feature(enable = "avx512fp16")]
1183#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1184pub unsafe fn _mm512_loadu_ph(mem_addr: *const f16) -> __m512h {
1185    ptr::read_unaligned(mem_addr.cast())
1186}
1187
1188/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1189/// using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper
1190/// 7 packed elements from a to the upper elements of dst.
1191///
1192/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sh)
1193#[inline]
1194#[target_feature(enable = "avx512fp16")]
1195#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1196pub fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1197    unsafe {
1198        let mut mov: f16 = simd_extract!(src, 0);
1199        if (k & 1) != 0 {
1200            mov = simd_extract!(b, 0);
1201        }
1202        simd_insert!(a, 0, mov)
1203    }
1204}
1205
1206/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1207/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
1208/// elements from a to the upper elements of dst.
1209///
1210/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sh)
1211#[inline]
1212#[target_feature(enable = "avx512fp16")]
1213#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1214pub fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1215    unsafe {
1216        let mut mov: f16 = 0.;
1217        if (k & 1) != 0 {
1218            mov = simd_extract!(b, 0);
1219        }
1220        simd_insert!(a, 0, mov)
1221    }
1222}
1223
1224/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst,
1225/// and copy the upper 7 packed elements from a to the upper elements of dst.
1226///
1227/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sh)
1228#[inline]
1229#[target_feature(enable = "avx512fp16")]
1230#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1231pub fn _mm_move_sh(a: __m128h, b: __m128h) -> __m128h {
1232    unsafe {
1233        let mov: f16 = simd_extract!(b, 0);
1234        simd_insert!(a, 0, mov)
1235    }
1236}
1237
1238/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1239/// The address must be aligned to 16 bytes or a general-protection exception may be generated.
1240///
1241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ph)
1242#[inline]
1243#[target_feature(enable = "avx512fp16,avx512vl")]
1244#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1245pub unsafe fn _mm_store_ph(mem_addr: *mut f16, a: __m128h) {
1246    *mem_addr.cast() = a;
1247}
1248
1249/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1250/// The address must be aligned to 32 bytes or a general-protection exception may be generated.
1251///
1252/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ph)
1253#[inline]
1254#[target_feature(enable = "avx512fp16,avx512vl")]
1255#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1256pub unsafe fn _mm256_store_ph(mem_addr: *mut f16, a: __m256h) {
1257    *mem_addr.cast() = a;
1258}
1259
1260/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1261/// The address must be aligned to 64 bytes or a general-protection exception may be generated.
1262///
1263/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_ph)
1264#[inline]
1265#[target_feature(enable = "avx512fp16")]
1266#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1267pub unsafe fn _mm512_store_ph(mem_addr: *mut f16, a: __m512h) {
1268    *mem_addr.cast() = a;
1269}
1270
1271/// Store the lower half-precision (16-bit) floating-point element from a into memory.
1272///
1273/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sh)
1274#[inline]
1275#[target_feature(enable = "avx512fp16")]
1276#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1277pub unsafe fn _mm_store_sh(mem_addr: *mut f16, a: __m128h) {
1278    *mem_addr = simd_extract!(a, 0);
1279}
1280
1281/// Store the lower half-precision (16-bit) floating-point element from a into memory using writemask k
1282///
1283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sh)
1284#[inline]
1285#[target_feature(enable = "avx512fp16")]
1286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1287pub unsafe fn _mm_mask_store_sh(mem_addr: *mut f16, k: __mmask8, a: __m128h) {
1288    asm!(
1289        vps!("vmovdqu16", "{{{k}}}, {src}"),
1290        p = in(reg) mem_addr,
1291        k = in(kreg) k,
1292        src = in(xmm_reg) a,
1293        options(nostack, preserves_flags)
1294    );
1295}
1296
1297/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1298/// The address does not need to be aligned to any particular boundary.
1299///
1300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ph)
1301#[inline]
1302#[target_feature(enable = "avx512fp16,avx512vl")]
1303#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1304pub unsafe fn _mm_storeu_ph(mem_addr: *mut f16, a: __m128h) {
1305    ptr::write_unaligned(mem_addr.cast(), a);
1306}
1307
1308/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1309/// The address does not need to be aligned to any particular boundary.
1310///
1311/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ph)
1312#[inline]
1313#[target_feature(enable = "avx512fp16,avx512vl")]
1314#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1315pub unsafe fn _mm256_storeu_ph(mem_addr: *mut f16, a: __m256h) {
1316    ptr::write_unaligned(mem_addr.cast(), a);
1317}
1318
1319/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1320/// The address does not need to be aligned to any particular boundary.
1321///
1322/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ph)
1323#[inline]
1324#[target_feature(enable = "avx512fp16")]
1325#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1326pub unsafe fn _mm512_storeu_ph(mem_addr: *mut f16, a: __m512h) {
1327    ptr::write_unaligned(mem_addr.cast(), a);
1328}
1329
1330/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1331///
1332/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ph)
1333#[inline]
1334#[target_feature(enable = "avx512fp16,avx512vl")]
1335#[cfg_attr(test, assert_instr(vaddph))]
1336#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1337pub fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
1338    unsafe { simd_add(a, b) }
1339}
1340
1341/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1342/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1343///
1344/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ph)
1345#[inline]
1346#[target_feature(enable = "avx512fp16,avx512vl")]
1347#[cfg_attr(test, assert_instr(vaddph))]
1348#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1349pub fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1350    unsafe {
1351        let r = _mm_add_ph(a, b);
1352        simd_select_bitmask(k, r, src)
1353    }
1354}
1355
1356/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1357/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1358///
1359/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ph)
1360#[inline]
1361#[target_feature(enable = "avx512fp16,avx512vl")]
1362#[cfg_attr(test, assert_instr(vaddph))]
1363#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1364pub fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1365    unsafe {
1366        let r = _mm_add_ph(a, b);
1367        simd_select_bitmask(k, r, _mm_setzero_ph())
1368    }
1369}
1370
1371/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1372///
1373/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ph)
1374#[inline]
1375#[target_feature(enable = "avx512fp16,avx512vl")]
1376#[cfg_attr(test, assert_instr(vaddph))]
1377#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1378pub fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
1379    unsafe { simd_add(a, b) }
1380}
1381
1382/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1383/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1384///
1385/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_ph)
1386#[inline]
1387#[target_feature(enable = "avx512fp16,avx512vl")]
1388#[cfg_attr(test, assert_instr(vaddph))]
1389#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1390pub fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1391    unsafe {
1392        let r = _mm256_add_ph(a, b);
1393        simd_select_bitmask(k, r, src)
1394    }
1395}
1396
1397/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1398/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1399///
1400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_ph)
1401#[inline]
1402#[target_feature(enable = "avx512fp16,avx512vl")]
1403#[cfg_attr(test, assert_instr(vaddph))]
1404#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1405pub fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1406    unsafe {
1407        let r = _mm256_add_ph(a, b);
1408        simd_select_bitmask(k, r, _mm256_setzero_ph())
1409    }
1410}
1411
1412/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1413///
1414/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_ph)
1415#[inline]
1416#[target_feature(enable = "avx512fp16")]
1417#[cfg_attr(test, assert_instr(vaddph))]
1418#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1419pub fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
1420    unsafe { simd_add(a, b) }
1421}
1422
1423/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1424/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1425///
1426/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_ph)
1427#[inline]
1428#[target_feature(enable = "avx512fp16")]
1429#[cfg_attr(test, assert_instr(vaddph))]
1430#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1431pub fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1432    unsafe {
1433        let r = _mm512_add_ph(a, b);
1434        simd_select_bitmask(k, r, src)
1435    }
1436}
1437
1438/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1439/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1440///
1441/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_ph)
1442#[inline]
1443#[target_feature(enable = "avx512fp16")]
1444#[cfg_attr(test, assert_instr(vaddph))]
1445#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1446pub fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1447    unsafe {
1448        let r = _mm512_add_ph(a, b);
1449        simd_select_bitmask(k, r, _mm512_setzero_ph())
1450    }
1451}
1452
1453/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1454/// Rounding is done according to the rounding parameter, which can be one of:
1455///
1456/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1457/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1458/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1459/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1460/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1461///
1462/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_ph)
1463#[inline]
1464#[target_feature(enable = "avx512fp16")]
1465#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1466#[rustc_legacy_const_generics(2)]
1467#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1468pub fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1469    unsafe {
1470        static_assert_rounding!(ROUNDING);
1471        vaddph(a, b, ROUNDING)
1472    }
1473}
1474
1475/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1476/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1477/// Rounding is done according to the rounding parameter, which can be one of:
1478///
1479/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1480/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1481/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1482/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1483/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1484///
1485/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_ph)
1486#[inline]
1487#[target_feature(enable = "avx512fp16")]
1488#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1489#[rustc_legacy_const_generics(4)]
1490#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1491pub fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
1492    src: __m512h,
1493    k: __mmask32,
1494    a: __m512h,
1495    b: __m512h,
1496) -> __m512h {
1497    unsafe {
1498        static_assert_rounding!(ROUNDING);
1499        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1500        simd_select_bitmask(k, r, src)
1501    }
1502}
1503
1504/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1505/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1506/// Rounding is done according to the rounding parameter, which can be one of:
1507///
1508/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1509/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1510/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1511/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1512///
1513/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_ph)
1514#[inline]
1515#[target_feature(enable = "avx512fp16")]
1516#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1517#[rustc_legacy_const_generics(3)]
1518#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1519pub fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
1520    k: __mmask32,
1521    a: __m512h,
1522    b: __m512h,
1523) -> __m512h {
1524    unsafe {
1525        static_assert_rounding!(ROUNDING);
1526        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1527        simd_select_bitmask(k, r, _mm512_setzero_ph())
1528    }
1529}
1530
1531/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1532/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1533/// Rounding is done according to the rounding parameter, which can be one of:
1534///
1535/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1536/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1537/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1538/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1539/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1540///
1541/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sh)
1542#[inline]
1543#[target_feature(enable = "avx512fp16")]
1544#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1545#[rustc_legacy_const_generics(2)]
1546#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1547pub fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1548    static_assert_rounding!(ROUNDING);
1549    _mm_mask_add_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
1550}
1551
1552/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1553/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1554/// writemask k (the element is copied from src when mask bit 0 is not set).
1555/// Rounding is done according to the rounding parameter, which can be one of:
1556///
1557/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1558/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1559/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1560/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1561/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1562///
1563/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sh)
1564#[inline]
1565#[target_feature(enable = "avx512fp16")]
1566#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1567#[rustc_legacy_const_generics(4)]
1568#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1569pub fn _mm_mask_add_round_sh<const ROUNDING: i32>(
1570    src: __m128h,
1571    k: __mmask8,
1572    a: __m128h,
1573    b: __m128h,
1574) -> __m128h {
1575    unsafe {
1576        static_assert_rounding!(ROUNDING);
1577        vaddsh(a, b, src, k, ROUNDING)
1578    }
1579}
1580
1581/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1582/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1583/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1584/// Rounding is done according to the rounding parameter, which can be one of:
1585///
1586/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1587/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1588/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1589/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1590/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1591///
1592/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sh)
1593#[inline]
1594#[target_feature(enable = "avx512fp16")]
1595#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1596#[rustc_legacy_const_generics(3)]
1597#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1598pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1599    static_assert_rounding!(ROUNDING);
1600    _mm_mask_add_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
1601}
1602
1603/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1604/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1605///
1606/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sh)
1607#[inline]
1608#[target_feature(enable = "avx512fp16")]
1609#[cfg_attr(test, assert_instr(vaddsh))]
1610#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1611pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
1612    _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
1613}
1614
1615/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1616/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1617/// writemask k (the element is copied from src when mask bit 0 is not set).
1618///
1619/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sh)
1620#[inline]
1621#[target_feature(enable = "avx512fp16")]
1622#[cfg_attr(test, assert_instr(vaddsh))]
1623#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1624pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1625    _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
1626}
1627
1628/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1629/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1630/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1631///
1632/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sh)
1633#[inline]
1634#[target_feature(enable = "avx512fp16")]
1635#[cfg_attr(test, assert_instr(vaddsh))]
1636#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1637pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1638    _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
1639}
1640
1641/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1642///
1643/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ph)
1644#[inline]
1645#[target_feature(enable = "avx512fp16,avx512vl")]
1646#[cfg_attr(test, assert_instr(vsubph))]
1647#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1648pub fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
1649    unsafe { simd_sub(a, b) }
1650}
1651
1652/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1653/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1654///
1655/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ph)
1656#[inline]
1657#[target_feature(enable = "avx512fp16,avx512vl")]
1658#[cfg_attr(test, assert_instr(vsubph))]
1659#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1660pub fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1661    unsafe {
1662        let r = _mm_sub_ph(a, b);
1663        simd_select_bitmask(k, r, src)
1664    }
1665}
1666
1667/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1668/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1669///
1670/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ph)
1671#[inline]
1672#[target_feature(enable = "avx512fp16,avx512vl")]
1673#[cfg_attr(test, assert_instr(vsubph))]
1674#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1675pub fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1676    unsafe {
1677        let r = _mm_sub_ph(a, b);
1678        simd_select_bitmask(k, r, _mm_setzero_ph())
1679    }
1680}
1681
1682/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1683///
1684/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ph)
1685#[inline]
1686#[target_feature(enable = "avx512fp16,avx512vl")]
1687#[cfg_attr(test, assert_instr(vsubph))]
1688#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1689pub fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
1690    unsafe { simd_sub(a, b) }
1691}
1692
1693/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1694/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1695///
1696/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_ph)
1697#[inline]
1698#[target_feature(enable = "avx512fp16,avx512vl")]
1699#[cfg_attr(test, assert_instr(vsubph))]
1700#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1701pub fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1702    unsafe {
1703        let r = _mm256_sub_ph(a, b);
1704        simd_select_bitmask(k, r, src)
1705    }
1706}
1707
1708/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1709/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1710///
1711/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_ph)
1712#[inline]
1713#[target_feature(enable = "avx512fp16,avx512vl")]
1714#[cfg_attr(test, assert_instr(vsubph))]
1715#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1716pub fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1717    unsafe {
1718        let r = _mm256_sub_ph(a, b);
1719        simd_select_bitmask(k, r, _mm256_setzero_ph())
1720    }
1721}
1722
1723/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1724///
1725/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_ph)
1726#[inline]
1727#[target_feature(enable = "avx512fp16")]
1728#[cfg_attr(test, assert_instr(vsubph))]
1729#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1730pub fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
1731    unsafe { simd_sub(a, b) }
1732}
1733
1734/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1735/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1736///
1737/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ph)
1738#[inline]
1739#[target_feature(enable = "avx512fp16")]
1740#[cfg_attr(test, assert_instr(vsubph))]
1741#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1742pub fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1743    unsafe {
1744        let r = _mm512_sub_ph(a, b);
1745        simd_select_bitmask(k, r, src)
1746    }
1747}
1748
1749/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1750/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1751///
1752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ph)
1753#[inline]
1754#[target_feature(enable = "avx512fp16")]
1755#[cfg_attr(test, assert_instr(vsubph))]
1756#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1757pub fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1758    unsafe {
1759        let r = _mm512_sub_ph(a, b);
1760        simd_select_bitmask(k, r, _mm512_setzero_ph())
1761    }
1762}
1763
1764/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1765/// Rounding is done according to the rounding parameter, which can be one of:
1766///
1767/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1768/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1769/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1770/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1771/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1772///
1773/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ph)
1774#[inline]
1775#[target_feature(enable = "avx512fp16")]
1776#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1777#[rustc_legacy_const_generics(2)]
1778#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1779pub fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1780    unsafe {
1781        static_assert_rounding!(ROUNDING);
1782        vsubph(a, b, ROUNDING)
1783    }
1784}
1785
1786/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1787/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1788/// Rounding is done according to the rounding parameter, which can be one of:
1789///
1790/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1791/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1792/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1793/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1794/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1795///
1796/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ph)
1797#[inline]
1798#[target_feature(enable = "avx512fp16")]
1799#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1800#[rustc_legacy_const_generics(4)]
1801#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1802pub fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
1803    src: __m512h,
1804    k: __mmask32,
1805    a: __m512h,
1806    b: __m512h,
1807) -> __m512h {
1808    unsafe {
1809        static_assert_rounding!(ROUNDING);
1810        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1811        simd_select_bitmask(k, r, src)
1812    }
1813}
1814
1815/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1816/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1817/// Rounding is done according to the rounding parameter, which can be one of:
1818///
1819/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1820/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1821/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1822/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1823/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1824///
1825/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ph)
1826#[inline]
1827#[target_feature(enable = "avx512fp16")]
1828#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1829#[rustc_legacy_const_generics(3)]
1830#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1831pub fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
1832    k: __mmask32,
1833    a: __m512h,
1834    b: __m512h,
1835) -> __m512h {
1836    unsafe {
1837        static_assert_rounding!(ROUNDING);
1838        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1839        simd_select_bitmask(k, r, _mm512_setzero_ph())
1840    }
1841}
1842
1843/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1844/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1845/// Rounding is done according to the rounding parameter, which can be one of:
1846///
1847/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1848/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1849/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1850/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1851/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1852///
1853/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sh)
1854#[inline]
1855#[target_feature(enable = "avx512fp16")]
1856#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1857#[rustc_legacy_const_generics(2)]
1858#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1859pub fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1860    static_assert_rounding!(ROUNDING);
1861    _mm_mask_sub_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
1862}
1863
1864/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1865/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1866/// writemask k (the element is copied from src when mask bit 0 is not set).
1867/// Rounding is done according to the rounding parameter, which can be one of:
1868///
1869/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1870/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1871/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1872/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1873/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1874///
1875/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sh)
1876#[inline]
1877#[target_feature(enable = "avx512fp16")]
1878#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1879#[rustc_legacy_const_generics(4)]
1880#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1881pub fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
1882    src: __m128h,
1883    k: __mmask8,
1884    a: __m128h,
1885    b: __m128h,
1886) -> __m128h {
1887    unsafe {
1888        static_assert_rounding!(ROUNDING);
1889        vsubsh(a, b, src, k, ROUNDING)
1890    }
1891}
1892
1893/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1894/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1895/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1896/// Rounding is done according to the rounding parameter, which can be one of:
1897///
1898/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1899/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1900/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1901/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1902/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1903///
1904/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sh)
1905#[inline]
1906#[target_feature(enable = "avx512fp16")]
1907#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1908#[rustc_legacy_const_generics(3)]
1909#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1910pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1911    static_assert_rounding!(ROUNDING);
1912    _mm_mask_sub_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
1913}
1914
1915/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1916/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1917///
1918/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sh)
1919#[inline]
1920#[target_feature(enable = "avx512fp16")]
1921#[cfg_attr(test, assert_instr(vsubsh))]
1922#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1923pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
1924    _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
1925}
1926
1927/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1928/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1929/// writemask k (the element is copied from src when mask bit 0 is not set).
1930///
1931/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sh)
1932#[inline]
1933#[target_feature(enable = "avx512fp16")]
1934#[cfg_attr(test, assert_instr(vsubsh))]
1935#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1936pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1937    _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
1938}
1939
1940/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1941/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1942/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1943///
1944/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sh)
1945#[inline]
1946#[target_feature(enable = "avx512fp16")]
1947#[cfg_attr(test, assert_instr(vsubsh))]
1948#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1949pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1950    _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
1951}
1952
1953/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1954///
1955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ph)
1956#[inline]
1957#[target_feature(enable = "avx512fp16,avx512vl")]
1958#[cfg_attr(test, assert_instr(vmulph))]
1959#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1960pub fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
1961    unsafe { simd_mul(a, b) }
1962}
1963
1964/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1965/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1966///
1967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ph)
1968#[inline]
1969#[target_feature(enable = "avx512fp16,avx512vl")]
1970#[cfg_attr(test, assert_instr(vmulph))]
1971#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1972pub fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1973    unsafe {
1974        let r = _mm_mul_ph(a, b);
1975        simd_select_bitmask(k, r, src)
1976    }
1977}
1978
1979/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1980/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1981///
1982/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ph)
1983#[inline]
1984#[target_feature(enable = "avx512fp16,avx512vl")]
1985#[cfg_attr(test, assert_instr(vmulph))]
1986#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1987pub fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1988    unsafe {
1989        let r = _mm_mul_ph(a, b);
1990        simd_select_bitmask(k, r, _mm_setzero_ph())
1991    }
1992}
1993
1994/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1995///
1996/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ph)
1997#[inline]
1998#[target_feature(enable = "avx512fp16,avx512vl")]
1999#[cfg_attr(test, assert_instr(vmulph))]
2000#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2001pub fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
2002    unsafe { simd_mul(a, b) }
2003}
2004
2005/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2006/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2007///
2008/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_ph)
2009#[inline]
2010#[target_feature(enable = "avx512fp16,avx512vl")]
2011#[cfg_attr(test, assert_instr(vmulph))]
2012#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2013pub fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2014    unsafe {
2015        let r = _mm256_mul_ph(a, b);
2016        simd_select_bitmask(k, r, src)
2017    }
2018}
2019
2020/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2021/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2022///
2023/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_ph)
2024#[inline]
2025#[target_feature(enable = "avx512fp16,avx512vl")]
2026#[cfg_attr(test, assert_instr(vmulph))]
2027#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2028pub fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2029    unsafe {
2030        let r = _mm256_mul_ph(a, b);
2031        simd_select_bitmask(k, r, _mm256_setzero_ph())
2032    }
2033}
2034
2035/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2036///
2037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_ph)
2038#[inline]
2039#[target_feature(enable = "avx512fp16")]
2040#[cfg_attr(test, assert_instr(vmulph))]
2041#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2042pub fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
2043    unsafe { simd_mul(a, b) }
2044}
2045
2046/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2047/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2048///
2049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_ph)
2050#[inline]
2051#[target_feature(enable = "avx512fp16")]
2052#[cfg_attr(test, assert_instr(vmulph))]
2053#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2054pub fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2055    unsafe {
2056        let r = _mm512_mul_ph(a, b);
2057        simd_select_bitmask(k, r, src)
2058    }
2059}
2060
2061/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2062/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2063///
2064/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_ph)
2065#[inline]
2066#[target_feature(enable = "avx512fp16")]
2067#[cfg_attr(test, assert_instr(vmulph))]
2068#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2069pub fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2070    unsafe {
2071        let r = _mm512_mul_ph(a, b);
2072        simd_select_bitmask(k, r, _mm512_setzero_ph())
2073    }
2074}
2075
2076/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2077/// Rounding is done according to the rounding parameter, which can be one of:
2078///
2079/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2080/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2081/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2082/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2083/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2084///
2085/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_ph)
2086#[inline]
2087#[target_feature(enable = "avx512fp16")]
2088#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2089#[rustc_legacy_const_generics(2)]
2090#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2091pub fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2092    unsafe {
2093        static_assert_rounding!(ROUNDING);
2094        vmulph(a, b, ROUNDING)
2095    }
2096}
2097
2098/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2099/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2100/// Rounding is done according to the rounding parameter, which can be one of:
2101///
2102/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2103/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2104/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2105/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2106/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2107///
2108/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_ph)
2109#[inline]
2110#[target_feature(enable = "avx512fp16")]
2111#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2112#[rustc_legacy_const_generics(4)]
2113#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2114pub fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
2115    src: __m512h,
2116    k: __mmask32,
2117    a: __m512h,
2118    b: __m512h,
2119) -> __m512h {
2120    unsafe {
2121        static_assert_rounding!(ROUNDING);
2122        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2123        simd_select_bitmask(k, r, src)
2124    }
2125}
2126
2127/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2128/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2129/// Rounding is done according to the rounding parameter, which can be one of:
2130///
2131/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2132/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2133/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2134/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2135/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2136///
2137/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ph)
2138#[inline]
2139#[target_feature(enable = "avx512fp16")]
2140#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2141#[rustc_legacy_const_generics(3)]
2142#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2143pub fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
2144    k: __mmask32,
2145    a: __m512h,
2146    b: __m512h,
2147) -> __m512h {
2148    unsafe {
2149        static_assert_rounding!(ROUNDING);
2150        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2151        simd_select_bitmask(k, r, _mm512_setzero_ph())
2152    }
2153}
2154
2155/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2156/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2157/// Rounding is done according to the rounding parameter, which can be one of:
2158///
2159/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2160/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2161/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2162/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2163/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2164///
2165/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sh)
2166#[inline]
2167#[target_feature(enable = "avx512fp16")]
2168#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2169#[rustc_legacy_const_generics(2)]
2170#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2171pub fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2172    static_assert_rounding!(ROUNDING);
2173    _mm_mask_mul_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
2174}
2175
2176/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2177/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2178/// writemask k (the element is copied from src when mask bit 0 is not set).
2179/// Rounding is done according to the rounding parameter, which can be one of:
2180///
2181/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2182/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2183/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2184/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2185/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2186///
2187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sh)
2188#[inline]
2189#[target_feature(enable = "avx512fp16")]
2190#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2191#[rustc_legacy_const_generics(4)]
2192#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2193pub fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
2194    src: __m128h,
2195    k: __mmask8,
2196    a: __m128h,
2197    b: __m128h,
2198) -> __m128h {
2199    unsafe {
2200        static_assert_rounding!(ROUNDING);
2201        vmulsh(a, b, src, k, ROUNDING)
2202    }
2203}
2204
2205/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2206/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2207/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2208/// Rounding is done according to the rounding parameter, which can be one of:
2209///
2210/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2211/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2212/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2213/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2214/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2215///
2216/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sh)
2217#[inline]
2218#[target_feature(enable = "avx512fp16")]
2219#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2220#[rustc_legacy_const_generics(3)]
2221#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2222pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2223    static_assert_rounding!(ROUNDING);
2224    _mm_mask_mul_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
2225}
2226
2227/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2228/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2229///
2230/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sh)
2231#[inline]
2232#[target_feature(enable = "avx512fp16")]
2233#[cfg_attr(test, assert_instr(vmulsh))]
2234#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2235pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
2236    _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
2237}
2238
2239/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2240/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2241/// writemask k (the element is copied from src when mask bit 0 is not set).
2242///
2243/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sh)
2244#[inline]
2245#[target_feature(enable = "avx512fp16")]
2246#[cfg_attr(test, assert_instr(vmulsh))]
2247#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2248pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2249    _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2250}
2251
2252/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2253/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2254/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2255///
2256/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sh)
2257#[inline]
2258#[target_feature(enable = "avx512fp16")]
2259#[cfg_attr(test, assert_instr(vmulsh))]
2260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2261pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2262    _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2263}
2264
2265/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2266///
2267/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ph)
2268#[inline]
2269#[target_feature(enable = "avx512fp16,avx512vl")]
2270#[cfg_attr(test, assert_instr(vdivph))]
2271#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2272pub fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
2273    unsafe { simd_div(a, b) }
2274}
2275
2276/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2277/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2278///
2279/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ph)
2280#[inline]
2281#[target_feature(enable = "avx512fp16,avx512vl")]
2282#[cfg_attr(test, assert_instr(vdivph))]
2283#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2284pub fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2285    unsafe {
2286        let r = _mm_div_ph(a, b);
2287        simd_select_bitmask(k, r, src)
2288    }
2289}
2290
2291/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2292/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2293///
2294/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ph)
2295#[inline]
2296#[target_feature(enable = "avx512fp16,avx512vl")]
2297#[cfg_attr(test, assert_instr(vdivph))]
2298#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2299pub fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2300    unsafe {
2301        let r = _mm_div_ph(a, b);
2302        simd_select_bitmask(k, r, _mm_setzero_ph())
2303    }
2304}
2305
2306/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2307///
2308/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ph)
2309#[inline]
2310#[target_feature(enable = "avx512fp16,avx512vl")]
2311#[cfg_attr(test, assert_instr(vdivph))]
2312#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2313pub fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
2314    unsafe { simd_div(a, b) }
2315}
2316
2317/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2318/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2319///
2320/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_ph)
2321#[inline]
2322#[target_feature(enable = "avx512fp16,avx512vl")]
2323#[cfg_attr(test, assert_instr(vdivph))]
2324#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2325pub fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2326    unsafe {
2327        let r = _mm256_div_ph(a, b);
2328        simd_select_bitmask(k, r, src)
2329    }
2330}
2331
2332/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2333/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2334///
2335/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_ph)
2336#[inline]
2337#[target_feature(enable = "avx512fp16,avx512vl")]
2338#[cfg_attr(test, assert_instr(vdivph))]
2339#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2340pub fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2341    unsafe {
2342        let r = _mm256_div_ph(a, b);
2343        simd_select_bitmask(k, r, _mm256_setzero_ph())
2344    }
2345}
2346
2347/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2348///
2349/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ph)
2350#[inline]
2351#[target_feature(enable = "avx512fp16")]
2352#[cfg_attr(test, assert_instr(vdivph))]
2353#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2354pub fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
2355    unsafe { simd_div(a, b) }
2356}
2357
2358/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2359/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2360///
2361/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_ph)
2362#[inline]
2363#[target_feature(enable = "avx512fp16")]
2364#[cfg_attr(test, assert_instr(vdivph))]
2365#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2366pub fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2367    unsafe {
2368        let r = _mm512_div_ph(a, b);
2369        simd_select_bitmask(k, r, src)
2370    }
2371}
2372
2373/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2374/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2375///
2376/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_ph)
2377#[inline]
2378#[target_feature(enable = "avx512fp16")]
2379#[cfg_attr(test, assert_instr(vdivph))]
2380#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2381pub fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2382    unsafe {
2383        let r = _mm512_div_ph(a, b);
2384        simd_select_bitmask(k, r, _mm512_setzero_ph())
2385    }
2386}
2387
2388/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2389/// Rounding is done according to the rounding parameter, which can be one of:
2390///
2391/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2392/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2393/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2394/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2395/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2396///
2397/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ph)
2398#[inline]
2399#[target_feature(enable = "avx512fp16")]
2400#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2401#[rustc_legacy_const_generics(2)]
2402#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2403pub fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2404    unsafe {
2405        static_assert_rounding!(ROUNDING);
2406        vdivph(a, b, ROUNDING)
2407    }
2408}
2409
2410/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2411/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2412/// Rounding is done according to the rounding parameter, which can be one of:
2413///
2414/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2415/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2416/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2417/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2418/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2419///
2420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_ph)
2421#[inline]
2422#[target_feature(enable = "avx512fp16")]
2423#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2424#[rustc_legacy_const_generics(4)]
2425#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2426pub fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
2427    src: __m512h,
2428    k: __mmask32,
2429    a: __m512h,
2430    b: __m512h,
2431) -> __m512h {
2432    unsafe {
2433        static_assert_rounding!(ROUNDING);
2434        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2435        simd_select_bitmask(k, r, src)
2436    }
2437}
2438
2439/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2440/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2441/// Rounding is done according to the rounding parameter, which can be one of:
2442///
2443/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2444/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2445/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2446/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2447/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2448///
2449/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_ph)
2450#[inline]
2451#[target_feature(enable = "avx512fp16")]
2452#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2453#[rustc_legacy_const_generics(3)]
2454#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2455pub fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
2456    k: __mmask32,
2457    a: __m512h,
2458    b: __m512h,
2459) -> __m512h {
2460    unsafe {
2461        static_assert_rounding!(ROUNDING);
2462        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2463        simd_select_bitmask(k, r, _mm512_setzero_ph())
2464    }
2465}
2466
2467/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2468/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2469/// Rounding is done according to the rounding parameter, which can be one of:
2470///
2471/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2472/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2473/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2474/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2475/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2476///
2477/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sh)
2478#[inline]
2479#[target_feature(enable = "avx512fp16")]
2480#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2481#[rustc_legacy_const_generics(2)]
2482#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2483pub fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2484    static_assert_rounding!(ROUNDING);
2485    _mm_mask_div_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
2486}
2487
2488/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2489/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2490/// writemask k (the element is copied from src when mask bit 0 is not set).
2491/// Rounding is done according to the rounding parameter, which can be one of:
2492///
2493/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2494/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2495/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2496/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2497/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2498///
2499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sh)
2500#[inline]
2501#[target_feature(enable = "avx512fp16")]
2502#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2503#[rustc_legacy_const_generics(4)]
2504#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2505pub fn _mm_mask_div_round_sh<const ROUNDING: i32>(
2506    src: __m128h,
2507    k: __mmask8,
2508    a: __m128h,
2509    b: __m128h,
2510) -> __m128h {
2511    unsafe {
2512        static_assert_rounding!(ROUNDING);
2513        vdivsh(a, b, src, k, ROUNDING)
2514    }
2515}
2516
2517/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2518/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2519/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2520/// Rounding is done according to the rounding parameter, which can be one of:
2521///
2522/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2523/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2524/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2525/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2526/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2527///
2528/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sh)
2529#[inline]
2530#[target_feature(enable = "avx512fp16")]
2531#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2532#[rustc_legacy_const_generics(3)]
2533#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2534pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2535    static_assert_rounding!(ROUNDING);
2536    _mm_mask_div_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
2537}
2538
2539/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2540/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2541///
2542/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sh)
2543#[inline]
2544#[target_feature(enable = "avx512fp16")]
2545#[cfg_attr(test, assert_instr(vdivsh))]
2546#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2547pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
2548    _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
2549}
2550
2551/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2552/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2553/// writemask k (the element is copied from src when mask bit 0 is not set).
2554///
2555/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sh)
2556#[inline]
2557#[target_feature(enable = "avx512fp16")]
2558#[cfg_attr(test, assert_instr(vdivsh))]
2559#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2560pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2561    _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2562}
2563
2564/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2565/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2566/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2567///
2568/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sh)
2569#[inline]
2570#[target_feature(enable = "avx512fp16")]
2571#[cfg_attr(test, assert_instr(vdivsh))]
2572#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2573pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2574    _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2575}
2576
2577/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2578/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2579/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2580///
2581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pch)
2582#[inline]
2583#[target_feature(enable = "avx512fp16,avx512vl")]
2584#[cfg_attr(test, assert_instr(vfmulcph))]
2585#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2586pub fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
2587    _mm_mask_mul_pch(_mm_undefined_ph(), 0xff, a, b)
2588}
2589
2590/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2591/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2592/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2593///
2594/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pch)
2595#[inline]
2596#[target_feature(enable = "avx512fp16,avx512vl")]
2597#[cfg_attr(test, assert_instr(vfmulcph))]
2598#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2599pub fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2600    unsafe { transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
2601}
2602
2603/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2604/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2605/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2606///
2607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pch)
2608#[inline]
2609#[target_feature(enable = "avx512fp16,avx512vl")]
2610#[cfg_attr(test, assert_instr(vfmulcph))]
2611#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2612pub fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2613    _mm_mask_mul_pch(_mm_setzero_ph(), k, a, b)
2614}
2615
2616/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2617/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2618/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2619///
2620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pch)
2621#[inline]
2622#[target_feature(enable = "avx512fp16,avx512vl")]
2623#[cfg_attr(test, assert_instr(vfmulcph))]
2624#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2625pub fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
2626    _mm256_mask_mul_pch(_mm256_undefined_ph(), 0xff, a, b)
2627}
2628
2629/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2630/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2631/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2632///
2633/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pch)
2634#[inline]
2635#[target_feature(enable = "avx512fp16,avx512vl")]
2636#[cfg_attr(test, assert_instr(vfmulcph))]
2637#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2638pub fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2639    unsafe { transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
2640}
2641
2642/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2643/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2644/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2645///
2646/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pch)
2647#[inline]
2648#[target_feature(enable = "avx512fp16,avx512vl")]
2649#[cfg_attr(test, assert_instr(vfmulcph))]
2650#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2651pub fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2652    _mm256_mask_mul_pch(_mm256_setzero_ph(), k, a, b)
2653}
2654
2655/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2656/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2657/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2658///
2659/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pch)
2660#[inline]
2661#[target_feature(enable = "avx512fp16")]
2662#[cfg_attr(test, assert_instr(vfmulcph))]
2663#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2664pub fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
2665    _mm512_mask_mul_pch(_mm512_undefined_ph(), 0xffff, a, b)
2666}
2667
2668/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2669/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2670/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2671///
2672/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pch)
2673#[inline]
2674#[target_feature(enable = "avx512fp16")]
2675#[cfg_attr(test, assert_instr(vfmulcph))]
2676#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2677pub fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2678    _mm512_mask_mul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2679}
2680
2681/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2682/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2683/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2684///
2685/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pch)
2686#[inline]
2687#[target_feature(enable = "avx512fp16")]
2688#[cfg_attr(test, assert_instr(vfmulcph))]
2689#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2690pub fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2691    _mm512_mask_mul_pch(_mm512_setzero_ph(), k, a, b)
2692}
2693
2694/// Multiply the packed complex numbers in a and b, and store the results in dst. Each complex number is
2695/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2696/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2697///
2698/// Rounding is done according to the rounding parameter, which can be one of:
2699///
2700/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2701/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2702/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2703/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2704/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2705///
2706/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pch)
2707#[inline]
2708#[target_feature(enable = "avx512fp16")]
2709#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2710#[rustc_legacy_const_generics(2)]
2711#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2712pub fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2713    static_assert_rounding!(ROUNDING);
2714    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
2715}
2716
2717/// Multiply the packed complex numbers in a and b, and store the results in dst using writemask k (the element
2718/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2719/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2720///
2721/// Rounding is done according to the rounding parameter, which can be one of:
2722///
2723/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2724/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2725/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2726/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2727/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2728///
2729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pch)
2730#[inline]
2731#[target_feature(enable = "avx512fp16")]
2732#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2733#[rustc_legacy_const_generics(4)]
2734#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2735pub fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
2736    src: __m512h,
2737    k: __mmask16,
2738    a: __m512h,
2739    b: __m512h,
2740) -> __m512h {
2741    unsafe {
2742        static_assert_rounding!(ROUNDING);
2743        transmute(vfmulcph_512(
2744            transmute(a),
2745            transmute(b),
2746            transmute(src),
2747            k,
2748            ROUNDING,
2749        ))
2750    }
2751}
2752
2753/// Multiply the packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2754/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2755/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2756///
2757/// Rounding is done according to the rounding parameter, which can be one of:
2758///
2759/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2760/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2761/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2762/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2763/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2764///
2765/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_pch)
2766#[inline]
2767#[target_feature(enable = "avx512fp16")]
2768#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2769#[rustc_legacy_const_generics(3)]
2770#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2771pub fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
2772    k: __mmask16,
2773    a: __m512h,
2774    b: __m512h,
2775) -> __m512h {
2776    static_assert_rounding!(ROUNDING);
2777    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
2778}
2779
2780/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2781/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2782/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2783/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2784///
2785/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sch)
2786#[inline]
2787#[target_feature(enable = "avx512fp16")]
2788#[cfg_attr(test, assert_instr(vfmulcsh))]
2789#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2790pub fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
2791    _mm_mask_mul_sch(_mm_undefined_ph(), 0xff, a, b)
2792}
2793
2794/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2795/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2796/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent
2797/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2798///
2799/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sch)
2800#[inline]
2801#[target_feature(enable = "avx512fp16")]
2802#[cfg_attr(test, assert_instr(vfmulcsh))]
2803#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2804pub fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2805    _mm_mask_mul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2806}
2807
2808/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2809/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2810/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2811/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2812///
2813/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sch)
2814#[inline]
2815#[target_feature(enable = "avx512fp16")]
2816#[cfg_attr(test, assert_instr(vfmulcsh))]
2817#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2818pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2819    _mm_mask_mul_sch(_mm_setzero_ph(), k, a, b)
2820}
2821
2822/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2823/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2824/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2825/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2826///
2827/// Rounding is done according to the rounding parameter, which can be one of:
2828///
2829/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2830/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2831/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2832/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2833/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2834///
2835/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sch)
2836#[inline]
2837#[target_feature(enable = "avx512fp16")]
2838#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2839#[rustc_legacy_const_generics(2)]
2840#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2841pub fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2842    static_assert_rounding!(ROUNDING);
2843    _mm_mask_mul_round_sch::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
2844}
2845
2846/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2847/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2848/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2849/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2850///
2851/// Rounding is done according to the rounding parameter, which can be one of:
2852///
2853/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2854/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2855/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2856/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2857/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2858///
2859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sch)
2860#[inline]
2861#[target_feature(enable = "avx512fp16")]
2862#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2863#[rustc_legacy_const_generics(4)]
2864#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2865pub fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
2866    src: __m128h,
2867    k: __mmask8,
2868    a: __m128h,
2869    b: __m128h,
2870) -> __m128h {
2871    unsafe {
2872        static_assert_rounding!(ROUNDING);
2873        transmute(vfmulcsh(
2874            transmute(a),
2875            transmute(b),
2876            transmute(src),
2877            k,
2878            ROUNDING,
2879        ))
2880    }
2881}
2882
2883/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2884/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2885/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2886/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2887///
2888/// Rounding is done according to the rounding parameter, which can be one of:
2889///
2890/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2891/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2892/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2893/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2894/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2895///
2896/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sch)
2897#[inline]
2898#[target_feature(enable = "avx512fp16")]
2899#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2900#[rustc_legacy_const_generics(3)]
2901#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2902pub fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
2903    k: __mmask8,
2904    a: __m128h,
2905    b: __m128h,
2906) -> __m128h {
2907    static_assert_rounding!(ROUNDING);
2908    _mm_mask_mul_round_sch::<ROUNDING>(_mm_setzero_ph(), k, a, b)
2909}
2910
2911/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2912/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2913/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2914///
2915/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_pch)
2916#[inline]
2917#[target_feature(enable = "avx512fp16,avx512vl")]
2918#[cfg_attr(test, assert_instr(vfmulcph))]
2919#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2920pub fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
2921    _mm_mul_pch(a, b)
2922}
2923
2924/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2925/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2926/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2927///
2928/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_pch)
2929#[inline]
2930#[target_feature(enable = "avx512fp16,avx512vl")]
2931#[cfg_attr(test, assert_instr(vfmulcph))]
2932#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2933pub fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2934    _mm_mask_mul_pch(src, k, a, b)
2935}
2936
2937/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2938/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2939/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2940///
2941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_pch)
2942#[inline]
2943#[target_feature(enable = "avx512fp16,avx512vl")]
2944#[cfg_attr(test, assert_instr(vfmulcph))]
2945#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2946pub fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2947    _mm_maskz_mul_pch(k, a, b)
2948}
2949
2950/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2951/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2952/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2953///
2954/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmul_pch)
2955#[inline]
2956#[target_feature(enable = "avx512fp16,avx512vl")]
2957#[cfg_attr(test, assert_instr(vfmulcph))]
2958#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2959pub fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
2960    _mm256_mul_pch(a, b)
2961}
2962
2963/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2964/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2965/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2966///
2967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmul_pch)
2968#[inline]
2969#[target_feature(enable = "avx512fp16,avx512vl")]
2970#[cfg_attr(test, assert_instr(vfmulcph))]
2971#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2972pub fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2973    _mm256_mask_mul_pch(src, k, a, b)
2974}
2975
2976/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2977/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2978/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2979///
2980/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmul_pch)
2981#[inline]
2982#[target_feature(enable = "avx512fp16,avx512vl")]
2983#[cfg_attr(test, assert_instr(vfmulcph))]
2984#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2985pub fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2986    _mm256_maskz_mul_pch(k, a, b)
2987}
2988
2989/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
2990/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2991///
2992/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_pch)
2993#[inline]
2994#[target_feature(enable = "avx512fp16")]
2995#[cfg_attr(test, assert_instr(vfmulcph))]
2996#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2997pub fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
2998    _mm512_mul_pch(a, b)
2999}
3000
3001/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3002/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3003/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3004///
3005/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_pch)
3006#[inline]
3007#[target_feature(enable = "avx512fp16")]
3008#[cfg_attr(test, assert_instr(vfmulcph))]
3009#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3010pub fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3011    _mm512_mask_mul_pch(src, k, a, b)
3012}
3013
3014/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3015/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3016/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3017///
3018/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_pch)
3019#[inline]
3020#[target_feature(enable = "avx512fp16")]
3021#[cfg_attr(test, assert_instr(vfmulcph))]
3022#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3023pub fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3024    _mm512_maskz_mul_pch(k, a, b)
3025}
3026
3027/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
3028/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3029/// Rounding is done according to the rounding parameter, which can be one of:
3030///
3031/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3032/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3033/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3034/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3035/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3036///
3037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_round_pch)
3038#[inline]
3039#[target_feature(enable = "avx512fp16")]
3040#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3041#[rustc_legacy_const_generics(2)]
3042#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3043pub fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3044    static_assert_rounding!(ROUNDING);
3045    _mm512_mul_round_pch::<ROUNDING>(a, b)
3046}
3047
3048/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3049/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3050/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3051/// Rounding is done according to the rounding parameter, which can be one of:
3052///
3053/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3054/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3055/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3056/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3057/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3058///
3059/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_round_pch)
3060#[inline]
3061#[target_feature(enable = "avx512fp16")]
3062#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3063#[rustc_legacy_const_generics(4)]
3064#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3065pub fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
3066    src: __m512h,
3067    k: __mmask16,
3068    a: __m512h,
3069    b: __m512h,
3070) -> __m512h {
3071    static_assert_rounding!(ROUNDING);
3072    _mm512_mask_mul_round_pch::<ROUNDING>(src, k, a, b)
3073}
3074
3075/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3076/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3077/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3078/// Rounding is done according to the rounding parameter, which can be one of:
3079///
3080/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3081/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3082/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3083/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3084/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3085///
3086/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_round_pch)
3087#[inline]
3088#[target_feature(enable = "avx512fp16")]
3089#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3090#[rustc_legacy_const_generics(3)]
3091#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3092pub fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
3093    k: __mmask16,
3094    a: __m512h,
3095    b: __m512h,
3096) -> __m512h {
3097    static_assert_rounding!(ROUNDING);
3098    _mm512_maskz_mul_round_pch::<ROUNDING>(k, a, b)
3099}
3100
3101/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is
3102/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3103/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3104///
3105/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_sch)
3106#[inline]
3107#[target_feature(enable = "avx512fp16")]
3108#[cfg_attr(test, assert_instr(vfmulcsh))]
3109#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3110pub fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
3111    _mm_mul_sch(a, b)
3112}
3113
3114/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3115/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3116/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3117///
3118/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_sch)
3119#[inline]
3120#[target_feature(enable = "avx512fp16")]
3121#[cfg_attr(test, assert_instr(vfmulcsh))]
3122#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3123pub fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3124    _mm_mask_mul_sch(src, k, a, b)
3125}
3126
3127/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3128/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3129/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3130///
3131/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_sch)
3132#[inline]
3133#[target_feature(enable = "avx512fp16")]
3134#[cfg_attr(test, assert_instr(vfmulcsh))]
3135#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3136pub fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3137    _mm_maskz_mul_sch(k, a, b)
3138}
3139
3140/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is composed
3141/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3142///
3143/// Rounding is done according to the rounding parameter, which can be one of:
3144///
3145/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3146/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3147/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3148/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3149/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3150///
3151/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_round_sch)
3152#[inline]
3153#[target_feature(enable = "avx512fp16")]
3154#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3155#[rustc_legacy_const_generics(2)]
3156#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3157pub fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3158    static_assert_rounding!(ROUNDING);
3159    _mm_mul_round_sch::<ROUNDING>(a, b)
3160}
3161
3162/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3163/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3164/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3165///
3166/// Rounding is done according to the rounding parameter, which can be one of:
3167///
3168/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3169/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3170/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3171/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3172/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3173///
3174/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_round_sch)
3175#[inline]
3176#[target_feature(enable = "avx512fp16")]
3177#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3178#[rustc_legacy_const_generics(4)]
3179#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3180pub fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
3181    src: __m128h,
3182    k: __mmask8,
3183    a: __m128h,
3184    b: __m128h,
3185) -> __m128h {
3186    static_assert_rounding!(ROUNDING);
3187    _mm_mask_mul_round_sch::<ROUNDING>(src, k, a, b)
3188}
3189
3190/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3191/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3192/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3193///
3194/// Rounding is done according to the rounding parameter, which can be one of:
3195///
3196/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3197/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3198/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3199/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3200/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3201///
3202/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_round_sch)
3203#[inline]
3204#[target_feature(enable = "avx512fp16")]
3205#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3206#[rustc_legacy_const_generics(3)]
3207#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3208pub fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
3209    k: __mmask8,
3210    a: __m128h,
3211    b: __m128h,
3212) -> __m128h {
3213    static_assert_rounding!(ROUNDING);
3214    _mm_maskz_mul_round_sch::<ROUNDING>(k, a, b)
3215}
3216
3217/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3218/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3219/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3220/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3221///
3222/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_pch)
3223#[inline]
3224#[target_feature(enable = "avx512fp16,avx512vl")]
3225#[cfg_attr(test, assert_instr(vfcmulcph))]
3226#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3227pub fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
3228    _mm_mask_cmul_pch(_mm_undefined_ph(), 0xff, a, b)
3229}
3230
3231/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3232/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3233/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3234/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3235///
3236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_pch)
3237#[inline]
3238#[target_feature(enable = "avx512fp16,avx512vl")]
3239#[cfg_attr(test, assert_instr(vfcmulcph))]
3240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3241pub fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3242    unsafe { transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
3243}
3244
3245/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3246/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3247/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3248/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3249///
3250/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_pch)
3251#[inline]
3252#[target_feature(enable = "avx512fp16,avx512vl")]
3253#[cfg_attr(test, assert_instr(vfcmulcph))]
3254#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3255pub fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3256    _mm_mask_cmul_pch(_mm_setzero_ph(), k, a, b)
3257}
3258
3259/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3260/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3261/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3262/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3263///
3264/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmul_pch)
3265#[inline]
3266#[target_feature(enable = "avx512fp16,avx512vl")]
3267#[cfg_attr(test, assert_instr(vfcmulcph))]
3268#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3269pub fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
3270    _mm256_mask_cmul_pch(_mm256_undefined_ph(), 0xff, a, b)
3271}
3272
3273/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3274/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3275/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3276/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3277///
3278/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmul_pch)
3279#[inline]
3280#[target_feature(enable = "avx512fp16,avx512vl")]
3281#[cfg_attr(test, assert_instr(vfcmulcph))]
3282#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3283pub fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3284    unsafe { transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
3285}
3286
3287/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3288/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3289/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3290/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3291///
3292/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cmul_pch)
3293#[inline]
3294#[target_feature(enable = "avx512fp16,avx512vl")]
3295#[cfg_attr(test, assert_instr(vfcmulcph))]
3296#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3297pub fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3298    _mm256_mask_cmul_pch(_mm256_setzero_ph(), k, a, b)
3299}
3300
3301/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3302/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3303/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3304/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3305///
3306/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_pch)
3307#[inline]
3308#[target_feature(enable = "avx512fp16")]
3309#[cfg_attr(test, assert_instr(vfcmulcph))]
3310#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3311pub fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
3312    _mm512_mask_cmul_pch(_mm512_undefined_ph(), 0xffff, a, b)
3313}
3314
3315/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3316/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3317/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3318/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3319///
3320/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_pch)
3321#[inline]
3322#[target_feature(enable = "avx512fp16")]
3323#[cfg_attr(test, assert_instr(vfcmulcph))]
3324#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3325pub fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3326    _mm512_mask_cmul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3327}
3328
3329/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3330/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3331/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3332/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3333///
3334/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_pch)
3335#[inline]
3336#[target_feature(enable = "avx512fp16")]
3337#[cfg_attr(test, assert_instr(vfcmulcph))]
3338#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3339pub fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3340    _mm512_mask_cmul_pch(_mm512_setzero_ph(), k, a, b)
3341}
3342
3343/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3344/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3345/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3346/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3347///
3348/// Rounding is done according to the rounding parameter, which can be one of:
3349///
3350/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3351/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3352/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3353/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3354/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3355///
3356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_round_pch)
3357#[inline]
3358#[target_feature(enable = "avx512fp16")]
3359#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3360#[rustc_legacy_const_generics(2)]
3361#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3362pub fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3363    static_assert_rounding!(ROUNDING);
3364    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
3365}
3366
3367/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3368/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3369/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3370/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3371///
3372/// Rounding is done according to the rounding parameter, which can be one of:
3373///
3374/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3375/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3376/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3377/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3378/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3379///
3380/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_round_pch)
3381#[inline]
3382#[target_feature(enable = "avx512fp16")]
3383#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3384#[rustc_legacy_const_generics(4)]
3385#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3386pub fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
3387    src: __m512h,
3388    k: __mmask16,
3389    a: __m512h,
3390    b: __m512h,
3391) -> __m512h {
3392    unsafe {
3393        static_assert_rounding!(ROUNDING);
3394        transmute(vfcmulcph_512(
3395            transmute(a),
3396            transmute(b),
3397            transmute(src),
3398            k,
3399            ROUNDING,
3400        ))
3401    }
3402}
3403
3404/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3405/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3406/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3407/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3408///
3409/// Rounding is done according to the rounding parameter, which can be one of:
3410///
3411/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3412/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3413/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3414/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3415/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3416///
3417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_round_pch)
3418#[inline]
3419#[target_feature(enable = "avx512fp16")]
3420#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3421#[rustc_legacy_const_generics(3)]
3422#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3423pub fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
3424    k: __mmask16,
3425    a: __m512h,
3426    b: __m512h,
3427) -> __m512h {
3428    static_assert_rounding!(ROUNDING);
3429    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
3430}
3431
3432/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3433/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3434/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3435///
3436/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_sch)
3437#[inline]
3438#[target_feature(enable = "avx512fp16")]
3439#[cfg_attr(test, assert_instr(vfcmulcsh))]
3440#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3441pub fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
3442    _mm_mask_cmul_sch(_mm_undefined_ph(), 0xff, a, b)
3443}
3444
3445/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3446/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3447/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3448/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3449///
3450/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_sch)
3451#[inline]
3452#[target_feature(enable = "avx512fp16")]
3453#[cfg_attr(test, assert_instr(vfcmulcsh))]
3454#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3455pub fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3456    _mm_mask_cmul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3457}
3458
3459/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3460/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3461/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3462/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3463///
3464/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_sch)
3465#[inline]
3466#[target_feature(enable = "avx512fp16")]
3467#[cfg_attr(test, assert_instr(vfcmulcsh))]
3468#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3469pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3470    _mm_mask_cmul_sch(_mm_setzero_ph(), k, a, b)
3471}
3472
3473/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3474/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3475/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3476///
3477/// Rounding is done according to the rounding parameter, which can be one of:
3478///
3479/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3480/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3481/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3482/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3483/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3484///
3485/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_round_sch)
3486#[inline]
3487#[target_feature(enable = "avx512fp16")]
3488#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3489#[rustc_legacy_const_generics(2)]
3490#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3491pub fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3492    static_assert_rounding!(ROUNDING);
3493    _mm_mask_cmul_round_sch::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
3494}
3495
3496/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3497/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3498/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3499/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3500///
3501/// Rounding is done according to the rounding parameter, which can be one of:
3502///
3503/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3504/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3505/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3506/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3507/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3508///
3509/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_round_sch)
3510#[inline]
3511#[target_feature(enable = "avx512fp16")]
3512#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3513#[rustc_legacy_const_generics(4)]
3514#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3515pub fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
3516    src: __m128h,
3517    k: __mmask8,
3518    a: __m128h,
3519    b: __m128h,
3520) -> __m128h {
3521    unsafe {
3522        static_assert_rounding!(ROUNDING);
3523        transmute(vfcmulcsh(
3524            transmute(a),
3525            transmute(b),
3526            transmute(src),
3527            k,
3528            ROUNDING,
3529        ))
3530    }
3531}
3532
3533/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3534/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3535/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3536/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3537///
3538/// Rounding is done according to the rounding parameter, which can be one of:
3539///
3540/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3541/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3542/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3543/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3544/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3545///
3546/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_round_sch)
3547#[inline]
3548#[target_feature(enable = "avx512fp16")]
3549#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3550#[rustc_legacy_const_generics(3)]
3551#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3552pub fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
3553    k: __mmask8,
3554    a: __m128h,
3555    b: __m128h,
3556) -> __m128h {
3557    static_assert_rounding!(ROUNDING);
3558    _mm_mask_cmul_round_sch::<ROUNDING>(_mm_setzero_ph(), k, a, b)
3559}
3560
3561/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3562/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3563/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3564/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3565///
3566/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_pch)
3567#[inline]
3568#[target_feature(enable = "avx512fp16,avx512vl")]
3569#[cfg_attr(test, assert_instr(vfcmulcph))]
3570#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3571pub fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
3572    _mm_cmul_pch(a, b)
3573}
3574
3575/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3576/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3577/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3578/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3579///
3580/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_pch)
3581#[inline]
3582#[target_feature(enable = "avx512fp16,avx512vl")]
3583#[cfg_attr(test, assert_instr(vfcmulcph))]
3584#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3585pub fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3586    _mm_mask_cmul_pch(src, k, a, b)
3587}
3588
3589/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3590/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3591/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3592/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3593///
3594/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_pch)
3595#[inline]
3596#[target_feature(enable = "avx512fp16,avx512vl")]
3597#[cfg_attr(test, assert_instr(vfcmulcph))]
3598#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3599pub fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3600    _mm_maskz_cmul_pch(k, a, b)
3601}
3602
3603/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3604/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3605/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3606/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3607///
3608/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmul_pch)
3609#[inline]
3610#[target_feature(enable = "avx512fp16,avx512vl")]
3611#[cfg_attr(test, assert_instr(vfcmulcph))]
3612#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3613pub fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
3614    _mm256_cmul_pch(a, b)
3615}
3616
3617/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3618/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3619/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3620/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3621///
3622/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmul_pch)
3623#[inline]
3624#[target_feature(enable = "avx512fp16,avx512vl")]
3625#[cfg_attr(test, assert_instr(vfcmulcph))]
3626#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3627pub fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3628    _mm256_mask_cmul_pch(src, k, a, b)
3629}
3630
3631/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3632/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3633/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3634/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3635///
3636/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmul_pch)
3637#[inline]
3638#[target_feature(enable = "avx512fp16,avx512vl")]
3639#[cfg_attr(test, assert_instr(vfcmulcph))]
3640#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3641pub fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3642    _mm256_maskz_cmul_pch(k, a, b)
3643}
3644
3645/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3646/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3647/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3648/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3649///
3650/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_pch)
3651#[inline]
3652#[target_feature(enable = "avx512fp16")]
3653#[cfg_attr(test, assert_instr(vfcmulcph))]
3654#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3655pub fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
3656    _mm512_cmul_pch(a, b)
3657}
3658
3659/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3660/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3661/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3662/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3663///
3664/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_pch)
3665#[inline]
3666#[target_feature(enable = "avx512fp16")]
3667#[cfg_attr(test, assert_instr(vfcmulcph))]
3668#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3669pub fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3670    _mm512_mask_cmul_pch(src, k, a, b)
3671}
3672
3673/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3674/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3675/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3676/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3677///
3678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_pch)
3679#[inline]
3680#[target_feature(enable = "avx512fp16")]
3681#[cfg_attr(test, assert_instr(vfcmulcph))]
3682#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3683pub fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3684    _mm512_maskz_cmul_pch(k, a, b)
3685}
3686
3687/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3688/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3689/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3690///
3691/// Rounding is done according to the rounding parameter, which can be one of:
3692///
3693/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3694/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3695/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3696/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3697/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3698///
3699/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_round_pch)
3700#[inline]
3701#[target_feature(enable = "avx512fp16")]
3702#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3703#[rustc_legacy_const_generics(2)]
3704#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3705pub fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3706    static_assert_rounding!(ROUNDING);
3707    _mm512_cmul_round_pch::<ROUNDING>(a, b)
3708}
3709
3710/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3711/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3712/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3713/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3714///
3715/// Rounding is done according to the rounding parameter, which can be one of:
3716///
3717/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3718/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3719/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3720/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3721/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3722///
3723/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_round_pch)
3724#[inline]
3725#[target_feature(enable = "avx512fp16")]
3726#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3727#[rustc_legacy_const_generics(4)]
3728#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3729pub fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
3730    src: __m512h,
3731    k: __mmask16,
3732    a: __m512h,
3733    b: __m512h,
3734) -> __m512h {
3735    static_assert_rounding!(ROUNDING);
3736    _mm512_mask_cmul_round_pch::<ROUNDING>(src, k, a, b)
3737}
3738
3739/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3740/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3741/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3742/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3743///
3744/// Rounding is done according to the rounding parameter, which can be one of:
3745///
3746/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3747/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3748/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3749/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3750/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3751///
3752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_round_pch)
3753#[inline]
3754#[target_feature(enable = "avx512fp16")]
3755#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3756#[rustc_legacy_const_generics(3)]
3757#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3758pub fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
3759    k: __mmask16,
3760    a: __m512h,
3761    b: __m512h,
3762) -> __m512h {
3763    static_assert_rounding!(ROUNDING);
3764    _mm512_maskz_cmul_round_pch::<ROUNDING>(k, a, b)
3765}
3766
3767/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3768/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3769/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3770/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3771///
3772/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_sch)
3773#[inline]
3774#[target_feature(enable = "avx512fp16")]
3775#[cfg_attr(test, assert_instr(vfcmulcsh))]
3776#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3777pub fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
3778    _mm_cmul_sch(a, b)
3779}
3780
3781/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3782/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3783/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3784/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3785///
3786/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_sch)
3787#[inline]
3788#[target_feature(enable = "avx512fp16")]
3789#[cfg_attr(test, assert_instr(vfcmulcsh))]
3790#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3791pub fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3792    _mm_mask_cmul_sch(src, k, a, b)
3793}
3794
3795/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3796/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3797/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3798/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3799///
3800/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_sch)
3801#[inline]
3802#[target_feature(enable = "avx512fp16")]
3803#[cfg_attr(test, assert_instr(vfcmulcsh))]
3804#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3805pub fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3806    _mm_maskz_cmul_sch(k, a, b)
3807}
3808
3809/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3810/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3811/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3812///
3813/// Rounding is done according to the rounding parameter, which can be one of:
3814///
3815/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3816/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3817/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3818/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3819/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3820///
3821/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_round_sch)
3822#[inline]
3823#[target_feature(enable = "avx512fp16")]
3824#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3825#[rustc_legacy_const_generics(2)]
3826#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3827pub fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3828    static_assert_rounding!(ROUNDING);
3829    _mm_cmul_round_sch::<ROUNDING>(a, b)
3830}
3831
3832/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3833/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3834/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3835/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3836///
3837/// Rounding is done according to the rounding parameter, which can be one of:
3838///
3839/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3840/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3841/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3842/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3843/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3844///
3845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_round_sch)
3846#[inline]
3847#[target_feature(enable = "avx512fp16")]
3848#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3849#[rustc_legacy_const_generics(4)]
3850#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3851pub fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
3852    src: __m128h,
3853    k: __mmask8,
3854    a: __m128h,
3855    b: __m128h,
3856) -> __m128h {
3857    static_assert_rounding!(ROUNDING);
3858    _mm_mask_cmul_round_sch::<ROUNDING>(src, k, a, b)
3859}
3860
3861/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3862/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3863/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3864/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3865///
3866/// Rounding is done according to the rounding parameter, which can be one of:
3867///
3868/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3869/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3870/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3871/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3872/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3873///
3874/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_round_sch)
3875#[inline]
3876#[target_feature(enable = "avx512fp16")]
3877#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3878#[rustc_legacy_const_generics(3)]
3879#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3880pub fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
3881    k: __mmask8,
3882    a: __m128h,
3883    b: __m128h,
3884) -> __m128h {
3885    static_assert_rounding!(ROUNDING);
3886    _mm_maskz_cmul_round_sch::<ROUNDING>(k, a, b)
3887}
3888
3889/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3890/// the results in dst.
3891///
3892/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_ph)
3893#[inline]
3894#[target_feature(enable = "avx512fp16,avx512vl")]
3895#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3896pub fn _mm_abs_ph(v2: __m128h) -> __m128h {
3897    unsafe { transmute(_mm_and_si128(transmute(v2), _mm_set1_epi16(i16::MAX))) }
3898}
3899
3900/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3901/// the result in dst.
3902///
3903/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_ph)
3904#[inline]
3905#[target_feature(enable = "avx512fp16,avx512vl")]
3906#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3907pub fn _mm256_abs_ph(v2: __m256h) -> __m256h {
3908    unsafe { transmute(_mm256_and_si256(transmute(v2), _mm256_set1_epi16(i16::MAX))) }
3909}
3910
3911/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3912/// the result in dst.
3913///
3914/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ph)
3915#[inline]
3916#[target_feature(enable = "avx512fp16")]
3917#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3918pub fn _mm512_abs_ph(v2: __m512h) -> __m512h {
3919    unsafe { transmute(_mm512_and_si512(transmute(v2), _mm512_set1_epi16(i16::MAX))) }
3920}
3921
3922/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex
3923/// number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines
3924/// the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate
3925/// `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3926///
3927/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conj_pch)
3928#[inline]
3929#[target_feature(enable = "avx512fp16,avx512vl")]
3930#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3931pub fn _mm_conj_pch(a: __m128h) -> __m128h {
3932    unsafe { transmute(_mm_xor_si128(transmute(a), _mm_set1_epi32(i32::MIN))) }
3933}
3934
3935/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
3936/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
3937/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number
3938/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3939///
3940/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conj_pch)
3941#[inline]
3942#[target_feature(enable = "avx512fp16,avx512vl")]
3943#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3944pub fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
3945    unsafe {
3946        let r: __m128 = transmute(_mm_conj_pch(a));
3947        transmute(simd_select_bitmask(k, r, transmute(src)))
3948    }
3949}
3950
3951/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
3952/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
3953/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3954/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3955///
3956/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conj_pch)
3957#[inline]
3958#[target_feature(enable = "avx512fp16,avx512vl")]
3959#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3960pub fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
3961    _mm_mask_conj_pch(_mm_setzero_ph(), k, a)
3962}
3963
3964/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
3965/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3966/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3967///
3968/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conj_pch)
3969#[inline]
3970#[target_feature(enable = "avx512fp16,avx512vl")]
3971#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3972pub fn _mm256_conj_pch(a: __m256h) -> __m256h {
3973    unsafe { transmute(_mm256_xor_si256(transmute(a), _mm256_set1_epi32(i32::MIN))) }
3974}
3975
3976/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
3977/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
3978/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3979/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3980///
3981/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conj_pch)
3982#[inline]
3983#[target_feature(enable = "avx512fp16,avx512vl")]
3984#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3985pub fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m256h {
3986    unsafe {
3987        let r: __m256 = transmute(_mm256_conj_pch(a));
3988        transmute(simd_select_bitmask(k, r, transmute(src)))
3989    }
3990}
3991
3992/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
3993/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
3994/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3995/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3996///
3997/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conj_pch)
3998#[inline]
3999#[target_feature(enable = "avx512fp16,avx512vl")]
4000#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4001pub fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
4002    _mm256_mask_conj_pch(_mm256_setzero_ph(), k, a)
4003}
4004
4005/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
4006/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4007/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4008///
4009/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conj_pch)
4010#[inline]
4011#[target_feature(enable = "avx512fp16")]
4012#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4013pub fn _mm512_conj_pch(a: __m512h) -> __m512h {
4014    unsafe { transmute(_mm512_xor_si512(transmute(a), _mm512_set1_epi32(i32::MIN))) }
4015}
4016
4017/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4018/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4019/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4020/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4021///
4022/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conj_pch)
4023#[inline]
4024#[target_feature(enable = "avx512fp16")]
4025#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4026pub fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __m512h {
4027    unsafe {
4028        let r: __m512 = transmute(_mm512_conj_pch(a));
4029        transmute(simd_select_bitmask(k, r, transmute(src)))
4030    }
4031}
4032
4033/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4034/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4035/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4036/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4037///
4038/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conj_pch)
4039#[inline]
4040#[target_feature(enable = "avx512fp16")]
4041#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4042pub fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
4043    _mm512_mask_conj_pch(_mm512_setzero_ph(), k, a)
4044}
4045
4046/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4047/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4048/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4049///
4050/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pch)
4051#[inline]
4052#[target_feature(enable = "avx512fp16,avx512vl")]
4053#[cfg_attr(test, assert_instr(vfmaddcph))]
4054#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4055pub fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4056    _mm_mask3_fmadd_pch(a, b, c, 0xff)
4057}
4058
4059/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4060/// and store the results in dst using writemask k (the element is copied from a when the corresponding
4061/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4062/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4063///
4064/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pch)
4065#[inline]
4066#[target_feature(enable = "avx512fp16,avx512vl")]
4067#[cfg_attr(test, assert_instr(vfmaddcph))]
4068#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4069pub fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4070    unsafe {
4071        let r: __m128 = transmute(_mm_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4072        transmute(simd_select_bitmask(k, r, transmute(a)))
4073    }
4074}
4075
4076/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4077/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4078/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4079/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4080///
4081/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_pch)
4082#[inline]
4083#[target_feature(enable = "avx512fp16,avx512vl")]
4084#[cfg_attr(test, assert_instr(vfmaddcph))]
4085#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4086pub fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4087    unsafe {
4088        transmute(vfmaddcph_mask3_128(
4089            transmute(a),
4090            transmute(b),
4091            transmute(c),
4092            k,
4093        ))
4094    }
4095}
4096
4097/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4098/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4099/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4100/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4101///
4102/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pch)
4103#[inline]
4104#[target_feature(enable = "avx512fp16,avx512vl")]
4105#[cfg_attr(test, assert_instr(vfmaddcph))]
4106#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4107pub fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4108    unsafe {
4109        transmute(vfmaddcph_maskz_128(
4110            transmute(a),
4111            transmute(b),
4112            transmute(c),
4113            k,
4114        ))
4115    }
4116}
4117
4118/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4119/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4120/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4121///
4122/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pch)
4123#[inline]
4124#[target_feature(enable = "avx512fp16,avx512vl")]
4125#[cfg_attr(test, assert_instr(vfmaddcph))]
4126#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4127pub fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4128    _mm256_mask3_fmadd_pch(a, b, c, 0xff)
4129}
4130
4131/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4132/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4133/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4134/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4135///
4136/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pch)
4137#[inline]
4138#[target_feature(enable = "avx512fp16,avx512vl")]
4139#[cfg_attr(test, assert_instr(vfmaddcph))]
4140#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4141pub fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4142    unsafe {
4143        let r: __m256 = transmute(_mm256_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4144        transmute(simd_select_bitmask(k, r, transmute(a)))
4145    }
4146}
4147
4148/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4149/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4150/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4151/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4152///
4153/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_pch)
4154#[inline]
4155#[target_feature(enable = "avx512fp16,avx512vl")]
4156#[cfg_attr(test, assert_instr(vfmaddcph))]
4157#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4158pub fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4159    unsafe {
4160        transmute(vfmaddcph_mask3_256(
4161            transmute(a),
4162            transmute(b),
4163            transmute(c),
4164            k,
4165        ))
4166    }
4167}
4168
4169/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4170/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4171/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4172/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4173///
4174/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pch)
4175#[inline]
4176#[target_feature(enable = "avx512fp16,avx512vl")]
4177#[cfg_attr(test, assert_instr(vfmaddcph))]
4178#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4179pub fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4180    unsafe {
4181        transmute(vfmaddcph_maskz_256(
4182            transmute(a),
4183            transmute(b),
4184            transmute(c),
4185            k,
4186        ))
4187    }
4188}
4189
4190/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4191/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4192/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4193///
4194/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pch)
4195#[inline]
4196#[target_feature(enable = "avx512fp16")]
4197#[cfg_attr(test, assert_instr(vfmaddcph))]
4198#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4199pub fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4200    _mm512_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4201}
4202
4203/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4204/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4205/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4206/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4207///
4208/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pch)
4209#[inline]
4210#[target_feature(enable = "avx512fp16")]
4211#[cfg_attr(test, assert_instr(vfmaddcph))]
4212#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4213pub fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4214    _mm512_mask_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4215}
4216
4217/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4218/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4219/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4220/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4221///
4222/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pch)
4223#[inline]
4224#[target_feature(enable = "avx512fp16")]
4225#[cfg_attr(test, assert_instr(vfmaddcph))]
4226#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4227pub fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4228    _mm512_mask3_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4229}
4230
4231/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4232/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4233/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4234/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4235///
4236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pch)
4237#[inline]
4238#[target_feature(enable = "avx512fp16")]
4239#[cfg_attr(test, assert_instr(vfmaddcph))]
4240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4241pub fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4242    _mm512_maskz_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4243}
4244
4245/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4246/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4247/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4248///
4249/// Rounding is done according to the rounding parameter, which can be one of:
4250///
4251/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4252/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4253/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4254/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4255/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4256///
4257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pch)
4258#[inline]
4259#[target_feature(enable = "avx512fp16")]
4260#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4261#[rustc_legacy_const_generics(3)]
4262#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4263pub fn _mm512_fmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4264    static_assert_rounding!(ROUNDING);
4265    _mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4266}
4267
4268/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4269/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4270/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4271/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4272///
4273/// Rounding is done according to the rounding parameter, which can be one of:
4274///
4275/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4276/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4277/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4278/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4279/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4280///
4281/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pch)
4282#[inline]
4283#[target_feature(enable = "avx512fp16")]
4284#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4285#[rustc_legacy_const_generics(4)]
4286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4287pub fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
4288    a: __m512h,
4289    k: __mmask16,
4290    b: __m512h,
4291    c: __m512h,
4292) -> __m512h {
4293    unsafe {
4294        static_assert_rounding!(ROUNDING);
4295        let r: __m512 = transmute(_mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4296        transmute(simd_select_bitmask(k, r, transmute(a)))
4297    }
4298}
4299
4300/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4301/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4302/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4303/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4304///
4305/// Rounding is done according to the rounding parameter, which can be one of:
4306///
4307/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4308/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4309/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4310/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4311/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4312///
4313/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pch)
4314#[inline]
4315#[target_feature(enable = "avx512fp16")]
4316#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4317#[rustc_legacy_const_generics(4)]
4318#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4319pub fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
4320    a: __m512h,
4321    b: __m512h,
4322    c: __m512h,
4323    k: __mmask16,
4324) -> __m512h {
4325    unsafe {
4326        static_assert_rounding!(ROUNDING);
4327        transmute(vfmaddcph_mask3_512(
4328            transmute(a),
4329            transmute(b),
4330            transmute(c),
4331            k,
4332            ROUNDING,
4333        ))
4334    }
4335}
4336
4337/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4338/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4339/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4340/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4341///
4342/// Rounding is done according to the rounding parameter, which can be one of:
4343///
4344/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4345/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4346/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4347/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4348/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4349///
4350/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pch)
4351#[inline]
4352#[target_feature(enable = "avx512fp16")]
4353#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4354#[rustc_legacy_const_generics(4)]
4355#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4356pub fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
4357    k: __mmask16,
4358    a: __m512h,
4359    b: __m512h,
4360    c: __m512h,
4361) -> __m512h {
4362    unsafe {
4363        static_assert_rounding!(ROUNDING);
4364        transmute(vfmaddcph_maskz_512(
4365            transmute(a),
4366            transmute(b),
4367            transmute(c),
4368            k,
4369            ROUNDING,
4370        ))
4371    }
4372}
4373
4374/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4375/// store the result in the lower elements of dst, and copy the upper 6 packed elements from a to the
4376/// upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
4377/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4378///
4379/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sch)
4380#[inline]
4381#[target_feature(enable = "avx512fp16")]
4382#[cfg_attr(test, assert_instr(vfmaddcsh))]
4383#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4384pub fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4385    _mm_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4386}
4387
4388/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4389/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4390/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4391/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4392/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4393///
4394/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sch)
4395#[inline]
4396#[target_feature(enable = "avx512fp16")]
4397#[cfg_attr(test, assert_instr(vfmaddcsh))]
4398#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4399pub fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4400    _mm_mask_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4401}
4402
4403/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4404/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4405/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4406/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4407/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4408///
4409/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sch)
4410#[inline]
4411#[target_feature(enable = "avx512fp16")]
4412#[cfg_attr(test, assert_instr(vfmaddcsh))]
4413#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4414pub fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4415    _mm_mask3_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4416}
4417
4418/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4419/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4420/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4421/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4422/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4423///
4424/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sch)
4425#[inline]
4426#[target_feature(enable = "avx512fp16")]
4427#[cfg_attr(test, assert_instr(vfmaddcsh))]
4428#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4429pub fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4430    _mm_maskz_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4431}
4432
4433/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4434/// store the result in the lower elements of dst. Each complex number is composed of two adjacent
4435/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4436///
4437/// Rounding is done according to the rounding parameter, which can be one of:
4438///
4439/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4440/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4441/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4442/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4443/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4444///
4445/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sch)
4446#[inline]
4447#[target_feature(enable = "avx512fp16")]
4448#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4449#[rustc_legacy_const_generics(3)]
4450#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4451pub fn _mm_fmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4452    unsafe {
4453        static_assert_rounding!(ROUNDING);
4454        transmute(vfmaddcsh_mask(
4455            transmute(a),
4456            transmute(b),
4457            transmute(c),
4458            0xff,
4459            ROUNDING,
4460        ))
4461    }
4462}
4463
4464/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4465/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4466/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4467/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4468/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4469///
4470/// Rounding is done according to the rounding parameter, which can be one of:
4471///
4472/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4473/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4474/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4475/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4476/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4477///
4478/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sch)
4479#[inline]
4480#[target_feature(enable = "avx512fp16")]
4481#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4482#[rustc_legacy_const_generics(4)]
4483#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4484pub fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
4485    a: __m128h,
4486    k: __mmask8,
4487    b: __m128h,
4488    c: __m128h,
4489) -> __m128h {
4490    unsafe {
4491        static_assert_rounding!(ROUNDING);
4492        let a = transmute(a);
4493        let r = vfmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING); // using `0xff` would have been fine here, but this is what CLang does
4494        transmute(_mm_mask_move_ss(a, k, a, r))
4495    }
4496}
4497
4498/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4499/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4500/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4501/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4502/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4503///
4504/// Rounding is done according to the rounding parameter, which can be one of:
4505///
4506/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4507/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4508/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4509/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4510/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4511///
4512/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sch)
4513#[inline]
4514#[target_feature(enable = "avx512fp16")]
4515#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4516#[rustc_legacy_const_generics(4)]
4517#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4518pub fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
4519    a: __m128h,
4520    b: __m128h,
4521    c: __m128h,
4522    k: __mmask8,
4523) -> __m128h {
4524    unsafe {
4525        static_assert_rounding!(ROUNDING);
4526        let c = transmute(c);
4527        let r = vfmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
4528        transmute(_mm_move_ss(c, r))
4529    }
4530}
4531
4532/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4533/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4534/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4535/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4536/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4537///
4538/// Rounding is done according to the rounding parameter, which can be one of:
4539///
4540/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4541/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4542/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4543/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4544/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4545///
4546/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sch)
4547#[inline]
4548#[target_feature(enable = "avx512fp16")]
4549#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4550#[rustc_legacy_const_generics(4)]
4551#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4552pub fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
4553    k: __mmask8,
4554    a: __m128h,
4555    b: __m128h,
4556    c: __m128h,
4557) -> __m128h {
4558    unsafe {
4559        static_assert_rounding!(ROUNDING);
4560        let a = transmute(a);
4561        let r = vfmaddcsh_maskz(a, transmute(b), transmute(c), k, ROUNDING);
4562        transmute(_mm_move_ss(a, r)) // FIXME: If `k == 0`, then LLVM optimized `vfmaddcsh_maskz` to output an all-zero vector, which is incorrect
4563    }
4564}
4565
4566/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4567/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4568/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4569/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4570///
4571/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_pch)
4572#[inline]
4573#[target_feature(enable = "avx512fp16,avx512vl")]
4574#[cfg_attr(test, assert_instr(vfcmaddcph))]
4575#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4576pub fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4577    _mm_mask3_fcmadd_pch(a, b, c, 0xff)
4578}
4579
4580/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4581/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4582/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4583/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4584/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4585///
4586/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_pch)
4587#[inline]
4588#[target_feature(enable = "avx512fp16,avx512vl")]
4589#[cfg_attr(test, assert_instr(vfcmaddcph))]
4590#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4591pub fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4592    unsafe {
4593        let r: __m128 = transmute(_mm_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4594        transmute(simd_select_bitmask(k, r, transmute(a)))
4595    }
4596}
4597
4598/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4599/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4600/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4601/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4602/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4603///
4604/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_pch)
4605#[inline]
4606#[target_feature(enable = "avx512fp16,avx512vl")]
4607#[cfg_attr(test, assert_instr(vfcmaddcph))]
4608#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4609pub fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4610    unsafe {
4611        transmute(vfcmaddcph_mask3_128(
4612            transmute(a),
4613            transmute(b),
4614            transmute(c),
4615            k,
4616        ))
4617    }
4618}
4619
4620/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4621/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4622/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4623/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4624/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4625///
4626/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_pch)
4627#[inline]
4628#[target_feature(enable = "avx512fp16,avx512vl")]
4629#[cfg_attr(test, assert_instr(vfcmaddcph))]
4630#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4631pub fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4632    unsafe {
4633        transmute(vfcmaddcph_maskz_128(
4634            transmute(a),
4635            transmute(b),
4636            transmute(c),
4637            k,
4638        ))
4639    }
4640}
4641
4642/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4643/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4644/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4645/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4646///
4647/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmadd_pch)
4648#[inline]
4649#[target_feature(enable = "avx512fp16,avx512vl")]
4650#[cfg_attr(test, assert_instr(vfcmaddcph))]
4651#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4652pub fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4653    _mm256_mask3_fcmadd_pch(a, b, c, 0xff)
4654}
4655
4656/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4657/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4658/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4659/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4660/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4661///
4662/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmadd_pch)
4663#[inline]
4664#[target_feature(enable = "avx512fp16,avx512vl")]
4665#[cfg_attr(test, assert_instr(vfcmaddcph))]
4666#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4667pub fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4668    unsafe {
4669        let r: __m256 = transmute(_mm256_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4670        transmute(simd_select_bitmask(k, r, transmute(a)))
4671    }
4672}
4673
4674/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4675/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4676/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4677/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4678/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4679///
4680/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fcmadd_pch)
4681#[inline]
4682#[target_feature(enable = "avx512fp16,avx512vl")]
4683#[cfg_attr(test, assert_instr(vfcmaddcph))]
4684#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4685pub fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4686    unsafe {
4687        transmute(vfcmaddcph_mask3_256(
4688            transmute(a),
4689            transmute(b),
4690            transmute(c),
4691            k,
4692        ))
4693    }
4694}
4695
4696/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4697/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4698/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4699/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4700/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4701///
4702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmadd_pch)
4703#[inline]
4704#[target_feature(enable = "avx512fp16,avx512vl")]
4705#[cfg_attr(test, assert_instr(vfcmaddcph))]
4706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4707pub fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4708    unsafe {
4709        transmute(vfcmaddcph_maskz_256(
4710            transmute(a),
4711            transmute(b),
4712            transmute(c),
4713            k,
4714        ))
4715    }
4716}
4717
4718/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4719/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4720/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4721/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4722///
4723/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_pch)
4724#[inline]
4725#[target_feature(enable = "avx512fp16")]
4726#[cfg_attr(test, assert_instr(vfcmaddcph))]
4727#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4728pub fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4729    _mm512_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4730}
4731
4732/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4733/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4734/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4735/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4736/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4737///
4738/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_pch)
4739#[inline]
4740#[target_feature(enable = "avx512fp16")]
4741#[cfg_attr(test, assert_instr(vfcmaddcph))]
4742#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4743pub fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4744    _mm512_mask_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4745}
4746
4747/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4748/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4749/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4750/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4751/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4752///
4753/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_pch)
4754#[inline]
4755#[target_feature(enable = "avx512fp16")]
4756#[cfg_attr(test, assert_instr(vfcmaddcph))]
4757#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4758pub fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4759    _mm512_mask3_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4760}
4761
4762/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4763/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4764/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4765/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4766/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4767///
4768/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_pch)
4769#[inline]
4770#[target_feature(enable = "avx512fp16")]
4771#[cfg_attr(test, assert_instr(vfcmaddcph))]
4772#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4773pub fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4774    _mm512_maskz_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4775}
4776
4777/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4778/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4779/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4780/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4781///
4782/// Rounding is done according to the rounding parameter, which can be one of:
4783///
4784/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4785/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4786/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4787/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4788/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4789///
4790/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_round_pch)
4791#[inline]
4792#[target_feature(enable = "avx512fp16")]
4793#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4794#[rustc_legacy_const_generics(3)]
4795#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4796pub fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4797    static_assert_rounding!(ROUNDING);
4798    _mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4799}
4800
4801/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4802/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4803/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4804/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4805/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4806///
4807/// Rounding is done according to the rounding parameter, which can be one of:
4808///
4809/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4810/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4811/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4812/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4813/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4814///
4815/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_round_pch)
4816#[inline]
4817#[target_feature(enable = "avx512fp16")]
4818#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4819#[rustc_legacy_const_generics(4)]
4820#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4821pub fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
4822    a: __m512h,
4823    k: __mmask16,
4824    b: __m512h,
4825    c: __m512h,
4826) -> __m512h {
4827    unsafe {
4828        static_assert_rounding!(ROUNDING);
4829        let r: __m512 = transmute(_mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4830        transmute(simd_select_bitmask(k, r, transmute(a)))
4831    }
4832}
4833
4834/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4835/// to the corresponding complex numbers in c using writemask k (the element is copied from c when the corresponding
4836/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
4837/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
4838/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4839///
4840/// Rounding is done according to the rounding parameter, which can be one of:
4841///
4842/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4843/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4844/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4845/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4846/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4847///
4848/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_round_pch)
4849#[inline]
4850#[target_feature(enable = "avx512fp16")]
4851#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4852#[rustc_legacy_const_generics(4)]
4853#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4854pub fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
4855    a: __m512h,
4856    b: __m512h,
4857    c: __m512h,
4858    k: __mmask16,
4859) -> __m512h {
4860    unsafe {
4861        static_assert_rounding!(ROUNDING);
4862        transmute(vfcmaddcph_mask3_512(
4863            transmute(a),
4864            transmute(b),
4865            transmute(c),
4866            k,
4867            ROUNDING,
4868        ))
4869    }
4870}
4871
4872/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4873/// to the corresponding complex numbers in c using zeromask k (the element is zeroed out when the corresponding
4874/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
4875/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
4876/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4877///
4878/// Rounding is done according to the rounding parameter, which can be one of:
4879///
4880/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4881/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4882/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4883/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4884/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4885///
4886/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_round_pch)
4887#[inline]
4888#[target_feature(enable = "avx512fp16")]
4889#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4890#[rustc_legacy_const_generics(4)]
4891#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4892pub fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
4893    k: __mmask16,
4894    a: __m512h,
4895    b: __m512h,
4896    c: __m512h,
4897) -> __m512h {
4898    unsafe {
4899        static_assert_rounding!(ROUNDING);
4900        transmute(vfcmaddcph_maskz_512(
4901            transmute(a),
4902            transmute(b),
4903            transmute(c),
4904            k,
4905            ROUNDING,
4906        ))
4907    }
4908}
4909
4910/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4911/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
4912/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
4913/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4914/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4915///
4916/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_sch)
4917#[inline]
4918#[target_feature(enable = "avx512fp16")]
4919#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4920#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4921pub fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4922    _mm_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4923}
4924
4925/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4926/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4927/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
4928/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4929/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4930/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4931///
4932/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_sch)
4933#[inline]
4934#[target_feature(enable = "avx512fp16")]
4935#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4936#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4937pub fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4938    _mm_mask_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4939}
4940
4941/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4942/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4943/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
4944/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4945/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4946/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4947///
4948/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_sch)
4949#[inline]
4950#[target_feature(enable = "avx512fp16")]
4951#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4952#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4953pub fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4954    _mm_mask3_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4955}
4956
4957/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4958/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4959/// zeromask k (the element is zeroed out when the corresponding mask bit is not set), and copy the upper
4960/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4961/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4962/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4963///
4964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_sch)
4965#[inline]
4966#[target_feature(enable = "avx512fp16")]
4967#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4968#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4969pub fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4970    _mm_maskz_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4971}
4972
4973/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4974/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
4975/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
4976/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4977/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4978///
4979/// Rounding is done according to the rounding parameter, which can be one of:
4980///
4981/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4982/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4983/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4984/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4985/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4986///
4987/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_round_sch)
4988#[inline]
4989#[target_feature(enable = "avx512fp16")]
4990#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
4991#[rustc_legacy_const_generics(3)]
4992#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4993pub fn _mm_fcmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4994    unsafe {
4995        static_assert_rounding!(ROUNDING);
4996        transmute(vfcmaddcsh_mask(
4997            transmute(a),
4998            transmute(b),
4999            transmute(c),
5000            0xff,
5001            ROUNDING,
5002        ))
5003    }
5004}
5005
5006/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5007/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5008/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
5009/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5010/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5011/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5012///
5013/// Rounding is done according to the rounding parameter, which can be one of:
5014///
5015/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5016/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5017/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5018/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5019/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5020///
5021/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_round_sch)
5022#[inline]
5023#[target_feature(enable = "avx512fp16")]
5024#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5025#[rustc_legacy_const_generics(4)]
5026#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5027pub fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
5028    a: __m128h,
5029    k: __mmask8,
5030    b: __m128h,
5031    c: __m128h,
5032) -> __m128h {
5033    unsafe {
5034        static_assert_rounding!(ROUNDING);
5035        let a = transmute(a);
5036        let r = vfcmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING);
5037        transmute(_mm_mask_move_ss(a, k, a, r))
5038    }
5039}
5040
5041/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5042/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5043/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
5044/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5045/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5046/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5047///
5048/// Rounding is done according to the rounding parameter, which can be one of:
5049///
5050/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5051/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5052/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5053/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5054/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5055///
5056/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_round_sch)
5057#[inline]
5058#[target_feature(enable = "avx512fp16")]
5059#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5060#[rustc_legacy_const_generics(4)]
5061#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5062pub fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
5063    a: __m128h,
5064    b: __m128h,
5065    c: __m128h,
5066    k: __mmask8,
5067) -> __m128h {
5068    unsafe {
5069        static_assert_rounding!(ROUNDING);
5070        let c = transmute(c);
5071        let r = vfcmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
5072        transmute(_mm_move_ss(c, r))
5073    }
5074}
5075
5076/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5077/// accumulate to the lower complex number in c using zeromask k (the element is zeroed out when the corresponding
5078/// mask bit is not set), and store the result in the lower elements of dst, and copy the upper 6 packed elements
5079/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
5080/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
5081/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5082///
5083/// Rounding is done according to the rounding parameter, which can be one of:
5084///
5085/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5086/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5087/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5088/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5089/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5090///
5091/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_round_sch)
5092#[inline]
5093#[target_feature(enable = "avx512fp16")]
5094#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5095#[rustc_legacy_const_generics(4)]
5096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5097pub fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
5098    k: __mmask8,
5099    a: __m128h,
5100    b: __m128h,
5101    c: __m128h,
5102) -> __m128h {
5103    unsafe {
5104        static_assert_rounding!(ROUNDING);
5105        let a = transmute(a);
5106        let r = vfcmaddcsh_maskz(a, transmute(b), transmute(c), k, ROUNDING);
5107        transmute(_mm_move_ss(a, r)) // FIXME: If `k == 0`, then LLVM optimized `vfcmaddcsh_maskz` to output an all-zero vector, which is incorrect
5108    }
5109}
5110
5111/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5112/// result to packed elements in c, and store the results in dst.
5113///
5114/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ph)
5115#[inline]
5116#[target_feature(enable = "avx512fp16,avx512vl")]
5117#[cfg_attr(test, assert_instr(vfmadd))]
5118#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5119pub fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5120    unsafe { simd_fma(a, b, c) }
5121}
5122
5123/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5124/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5125/// from a when the corresponding mask bit is not set).
5126///
5127/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ph)
5128#[inline]
5129#[target_feature(enable = "avx512fp16,avx512vl")]
5130#[cfg_attr(test, assert_instr(vfmadd))]
5131#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5132pub fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5133    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), a) }
5134}
5135
5136/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5137/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5138/// from c when the corresponding mask bit is not set).
5139///
5140/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ph)
5141#[inline]
5142#[target_feature(enable = "avx512fp16,avx512vl")]
5143#[cfg_attr(test, assert_instr(vfmadd))]
5144#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5145pub fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5146    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), c) }
5147}
5148
5149/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5150/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5151/// out when the corresponding mask bit is not set).
5152///
5153/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ph)
5154#[inline]
5155#[target_feature(enable = "avx512fp16,avx512vl")]
5156#[cfg_attr(test, assert_instr(vfmadd))]
5157#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5158pub fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5159    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), _mm_setzero_ph()) }
5160}
5161
5162/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5163/// result to packed elements in c, and store the results in dst.
5164///
5165/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ph)
5166#[inline]
5167#[target_feature(enable = "avx512fp16,avx512vl")]
5168#[cfg_attr(test, assert_instr(vfmadd))]
5169#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5170pub fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5171    unsafe { simd_fma(a, b, c) }
5172}
5173
5174/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5175/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5176/// from a when the corresponding mask bit is not set).
5177///
5178/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ph)
5179#[inline]
5180#[target_feature(enable = "avx512fp16,avx512vl")]
5181#[cfg_attr(test, assert_instr(vfmadd))]
5182#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5183pub fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5184    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), a) }
5185}
5186
5187/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5188/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5189/// from c when the corresponding mask bit is not set).
5190///
5191/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_ph)
5192#[inline]
5193#[target_feature(enable = "avx512fp16,avx512vl")]
5194#[cfg_attr(test, assert_instr(vfmadd))]
5195#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5196pub fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5197    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), c) }
5198}
5199
5200/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5201/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5202/// out when the corresponding mask bit is not set).
5203///
5204/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ph)
5205#[inline]
5206#[target_feature(enable = "avx512fp16,avx512vl")]
5207#[cfg_attr(test, assert_instr(vfmadd))]
5208#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5209pub fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5210    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), _mm256_setzero_ph()) }
5211}
5212
5213/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5214/// result to packed elements in c, and store the results in dst.
5215///
5216/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ph)
5217#[inline]
5218#[target_feature(enable = "avx512fp16")]
5219#[cfg_attr(test, assert_instr(vfmadd))]
5220#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5221pub fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5222    unsafe { simd_fma(a, b, c) }
5223}
5224
5225/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5226/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5227/// from a when the corresponding mask bit is not set).
5228///
5229/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ph)
5230#[inline]
5231#[target_feature(enable = "avx512fp16")]
5232#[cfg_attr(test, assert_instr(vfmadd))]
5233#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5234pub fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5235    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), a) }
5236}
5237
5238/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5239/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5240/// from c when the corresponding mask bit is not set).
5241///
5242/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ph)
5243#[inline]
5244#[target_feature(enable = "avx512fp16")]
5245#[cfg_attr(test, assert_instr(vfmadd))]
5246#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5247pub fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5248    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), c) }
5249}
5250
5251/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5252/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5253/// out when the corresponding mask bit is not set).
5254///
5255/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ph)
5256#[inline]
5257#[target_feature(enable = "avx512fp16")]
5258#[cfg_attr(test, assert_instr(vfmadd))]
5259#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5260pub fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5261    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), _mm512_setzero_ph()) }
5262}
5263
5264/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5265/// result to packed elements in c, and store the results in dst.
5266///
5267/// Rounding is done according to the rounding parameter, which can be one of:
5268///
5269/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5270/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5271/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5272/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5273/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5274///
5275/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_ph)
5276#[inline]
5277#[target_feature(enable = "avx512fp16")]
5278#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5279#[rustc_legacy_const_generics(3)]
5280#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5281pub fn _mm512_fmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5282    unsafe {
5283        static_assert_rounding!(ROUNDING);
5284        vfmaddph_512(a, b, c, ROUNDING)
5285    }
5286}
5287
5288/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5289/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5290/// from a when the corresponding mask bit is not set).
5291///
5292/// Rounding is done according to the rounding parameter, which can be one of:
5293///
5294/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5295/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5296/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5297/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5298/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5299///
5300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_ph)
5301#[inline]
5302#[target_feature(enable = "avx512fp16")]
5303#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5304#[rustc_legacy_const_generics(4)]
5305#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5306pub fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
5307    a: __m512h,
5308    k: __mmask32,
5309    b: __m512h,
5310    c: __m512h,
5311) -> __m512h {
5312    unsafe {
5313        static_assert_rounding!(ROUNDING);
5314        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), a)
5315    }
5316}
5317
5318/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5319/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5320/// from c when the corresponding mask bit is not set).
5321///
5322/// Rounding is done according to the rounding parameter, which can be one of:
5323///
5324/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5325/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5326/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5327/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5328/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5329///
5330/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_ph)
5331#[inline]
5332#[target_feature(enable = "avx512fp16")]
5333#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5334#[rustc_legacy_const_generics(4)]
5335#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5336pub fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
5337    a: __m512h,
5338    b: __m512h,
5339    c: __m512h,
5340    k: __mmask32,
5341) -> __m512h {
5342    unsafe {
5343        static_assert_rounding!(ROUNDING);
5344        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), c)
5345    }
5346}
5347
5348/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5349/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5350/// out when the corresponding mask bit is not set).
5351///
5352/// Rounding is done according to the rounding parameter, which can be one of:
5353///
5354/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5355/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5356/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5357/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5358/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5359///
5360/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_ph)
5361#[inline]
5362#[target_feature(enable = "avx512fp16")]
5363#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5364#[rustc_legacy_const_generics(4)]
5365#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5366pub fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
5367    k: __mmask32,
5368    a: __m512h,
5369    b: __m512h,
5370    c: __m512h,
5371) -> __m512h {
5372    unsafe {
5373        static_assert_rounding!(ROUNDING);
5374        simd_select_bitmask(
5375            k,
5376            _mm512_fmadd_round_ph::<ROUNDING>(a, b, c),
5377            _mm512_setzero_ph(),
5378        )
5379    }
5380}
5381
5382/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5383/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5384/// 7 packed elements from a to the upper elements of dst.
5385///
5386/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sh)
5387#[inline]
5388#[target_feature(enable = "avx512fp16")]
5389#[cfg_attr(test, assert_instr(vfmadd))]
5390#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5391pub fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5392    unsafe {
5393        let extracta: f16 = simd_extract!(a, 0);
5394        let extractb: f16 = simd_extract!(b, 0);
5395        let extractc: f16 = simd_extract!(c, 0);
5396        let r = fmaf16(extracta, extractb, extractc);
5397        simd_insert!(a, 0, r)
5398    }
5399}
5400
5401/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5402/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5403/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5404/// upper elements of dst.
5405///
5406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sh)
5407#[inline]
5408#[target_feature(enable = "avx512fp16")]
5409#[cfg_attr(test, assert_instr(vfmadd))]
5410#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5411pub fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5412    unsafe {
5413        let mut fmadd: f16 = simd_extract!(a, 0);
5414        if k & 1 != 0 {
5415            let extractb: f16 = simd_extract!(b, 0);
5416            let extractc: f16 = simd_extract!(c, 0);
5417            fmadd = fmaf16(fmadd, extractb, extractc);
5418        }
5419        simd_insert!(a, 0, fmadd)
5420    }
5421}
5422
5423/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5424/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5425/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5426/// upper elements of dst.
5427///
5428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sh)
5429#[inline]
5430#[target_feature(enable = "avx512fp16")]
5431#[cfg_attr(test, assert_instr(vfmadd))]
5432#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5433pub fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5434    unsafe {
5435        let mut fmadd: f16 = simd_extract!(c, 0);
5436        if k & 1 != 0 {
5437            let extracta: f16 = simd_extract!(a, 0);
5438            let extractb: f16 = simd_extract!(b, 0);
5439            fmadd = fmaf16(extracta, extractb, fmadd);
5440        }
5441        simd_insert!(c, 0, fmadd)
5442    }
5443}
5444
5445/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5446/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5447/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5448/// upper elements of dst.
5449///
5450/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sh)
5451#[inline]
5452#[target_feature(enable = "avx512fp16")]
5453#[cfg_attr(test, assert_instr(vfmadd))]
5454#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5455pub fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5456    unsafe {
5457        let mut fmadd: f16 = 0.0;
5458        if k & 1 != 0 {
5459            let extracta: f16 = simd_extract!(a, 0);
5460            let extractb: f16 = simd_extract!(b, 0);
5461            let extractc: f16 = simd_extract!(c, 0);
5462            fmadd = fmaf16(extracta, extractb, extractc);
5463        }
5464        simd_insert!(a, 0, fmadd)
5465    }
5466}
5467
5468/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5469/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5470/// 7 packed elements from a to the upper elements of dst.
5471///
5472/// Rounding is done according to the rounding parameter, which can be one of:
5473///
5474/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5475/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5476/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5477/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5478/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5479///
5480/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sh)
5481#[inline]
5482#[target_feature(enable = "avx512fp16")]
5483#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5484#[rustc_legacy_const_generics(3)]
5485#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5486pub fn _mm_fmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5487    unsafe {
5488        static_assert_rounding!(ROUNDING);
5489        let extracta: f16 = simd_extract!(a, 0);
5490        let extractb: f16 = simd_extract!(b, 0);
5491        let extractc: f16 = simd_extract!(c, 0);
5492        let r = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5493        simd_insert!(a, 0, r)
5494    }
5495}
5496
5497/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5498/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5499/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5500/// upper elements of dst.
5501///
5502/// Rounding is done according to the rounding parameter, which can be one of:
5503///
5504/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5505/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5506/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5507/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5508/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5509///
5510/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sh)
5511#[inline]
5512#[target_feature(enable = "avx512fp16")]
5513#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5514#[rustc_legacy_const_generics(4)]
5515#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5516pub fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
5517    a: __m128h,
5518    k: __mmask8,
5519    b: __m128h,
5520    c: __m128h,
5521) -> __m128h {
5522    unsafe {
5523        static_assert_rounding!(ROUNDING);
5524        let mut fmadd: f16 = simd_extract!(a, 0);
5525        if k & 1 != 0 {
5526            let extractb: f16 = simd_extract!(b, 0);
5527            let extractc: f16 = simd_extract!(c, 0);
5528            fmadd = vfmaddsh(fmadd, extractb, extractc, ROUNDING);
5529        }
5530        simd_insert!(a, 0, fmadd)
5531    }
5532}
5533
5534/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5535/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5536/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5537/// upper elements of dst.
5538///
5539/// Rounding is done according to the rounding parameter, which can be one of:
5540///
5541/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5542/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5543/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5544/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5545/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5546///
5547/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sh)
5548#[inline]
5549#[target_feature(enable = "avx512fp16")]
5550#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5551#[rustc_legacy_const_generics(4)]
5552#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5553pub fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
5554    a: __m128h,
5555    b: __m128h,
5556    c: __m128h,
5557    k: __mmask8,
5558) -> __m128h {
5559    unsafe {
5560        static_assert_rounding!(ROUNDING);
5561        let mut fmadd: f16 = simd_extract!(c, 0);
5562        if k & 1 != 0 {
5563            let extracta: f16 = simd_extract!(a, 0);
5564            let extractb: f16 = simd_extract!(b, 0);
5565            fmadd = vfmaddsh(extracta, extractb, fmadd, ROUNDING);
5566        }
5567        simd_insert!(c, 0, fmadd)
5568    }
5569}
5570
5571/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5572/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5573/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5574/// upper elements of dst.
5575///
5576/// Rounding is done according to the rounding parameter, which can be one of:
5577///
5578/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5579/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5580/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5581/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5582/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5583///
5584/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sh)
5585#[inline]
5586#[target_feature(enable = "avx512fp16")]
5587#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5588#[rustc_legacy_const_generics(4)]
5589#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5590pub fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
5591    k: __mmask8,
5592    a: __m128h,
5593    b: __m128h,
5594    c: __m128h,
5595) -> __m128h {
5596    unsafe {
5597        static_assert_rounding!(ROUNDING);
5598        let mut fmadd: f16 = 0.0;
5599        if k & 1 != 0 {
5600            let extracta: f16 = simd_extract!(a, 0);
5601            let extractb: f16 = simd_extract!(b, 0);
5602            let extractc: f16 = simd_extract!(c, 0);
5603            fmadd = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5604        }
5605        simd_insert!(a, 0, fmadd)
5606    }
5607}
5608
5609/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5610/// in c from the intermediate result, and store the results in dst.
5611/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5612///
5613/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ph)
5614#[inline]
5615#[target_feature(enable = "avx512fp16,avx512vl")]
5616#[cfg_attr(test, assert_instr(vfmsub))]
5617#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5618pub fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5619    unsafe { simd_fma(a, b, simd_neg(c)) }
5620}
5621
5622/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5623/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5624/// from a when the corresponding mask bit is not set).
5625///
5626/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ph)
5627#[inline]
5628#[target_feature(enable = "avx512fp16,avx512vl")]
5629#[cfg_attr(test, assert_instr(vfmsub))]
5630#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5631pub fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5632    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), a) }
5633}
5634
5635/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5636/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5637/// from c when the corresponding mask bit is not set).
5638///
5639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_ph)
5640#[inline]
5641#[target_feature(enable = "avx512fp16,avx512vl")]
5642#[cfg_attr(test, assert_instr(vfmsub))]
5643#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5644pub fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5645    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), c) }
5646}
5647
5648/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5649/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5650/// out when the corresponding mask bit is not set).
5651///
5652/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ph)
5653#[inline]
5654#[target_feature(enable = "avx512fp16,avx512vl")]
5655#[cfg_attr(test, assert_instr(vfmsub))]
5656#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5657pub fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5658    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), _mm_setzero_ph()) }
5659}
5660
5661/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5662/// in c from the intermediate result, and store the results in dst.
5663///
5664/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ph)
5665#[inline]
5666#[target_feature(enable = "avx512fp16,avx512vl")]
5667#[cfg_attr(test, assert_instr(vfmsub))]
5668#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5669pub fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5670    unsafe { simd_fma(a, b, simd_neg(c)) }
5671}
5672
5673/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5674/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5675/// from a when the corresponding mask bit is not set).
5676///
5677/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ph)
5678#[inline]
5679#[target_feature(enable = "avx512fp16,avx512vl")]
5680#[cfg_attr(test, assert_instr(vfmsub))]
5681#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5682pub fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5683    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), a) }
5684}
5685
5686/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5687/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5688/// from c when the corresponding mask bit is not set).
5689///
5690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_ph)
5691#[inline]
5692#[target_feature(enable = "avx512fp16,avx512vl")]
5693#[cfg_attr(test, assert_instr(vfmsub))]
5694#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5695pub fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5696    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), c) }
5697}
5698
5699/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5700/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5701/// out when the corresponding mask bit is not set).
5702///
5703/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ph)
5704#[inline]
5705#[target_feature(enable = "avx512fp16,avx512vl")]
5706#[cfg_attr(test, assert_instr(vfmsub))]
5707#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5708pub fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5709    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), _mm256_setzero_ph()) }
5710}
5711
5712/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5713/// in c from the intermediate result, and store the results in dst.
5714///
5715/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_ph)
5716#[inline]
5717#[target_feature(enable = "avx512fp16")]
5718#[cfg_attr(test, assert_instr(vfmsub))]
5719#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5720pub fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5721    unsafe { simd_fma(a, b, simd_neg(c)) }
5722}
5723
5724/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5725/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5726/// from a when the corresponding mask bit is not set).
5727///
5728/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_ph)
5729#[inline]
5730#[target_feature(enable = "avx512fp16")]
5731#[cfg_attr(test, assert_instr(vfmsub))]
5732#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5733pub fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5734    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), a) }
5735}
5736
5737/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5738/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5739/// from c when the corresponding mask bit is not set).
5740///
5741/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_ph)
5742#[inline]
5743#[target_feature(enable = "avx512fp16")]
5744#[cfg_attr(test, assert_instr(vfmsub))]
5745#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5746pub fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5747    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), c) }
5748}
5749
5750/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5751/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5752/// out when the corresponding mask bit is not set).
5753///
5754/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_ph)
5755#[inline]
5756#[target_feature(enable = "avx512fp16")]
5757#[cfg_attr(test, assert_instr(vfmsub))]
5758#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5759pub fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5760    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), _mm512_setzero_ph()) }
5761}
5762
5763/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5764/// in c from the intermediate result, and store the results in dst.
5765///
5766/// Rounding is done according to the rounding parameter, which can be one of:
5767///
5768/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5769/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5770/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5771/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5772/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5773///
5774/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ph)
5775#[inline]
5776#[target_feature(enable = "avx512fp16")]
5777#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5778#[rustc_legacy_const_generics(3)]
5779#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5780pub fn _mm512_fmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5781    unsafe {
5782        static_assert_rounding!(ROUNDING);
5783        vfmaddph_512(a, b, simd_neg(c), ROUNDING)
5784    }
5785}
5786
5787/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5788/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5789/// from a when the corresponding mask bit is not set).
5790///
5791/// Rounding is done according to the rounding parameter, which can be one of:
5792///
5793/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5794/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5795/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5796/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5797/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5798///
5799/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ph)
5800#[inline]
5801#[target_feature(enable = "avx512fp16")]
5802#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5803#[rustc_legacy_const_generics(4)]
5804#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5805pub fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
5806    a: __m512h,
5807    k: __mmask32,
5808    b: __m512h,
5809    c: __m512h,
5810) -> __m512h {
5811    unsafe {
5812        static_assert_rounding!(ROUNDING);
5813        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), a)
5814    }
5815}
5816
5817/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5818/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5819/// from c when the corresponding mask bit is not set).
5820///
5821/// Rounding is done according to the rounding parameter, which can be one of:
5822///
5823/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5824/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5825/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5826/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5827/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5828///
5829/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_ph)
5830#[inline]
5831#[target_feature(enable = "avx512fp16")]
5832#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5833#[rustc_legacy_const_generics(4)]
5834#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5835pub fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
5836    a: __m512h,
5837    b: __m512h,
5838    c: __m512h,
5839    k: __mmask32,
5840) -> __m512h {
5841    unsafe {
5842        static_assert_rounding!(ROUNDING);
5843        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), c)
5844    }
5845}
5846
5847/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5848/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5849/// out when the corresponding mask bit is not set).
5850///
5851/// Rounding is done according to the rounding parameter, which can be one of:
5852///
5853/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5854/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5855/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5856/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5857/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5858///
5859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ph)
5860#[inline]
5861#[target_feature(enable = "avx512fp16")]
5862#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5863#[rustc_legacy_const_generics(4)]
5864#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5865pub fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
5866    k: __mmask32,
5867    a: __m512h,
5868    b: __m512h,
5869    c: __m512h,
5870) -> __m512h {
5871    unsafe {
5872        static_assert_rounding!(ROUNDING);
5873        simd_select_bitmask(
5874            k,
5875            _mm512_fmsub_round_ph::<ROUNDING>(a, b, c),
5876            _mm512_setzero_ph(),
5877        )
5878    }
5879}
5880
5881/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5882/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
5883/// 7 packed elements from a to the upper elements of dst.
5884///
5885/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sh)
5886#[inline]
5887#[target_feature(enable = "avx512fp16")]
5888#[cfg_attr(test, assert_instr(vfmsub))]
5889#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5890pub fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5891    unsafe {
5892        let extracta: f16 = simd_extract!(a, 0);
5893        let extractb: f16 = simd_extract!(b, 0);
5894        let extractc: f16 = simd_extract!(c, 0);
5895        let r = fmaf16(extracta, extractb, -extractc);
5896        simd_insert!(a, 0, r)
5897    }
5898}
5899
5900/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5901/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
5902/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5903/// upper elements of dst.
5904///
5905/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sh)
5906#[inline]
5907#[target_feature(enable = "avx512fp16")]
5908#[cfg_attr(test, assert_instr(vfmsub))]
5909#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5910pub fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5911    unsafe {
5912        let mut fmsub: f16 = simd_extract!(a, 0);
5913        if k & 1 != 0 {
5914            let extractb: f16 = simd_extract!(b, 0);
5915            let extractc: f16 = simd_extract!(c, 0);
5916            fmsub = fmaf16(fmsub, extractb, -extractc);
5917        }
5918        simd_insert!(a, 0, fmsub)
5919    }
5920}
5921
5922/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5923/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
5924/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5925/// upper elements of dst.
5926///
5927/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sh)
5928#[inline]
5929#[target_feature(enable = "avx512fp16")]
5930#[cfg_attr(test, assert_instr(vfmsub))]
5931#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5932pub fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5933    unsafe {
5934        let mut fmsub: f16 = simd_extract!(c, 0);
5935        if k & 1 != 0 {
5936            let extracta: f16 = simd_extract!(a, 0);
5937            let extractb: f16 = simd_extract!(b, 0);
5938            fmsub = fmaf16(extracta, extractb, -fmsub);
5939        }
5940        simd_insert!(c, 0, fmsub)
5941    }
5942}
5943
5944/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5945/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
5946/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5947/// upper elements of dst.
5948///
5949/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sh)
5950#[inline]
5951#[target_feature(enable = "avx512fp16")]
5952#[cfg_attr(test, assert_instr(vfmsub))]
5953#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5954pub fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5955    unsafe {
5956        let mut fmsub: f16 = 0.0;
5957        if k & 1 != 0 {
5958            let extracta: f16 = simd_extract!(a, 0);
5959            let extractb: f16 = simd_extract!(b, 0);
5960            let extractc: f16 = simd_extract!(c, 0);
5961            fmsub = fmaf16(extracta, extractb, -extractc);
5962        }
5963        simd_insert!(a, 0, fmsub)
5964    }
5965}
5966
5967/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5968/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
5969/// 7 packed elements from a to the upper elements of dst.
5970///
5971/// Rounding is done according to the rounding parameter, which can be one of:
5972///
5973/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5974/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5975/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5976/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5977/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5978///
5979/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sh)
5980#[inline]
5981#[target_feature(enable = "avx512fp16")]
5982#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5983#[rustc_legacy_const_generics(3)]
5984#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5985pub fn _mm_fmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5986    unsafe {
5987        static_assert_rounding!(ROUNDING);
5988        let extracta: f16 = simd_extract!(a, 0);
5989        let extractb: f16 = simd_extract!(b, 0);
5990        let extractc: f16 = simd_extract!(c, 0);
5991        let r = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
5992        simd_insert!(a, 0, r)
5993    }
5994}
5995
5996/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5997/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
5998/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5999/// upper elements of dst.
6000///
6001/// Rounding is done according to the rounding parameter, which can be one of:
6002///
6003/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6004/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6005/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6006/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6007/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6008///
6009/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sh)
6010#[inline]
6011#[target_feature(enable = "avx512fp16")]
6012#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6013#[rustc_legacy_const_generics(4)]
6014#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6015pub fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
6016    a: __m128h,
6017    k: __mmask8,
6018    b: __m128h,
6019    c: __m128h,
6020) -> __m128h {
6021    unsafe {
6022        static_assert_rounding!(ROUNDING);
6023        let mut fmsub: f16 = simd_extract!(a, 0);
6024        if k & 1 != 0 {
6025            let extractb: f16 = simd_extract!(b, 0);
6026            let extractc: f16 = simd_extract!(c, 0);
6027            fmsub = vfmaddsh(fmsub, extractb, -extractc, ROUNDING);
6028        }
6029        simd_insert!(a, 0, fmsub)
6030    }
6031}
6032
6033/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6034/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6035/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
6036/// upper elements of dst.
6037///
6038/// Rounding is done according to the rounding parameter, which can be one of:
6039///
6040/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6041/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6042/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6043/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6044/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6045///
6046/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sh)
6047#[inline]
6048#[target_feature(enable = "avx512fp16")]
6049#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6050#[rustc_legacy_const_generics(4)]
6051#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6052pub fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
6053    a: __m128h,
6054    b: __m128h,
6055    c: __m128h,
6056    k: __mmask8,
6057) -> __m128h {
6058    unsafe {
6059        static_assert_rounding!(ROUNDING);
6060        let mut fmsub: f16 = simd_extract!(c, 0);
6061        if k & 1 != 0 {
6062            let extracta: f16 = simd_extract!(a, 0);
6063            let extractb: f16 = simd_extract!(b, 0);
6064            fmsub = vfmaddsh(extracta, extractb, -fmsub, ROUNDING);
6065        }
6066        simd_insert!(c, 0, fmsub)
6067    }
6068}
6069
6070/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6071/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
6072/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6073/// upper elements of dst.
6074///
6075/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sh)
6076#[inline]
6077#[target_feature(enable = "avx512fp16")]
6078#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6079#[rustc_legacy_const_generics(4)]
6080#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6081pub fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
6082    k: __mmask8,
6083    a: __m128h,
6084    b: __m128h,
6085    c: __m128h,
6086) -> __m128h {
6087    unsafe {
6088        static_assert_rounding!(ROUNDING);
6089        let mut fmsub: f16 = 0.0;
6090        if k & 1 != 0 {
6091            let extracta: f16 = simd_extract!(a, 0);
6092            let extractb: f16 = simd_extract!(b, 0);
6093            let extractc: f16 = simd_extract!(c, 0);
6094            fmsub = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
6095        }
6096        simd_insert!(a, 0, fmsub)
6097    }
6098}
6099
6100/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6101/// result from packed elements in c, and store the results in dst.
6102///
6103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ph)
6104#[inline]
6105#[target_feature(enable = "avx512fp16,avx512vl")]
6106#[cfg_attr(test, assert_instr(vfnmadd))]
6107#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6108pub fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6109    unsafe { simd_fma(simd_neg(a), b, c) }
6110}
6111
6112/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6113/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6114/// from a when the corresponding mask bit is not set).
6115///
6116/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ph)
6117#[inline]
6118#[target_feature(enable = "avx512fp16,avx512vl")]
6119#[cfg_attr(test, assert_instr(vfnmadd))]
6120#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6121pub fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6122    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), a) }
6123}
6124
6125/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6126/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6127/// from c when the corresponding mask bit is not set).
6128///
6129/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_ph)
6130#[inline]
6131#[target_feature(enable = "avx512fp16,avx512vl")]
6132#[cfg_attr(test, assert_instr(vfnmadd))]
6133#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6134pub fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6135    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), c) }
6136}
6137
6138/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6139/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6140/// out when the corresponding mask bit is not set).
6141///
6142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ph)
6143#[inline]
6144#[target_feature(enable = "avx512fp16,avx512vl")]
6145#[cfg_attr(test, assert_instr(vfnmadd))]
6146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6147pub fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6148    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), _mm_setzero_ph()) }
6149}
6150
6151/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6152/// result from packed elements in c, and store the results in dst.
6153///
6154/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ph)
6155#[inline]
6156#[target_feature(enable = "avx512fp16,avx512vl")]
6157#[cfg_attr(test, assert_instr(vfnmadd))]
6158#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6159pub fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6160    unsafe { simd_fma(simd_neg(a), b, c) }
6161}
6162
6163/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6164/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6165/// from a when the corresponding mask bit is not set).
6166///
6167/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ph)
6168#[inline]
6169#[target_feature(enable = "avx512fp16,avx512vl")]
6170#[cfg_attr(test, assert_instr(vfnmadd))]
6171#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6172pub fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6173    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), a) }
6174}
6175
6176/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6177/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6178/// from c when the corresponding mask bit is not set).
6179///
6180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_ph)
6181#[inline]
6182#[target_feature(enable = "avx512fp16,avx512vl")]
6183#[cfg_attr(test, assert_instr(vfnmadd))]
6184#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6185pub fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6186    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), c) }
6187}
6188
6189/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6190/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6191/// out when the corresponding mask bit is not set).
6192///
6193/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ph)
6194#[inline]
6195#[target_feature(enable = "avx512fp16,avx512vl")]
6196#[cfg_attr(test, assert_instr(vfnmadd))]
6197#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6198pub fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6199    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), _mm256_setzero_ph()) }
6200}
6201
6202/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6203/// result from packed elements in c, and store the results in dst.
6204///
6205/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ph)
6206#[inline]
6207#[target_feature(enable = "avx512fp16")]
6208#[cfg_attr(test, assert_instr(vfnmadd))]
6209#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6210pub fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6211    unsafe { simd_fma(simd_neg(a), b, c) }
6212}
6213
6214/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6215/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6216/// from a when the corresponding mask bit is not set).
6217///
6218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ph)
6219#[inline]
6220#[target_feature(enable = "avx512fp16")]
6221#[cfg_attr(test, assert_instr(vfnmadd))]
6222#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6223pub fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6224    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), a) }
6225}
6226
6227/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6228/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6229/// from c when the corresponding mask bit is not set).
6230///
6231/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_ph)
6232#[inline]
6233#[target_feature(enable = "avx512fp16")]
6234#[cfg_attr(test, assert_instr(vfnmadd))]
6235#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6236pub fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6237    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), c) }
6238}
6239
6240/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6241/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6242/// out when the corresponding mask bit is not set).
6243///
6244/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ph)
6245#[inline]
6246#[target_feature(enable = "avx512fp16")]
6247#[cfg_attr(test, assert_instr(vfnmadd))]
6248#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6249pub fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6250    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), _mm512_setzero_ph()) }
6251}
6252
6253/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6254/// result from packed elements in c, and store the results in dst.
6255///
6256/// Rounding is done according to the rounding parameter, which can be one of:
6257///
6258/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6259/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6260/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6261/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6262/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6263///
6264/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ph)
6265#[inline]
6266#[target_feature(enable = "avx512fp16")]
6267#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6268#[rustc_legacy_const_generics(3)]
6269#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6270pub fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6271    unsafe {
6272        static_assert_rounding!(ROUNDING);
6273        vfmaddph_512(simd_neg(a), b, c, ROUNDING)
6274    }
6275}
6276
6277/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6278/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6279/// from a when the corresponding mask bit is not set).
6280///
6281/// Rounding is done according to the rounding parameter, which can be one of:
6282///
6283/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6284/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6285/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6286/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6287/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6288///
6289/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ph)
6290#[inline]
6291#[target_feature(enable = "avx512fp16")]
6292#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6293#[rustc_legacy_const_generics(4)]
6294#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6295pub fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
6296    a: __m512h,
6297    k: __mmask32,
6298    b: __m512h,
6299    c: __m512h,
6300) -> __m512h {
6301    unsafe {
6302        static_assert_rounding!(ROUNDING);
6303        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), a)
6304    }
6305}
6306
6307/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6308/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6309/// from c when the corresponding mask bit is not set).
6310///
6311/// Rounding is done according to the rounding parameter, which can be one of:
6312///
6313/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6314/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6315/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6316/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6317/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6318///
6319/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_ph)
6320#[inline]
6321#[target_feature(enable = "avx512fp16")]
6322#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6323#[rustc_legacy_const_generics(4)]
6324#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6325pub fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
6326    a: __m512h,
6327    b: __m512h,
6328    c: __m512h,
6329    k: __mmask32,
6330) -> __m512h {
6331    unsafe {
6332        static_assert_rounding!(ROUNDING);
6333        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), c)
6334    }
6335}
6336
6337/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6338/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6339/// out when the corresponding mask bit is not set).
6340///
6341/// Rounding is done according to the rounding parameter, which can be one of:
6342///
6343/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6344/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6345/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6346/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6347/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6348///
6349/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ph)
6350#[inline]
6351#[target_feature(enable = "avx512fp16")]
6352#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6353#[rustc_legacy_const_generics(4)]
6354#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6355pub fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
6356    k: __mmask32,
6357    a: __m512h,
6358    b: __m512h,
6359    c: __m512h,
6360) -> __m512h {
6361    unsafe {
6362        static_assert_rounding!(ROUNDING);
6363        simd_select_bitmask(
6364            k,
6365            _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c),
6366            _mm512_setzero_ph(),
6367        )
6368    }
6369}
6370
6371/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6372/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6373/// elements from a to the upper elements of dst.
6374///
6375/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sh)
6376#[inline]
6377#[target_feature(enable = "avx512fp16")]
6378#[cfg_attr(test, assert_instr(vfnmadd))]
6379#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6380pub fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6381    unsafe {
6382        let extracta: f16 = simd_extract!(a, 0);
6383        let extractb: f16 = simd_extract!(b, 0);
6384        let extractc: f16 = simd_extract!(c, 0);
6385        let r = fmaf16(-extracta, extractb, extractc);
6386        simd_insert!(a, 0, r)
6387    }
6388}
6389
6390/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6391/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6392/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6393/// elements of dst.
6394///
6395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_sh)
6396#[inline]
6397#[target_feature(enable = "avx512fp16")]
6398#[cfg_attr(test, assert_instr(vfnmadd))]
6399#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6400pub fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6401    unsafe {
6402        let mut fnmadd: f16 = simd_extract!(a, 0);
6403        if k & 1 != 0 {
6404            let extractb: f16 = simd_extract!(b, 0);
6405            let extractc: f16 = simd_extract!(c, 0);
6406            fnmadd = fmaf16(-fnmadd, extractb, extractc);
6407        }
6408        simd_insert!(a, 0, fnmadd)
6409    }
6410}
6411
6412/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6413/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6414/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6415/// elements of dst.
6416///
6417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_sh)
6418#[inline]
6419#[target_feature(enable = "avx512fp16")]
6420#[cfg_attr(test, assert_instr(vfnmadd))]
6421#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6422pub fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6423    unsafe {
6424        let mut fnmadd: f16 = simd_extract!(c, 0);
6425        if k & 1 != 0 {
6426            let extracta: f16 = simd_extract!(a, 0);
6427            let extractb: f16 = simd_extract!(b, 0);
6428            fnmadd = fmaf16(-extracta, extractb, fnmadd);
6429        }
6430        simd_insert!(c, 0, fnmadd)
6431    }
6432}
6433
6434/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6435/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6436/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6437/// elements of dst.
6438///
6439/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_sh)
6440#[inline]
6441#[target_feature(enable = "avx512fp16")]
6442#[cfg_attr(test, assert_instr(vfnmadd))]
6443#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6444pub fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6445    unsafe {
6446        let mut fnmadd: f16 = 0.0;
6447        if k & 1 != 0 {
6448            let extracta: f16 = simd_extract!(a, 0);
6449            let extractb: f16 = simd_extract!(b, 0);
6450            let extractc: f16 = simd_extract!(c, 0);
6451            fnmadd = fmaf16(-extracta, extractb, extractc);
6452        }
6453        simd_insert!(a, 0, fnmadd)
6454    }
6455}
6456
6457/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6458/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6459/// elements from a to the upper elements of dst.
6460///
6461/// Rounding is done according to the rounding parameter, which can be one of:
6462///
6463/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6464/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6465/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6466/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6467/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6468///
6469/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sh)
6470#[inline]
6471#[target_feature(enable = "avx512fp16")]
6472#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6473#[rustc_legacy_const_generics(3)]
6474#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6475pub fn _mm_fnmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6476    unsafe {
6477        static_assert_rounding!(ROUNDING);
6478        let extracta: f16 = simd_extract!(a, 0);
6479        let extractb: f16 = simd_extract!(b, 0);
6480        let extractc: f16 = simd_extract!(c, 0);
6481        let r = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6482        simd_insert!(a, 0, r)
6483    }
6484}
6485
6486/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6487/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6488/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6489/// elements of dst.
6490///
6491/// Rounding is done according to the rounding parameter, which can be one of:
6492///
6493/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6494/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6495/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6496/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6497/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6498///
6499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_round_sh)
6500#[inline]
6501#[target_feature(enable = "avx512fp16")]
6502#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6503#[rustc_legacy_const_generics(4)]
6504#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6505pub fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
6506    a: __m128h,
6507    k: __mmask8,
6508    b: __m128h,
6509    c: __m128h,
6510) -> __m128h {
6511    unsafe {
6512        static_assert_rounding!(ROUNDING);
6513        let mut fnmadd: f16 = simd_extract!(a, 0);
6514        if k & 1 != 0 {
6515            let extractb: f16 = simd_extract!(b, 0);
6516            let extractc: f16 = simd_extract!(c, 0);
6517            fnmadd = vfmaddsh(-fnmadd, extractb, extractc, ROUNDING);
6518        }
6519        simd_insert!(a, 0, fnmadd)
6520    }
6521}
6522
6523/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6524/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6525/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6526/// elements of dst.
6527///
6528/// Rounding is done according to the rounding parameter, which can be one of:
6529///
6530/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6531/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6532/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6533/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6534/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6535///
6536/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_round_sh)
6537#[inline]
6538#[target_feature(enable = "avx512fp16")]
6539#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6540#[rustc_legacy_const_generics(4)]
6541#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6542pub fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
6543    a: __m128h,
6544    b: __m128h,
6545    c: __m128h,
6546    k: __mmask8,
6547) -> __m128h {
6548    unsafe {
6549        static_assert_rounding!(ROUNDING);
6550        let mut fnmadd: f16 = simd_extract!(c, 0);
6551        if k & 1 != 0 {
6552            let extracta: f16 = simd_extract!(a, 0);
6553            let extractb: f16 = simd_extract!(b, 0);
6554            fnmadd = vfmaddsh(-extracta, extractb, fnmadd, ROUNDING);
6555        }
6556        simd_insert!(c, 0, fnmadd)
6557    }
6558}
6559
6560/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6561/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6562/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6563/// elements of dst.
6564///
6565/// Rounding is done according to the rounding parameter, which can be one of:
6566///
6567/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6568/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6569/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6570/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6571/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6572///
6573/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_round_sh)
6574#[inline]
6575#[target_feature(enable = "avx512fp16")]
6576#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6577#[rustc_legacy_const_generics(4)]
6578#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6579pub fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
6580    k: __mmask8,
6581    a: __m128h,
6582    b: __m128h,
6583    c: __m128h,
6584) -> __m128h {
6585    unsafe {
6586        static_assert_rounding!(ROUNDING);
6587        let mut fnmadd: f16 = 0.0;
6588        if k & 1 != 0 {
6589            let extracta: f16 = simd_extract!(a, 0);
6590            let extractb: f16 = simd_extract!(b, 0);
6591            let extractc: f16 = simd_extract!(c, 0);
6592            fnmadd = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6593        }
6594        simd_insert!(a, 0, fnmadd)
6595    }
6596}
6597
6598/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6599/// in c from the negated intermediate result, and store the results in dst.
6600///
6601/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ph)
6602#[inline]
6603#[target_feature(enable = "avx512fp16,avx512vl")]
6604#[cfg_attr(test, assert_instr(vfnmsub))]
6605#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6606pub fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6607    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6608}
6609
6610/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6611/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6612/// copied from a when the corresponding mask bit is not set).
6613///
6614/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ph)
6615#[inline]
6616#[target_feature(enable = "avx512fp16,avx512vl")]
6617#[cfg_attr(test, assert_instr(vfnmsub))]
6618#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6619pub fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6620    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), a) }
6621}
6622
6623/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6624/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6625/// copied from c when the corresponding mask bit is not set).
6626///
6627/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ph)
6628#[inline]
6629#[target_feature(enable = "avx512fp16,avx512vl")]
6630#[cfg_attr(test, assert_instr(vfnmsub))]
6631#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6632pub fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6633    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), c) }
6634}
6635
6636/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6637/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6638/// zeroed out when the corresponding mask bit is not set).
6639///
6640/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ph)
6641#[inline]
6642#[target_feature(enable = "avx512fp16,avx512vl")]
6643#[cfg_attr(test, assert_instr(vfnmsub))]
6644#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6645pub fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6646    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), _mm_setzero_ph()) }
6647}
6648
6649/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6650/// in c from the negated intermediate result, and store the results in dst.
6651///
6652/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ph)
6653#[inline]
6654#[target_feature(enable = "avx512fp16,avx512vl")]
6655#[cfg_attr(test, assert_instr(vfnmsub))]
6656#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6657pub fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6658    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6659}
6660
6661/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6662/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6663/// copied from a when the corresponding mask bit is not set).
6664///
6665/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ph)
6666#[inline]
6667#[target_feature(enable = "avx512fp16,avx512vl")]
6668#[cfg_attr(test, assert_instr(vfnmsub))]
6669#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6670pub fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6671    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), a) }
6672}
6673
6674/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6675/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6676/// copied from c when the corresponding mask bit is not set).
6677///
6678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ph)
6679#[inline]
6680#[target_feature(enable = "avx512fp16,avx512vl")]
6681#[cfg_attr(test, assert_instr(vfnmsub))]
6682#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6683pub fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6684    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), c) }
6685}
6686
6687/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6688/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6689/// zeroed out when the corresponding mask bit is not set).
6690///
6691/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ph)
6692#[inline]
6693#[target_feature(enable = "avx512fp16,avx512vl")]
6694#[cfg_attr(test, assert_instr(vfnmsub))]
6695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6696pub fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6697    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), _mm256_setzero_ph()) }
6698}
6699
6700/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6701/// in c from the negated intermediate result, and store the results in dst.
6702///
6703/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ph)
6704#[inline]
6705#[target_feature(enable = "avx512fp16")]
6706#[cfg_attr(test, assert_instr(vfnmsub))]
6707#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6708pub fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6709    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6710}
6711
6712/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6713/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6714/// copied from a when the corresponding mask bit is not set).
6715///
6716/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ph)
6717#[inline]
6718#[target_feature(enable = "avx512fp16")]
6719#[cfg_attr(test, assert_instr(vfnmsub))]
6720#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6721pub fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6722    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), a) }
6723}
6724
6725/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6726/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6727/// copied from c when the corresponding mask bit is not set).
6728///
6729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_ph)
6730#[inline]
6731#[target_feature(enable = "avx512fp16")]
6732#[cfg_attr(test, assert_instr(vfnmsub))]
6733#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6734pub fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6735    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), c) }
6736}
6737
6738/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6739/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6740/// zeroed out when the corresponding mask bit is not set).
6741///
6742/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ph)
6743#[inline]
6744#[target_feature(enable = "avx512fp16")]
6745#[cfg_attr(test, assert_instr(vfnmsub))]
6746#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6747pub fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6748    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), _mm512_setzero_ph()) }
6749}
6750
6751/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6752/// in c from the negated intermediate result, and store the results in dst.
6753///
6754/// Rounding is done according to the rounding parameter, which can be one of:
6755///
6756/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6757/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6758/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6759/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6760/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6761///
6762/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_ph)
6763#[inline]
6764#[target_feature(enable = "avx512fp16")]
6765#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6766#[rustc_legacy_const_generics(3)]
6767#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6768pub fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6769    unsafe {
6770        static_assert_rounding!(ROUNDING);
6771        vfmaddph_512(simd_neg(a), b, simd_neg(c), ROUNDING)
6772    }
6773}
6774
6775/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6776/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6777/// copied from a when the corresponding mask bit is not set).
6778///
6779/// Rounding is done according to the rounding parameter, which can be one of:
6780///
6781/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6782/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6783/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6784/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6785/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6786///
6787/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_ph)
6788#[inline]
6789#[target_feature(enable = "avx512fp16")]
6790#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6791#[rustc_legacy_const_generics(4)]
6792#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6793pub fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
6794    a: __m512h,
6795    k: __mmask32,
6796    b: __m512h,
6797    c: __m512h,
6798) -> __m512h {
6799    unsafe {
6800        static_assert_rounding!(ROUNDING);
6801        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), a)
6802    }
6803}
6804
6805/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6806/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6807/// copied from c when the corresponding mask bit is not set).
6808///
6809/// Rounding is done according to the rounding parameter, which can be one of:
6810///
6811/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6812/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6813/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6814/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6815/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6816///
6817/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_ph)
6818#[inline]
6819#[target_feature(enable = "avx512fp16")]
6820#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6821#[rustc_legacy_const_generics(4)]
6822#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6823pub fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
6824    a: __m512h,
6825    b: __m512h,
6826    c: __m512h,
6827    k: __mmask32,
6828) -> __m512h {
6829    unsafe {
6830        static_assert_rounding!(ROUNDING);
6831        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), c)
6832    }
6833}
6834
6835/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6836/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6837/// zeroed out when the corresponding mask bit is not set).
6838///
6839/// Rounding is done according to the rounding parameter, which can be one of:
6840///
6841/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6842/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6843/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6844/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6845/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6846///
6847/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_ph)
6848#[inline]
6849#[target_feature(enable = "avx512fp16")]
6850#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6851#[rustc_legacy_const_generics(4)]
6852#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6853pub fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
6854    k: __mmask32,
6855    a: __m512h,
6856    b: __m512h,
6857    c: __m512h,
6858) -> __m512h {
6859    unsafe {
6860        static_assert_rounding!(ROUNDING);
6861        simd_select_bitmask(
6862            k,
6863            _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c),
6864            _mm512_setzero_ph(),
6865        )
6866    }
6867}
6868
6869/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6870/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6871/// elements from a to the upper elements of dst.
6872///
6873/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sh)
6874#[inline]
6875#[target_feature(enable = "avx512fp16")]
6876#[cfg_attr(test, assert_instr(vfnmsub))]
6877#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6878pub fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6879    unsafe {
6880        let extracta: f16 = simd_extract!(a, 0);
6881        let extractb: f16 = simd_extract!(b, 0);
6882        let extractc: f16 = simd_extract!(c, 0);
6883        let r = fmaf16(-extracta, extractb, -extractc);
6884        simd_insert!(a, 0, r)
6885    }
6886}
6887
6888/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6889/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6890/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6891/// elements of dst.
6892///
6893/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sh)
6894#[inline]
6895#[target_feature(enable = "avx512fp16")]
6896#[cfg_attr(test, assert_instr(vfnmsub))]
6897#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6898pub fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6899    unsafe {
6900        let mut fnmsub: f16 = simd_extract!(a, 0);
6901        if k & 1 != 0 {
6902            let extractb: f16 = simd_extract!(b, 0);
6903            let extractc: f16 = simd_extract!(c, 0);
6904            fnmsub = fmaf16(-fnmsub, extractb, -extractc);
6905        }
6906        simd_insert!(a, 0, fnmsub)
6907    }
6908}
6909
6910/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6911/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6912/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6913/// elements of dst.
6914///
6915/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sh)
6916#[inline]
6917#[target_feature(enable = "avx512fp16")]
6918#[cfg_attr(test, assert_instr(vfnmsub))]
6919#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6920pub fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6921    unsafe {
6922        let mut fnmsub: f16 = simd_extract!(c, 0);
6923        if k & 1 != 0 {
6924            let extracta: f16 = simd_extract!(a, 0);
6925            let extractb: f16 = simd_extract!(b, 0);
6926            fnmsub = fmaf16(-extracta, extractb, -fnmsub);
6927        }
6928        simd_insert!(c, 0, fnmsub)
6929    }
6930}
6931
6932/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6933/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6934/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6935/// elements of dst.
6936///
6937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sh)
6938#[inline]
6939#[target_feature(enable = "avx512fp16")]
6940#[cfg_attr(test, assert_instr(vfnmsub))]
6941#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6942pub fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6943    unsafe {
6944        let mut fnmsub: f16 = 0.0;
6945        if k & 1 != 0 {
6946            let extracta: f16 = simd_extract!(a, 0);
6947            let extractb: f16 = simd_extract!(b, 0);
6948            let extractc: f16 = simd_extract!(c, 0);
6949            fnmsub = fmaf16(-extracta, extractb, -extractc);
6950        }
6951        simd_insert!(a, 0, fnmsub)
6952    }
6953}
6954
6955/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6956/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6957/// elements from a to the upper elements of dst.
6958///
6959/// Rounding is done according to the rounding parameter, which can be one of:
6960///
6961/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6962/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6963/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6964/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6965/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6966///
6967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sh)
6968#[inline]
6969#[target_feature(enable = "avx512fp16")]
6970#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6971#[rustc_legacy_const_generics(3)]
6972#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6973pub fn _mm_fnmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6974    unsafe {
6975        static_assert_rounding!(ROUNDING);
6976        let extracta: f16 = simd_extract!(a, 0);
6977        let extractb: f16 = simd_extract!(b, 0);
6978        let extractc: f16 = simd_extract!(c, 0);
6979        let r = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
6980        simd_insert!(a, 0, r)
6981    }
6982}
6983
6984/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6985/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6986/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6987/// elements of dst.
6988///
6989/// Rounding is done according to the rounding parameter, which can be one of:
6990///
6991/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6992/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6993/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6994/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6995/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6996///
6997/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_sh)
6998#[inline]
6999#[target_feature(enable = "avx512fp16")]
7000#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7001#[rustc_legacy_const_generics(4)]
7002#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7003pub fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
7004    a: __m128h,
7005    k: __mmask8,
7006    b: __m128h,
7007    c: __m128h,
7008) -> __m128h {
7009    unsafe {
7010        static_assert_rounding!(ROUNDING);
7011        let mut fnmsub: f16 = simd_extract!(a, 0);
7012        if k & 1 != 0 {
7013            let extractb: f16 = simd_extract!(b, 0);
7014            let extractc: f16 = simd_extract!(c, 0);
7015            fnmsub = vfmaddsh(-fnmsub, extractb, -extractc, ROUNDING);
7016        }
7017        simd_insert!(a, 0, fnmsub)
7018    }
7019}
7020
7021/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7022/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7023/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
7024/// elements of dst.
7025///
7026/// Rounding is done according to the rounding parameter, which can be one of:
7027///
7028/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7029/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7030/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7031/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7032/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7033///
7034/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_sh)
7035#[inline]
7036#[target_feature(enable = "avx512fp16")]
7037#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7038#[rustc_legacy_const_generics(4)]
7039#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7040pub fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
7041    a: __m128h,
7042    b: __m128h,
7043    c: __m128h,
7044    k: __mmask8,
7045) -> __m128h {
7046    unsafe {
7047        static_assert_rounding!(ROUNDING);
7048        let mut fnmsub: f16 = simd_extract!(c, 0);
7049        if k & 1 != 0 {
7050            let extracta: f16 = simd_extract!(a, 0);
7051            let extractb: f16 = simd_extract!(b, 0);
7052            fnmsub = vfmaddsh(-extracta, extractb, -fnmsub, ROUNDING);
7053        }
7054        simd_insert!(c, 0, fnmsub)
7055    }
7056}
7057
7058/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7059/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
7060/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7061/// elements of dst.
7062///
7063/// Rounding is done according to the rounding parameter, which can be one of:
7064///
7065/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7066/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7067/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7068/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7069/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7070///
7071/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_sh)
7072#[inline]
7073#[target_feature(enable = "avx512fp16")]
7074#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7075#[rustc_legacy_const_generics(4)]
7076#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7077pub fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
7078    k: __mmask8,
7079    a: __m128h,
7080    b: __m128h,
7081    c: __m128h,
7082) -> __m128h {
7083    unsafe {
7084        static_assert_rounding!(ROUNDING);
7085        let mut fnmsub: f16 = 0.0;
7086        if k & 1 != 0 {
7087            let extracta: f16 = simd_extract!(a, 0);
7088            let extractb: f16 = simd_extract!(b, 0);
7089            let extractc: f16 = simd_extract!(c, 0);
7090            fnmsub = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
7091        }
7092        simd_insert!(a, 0, fnmsub)
7093    }
7094}
7095
7096/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7097/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7098///
7099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ph)
7100#[inline]
7101#[target_feature(enable = "avx512fp16,avx512vl")]
7102#[cfg_attr(test, assert_instr(vfmaddsub))]
7103#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7104pub fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7105    unsafe { vfmaddsubph_128(a, b, c) }
7106}
7107
7108/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7109/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7110/// (the element is copied from a when the corresponding mask bit is not set).
7111///
7112/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ph)
7113#[inline]
7114#[target_feature(enable = "avx512fp16,avx512vl")]
7115#[cfg_attr(test, assert_instr(vfmaddsub))]
7116#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7117pub fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7118    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), a) }
7119}
7120
7121/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7122/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7123/// (the element is copied from c when the corresponding mask bit is not set).
7124///
7125/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_ph)
7126#[inline]
7127#[target_feature(enable = "avx512fp16,avx512vl")]
7128#[cfg_attr(test, assert_instr(vfmaddsub))]
7129#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7130pub fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7131    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), c) }
7132}
7133
7134/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7135/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7136/// (the element is zeroed out when the corresponding mask bit is not set).
7137///
7138/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ph)
7139#[inline]
7140#[target_feature(enable = "avx512fp16,avx512vl")]
7141#[cfg_attr(test, assert_instr(vfmaddsub))]
7142#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7143pub fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7144    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), _mm_setzero_ph()) }
7145}
7146
7147/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7148/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7149///
7150/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ph)
7151#[inline]
7152#[target_feature(enable = "avx512fp16,avx512vl")]
7153#[cfg_attr(test, assert_instr(vfmaddsub))]
7154#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7155pub fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7156    unsafe { vfmaddsubph_256(a, b, c) }
7157}
7158
7159/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7160/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7161/// (the element is copied from a when the corresponding mask bit is not set).
7162///
7163/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ph)
7164#[inline]
7165#[target_feature(enable = "avx512fp16,avx512vl")]
7166#[cfg_attr(test, assert_instr(vfmaddsub))]
7167#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7168pub fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7169    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), a) }
7170}
7171
7172/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7173/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7174/// (the element is copied from c when the corresponding mask bit is not set).
7175///
7176/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_ph)
7177#[inline]
7178#[target_feature(enable = "avx512fp16,avx512vl")]
7179#[cfg_attr(test, assert_instr(vfmaddsub))]
7180#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7181pub fn _mm256_mask3_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7182    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), c) }
7183}
7184
7185/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7186/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7187/// (the element is zeroed out when the corresponding mask bit is not set).
7188///
7189/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ph)
7190#[inline]
7191#[target_feature(enable = "avx512fp16,avx512vl")]
7192#[cfg_attr(test, assert_instr(vfmaddsub))]
7193#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7194pub fn _mm256_maskz_fmaddsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7195    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), _mm256_setzero_ph()) }
7196}
7197
7198/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7199/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7200///
7201/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_ph)
7202#[inline]
7203#[target_feature(enable = "avx512fp16")]
7204#[cfg_attr(test, assert_instr(vfmaddsub))]
7205#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7206pub fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7207    _mm512_fmaddsub_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
7208}
7209
7210/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7211/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7212/// (the element is copied from a when the corresponding mask bit is not set).
7213///
7214/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_ph)
7215#[inline]
7216#[target_feature(enable = "avx512fp16")]
7217#[cfg_attr(test, assert_instr(vfmaddsub))]
7218#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7219pub fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7220    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), a) }
7221}
7222
7223/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7224/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7225/// (the element is copied from c when the corresponding mask bit is not set).
7226///
7227/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ph)
7228#[inline]
7229#[target_feature(enable = "avx512fp16")]
7230#[cfg_attr(test, assert_instr(vfmaddsub))]
7231#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7232pub fn _mm512_mask3_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7233    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), c) }
7234}
7235
7236/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7237/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7238/// (the element is zeroed out when the corresponding mask bit is not set).
7239///
7240/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_ph)
7241#[inline]
7242#[target_feature(enable = "avx512fp16")]
7243#[cfg_attr(test, assert_instr(vfmaddsub))]
7244#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7245pub fn _mm512_maskz_fmaddsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7246    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), _mm512_setzero_ph()) }
7247}
7248
7249/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7250/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7251///
7252/// Rounding is done according to the rounding parameter, which can be one of:
7253///
7254/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7255/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7256/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7257/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7258/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7259///
7260/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_ph)
7261#[inline]
7262#[target_feature(enable = "avx512fp16")]
7263#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7264#[rustc_legacy_const_generics(3)]
7265#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7266pub fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
7267    a: __m512h,
7268    b: __m512h,
7269    c: __m512h,
7270) -> __m512h {
7271    unsafe {
7272        static_assert_rounding!(ROUNDING);
7273        vfmaddsubph_512(a, b, c, ROUNDING)
7274    }
7275}
7276
7277/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7278/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7279/// (the element is copied from a when the corresponding mask bit is not set).
7280///
7281/// Rounding is done according to the rounding parameter, which can be one of:
7282///
7283/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7284/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7285/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7286/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7287/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7288///
7289/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_ph)
7290#[inline]
7291#[target_feature(enable = "avx512fp16")]
7292#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7293#[rustc_legacy_const_generics(4)]
7294#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7295pub fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
7296    a: __m512h,
7297    k: __mmask32,
7298    b: __m512h,
7299    c: __m512h,
7300) -> __m512h {
7301    unsafe {
7302        static_assert_rounding!(ROUNDING);
7303        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), a)
7304    }
7305}
7306
7307/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7308/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7309/// (the element is copied from c when the corresponding mask bit is not set).
7310///
7311/// Rounding is done according to the rounding parameter, which can be one of:
7312///
7313/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7314/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7315/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7316/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7317/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7318///
7319/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_ph)
7320#[inline]
7321#[target_feature(enable = "avx512fp16")]
7322#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7323#[rustc_legacy_const_generics(4)]
7324#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7325pub fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
7326    a: __m512h,
7327    b: __m512h,
7328    c: __m512h,
7329    k: __mmask32,
7330) -> __m512h {
7331    unsafe {
7332        static_assert_rounding!(ROUNDING);
7333        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), c)
7334    }
7335}
7336
7337/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7338/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7339/// (the element is zeroed out when the corresponding mask bit is not set).
7340///
7341/// Rounding is done according to the rounding parameter, which can be one of:
7342///
7343/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7344/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7345/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7346/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7347/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7348///
7349/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_ph)
7350#[inline]
7351#[target_feature(enable = "avx512fp16")]
7352#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7353#[rustc_legacy_const_generics(4)]
7354#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7355pub fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
7356    k: __mmask32,
7357    a: __m512h,
7358    b: __m512h,
7359    c: __m512h,
7360) -> __m512h {
7361    unsafe {
7362        static_assert_rounding!(ROUNDING);
7363        simd_select_bitmask(
7364            k,
7365            _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c),
7366            _mm512_setzero_ph(),
7367        )
7368    }
7369}
7370
7371/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7372/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7373///
7374/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ph)
7375#[inline]
7376#[target_feature(enable = "avx512fp16,avx512vl")]
7377#[cfg_attr(test, assert_instr(vfmsubadd))]
7378#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7379pub fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7380    unsafe { vfmaddsubph_128(a, b, simd_neg(c)) }
7381}
7382
7383/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7384/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7385/// (the element is copied from a when the corresponding mask bit is not set).
7386///
7387/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ph)
7388#[inline]
7389#[target_feature(enable = "avx512fp16,avx512vl")]
7390#[cfg_attr(test, assert_instr(vfmsubadd))]
7391#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7392pub fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7393    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), a) }
7394}
7395
7396/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7397/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7398/// (the element is copied from c when the corresponding mask bit is not set).
7399///
7400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_ph)
7401#[inline]
7402#[target_feature(enable = "avx512fp16,avx512vl")]
7403#[cfg_attr(test, assert_instr(vfmsubadd))]
7404#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7405pub fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7406    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), c) }
7407}
7408
7409/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7410/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7411/// (the element is zeroed out when the corresponding mask bit is not set).
7412///
7413/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ph)
7414#[inline]
7415#[target_feature(enable = "avx512fp16,avx512vl")]
7416#[cfg_attr(test, assert_instr(vfmsubadd))]
7417#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7418pub fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7419    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), _mm_setzero_ph()) }
7420}
7421
7422/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7423/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7424///
7425/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ph)
7426#[inline]
7427#[target_feature(enable = "avx512fp16,avx512vl")]
7428#[cfg_attr(test, assert_instr(vfmsubadd))]
7429#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7430pub fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7431    unsafe { vfmaddsubph_256(a, b, simd_neg(c)) }
7432}
7433
7434/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7435/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7436/// (the element is copied from a when the corresponding mask bit is not set).
7437///
7438/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ph)
7439#[inline]
7440#[target_feature(enable = "avx512fp16,avx512vl")]
7441#[cfg_attr(test, assert_instr(vfmsubadd))]
7442#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7443pub fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7444    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), a) }
7445}
7446
7447/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7448/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7449/// (the element is copied from c when the corresponding mask bit is not set).
7450///
7451/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_ph)
7452#[inline]
7453#[target_feature(enable = "avx512fp16,avx512vl")]
7454#[cfg_attr(test, assert_instr(vfmsubadd))]
7455#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7456pub fn _mm256_mask3_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7457    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), c) }
7458}
7459
7460/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7461/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7462/// (the element is zeroed out when the corresponding mask bit is not set).
7463///
7464/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ph)
7465#[inline]
7466#[target_feature(enable = "avx512fp16,avx512vl")]
7467#[cfg_attr(test, assert_instr(vfmsubadd))]
7468#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7469pub fn _mm256_maskz_fmsubadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7470    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), _mm256_setzero_ph()) }
7471}
7472
7473/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7474/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7475///
7476/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ph)
7477#[inline]
7478#[target_feature(enable = "avx512fp16")]
7479#[cfg_attr(test, assert_instr(vfmsubadd))]
7480#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7481pub fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7482    _mm512_fmsubadd_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
7483}
7484
7485/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7486/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7487/// (the element is copied from a when the corresponding mask bit is not set).
7488///
7489/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ph)
7490#[inline]
7491#[target_feature(enable = "avx512fp16")]
7492#[cfg_attr(test, assert_instr(vfmsubadd))]
7493#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7494pub fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7495    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), a) }
7496}
7497
7498/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7499/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7500/// (the element is copied from c when the corresponding mask bit is not set).
7501///
7502/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_ph)
7503#[inline]
7504#[target_feature(enable = "avx512fp16")]
7505#[cfg_attr(test, assert_instr(vfmsubadd))]
7506#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7507pub fn _mm512_mask3_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7508    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), c) }
7509}
7510
7511/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7512/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7513/// (the element is zeroed out when the corresponding mask bit is not set).
7514///
7515/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ph)
7516#[inline]
7517#[target_feature(enable = "avx512fp16")]
7518#[cfg_attr(test, assert_instr(vfmsubadd))]
7519#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7520pub fn _mm512_maskz_fmsubadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7521    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), _mm512_setzero_ph()) }
7522}
7523
7524/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7525/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7526///
7527/// Rounding is done according to the rounding parameter, which can be one of:
7528///
7529/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7530/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7531/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7532/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7533/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7534///
7535/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ph)
7536#[inline]
7537#[target_feature(enable = "avx512fp16")]
7538#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7539#[rustc_legacy_const_generics(3)]
7540#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7541pub fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
7542    a: __m512h,
7543    b: __m512h,
7544    c: __m512h,
7545) -> __m512h {
7546    unsafe {
7547        static_assert_rounding!(ROUNDING);
7548        vfmaddsubph_512(a, b, simd_neg(c), ROUNDING)
7549    }
7550}
7551
7552/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7553/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7554/// (the element is copied from a when the corresponding mask bit is not set).
7555///
7556/// Rounding is done according to the rounding parameter, which can be one of:
7557///
7558/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7559/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7560/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7561/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7562/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7563///
7564/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ph)
7565#[inline]
7566#[target_feature(enable = "avx512fp16")]
7567#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7568#[rustc_legacy_const_generics(4)]
7569#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7570pub fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
7571    a: __m512h,
7572    k: __mmask32,
7573    b: __m512h,
7574    c: __m512h,
7575) -> __m512h {
7576    unsafe {
7577        static_assert_rounding!(ROUNDING);
7578        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), a)
7579    }
7580}
7581
7582/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7583/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7584/// (the element is copied from c when the corresponding mask bit is not set).
7585///
7586/// Rounding is done according to the rounding parameter, which can be one of:
7587///
7588/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7589/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7590/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7591/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7592/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7593///
7594/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_ph)
7595#[inline]
7596#[target_feature(enable = "avx512fp16")]
7597#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7598#[rustc_legacy_const_generics(4)]
7599#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7600pub fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
7601    a: __m512h,
7602    b: __m512h,
7603    c: __m512h,
7604    k: __mmask32,
7605) -> __m512h {
7606    unsafe {
7607        static_assert_rounding!(ROUNDING);
7608        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), c)
7609    }
7610}
7611
7612/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7613/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7614/// (the element is zeroed out when the corresponding mask bit is not set).
7615///
7616/// Rounding is done according to the rounding parameter, which can be one of:
7617///
7618/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7619/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7620/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7621/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7622/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7623///
7624/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ph)
7625#[inline]
7626#[target_feature(enable = "avx512fp16")]
7627#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7628#[rustc_legacy_const_generics(4)]
7629#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7630pub fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
7631    k: __mmask32,
7632    a: __m512h,
7633    b: __m512h,
7634    c: __m512h,
7635) -> __m512h {
7636    unsafe {
7637        static_assert_rounding!(ROUNDING);
7638        simd_select_bitmask(
7639            k,
7640            _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c),
7641            _mm512_setzero_ph(),
7642        )
7643    }
7644}
7645
7646/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7647/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7648///
7649/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ph)
7650#[inline]
7651#[target_feature(enable = "avx512fp16,avx512vl")]
7652#[cfg_attr(test, assert_instr(vrcpph))]
7653#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7654pub fn _mm_rcp_ph(a: __m128h) -> __m128h {
7655    _mm_mask_rcp_ph(_mm_undefined_ph(), 0xff, a)
7656}
7657
7658/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7659/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7660/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7661///
7662/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_ph)
7663#[inline]
7664#[target_feature(enable = "avx512fp16,avx512vl")]
7665#[cfg_attr(test, assert_instr(vrcpph))]
7666#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7667pub fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7668    unsafe { vrcpph_128(a, src, k) }
7669}
7670
7671/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7672/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7673/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7674///
7675/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_ph)
7676#[inline]
7677#[target_feature(enable = "avx512fp16,avx512vl")]
7678#[cfg_attr(test, assert_instr(vrcpph))]
7679#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7680pub fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
7681    _mm_mask_rcp_ph(_mm_setzero_ph(), k, a)
7682}
7683
7684/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7685/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7686///
7687/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ph)
7688#[inline]
7689#[target_feature(enable = "avx512fp16,avx512vl")]
7690#[cfg_attr(test, assert_instr(vrcpph))]
7691#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7692pub fn _mm256_rcp_ph(a: __m256h) -> __m256h {
7693    _mm256_mask_rcp_ph(_mm256_undefined_ph(), 0xffff, a)
7694}
7695
7696/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7697/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7698/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7699///
7700/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp_ph)
7701#[inline]
7702#[target_feature(enable = "avx512fp16,avx512vl")]
7703#[cfg_attr(test, assert_instr(vrcpph))]
7704#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7705pub fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
7706    unsafe { vrcpph_256(a, src, k) }
7707}
7708
7709/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7710/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7711/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7712///
7713/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp_ph)
7714#[inline]
7715#[target_feature(enable = "avx512fp16,avx512vl")]
7716#[cfg_attr(test, assert_instr(vrcpph))]
7717#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7718pub fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
7719    _mm256_mask_rcp_ph(_mm256_setzero_ph(), k, a)
7720}
7721
7722/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7723/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7724///
7725/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp_ph)
7726#[inline]
7727#[target_feature(enable = "avx512fp16")]
7728#[cfg_attr(test, assert_instr(vrcpph))]
7729#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7730pub fn _mm512_rcp_ph(a: __m512h) -> __m512h {
7731    _mm512_mask_rcp_ph(_mm512_undefined_ph(), 0xffffffff, a)
7732}
7733
7734/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7735/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7736/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7737///
7738/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp_ph)
7739#[inline]
7740#[target_feature(enable = "avx512fp16")]
7741#[cfg_attr(test, assert_instr(vrcpph))]
7742#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7743pub fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
7744    unsafe { vrcpph_512(a, src, k) }
7745}
7746
7747/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7748/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7749/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7750///
7751/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp_ph)
7752#[inline]
7753#[target_feature(enable = "avx512fp16")]
7754#[cfg_attr(test, assert_instr(vrcpph))]
7755#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7756pub fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
7757    _mm512_mask_rcp_ph(_mm512_setzero_ph(), k, a)
7758}
7759
7760/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7761/// store the result in the lower element of dst, and copy the upper 7 packed elements from a to the
7762/// upper elements of dst.
7763/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7764///
7765/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_sh)
7766#[inline]
7767#[target_feature(enable = "avx512fp16")]
7768#[cfg_attr(test, assert_instr(vrcpsh))]
7769#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7770pub fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
7771    _mm_mask_rcp_sh(_mm_undefined_ph(), 0xff, a, b)
7772}
7773
7774/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7775/// store the result in the lower element of dst using writemask k (the element is copied from src when
7776/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7777/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7778///
7779/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_sh)
7780#[inline]
7781#[target_feature(enable = "avx512fp16")]
7782#[cfg_attr(test, assert_instr(vrcpsh))]
7783#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7784pub fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7785    unsafe { vrcpsh(a, b, src, k) }
7786}
7787
7788/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7789/// store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
7790/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7791/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7792///
7793/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_sh)
7794#[inline]
7795#[target_feature(enable = "avx512fp16")]
7796#[cfg_attr(test, assert_instr(vrcpsh))]
7797#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7798pub fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7799    _mm_mask_rcp_sh(_mm_setzero_ph(), k, a, b)
7800}
7801
7802/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7803/// elements in a, and store the results in dst.
7804/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7805///
7806/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ph)
7807#[inline]
7808#[target_feature(enable = "avx512fp16,avx512vl")]
7809#[cfg_attr(test, assert_instr(vrsqrtph))]
7810#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7811pub fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
7812    _mm_mask_rsqrt_ph(_mm_undefined_ph(), 0xff, a)
7813}
7814
7815/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7816/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7817/// the corresponding mask bit is not set).
7818/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7819///
7820/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_ph)
7821#[inline]
7822#[target_feature(enable = "avx512fp16,avx512vl")]
7823#[cfg_attr(test, assert_instr(vrsqrtph))]
7824#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7825pub fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7826    unsafe { vrsqrtph_128(a, src, k) }
7827}
7828
7829/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7830/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7831/// corresponding mask bit is not set).
7832/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7833///
7834/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_ph)
7835#[inline]
7836#[target_feature(enable = "avx512fp16,avx512vl")]
7837#[cfg_attr(test, assert_instr(vrsqrtph))]
7838#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7839pub fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
7840    _mm_mask_rsqrt_ph(_mm_setzero_ph(), k, a)
7841}
7842
7843/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7844/// elements in a, and store the results in dst.
7845/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7846///
7847/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ph)
7848#[inline]
7849#[target_feature(enable = "avx512fp16,avx512vl")]
7850#[cfg_attr(test, assert_instr(vrsqrtph))]
7851#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7852pub fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
7853    _mm256_mask_rsqrt_ph(_mm256_undefined_ph(), 0xffff, a)
7854}
7855
7856/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7857/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7858/// the corresponding mask bit is not set).
7859/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7860///
7861/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt_ph)
7862#[inline]
7863#[target_feature(enable = "avx512fp16,avx512vl")]
7864#[cfg_attr(test, assert_instr(vrsqrtph))]
7865#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7866pub fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
7867    unsafe { vrsqrtph_256(a, src, k) }
7868}
7869
7870/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7871/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7872/// corresponding mask bit is not set).
7873/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7874///
7875/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt_ph)
7876#[inline]
7877#[target_feature(enable = "avx512fp16,avx512vl")]
7878#[cfg_attr(test, assert_instr(vrsqrtph))]
7879#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7880pub fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
7881    _mm256_mask_rsqrt_ph(_mm256_setzero_ph(), k, a)
7882}
7883
7884/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7885/// elements in a, and store the results in dst.
7886/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7887///
7888/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt_ph)
7889#[inline]
7890#[target_feature(enable = "avx512fp16")]
7891#[cfg_attr(test, assert_instr(vrsqrtph))]
7892#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7893pub fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
7894    _mm512_mask_rsqrt_ph(_mm512_undefined_ph(), 0xffffffff, a)
7895}
7896
7897/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7898/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7899/// the corresponding mask bit is not set).
7900/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7901///
7902/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt_ph)
7903#[inline]
7904#[target_feature(enable = "avx512fp16")]
7905#[cfg_attr(test, assert_instr(vrsqrtph))]
7906#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7907pub fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
7908    unsafe { vrsqrtph_512(a, src, k) }
7909}
7910
7911/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7912/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7913/// corresponding mask bit is not set).
7914/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7915///
7916/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt_ph)
7917#[inline]
7918#[target_feature(enable = "avx512fp16")]
7919#[cfg_attr(test, assert_instr(vrsqrtph))]
7920#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7921pub fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
7922    _mm512_mask_rsqrt_ph(_mm512_setzero_ph(), k, a)
7923}
7924
7925/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7926/// element in b, store the result in the lower element of dst, and copy the upper 7 packed elements from a
7927/// to the upper elements of dst.
7928/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7929///
7930/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_sh)
7931#[inline]
7932#[target_feature(enable = "avx512fp16")]
7933#[cfg_attr(test, assert_instr(vrsqrtsh))]
7934#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7935pub fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
7936    _mm_mask_rsqrt_sh(_mm_undefined_ph(), 0xff, a, b)
7937}
7938
7939/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7940/// element in b, store the result in the lower element of dst using writemask k (the element is copied from src
7941/// when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7942/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7943///
7944/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_sh)
7945#[inline]
7946#[target_feature(enable = "avx512fp16")]
7947#[cfg_attr(test, assert_instr(vrsqrtsh))]
7948#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7949pub fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7950    unsafe { vrsqrtsh(a, b, src, k) }
7951}
7952
7953/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7954/// element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when
7955/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7956/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7957///
7958/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_sh)
7959#[inline]
7960#[target_feature(enable = "avx512fp16")]
7961#[cfg_attr(test, assert_instr(vrsqrtsh))]
7962#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7963pub fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7964    _mm_mask_rsqrt_sh(_mm_setzero_ph(), k, a, b)
7965}
7966
7967/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7968/// results in dst.
7969///
7970/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ph)
7971#[inline]
7972#[target_feature(enable = "avx512fp16,avx512vl")]
7973#[cfg_attr(test, assert_instr(vsqrtph))]
7974#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7975pub fn _mm_sqrt_ph(a: __m128h) -> __m128h {
7976    unsafe { simd_fsqrt(a) }
7977}
7978
7979/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7980/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7981///
7982/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ph)
7983#[inline]
7984#[target_feature(enable = "avx512fp16,avx512vl")]
7985#[cfg_attr(test, assert_instr(vsqrtph))]
7986#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7987pub fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7988    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), src) }
7989}
7990
7991/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7992/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7993///
7994/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ph)
7995#[inline]
7996#[target_feature(enable = "avx512fp16,avx512vl")]
7997#[cfg_attr(test, assert_instr(vsqrtph))]
7998#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7999pub fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
8000    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), _mm_setzero_ph()) }
8001}
8002
8003/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8004/// results in dst.
8005///
8006/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ph)
8007#[inline]
8008#[target_feature(enable = "avx512fp16,avx512vl")]
8009#[cfg_attr(test, assert_instr(vsqrtph))]
8010#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8011pub fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
8012    unsafe { simd_fsqrt(a) }
8013}
8014
8015/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8016/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8017///
8018/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ph)
8019#[inline]
8020#[target_feature(enable = "avx512fp16,avx512vl")]
8021#[cfg_attr(test, assert_instr(vsqrtph))]
8022#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8023pub fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8024    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), src) }
8025}
8026
8027/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8028/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8029///
8030/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ph)
8031#[inline]
8032#[target_feature(enable = "avx512fp16,avx512vl")]
8033#[cfg_attr(test, assert_instr(vsqrtph))]
8034#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8035pub fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
8036    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), _mm256_setzero_ph()) }
8037}
8038
8039/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8040/// results in dst.
8041///
8042/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ph)
8043#[inline]
8044#[target_feature(enable = "avx512fp16")]
8045#[cfg_attr(test, assert_instr(vsqrtph))]
8046#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8047pub fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
8048    unsafe { simd_fsqrt(a) }
8049}
8050
8051/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8052/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8053///
8054/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ph)
8055#[inline]
8056#[target_feature(enable = "avx512fp16")]
8057#[cfg_attr(test, assert_instr(vsqrtph))]
8058#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8059pub fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8060    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), src) }
8061}
8062
8063/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8064/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8065///
8066/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_ph)
8067#[inline]
8068#[target_feature(enable = "avx512fp16")]
8069#[cfg_attr(test, assert_instr(vsqrtph))]
8070#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8071pub fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
8072    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), _mm512_setzero_ph()) }
8073}
8074
8075/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8076/// results in dst.
8077/// Rounding is done according to the rounding parameter, which can be one of:
8078///
8079/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8080/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8081/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8082/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8083/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8084///
8085/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ph)
8086#[inline]
8087#[target_feature(enable = "avx512fp16")]
8088#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8089#[rustc_legacy_const_generics(1)]
8090#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8091pub fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
8092    unsafe {
8093        static_assert_rounding!(ROUNDING);
8094        vsqrtph_512(a, ROUNDING)
8095    }
8096}
8097
8098/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8099/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8100/// Rounding is done according to the rounding parameter, which can be one of:
8101///
8102/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8103/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8104/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8105/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8106/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8107///
8108/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ph)
8109#[inline]
8110#[target_feature(enable = "avx512fp16")]
8111#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8112#[rustc_legacy_const_generics(3)]
8113#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8114pub fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
8115    src: __m512h,
8116    k: __mmask32,
8117    a: __m512h,
8118) -> __m512h {
8119    unsafe {
8120        static_assert_rounding!(ROUNDING);
8121        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), src)
8122    }
8123}
8124
8125/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8126/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8127/// Rounding is done according to the rounding parameter, which can be one of:
8128///
8129/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8130/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8131/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8132/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8133/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8134///
8135/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_ph)
8136#[inline]
8137#[target_feature(enable = "avx512fp16")]
8138#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8139#[rustc_legacy_const_generics(2)]
8140#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8141pub fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512h {
8142    unsafe {
8143        static_assert_rounding!(ROUNDING);
8144        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), _mm512_setzero_ph())
8145    }
8146}
8147
8148/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8149/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8150/// elements of dst.
8151///
8152/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sh)
8153#[inline]
8154#[target_feature(enable = "avx512fp16")]
8155#[cfg_attr(test, assert_instr(vsqrtsh))]
8156#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8157pub fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
8158    _mm_mask_sqrt_sh(_mm_undefined_ph(), 0xff, a, b)
8159}
8160
8161/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8162/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8163/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8164///
8165/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sh)
8166#[inline]
8167#[target_feature(enable = "avx512fp16")]
8168#[cfg_attr(test, assert_instr(vsqrtsh))]
8169#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8170pub fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8171    _mm_mask_sqrt_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8172}
8173
8174/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8175/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8176/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8177///
8178/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sh)
8179#[inline]
8180#[target_feature(enable = "avx512fp16")]
8181#[cfg_attr(test, assert_instr(vsqrtsh))]
8182#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8183pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8184    _mm_mask_sqrt_sh(_mm_setzero_ph(), k, a, b)
8185}
8186
8187/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8188/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8189/// elements of dst.
8190/// Rounding is done according to the rounding parameter, which can be one of:
8191///
8192/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8193/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8194/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8195/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8196/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8197///
8198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sh)
8199#[inline]
8200#[target_feature(enable = "avx512fp16")]
8201#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8202#[rustc_legacy_const_generics(2)]
8203#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8204pub fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
8205    static_assert_rounding!(ROUNDING);
8206    _mm_mask_sqrt_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
8207}
8208
8209/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8210/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8211/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8212/// Rounding is done according to the rounding parameter, which can be one of:
8213///
8214/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8215/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8216/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8217/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8218/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8219///
8220/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sh)
8221#[inline]
8222#[target_feature(enable = "avx512fp16")]
8223#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8224#[rustc_legacy_const_generics(4)]
8225#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8226pub fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
8227    src: __m128h,
8228    k: __mmask8,
8229    a: __m128h,
8230    b: __m128h,
8231) -> __m128h {
8232    unsafe {
8233        static_assert_rounding!(ROUNDING);
8234        vsqrtsh(a, b, src, k, ROUNDING)
8235    }
8236}
8237
8238/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8239/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8240/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8241/// Rounding is done according to the rounding parameter, which can be one of:
8242///
8243/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8244/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8245/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8246/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8247/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8248///
8249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sh)
8250#[inline]
8251#[target_feature(enable = "avx512fp16")]
8252#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8253#[rustc_legacy_const_generics(3)]
8254#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8255pub fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
8256    k: __mmask8,
8257    a: __m128h,
8258    b: __m128h,
8259) -> __m128h {
8260    static_assert_rounding!(ROUNDING);
8261    _mm_mask_sqrt_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
8262}
8263
8264/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8265/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8266/// value when inputs are NaN or signed-zero values.
8267///
8268/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ph)
8269#[inline]
8270#[target_feature(enable = "avx512fp16,avx512vl")]
8271#[cfg_attr(test, assert_instr(vmaxph))]
8272#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8273pub fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
8274    unsafe { vmaxph_128(a, b) }
8275}
8276
8277/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8278/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8279/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8280/// NaN or signed-zero values.
8281///
8282/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ph)
8283#[inline]
8284#[target_feature(enable = "avx512fp16,avx512vl")]
8285#[cfg_attr(test, assert_instr(vmaxph))]
8286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8287pub fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8288    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), src) }
8289}
8290
8291/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8292/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8293/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8294/// NaN or signed-zero values.
8295///
8296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ph)
8297#[inline]
8298#[target_feature(enable = "avx512fp16,avx512vl")]
8299#[cfg_attr(test, assert_instr(vmaxph))]
8300#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8301pub fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8302    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), _mm_setzero_ph()) }
8303}
8304
8305/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8306/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8307/// value when inputs are NaN or signed-zero values.
8308///
8309/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ph)
8310#[inline]
8311#[target_feature(enable = "avx512fp16,avx512vl")]
8312#[cfg_attr(test, assert_instr(vmaxph))]
8313#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8314pub fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
8315    unsafe { vmaxph_256(a, b) }
8316}
8317
8318/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8319/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8320/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8321/// NaN or signed-zero values.
8322///
8323/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ph)
8324#[inline]
8325#[target_feature(enable = "avx512fp16,avx512vl")]
8326#[cfg_attr(test, assert_instr(vmaxph))]
8327#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8328pub fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8329    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), src) }
8330}
8331
8332/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8333/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8334/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8335/// NaN or signed-zero values.
8336///
8337/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ph)
8338#[inline]
8339#[target_feature(enable = "avx512fp16,avx512vl")]
8340#[cfg_attr(test, assert_instr(vmaxph))]
8341#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8342pub fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8343    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), _mm256_setzero_ph()) }
8344}
8345
8346/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8347/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8348/// value when inputs are NaN or signed-zero values.
8349///
8350/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_ph)
8351#[inline]
8352#[target_feature(enable = "avx512fp16")]
8353#[cfg_attr(test, assert_instr(vmaxph))]
8354#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8355pub fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
8356    _mm512_max_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8357}
8358
8359/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8360/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8361/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8362/// NaN or signed-zero values.
8363///
8364/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_ph)
8365#[inline]
8366#[target_feature(enable = "avx512fp16")]
8367#[cfg_attr(test, assert_instr(vmaxph))]
8368#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8369pub fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8370    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), src) }
8371}
8372
8373/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8374/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8375/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8376/// NaN or signed-zero values.
8377///
8378/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_ph)
8379#[inline]
8380#[target_feature(enable = "avx512fp16")]
8381#[cfg_attr(test, assert_instr(vmaxph))]
8382#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8383pub fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8384    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), _mm512_setzero_ph()) }
8385}
8386
8387/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8388/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8389/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8390/// NaN or signed-zero values.
8391///
8392/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ph)
8393#[inline]
8394#[target_feature(enable = "avx512fp16")]
8395#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8396#[rustc_legacy_const_generics(2)]
8397#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8398pub fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8399    unsafe {
8400        static_assert_sae!(SAE);
8401        vmaxph_512(a, b, SAE)
8402    }
8403}
8404
8405/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8406/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8407/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8408/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8409///
8410/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ph)
8411#[inline]
8412#[target_feature(enable = "avx512fp16")]
8413#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8414#[rustc_legacy_const_generics(4)]
8415#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8416pub fn _mm512_mask_max_round_ph<const SAE: i32>(
8417    src: __m512h,
8418    k: __mmask32,
8419    a: __m512h,
8420    b: __m512h,
8421) -> __m512h {
8422    unsafe {
8423        static_assert_sae!(SAE);
8424        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), src)
8425    }
8426}
8427
8428/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8429/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8430/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8431/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8432///
8433/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ph)
8434#[inline]
8435#[target_feature(enable = "avx512fp16")]
8436#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8437#[rustc_legacy_const_generics(3)]
8438#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8439pub fn _mm512_maskz_max_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8440    unsafe {
8441        static_assert_sae!(SAE);
8442        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8443    }
8444}
8445
8446/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8447/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8448/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value
8449/// when inputs are NaN or signed-zero values.
8450///
8451/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sh)
8452#[inline]
8453#[target_feature(enable = "avx512fp16,avx512vl")]
8454#[cfg_attr(test, assert_instr(vmaxsh))]
8455#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8456pub fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
8457    _mm_mask_max_sh(_mm_undefined_ph(), 0xff, a, b)
8458}
8459
8460/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8461/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8462/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8463/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8464///
8465/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sh)
8466#[inline]
8467#[target_feature(enable = "avx512fp16,avx512vl")]
8468#[cfg_attr(test, assert_instr(vmaxsh))]
8469#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8470pub fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8471    _mm_mask_max_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8472}
8473
8474/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8475/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8476/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8477/// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8478///
8479/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sh)
8480#[inline]
8481#[target_feature(enable = "avx512fp16,avx512vl")]
8482#[cfg_attr(test, assert_instr(vmaxsh))]
8483#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8484pub fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8485    _mm_mask_max_sh(_mm_setzero_ph(), k, a, b)
8486}
8487
8488/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8489/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8490/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8491/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8492///
8493/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sh)
8494#[inline]
8495#[target_feature(enable = "avx512fp16,avx512vl")]
8496#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8497#[rustc_legacy_const_generics(2)]
8498#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8499pub fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8500    static_assert_sae!(SAE);
8501    _mm_mask_max_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8502}
8503
8504/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8505/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8506/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8507/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8508/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8509///
8510/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sh)
8511#[inline]
8512#[target_feature(enable = "avx512fp16,avx512vl")]
8513#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8514#[rustc_legacy_const_generics(4)]
8515#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8516pub fn _mm_mask_max_round_sh<const SAE: i32>(
8517    src: __m128h,
8518    k: __mmask8,
8519    a: __m128h,
8520    b: __m128h,
8521) -> __m128h {
8522    unsafe {
8523        static_assert_sae!(SAE);
8524        vmaxsh(a, b, src, k, SAE)
8525    }
8526}
8527
8528/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8529/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8530/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8531/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8532/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8533///
8534/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sh)
8535#[inline]
8536#[target_feature(enable = "avx512fp16,avx512vl")]
8537#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8538#[rustc_legacy_const_generics(3)]
8539#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8540pub fn _mm_maskz_max_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8541    static_assert_sae!(SAE);
8542    _mm_mask_max_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
8543}
8544
8545/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8546/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8547/// when inputs are NaN or signed-zero values.
8548///
8549/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ph)
8550#[inline]
8551#[target_feature(enable = "avx512fp16,avx512vl")]
8552#[cfg_attr(test, assert_instr(vminph))]
8553#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8554pub fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
8555    unsafe { vminph_128(a, b) }
8556}
8557
8558/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8559/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8560/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8561/// NaN or signed-zero values.
8562///
8563/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ph)
8564#[inline]
8565#[target_feature(enable = "avx512fp16,avx512vl")]
8566#[cfg_attr(test, assert_instr(vminph))]
8567#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8568pub fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8569    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), src) }
8570}
8571
8572/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8573/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8574/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8575/// NaN or signed-zero values.
8576///
8577/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ph)
8578#[inline]
8579#[target_feature(enable = "avx512fp16,avx512vl")]
8580#[cfg_attr(test, assert_instr(vminph))]
8581#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8582pub fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8583    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), _mm_setzero_ph()) }
8584}
8585
8586/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8587/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8588/// when inputs are NaN or signed-zero values.
8589///
8590/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ph)
8591#[inline]
8592#[target_feature(enable = "avx512fp16,avx512vl")]
8593#[cfg_attr(test, assert_instr(vminph))]
8594#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8595pub fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
8596    unsafe { vminph_256(a, b) }
8597}
8598
8599/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8600/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8601/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8602/// NaN or signed-zero values.
8603///
8604/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ph)
8605#[inline]
8606#[target_feature(enable = "avx512fp16,avx512vl")]
8607#[cfg_attr(test, assert_instr(vminph))]
8608#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8609pub fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8610    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), src) }
8611}
8612
8613/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8614/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8615/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8616/// NaN or signed-zero values.
8617///
8618/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ph)
8619#[inline]
8620#[target_feature(enable = "avx512fp16,avx512vl")]
8621#[cfg_attr(test, assert_instr(vminph))]
8622#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8623pub fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8624    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), _mm256_setzero_ph()) }
8625}
8626
8627/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8628/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8629/// when inputs are NaN or signed-zero values.
8630///
8631/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ph)
8632#[inline]
8633#[target_feature(enable = "avx512fp16")]
8634#[cfg_attr(test, assert_instr(vminph))]
8635#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8636pub fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
8637    _mm512_min_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8638}
8639
8640/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8641/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8642/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8643/// NaN or signed-zero values.
8644///
8645/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ph)
8646#[inline]
8647#[target_feature(enable = "avx512fp16")]
8648#[cfg_attr(test, assert_instr(vminph))]
8649#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8650pub fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8651    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), src) }
8652}
8653
8654/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8655/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8656/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8657/// NaN or signed-zero values.
8658///
8659/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ph)
8660#[inline]
8661#[target_feature(enable = "avx512fp16")]
8662#[cfg_attr(test, assert_instr(vminph))]
8663#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8664pub fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8665    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), _mm512_setzero_ph()) }
8666}
8667
8668/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8669/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not
8670/// follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8671///
8672/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ph)
8673#[inline]
8674#[target_feature(enable = "avx512fp16")]
8675#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8676#[rustc_legacy_const_generics(2)]
8677#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8678pub fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8679    unsafe {
8680        static_assert_sae!(SAE);
8681        vminph_512(a, b, SAE)
8682    }
8683}
8684
8685/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8686/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8687/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8688/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8689///
8690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ph)
8691#[inline]
8692#[target_feature(enable = "avx512fp16")]
8693#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8694#[rustc_legacy_const_generics(4)]
8695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8696pub fn _mm512_mask_min_round_ph<const SAE: i32>(
8697    src: __m512h,
8698    k: __mmask32,
8699    a: __m512h,
8700    b: __m512h,
8701) -> __m512h {
8702    unsafe {
8703        static_assert_sae!(SAE);
8704        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), src)
8705    }
8706}
8707
8708/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8709/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8710/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8711/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8712///
8713/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ph)
8714#[inline]
8715#[target_feature(enable = "avx512fp16")]
8716#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8717#[rustc_legacy_const_generics(3)]
8718#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8719pub fn _mm512_maskz_min_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8720    unsafe {
8721        static_assert_sae!(SAE);
8722        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8723    }
8724}
8725
8726/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
8727/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8728/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
8729/// inputs are NaN or signed-zero values.
8730///
8731/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sh)
8732#[inline]
8733#[target_feature(enable = "avx512fp16,avx512vl")]
8734#[cfg_attr(test, assert_instr(vminsh))]
8735#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8736pub fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
8737    _mm_mask_min_sh(_mm_undefined_ph(), 0xff, a, b)
8738}
8739
8740/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
8741/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8742/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8743/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8744///
8745/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sh)
8746#[inline]
8747#[target_feature(enable = "avx512fp16,avx512vl")]
8748#[cfg_attr(test, assert_instr(vminsh))]
8749#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8750pub fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8751    _mm_mask_min_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8752}
8753
8754/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8755/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8756/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8757/// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8758///
8759/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sh)
8760#[inline]
8761#[target_feature(enable = "avx512fp16,avx512vl")]
8762#[cfg_attr(test, assert_instr(vminsh))]
8763#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8764pub fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8765    _mm_mask_min_sh(_mm_setzero_ph(), k, a, b)
8766}
8767
8768/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8769/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8770/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8771/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8772///
8773/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sh)
8774#[inline]
8775#[target_feature(enable = "avx512fp16,avx512vl")]
8776#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8777#[rustc_legacy_const_generics(2)]
8778#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8779pub fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8780    static_assert_sae!(SAE);
8781    _mm_mask_min_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8782}
8783
8784/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8785/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8786/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8787/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8788/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8789///
8790/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sh)
8791#[inline]
8792#[target_feature(enable = "avx512fp16,avx512vl")]
8793#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8794#[rustc_legacy_const_generics(4)]
8795#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8796pub fn _mm_mask_min_round_sh<const SAE: i32>(
8797    src: __m128h,
8798    k: __mmask8,
8799    a: __m128h,
8800    b: __m128h,
8801) -> __m128h {
8802    unsafe {
8803        static_assert_sae!(SAE);
8804        vminsh(a, b, src, k, SAE)
8805    }
8806}
8807
8808/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8809/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8810/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8811/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8812/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8813///
8814/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sh)
8815#[inline]
8816#[target_feature(enable = "avx512fp16,avx512vl")]
8817#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8818#[rustc_legacy_const_generics(3)]
8819#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8820pub fn _mm_maskz_min_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8821    static_assert_sae!(SAE);
8822    _mm_mask_min_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
8823}
8824
8825/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8826/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8827/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8828///
8829/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ph)
8830#[inline]
8831#[target_feature(enable = "avx512fp16,avx512vl")]
8832#[cfg_attr(test, assert_instr(vgetexpph))]
8833#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8834pub fn _mm_getexp_ph(a: __m128h) -> __m128h {
8835    _mm_mask_getexp_ph(_mm_undefined_ph(), 0xff, a)
8836}
8837
8838/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8839/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8840/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8841/// `floor(log2(x))` for each element.
8842///
8843/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ph)
8844#[inline]
8845#[target_feature(enable = "avx512fp16,avx512vl")]
8846#[cfg_attr(test, assert_instr(vgetexpph))]
8847#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8848pub fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8849    unsafe { vgetexpph_128(a, src, k) }
8850}
8851
8852/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8853/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8854/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8855/// `floor(log2(x))` for each element.
8856///
8857/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph)
8858#[inline]
8859#[target_feature(enable = "avx512fp16,avx512vl")]
8860#[cfg_attr(test, assert_instr(vgetexpph))]
8861#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8862pub fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
8863    _mm_mask_getexp_ph(_mm_setzero_ph(), k, a)
8864}
8865
8866/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8867/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8868/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8869///
8870/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ph)
8871#[inline]
8872#[target_feature(enable = "avx512fp16,avx512vl")]
8873#[cfg_attr(test, assert_instr(vgetexpph))]
8874#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8875pub fn _mm256_getexp_ph(a: __m256h) -> __m256h {
8876    _mm256_mask_getexp_ph(_mm256_undefined_ph(), 0xffff, a)
8877}
8878
8879/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8880/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8881/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8882/// `floor(log2(x))` for each element.
8883///
8884/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ph)
8885#[inline]
8886#[target_feature(enable = "avx512fp16,avx512vl")]
8887#[cfg_attr(test, assert_instr(vgetexpph))]
8888#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8889pub fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8890    unsafe { vgetexpph_256(a, src, k) }
8891}
8892
8893/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8894/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8895/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8896/// `floor(log2(x))` for each element.
8897///
8898/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph)
8899#[inline]
8900#[target_feature(enable = "avx512fp16,avx512vl")]
8901#[cfg_attr(test, assert_instr(vgetexpph))]
8902#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8903pub fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
8904    _mm256_mask_getexp_ph(_mm256_setzero_ph(), k, a)
8905}
8906
8907/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8908/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8909/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8910///
8911/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ph)
8912#[inline]
8913#[target_feature(enable = "avx512fp16")]
8914#[cfg_attr(test, assert_instr(vgetexpph))]
8915#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8916pub fn _mm512_getexp_ph(a: __m512h) -> __m512h {
8917    _mm512_mask_getexp_ph(_mm512_undefined_ph(), 0xffffffff, a)
8918}
8919
8920/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8921/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8922/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8923/// `floor(log2(x))` for each element.
8924///
8925/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ph)
8926#[inline]
8927#[target_feature(enable = "avx512fp16")]
8928#[cfg_attr(test, assert_instr(vgetexpph))]
8929#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8930pub fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8931    _mm512_mask_getexp_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
8932}
8933
8934/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8935/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8936/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8937/// `floor(log2(x))` for each element.
8938///
8939/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ph)
8940#[inline]
8941#[target_feature(enable = "avx512fp16")]
8942#[cfg_attr(test, assert_instr(vgetexpph))]
8943#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8944pub fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
8945    _mm512_mask_getexp_ph(_mm512_setzero_ph(), k, a)
8946}
8947
8948/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8949/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8950/// This intrinsic essentially calculates `floor(log2(x))` for each element. Exceptions can be suppressed
8951/// by passing _MM_FROUND_NO_EXC in the sae parameter
8952///
8953/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ph)
8954#[inline]
8955#[target_feature(enable = "avx512fp16")]
8956#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
8957#[rustc_legacy_const_generics(1)]
8958#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8959pub fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
8960    static_assert_sae!(SAE);
8961    _mm512_mask_getexp_round_ph::<SAE>(_mm512_undefined_ph(), 0xffffffff, a)
8962}
8963
8964/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8965/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8966/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8967/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
8968///
8969/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ph)
8970#[inline]
8971#[target_feature(enable = "avx512fp16")]
8972#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
8973#[rustc_legacy_const_generics(3)]
8974#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8975pub fn _mm512_mask_getexp_round_ph<const SAE: i32>(
8976    src: __m512h,
8977    k: __mmask32,
8978    a: __m512h,
8979) -> __m512h {
8980    unsafe {
8981        static_assert_sae!(SAE);
8982        vgetexpph_512(a, src, k, SAE)
8983    }
8984}
8985
8986/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8987/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8988/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8989/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
8990///
8991/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ph)
8992#[inline]
8993#[target_feature(enable = "avx512fp16")]
8994#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
8995#[rustc_legacy_const_generics(2)]
8996#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8997pub fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512h {
8998    static_assert_sae!(SAE);
8999    _mm512_mask_getexp_round_ph::<SAE>(_mm512_setzero_ph(), k, a)
9000}
9001
9002/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9003/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9004/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9005/// calculates `floor(log2(x))` for the lower element.
9006///
9007/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sh)
9008#[inline]
9009#[target_feature(enable = "avx512fp16")]
9010#[cfg_attr(test, assert_instr(vgetexpsh))]
9011#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9012pub fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
9013    _mm_mask_getexp_sh(_mm_undefined_ph(), 0xff, a, b)
9014}
9015
9016/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9017/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9018/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9019/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9020/// for the lower element.
9021///
9022/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh)
9023#[inline]
9024#[target_feature(enable = "avx512fp16")]
9025#[cfg_attr(test, assert_instr(vgetexpsh))]
9026#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9027pub fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9028    _mm_mask_getexp_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9029}
9030
9031/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9032/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9033/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9034/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9035/// lower element.
9036///
9037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh)
9038#[inline]
9039#[target_feature(enable = "avx512fp16")]
9040#[cfg_attr(test, assert_instr(vgetexpsh))]
9041#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9042pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9043    _mm_mask_getexp_sh(_mm_setzero_ph(), k, a, b)
9044}
9045
9046/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9047/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9048/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9049/// calculates `floor(log2(x))` for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9050/// in the sae parameter
9051///
9052/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sh)
9053#[inline]
9054#[target_feature(enable = "avx512fp16")]
9055#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9056#[rustc_legacy_const_generics(2)]
9057#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9058pub fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
9059    static_assert_sae!(SAE);
9060    _mm_mask_getexp_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
9061}
9062
9063/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9064/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9065/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9066/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9067/// for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9068///
9069/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sh)
9070#[inline]
9071#[target_feature(enable = "avx512fp16")]
9072#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9073#[rustc_legacy_const_generics(4)]
9074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9075pub fn _mm_mask_getexp_round_sh<const SAE: i32>(
9076    src: __m128h,
9077    k: __mmask8,
9078    a: __m128h,
9079    b: __m128h,
9080) -> __m128h {
9081    unsafe {
9082        static_assert_sae!(SAE);
9083        vgetexpsh(a, b, src, k, SAE)
9084    }
9085}
9086
9087/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9088/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9089/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9090/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9091/// lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9092///
9093/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sh)
9094#[inline]
9095#[target_feature(enable = "avx512fp16")]
9096#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9097#[rustc_legacy_const_generics(3)]
9098#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9099pub fn _mm_maskz_getexp_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9100    static_assert_sae!(SAE);
9101    _mm_mask_getexp_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
9102}
9103
9104/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9105/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9106/// on the interval range defined by norm and the sign depends on sign and the source sign.
9107///
9108/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9109///
9110///     _MM_MANT_NORM_1_2     // interval [1, 2)
9111///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9112///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9113///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9114///
9115/// The sign is determined by sc which can take the following values:
9116///
9117///     _MM_MANT_SIGN_src     // sign = sign(src)
9118///     _MM_MANT_SIGN_zero    // sign = 0
9119///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9120///
9121/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ph)
9122#[inline]
9123#[target_feature(enable = "avx512fp16,avx512vl")]
9124#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9125#[rustc_legacy_const_generics(1, 2)]
9126#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9127pub fn _mm_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9128    a: __m128h,
9129) -> __m128h {
9130    static_assert_uimm_bits!(NORM, 4);
9131    static_assert_uimm_bits!(SIGN, 2);
9132    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a)
9133}
9134
9135/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9136/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9137/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9138/// by norm and the sign depends on sign and the source sign.
9139///
9140/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9141///
9142///     _MM_MANT_NORM_1_2     // interval [1, 2)
9143///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9144///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9145///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9146///
9147/// The sign is determined by sc which can take the following values:
9148///
9149///     _MM_MANT_SIGN_src     // sign = sign(src)
9150///     _MM_MANT_SIGN_zero    // sign = 0
9151///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9152///
9153/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ph)
9154#[inline]
9155#[target_feature(enable = "avx512fp16,avx512vl")]
9156#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9157#[rustc_legacy_const_generics(3, 4)]
9158#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9159pub fn _mm_mask_getmant_ph<
9160    const NORM: _MM_MANTISSA_NORM_ENUM,
9161    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9162>(
9163    src: __m128h,
9164    k: __mmask8,
9165    a: __m128h,
9166) -> __m128h {
9167    unsafe {
9168        static_assert_uimm_bits!(NORM, 4);
9169        static_assert_uimm_bits!(SIGN, 2);
9170        vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
9171    }
9172}
9173
9174/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9175/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9176/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9177/// by norm and the sign depends on sign and the source sign.
9178///
9179/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9180///
9181///     _MM_MANT_NORM_1_2     // interval [1, 2)
9182///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9183///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9184///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9185///
9186/// The sign is determined by sc which can take the following values:
9187///
9188///     _MM_MANT_SIGN_src     // sign = sign(src)
9189///     _MM_MANT_SIGN_zero    // sign = 0
9190///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9191///
9192/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ph)
9193#[inline]
9194#[target_feature(enable = "avx512fp16,avx512vl")]
9195#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9196#[rustc_legacy_const_generics(2, 3)]
9197#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9198pub fn _mm_maskz_getmant_ph<
9199    const NORM: _MM_MANTISSA_NORM_ENUM,
9200    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9201>(
9202    k: __mmask8,
9203    a: __m128h,
9204) -> __m128h {
9205    static_assert_uimm_bits!(NORM, 4);
9206    static_assert_uimm_bits!(SIGN, 2);
9207    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_setzero_ph(), k, a)
9208}
9209
9210/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9211/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9212/// on the interval range defined by norm and the sign depends on sign and the source sign.
9213///
9214/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9215///
9216///     _MM_MANT_NORM_1_2     // interval [1, 2)
9217///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9218///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9219///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9220///
9221/// The sign is determined by sc which can take the following values:
9222///
9223///     _MM_MANT_SIGN_src     // sign = sign(src)
9224///     _MM_MANT_SIGN_zero    // sign = 0
9225///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9226///
9227/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ph)
9228#[inline]
9229#[target_feature(enable = "avx512fp16,avx512vl")]
9230#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9231#[rustc_legacy_const_generics(1, 2)]
9232#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9233pub fn _mm256_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9234    a: __m256h,
9235) -> __m256h {
9236    static_assert_uimm_bits!(NORM, 4);
9237    static_assert_uimm_bits!(SIGN, 2);
9238    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_undefined_ph(), 0xffff, a)
9239}
9240
9241/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9242/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9243/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9244/// by norm and the sign depends on sign and the source sign.
9245///
9246/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9247///
9248///     _MM_MANT_NORM_1_2     // interval [1, 2)
9249///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9250///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9251///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9252///
9253/// The sign is determined by sc which can take the following values:
9254///
9255///     _MM_MANT_SIGN_src     // sign = sign(src)
9256///     _MM_MANT_SIGN_zero    // sign = 0
9257///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9258///
9259/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph)
9260#[inline]
9261#[target_feature(enable = "avx512fp16,avx512vl")]
9262#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9263#[rustc_legacy_const_generics(3, 4)]
9264#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9265pub fn _mm256_mask_getmant_ph<
9266    const NORM: _MM_MANTISSA_NORM_ENUM,
9267    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9268>(
9269    src: __m256h,
9270    k: __mmask16,
9271    a: __m256h,
9272) -> __m256h {
9273    unsafe {
9274        static_assert_uimm_bits!(NORM, 4);
9275        static_assert_uimm_bits!(SIGN, 2);
9276        vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
9277    }
9278}
9279
9280/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9281/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9282/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9283/// by norm and the sign depends on sign and the source sign.
9284///
9285/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9286///
9287///     _MM_MANT_NORM_1_2     // interval [1, 2)
9288///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9289///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9290///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9291///
9292/// The sign is determined by sc which can take the following values:
9293///
9294///     _MM_MANT_SIGN_src     // sign = sign(src)
9295///     _MM_MANT_SIGN_zero    // sign = 0
9296///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9297///
9298/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph)
9299#[inline]
9300#[target_feature(enable = "avx512fp16,avx512vl")]
9301#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9302#[rustc_legacy_const_generics(2, 3)]
9303#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9304pub fn _mm256_maskz_getmant_ph<
9305    const NORM: _MM_MANTISSA_NORM_ENUM,
9306    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9307>(
9308    k: __mmask16,
9309    a: __m256h,
9310) -> __m256h {
9311    static_assert_uimm_bits!(NORM, 4);
9312    static_assert_uimm_bits!(SIGN, 2);
9313    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_setzero_ph(), k, a)
9314}
9315
9316/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9317/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9318/// on the interval range defined by norm and the sign depends on sign and the source sign.
9319///
9320/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9321///
9322///     _MM_MANT_NORM_1_2     // interval [1, 2)
9323///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9324///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9325///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9326///
9327/// The sign is determined by sc which can take the following values:
9328///
9329///     _MM_MANT_SIGN_src     // sign = sign(src)
9330///     _MM_MANT_SIGN_zero    // sign = 0
9331///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9332///
9333/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ph)
9334#[inline]
9335#[target_feature(enable = "avx512fp16")]
9336#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9337#[rustc_legacy_const_generics(1, 2)]
9338#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9339pub fn _mm512_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9340    a: __m512h,
9341) -> __m512h {
9342    static_assert_uimm_bits!(NORM, 4);
9343    static_assert_uimm_bits!(SIGN, 2);
9344    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_undefined_ph(), 0xffffffff, a)
9345}
9346
9347/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9348/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9349/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9350/// by norm and the sign depends on sign and the source sign.
9351///
9352/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9353///
9354///     _MM_MANT_NORM_1_2     // interval [1, 2)
9355///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9356///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9357///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9358///
9359/// The sign is determined by sc which can take the following values:
9360///
9361///     _MM_MANT_SIGN_src     // sign = sign(src)
9362///     _MM_MANT_SIGN_zero    // sign = 0
9363///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9364///
9365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ph)
9366#[inline]
9367#[target_feature(enable = "avx512fp16")]
9368#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9369#[rustc_legacy_const_generics(3, 4)]
9370#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9371pub fn _mm512_mask_getmant_ph<
9372    const NORM: _MM_MANTISSA_NORM_ENUM,
9373    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9374>(
9375    src: __m512h,
9376    k: __mmask32,
9377    a: __m512h,
9378) -> __m512h {
9379    static_assert_uimm_bits!(NORM, 4);
9380    static_assert_uimm_bits!(SIGN, 2);
9381    _mm512_mask_getmant_round_ph::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a)
9382}
9383
9384/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9385/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9386/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9387/// by norm and the sign depends on sign and the source sign.
9388///
9389/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9390///
9391///     _MM_MANT_NORM_1_2     // interval [1, 2)
9392///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9393///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9394///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9395///
9396/// The sign is determined by sc which can take the following values:
9397///
9398///     _MM_MANT_SIGN_src     // sign = sign(src)
9399///     _MM_MANT_SIGN_zero    // sign = 0
9400///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9401///
9402/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ph)
9403#[inline]
9404#[target_feature(enable = "avx512fp16")]
9405#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9406#[rustc_legacy_const_generics(2, 3)]
9407#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9408pub fn _mm512_maskz_getmant_ph<
9409    const NORM: _MM_MANTISSA_NORM_ENUM,
9410    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9411>(
9412    k: __mmask32,
9413    a: __m512h,
9414) -> __m512h {
9415    static_assert_uimm_bits!(NORM, 4);
9416    static_assert_uimm_bits!(SIGN, 2);
9417    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_setzero_ph(), k, a)
9418}
9419
9420/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9421/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9422/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9423/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9424///
9425/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9426///
9427///     _MM_MANT_NORM_1_2     // interval [1, 2)
9428///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9429///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9430///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9431///
9432/// The sign is determined by sc which can take the following values:
9433///
9434///     _MM_MANT_SIGN_src     // sign = sign(src)
9435///     _MM_MANT_SIGN_zero    // sign = 0
9436///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9437///
9438/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9439///
9440/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ph)
9441#[inline]
9442#[target_feature(enable = "avx512fp16")]
9443#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9444#[rustc_legacy_const_generics(1, 2, 3)]
9445#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9446pub fn _mm512_getmant_round_ph<
9447    const NORM: _MM_MANTISSA_NORM_ENUM,
9448    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9449    const SAE: i32,
9450>(
9451    a: __m512h,
9452) -> __m512h {
9453    static_assert_uimm_bits!(NORM, 4);
9454    static_assert_uimm_bits!(SIGN, 2);
9455    static_assert_sae!(SAE);
9456    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
9457}
9458
9459/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9460/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9461/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9462/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9463/// in the sae parameter
9464///
9465/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9466///
9467///     _MM_MANT_NORM_1_2     // interval [1, 2)
9468///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9469///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9470///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9471///
9472/// The sign is determined by sc which can take the following values:
9473///
9474///     _MM_MANT_SIGN_src     // sign = sign(src)
9475///     _MM_MANT_SIGN_zero    // sign = 0
9476///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9477///
9478/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9479///
9480/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ph)
9481#[inline]
9482#[target_feature(enable = "avx512fp16")]
9483#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9484#[rustc_legacy_const_generics(3, 4, 5)]
9485#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9486pub fn _mm512_mask_getmant_round_ph<
9487    const NORM: _MM_MANTISSA_NORM_ENUM,
9488    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9489    const SAE: i32,
9490>(
9491    src: __m512h,
9492    k: __mmask32,
9493    a: __m512h,
9494) -> __m512h {
9495    unsafe {
9496        static_assert_uimm_bits!(NORM, 4);
9497        static_assert_uimm_bits!(SIGN, 2);
9498        static_assert_sae!(SAE);
9499        vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
9500    }
9501}
9502
9503/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9504/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9505/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9506/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9507/// in the sae parameter
9508///
9509/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9510///
9511///     _MM_MANT_NORM_1_2     // interval [1, 2)
9512///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9513///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9514///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9515///
9516/// The sign is determined by sc which can take the following values:
9517///
9518///     _MM_MANT_SIGN_src     // sign = sign(src)
9519///     _MM_MANT_SIGN_zero    // sign = 0
9520///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9521///
9522/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9523///
9524/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ph)
9525#[inline]
9526#[target_feature(enable = "avx512fp16")]
9527#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9528#[rustc_legacy_const_generics(2, 3, 4)]
9529#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9530pub fn _mm512_maskz_getmant_round_ph<
9531    const NORM: _MM_MANTISSA_NORM_ENUM,
9532    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9533    const SAE: i32,
9534>(
9535    k: __mmask32,
9536    a: __m512h,
9537) -> __m512h {
9538    static_assert_uimm_bits!(NORM, 4);
9539    static_assert_uimm_bits!(SIGN, 2);
9540    static_assert_sae!(SAE);
9541    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_setzero_ph(), k, a)
9542}
9543
9544/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9545/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9546/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9547/// on the interval range defined by norm and the sign depends on sign and the source sign.
9548///
9549/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9550///
9551///     _MM_MANT_NORM_1_2     // interval [1, 2)
9552///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9553///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9554///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9555///
9556/// The sign is determined by sc which can take the following values:
9557///
9558///     _MM_MANT_SIGN_src     // sign = sign(src)
9559///     _MM_MANT_SIGN_zero    // sign = 0
9560///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9561///
9562/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sh)
9563#[inline]
9564#[target_feature(enable = "avx512fp16")]
9565#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9566#[rustc_legacy_const_generics(2, 3)]
9567#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9568pub fn _mm_getmant_sh<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9569    a: __m128h,
9570    b: __m128h,
9571) -> __m128h {
9572    static_assert_uimm_bits!(NORM, 4);
9573    static_assert_uimm_bits!(SIGN, 2);
9574    _mm_mask_getmant_sh::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a, b)
9575}
9576
9577/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9578/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9579/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9580/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9581/// the source sign.
9582///
9583/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9584///
9585///     _MM_MANT_NORM_1_2     // interval [1, 2)
9586///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9587///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9588///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9589///
9590/// The sign is determined by sc which can take the following values:
9591///
9592///     _MM_MANT_SIGN_src     // sign = sign(src)
9593///     _MM_MANT_SIGN_zero    // sign = 0
9594///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9595///
9596/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh)
9597#[inline]
9598#[target_feature(enable = "avx512fp16")]
9599#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9600#[rustc_legacy_const_generics(4, 5)]
9601#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9602pub fn _mm_mask_getmant_sh<
9603    const NORM: _MM_MANTISSA_NORM_ENUM,
9604    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9605>(
9606    src: __m128h,
9607    k: __mmask8,
9608    a: __m128h,
9609    b: __m128h,
9610) -> __m128h {
9611    static_assert_uimm_bits!(NORM, 4);
9612    static_assert_uimm_bits!(SIGN, 2);
9613    _mm_mask_getmant_round_sh::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9614}
9615
9616/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9617/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9618/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9619/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9620/// the source sign.
9621///
9622/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9623///
9624///     _MM_MANT_NORM_1_2     // interval [1, 2)
9625///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9626///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9627///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9628///
9629/// The sign is determined by sc which can take the following values:
9630///
9631///     _MM_MANT_SIGN_src     // sign = sign(src)
9632///     _MM_MANT_SIGN_zero    // sign = 0
9633///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9634///
9635/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh)
9636#[inline]
9637#[target_feature(enable = "avx512fp16")]
9638#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9639#[rustc_legacy_const_generics(3, 4)]
9640#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9641pub fn _mm_maskz_getmant_sh<
9642    const NORM: _MM_MANTISSA_NORM_ENUM,
9643    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9644>(
9645    k: __mmask8,
9646    a: __m128h,
9647    b: __m128h,
9648) -> __m128h {
9649    static_assert_uimm_bits!(NORM, 4);
9650    static_assert_uimm_bits!(SIGN, 2);
9651    _mm_mask_getmant_sh::<NORM, SIGN>(_mm_setzero_ph(), k, a, b)
9652}
9653
9654/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9655/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9656/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9657/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9658/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9659///
9660/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9661///
9662///     _MM_MANT_NORM_1_2     // interval [1, 2)
9663///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9664///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9665///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9666///
9667/// The sign is determined by sc which can take the following values:
9668///
9669///     _MM_MANT_SIGN_src     // sign = sign(src)
9670///     _MM_MANT_SIGN_zero    // sign = 0
9671///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9672///
9673/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9674///
9675/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sh)
9676#[inline]
9677#[target_feature(enable = "avx512fp16")]
9678#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9679#[rustc_legacy_const_generics(2, 3, 4)]
9680#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9681pub fn _mm_getmant_round_sh<
9682    const NORM: _MM_MANTISSA_NORM_ENUM,
9683    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9684    const SAE: i32,
9685>(
9686    a: __m128h,
9687    b: __m128h,
9688) -> __m128h {
9689    static_assert_uimm_bits!(NORM, 4);
9690    static_assert_uimm_bits!(SIGN, 2);
9691    static_assert_sae!(SAE);
9692    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(_mm_undefined_ph(), 0xff, a, b)
9693}
9694
9695/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9696/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9697/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9698/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9699/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9700///
9701/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9702///
9703///     _MM_MANT_NORM_1_2     // interval [1, 2)
9704///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9705///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9706///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9707///
9708/// The sign is determined by sc which can take the following values:
9709///
9710///     _MM_MANT_SIGN_src     // sign = sign(src)
9711///     _MM_MANT_SIGN_zero    // sign = 0
9712///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9713///
9714/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9715///
9716/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh)
9717#[inline]
9718#[target_feature(enable = "avx512fp16")]
9719#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9720#[rustc_legacy_const_generics(4, 5, 6)]
9721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9722pub fn _mm_mask_getmant_round_sh<
9723    const NORM: _MM_MANTISSA_NORM_ENUM,
9724    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9725    const SAE: i32,
9726>(
9727    src: __m128h,
9728    k: __mmask8,
9729    a: __m128h,
9730    b: __m128h,
9731) -> __m128h {
9732    unsafe {
9733        static_assert_uimm_bits!(NORM, 4);
9734        static_assert_uimm_bits!(SIGN, 2);
9735        static_assert_sae!(SAE);
9736        vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
9737    }
9738}
9739
9740/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9741/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9742/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9743/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9744/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9745///
9746/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9747///
9748///     _MM_MANT_NORM_1_2     // interval [1, 2)
9749///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9750///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9751///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9752///
9753/// The sign is determined by sc which can take the following values:
9754///
9755///     _MM_MANT_SIGN_src     // sign = sign(src)
9756///     _MM_MANT_SIGN_zero    // sign = 0
9757///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9758///
9759/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9760///
9761/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh)
9762#[inline]
9763#[target_feature(enable = "avx512fp16")]
9764#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9765#[rustc_legacy_const_generics(3, 4, 5)]
9766#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9767pub fn _mm_maskz_getmant_round_sh<
9768    const NORM: _MM_MANTISSA_NORM_ENUM,
9769    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9770    const SAE: i32,
9771>(
9772    k: __mmask8,
9773    a: __m128h,
9774    b: __m128h,
9775) -> __m128h {
9776    static_assert_uimm_bits!(NORM, 4);
9777    static_assert_uimm_bits!(SIGN, 2);
9778    static_assert_sae!(SAE);
9779    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(_mm_setzero_ph(), k, a, b)
9780}
9781
9782/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9783/// specified by imm8, and store the results in dst.
9784///
9785/// Rounding is done according to the imm8 parameter, which can be one of:
9786///
9787/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9788/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9789/// * [`_MM_FROUND_TO_POS_INF`] : round up
9790/// * [`_MM_FROUND_TO_ZERO`] : truncate
9791/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9792///
9793/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ph)
9794#[inline]
9795#[target_feature(enable = "avx512fp16,avx512vl")]
9796#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9797#[rustc_legacy_const_generics(1)]
9798#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9799pub fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
9800    static_assert_uimm_bits!(IMM8, 8);
9801    _mm_mask_roundscale_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
9802}
9803
9804/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9805/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9806/// the corresponding mask bit is not set).
9807///
9808/// Rounding is done according to the imm8 parameter, which can be one of:
9809///
9810/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9811/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9812/// * [`_MM_FROUND_TO_POS_INF`] : round up
9813/// * [`_MM_FROUND_TO_ZERO`] : truncate
9814/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9815///
9816/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph)
9817#[inline]
9818#[target_feature(enable = "avx512fp16,avx512vl")]
9819#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9820#[rustc_legacy_const_generics(3)]
9821#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9822pub fn _mm_mask_roundscale_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
9823    unsafe {
9824        static_assert_uimm_bits!(IMM8, 8);
9825        vrndscaleph_128(a, IMM8, src, k)
9826    }
9827}
9828
9829/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9830/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9831/// mask bit is not set).
9832///
9833/// Rounding is done according to the imm8 parameter, which can be one of:
9834///
9835/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9836/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9837/// * [`_MM_FROUND_TO_POS_INF`] : round up
9838/// * [`_MM_FROUND_TO_ZERO`] : truncate
9839/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9840///
9841/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph)
9842#[inline]
9843#[target_feature(enable = "avx512fp16,avx512vl")]
9844#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9845#[rustc_legacy_const_generics(2)]
9846#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9847pub fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
9848    static_assert_uimm_bits!(IMM8, 8);
9849    _mm_mask_roundscale_ph::<IMM8>(_mm_setzero_ph(), k, a)
9850}
9851
9852/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9853/// specified by imm8, and store the results in dst.
9854///
9855/// Rounding is done according to the imm8 parameter, which can be one of:
9856///
9857/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9858/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9859/// * [`_MM_FROUND_TO_POS_INF`] : round up
9860/// * [`_MM_FROUND_TO_ZERO`] : truncate
9861/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9862///
9863/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ph)
9864#[inline]
9865#[target_feature(enable = "avx512fp16,avx512vl")]
9866#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9867#[rustc_legacy_const_generics(1)]
9868#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9869pub fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
9870    static_assert_uimm_bits!(IMM8, 8);
9871    _mm256_mask_roundscale_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
9872}
9873
9874/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9875/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9876/// the corresponding mask bit is not set).
9877///
9878/// Rounding is done according to the imm8 parameter, which can be one of:
9879///
9880/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9881/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9882/// * [`_MM_FROUND_TO_POS_INF`] : round up
9883/// * [`_MM_FROUND_TO_ZERO`] : truncate
9884/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9885///
9886/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph)
9887#[inline]
9888#[target_feature(enable = "avx512fp16,avx512vl")]
9889#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9890#[rustc_legacy_const_generics(3)]
9891#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9892pub fn _mm256_mask_roundscale_ph<const IMM8: i32>(
9893    src: __m256h,
9894    k: __mmask16,
9895    a: __m256h,
9896) -> __m256h {
9897    unsafe {
9898        static_assert_uimm_bits!(IMM8, 8);
9899        vrndscaleph_256(a, IMM8, src, k)
9900    }
9901}
9902
9903/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9904/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9905/// mask bit is not set).
9906///
9907/// Rounding is done according to the imm8 parameter, which can be one of:
9908///
9909/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9910/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9911/// * [`_MM_FROUND_TO_POS_INF`] : round up
9912/// * [`_MM_FROUND_TO_ZERO`] : truncate
9913/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9914///
9915/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph)
9916#[inline]
9917#[target_feature(enable = "avx512fp16,avx512vl")]
9918#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9919#[rustc_legacy_const_generics(2)]
9920#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9921pub fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
9922    static_assert_uimm_bits!(IMM8, 8);
9923    _mm256_mask_roundscale_ph::<IMM8>(_mm256_setzero_ph(), k, a)
9924}
9925
9926/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9927/// specified by imm8, and store the results in dst.
9928///
9929/// Rounding is done according to the imm8 parameter, which can be one of:
9930///
9931/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9932/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9933/// * [`_MM_FROUND_TO_POS_INF`] : round up
9934/// * [`_MM_FROUND_TO_ZERO`] : truncate
9935/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9936///
9937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ph)
9938#[inline]
9939#[target_feature(enable = "avx512fp16")]
9940#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9941#[rustc_legacy_const_generics(1)]
9942#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9943pub fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
9944    static_assert_uimm_bits!(IMM8, 8);
9945    _mm512_mask_roundscale_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
9946}
9947
9948/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9949/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9950/// the corresponding mask bit is not set).
9951///
9952/// Rounding is done according to the imm8 parameter, which can be one of:
9953///
9954/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9955/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9956/// * [`_MM_FROUND_TO_POS_INF`] : round up
9957/// * [`_MM_FROUND_TO_ZERO`] : truncate
9958/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9959///
9960/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ph)
9961#[inline]
9962#[target_feature(enable = "avx512fp16")]
9963#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9964#[rustc_legacy_const_generics(3)]
9965#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9966pub fn _mm512_mask_roundscale_ph<const IMM8: i32>(
9967    src: __m512h,
9968    k: __mmask32,
9969    a: __m512h,
9970) -> __m512h {
9971    static_assert_uimm_bits!(IMM8, 8);
9972    _mm512_mask_roundscale_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
9973}
9974
9975/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9976/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9977/// mask bit is not set).
9978///
9979/// Rounding is done according to the imm8 parameter, which can be one of:
9980///
9981/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9982/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9983/// * [`_MM_FROUND_TO_POS_INF`] : round up
9984/// * [`_MM_FROUND_TO_ZERO`] : truncate
9985/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9986///
9987/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ph)
9988#[inline]
9989#[target_feature(enable = "avx512fp16")]
9990#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9991#[rustc_legacy_const_generics(2)]
9992#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9993pub fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
9994    static_assert_uimm_bits!(IMM8, 8);
9995    _mm512_mask_roundscale_ph::<IMM8>(_mm512_setzero_ph(), k, a)
9996}
9997
9998/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9999/// specified by imm8, and store the results in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10000/// in the sae parameter
10001///
10002/// Rounding is done according to the imm8 parameter, which can be one of:
10003///
10004/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10005/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10006/// * [`_MM_FROUND_TO_POS_INF`] : round up
10007/// * [`_MM_FROUND_TO_ZERO`] : truncate
10008/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10009///
10010/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ph)
10011#[inline]
10012#[target_feature(enable = "avx512fp16")]
10013#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10014#[rustc_legacy_const_generics(1, 2)]
10015#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10016pub fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
10017    static_assert_uimm_bits!(IMM8, 8);
10018    static_assert_sae!(SAE);
10019    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
10020}
10021
10022/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10023/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10024/// the corresponding mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10025/// in the sae parameter
10026///
10027/// Rounding is done according to the imm8 parameter, which can be one of:
10028///
10029/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10030/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10031/// * [`_MM_FROUND_TO_POS_INF`] : round up
10032/// * [`_MM_FROUND_TO_ZERO`] : truncate
10033/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10034///
10035/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ph)
10036#[inline]
10037#[target_feature(enable = "avx512fp16")]
10038#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10039#[rustc_legacy_const_generics(3, 4)]
10040#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10041pub fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10042    src: __m512h,
10043    k: __mmask32,
10044    a: __m512h,
10045) -> __m512h {
10046    unsafe {
10047        static_assert_uimm_bits!(IMM8, 8);
10048        static_assert_sae!(SAE);
10049        vrndscaleph_512(a, IMM8, src, k, SAE)
10050    }
10051}
10052
10053/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10054/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10055/// mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10056///
10057/// Rounding is done according to the imm8 parameter, which can be one of:
10058///
10059/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10060/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10061/// * [`_MM_FROUND_TO_POS_INF`] : round up
10062/// * [`_MM_FROUND_TO_ZERO`] : truncate
10063/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10064///
10065/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ph)
10066#[inline]
10067#[target_feature(enable = "avx512fp16")]
10068#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10069#[rustc_legacy_const_generics(2, 3)]
10070#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10071pub fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10072    k: __mmask32,
10073    a: __m512h,
10074) -> __m512h {
10075    static_assert_uimm_bits!(IMM8, 8);
10076    static_assert_sae!(SAE);
10077    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
10078}
10079
10080/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10081/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10082/// from a to the upper elements of dst.
10083///
10084/// Rounding is done according to the imm8 parameter, which can be one of:
10085///
10086/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10087/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10088/// * [`_MM_FROUND_TO_POS_INF`] : round up
10089/// * [`_MM_FROUND_TO_ZERO`] : truncate
10090/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10091///
10092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sh)
10093#[inline]
10094#[target_feature(enable = "avx512fp16")]
10095#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10096#[rustc_legacy_const_generics(2)]
10097#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10098pub fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
10099    static_assert_uimm_bits!(IMM8, 8);
10100    _mm_mask_roundscale_sh::<IMM8>(_mm_undefined_ph(), 0xff, a, b)
10101}
10102
10103/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10104/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10105/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10106///
10107/// Rounding is done according to the imm8 parameter, which can be one of:
10108///
10109/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10110/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10111/// * [`_MM_FROUND_TO_POS_INF`] : round up
10112/// * [`_MM_FROUND_TO_ZERO`] : truncate
10113/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10114///
10115/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sh)
10116#[inline]
10117#[target_feature(enable = "avx512fp16")]
10118#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10119#[rustc_legacy_const_generics(4)]
10120#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10121pub fn _mm_mask_roundscale_sh<const IMM8: i32>(
10122    src: __m128h,
10123    k: __mmask8,
10124    a: __m128h,
10125    b: __m128h,
10126) -> __m128h {
10127    static_assert_uimm_bits!(IMM8, 8);
10128    _mm_mask_roundscale_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10129}
10130
10131/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10132/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10133/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10134///
10135/// Rounding is done according to the imm8 parameter, which can be one of:
10136///
10137/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10138/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10139/// * [`_MM_FROUND_TO_POS_INF`] : round up
10140/// * [`_MM_FROUND_TO_ZERO`] : truncate
10141/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10142///
10143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sh)
10144#[inline]
10145#[target_feature(enable = "avx512fp16")]
10146#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10147#[rustc_legacy_const_generics(3)]
10148#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10149pub fn _mm_maskz_roundscale_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10150    static_assert_uimm_bits!(IMM8, 8);
10151    _mm_mask_roundscale_sh::<IMM8>(_mm_setzero_ph(), k, a, b)
10152}
10153
10154/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10155/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10156/// from a to the upper elements of dst.
10157///
10158/// Rounding is done according to the imm8 parameter, which can be one of:
10159///
10160/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10161/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10162/// * [`_MM_FROUND_TO_POS_INF`] : round up
10163/// * [`_MM_FROUND_TO_ZERO`] : truncate
10164/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10165///
10166/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10167///
10168/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sh)
10169#[inline]
10170#[target_feature(enable = "avx512fp16")]
10171#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10172#[rustc_legacy_const_generics(2, 3)]
10173#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10174pub fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
10175    static_assert_uimm_bits!(IMM8, 8);
10176    static_assert_sae!(SAE);
10177    _mm_mask_roundscale_round_sh::<IMM8, SAE>(_mm_undefined_ph(), 0xff, a, b)
10178}
10179
10180/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10181/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10182/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10183///
10184/// Rounding is done according to the imm8 parameter, which can be one of:
10185///
10186/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10187/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10188/// * [`_MM_FROUND_TO_POS_INF`] : round up
10189/// * [`_MM_FROUND_TO_ZERO`] : truncate
10190/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10191///
10192/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10193///
10194/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sh)
10195#[inline]
10196#[target_feature(enable = "avx512fp16")]
10197#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10198#[rustc_legacy_const_generics(4, 5)]
10199#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10200pub fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10201    src: __m128h,
10202    k: __mmask8,
10203    a: __m128h,
10204    b: __m128h,
10205) -> __m128h {
10206    unsafe {
10207        static_assert_uimm_bits!(IMM8, 8);
10208        static_assert_sae!(SAE);
10209        vrndscalesh(a, b, src, k, IMM8, SAE)
10210    }
10211}
10212
10213/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10214/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10215/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10216///
10217/// Rounding is done according to the imm8 parameter, which can be one of:
10218///
10219/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10220/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10221/// * [`_MM_FROUND_TO_POS_INF`] : round up
10222/// * [`_MM_FROUND_TO_ZERO`] : truncate
10223/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10224///
10225/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10226///
10227/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sh)
10228#[inline]
10229#[target_feature(enable = "avx512fp16")]
10230#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10231#[rustc_legacy_const_generics(3, 4)]
10232#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10233pub fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10234    k: __mmask8,
10235    a: __m128h,
10236    b: __m128h,
10237) -> __m128h {
10238    static_assert_uimm_bits!(IMM8, 8);
10239    static_assert_sae!(SAE);
10240    _mm_mask_roundscale_round_sh::<IMM8, SAE>(_mm_setzero_ph(), k, a, b)
10241}
10242
10243/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10244/// the results in dst.
10245///
10246/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ph)
10247#[inline]
10248#[target_feature(enable = "avx512fp16,avx512vl")]
10249#[cfg_attr(test, assert_instr(vscalefph))]
10250#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10251pub fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
10252    _mm_mask_scalef_ph(_mm_undefined_ph(), 0xff, a, b)
10253}
10254
10255/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10256/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10257///
10258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph)
10259#[inline]
10260#[target_feature(enable = "avx512fp16,avx512vl")]
10261#[cfg_attr(test, assert_instr(vscalefph))]
10262#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10263pub fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10264    unsafe { vscalefph_128(a, b, src, k) }
10265}
10266
10267/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10268/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10269///
10270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ph)
10271#[inline]
10272#[target_feature(enable = "avx512fp16,avx512vl")]
10273#[cfg_attr(test, assert_instr(vscalefph))]
10274#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10275pub fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10276    _mm_mask_scalef_ph(_mm_setzero_ph(), k, a, b)
10277}
10278
10279/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10280/// the results in dst.
10281///
10282/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ph)
10283#[inline]
10284#[target_feature(enable = "avx512fp16,avx512vl")]
10285#[cfg_attr(test, assert_instr(vscalefph))]
10286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10287pub fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
10288    _mm256_mask_scalef_ph(_mm256_undefined_ph(), 0xffff, a, b)
10289}
10290
10291/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10292/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10293///
10294/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph)
10295#[inline]
10296#[target_feature(enable = "avx512fp16,avx512vl")]
10297#[cfg_attr(test, assert_instr(vscalefph))]
10298#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10299pub fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10300    unsafe { vscalefph_256(a, b, src, k) }
10301}
10302
10303/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10304/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10305///
10306/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ph)
10307#[inline]
10308#[target_feature(enable = "avx512fp16,avx512vl")]
10309#[cfg_attr(test, assert_instr(vscalefph))]
10310#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10311pub fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10312    _mm256_mask_scalef_ph(_mm256_setzero_ph(), k, a, b)
10313}
10314
10315/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10316/// the results in dst.
10317///
10318/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph)
10319#[inline]
10320#[target_feature(enable = "avx512fp16")]
10321#[cfg_attr(test, assert_instr(vscalefph))]
10322#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10323pub fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
10324    _mm512_mask_scalef_ph(_mm512_undefined_ph(), 0xffffffff, a, b)
10325}
10326
10327/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10328/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10329///
10330/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ph)
10331#[inline]
10332#[target_feature(enable = "avx512fp16")]
10333#[cfg_attr(test, assert_instr(vscalefph))]
10334#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10335pub fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10336    _mm512_mask_scalef_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10337}
10338
10339/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10340/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10341///
10342/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ph)
10343#[inline]
10344#[target_feature(enable = "avx512fp16")]
10345#[cfg_attr(test, assert_instr(vscalefph))]
10346#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10347pub fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10348    _mm512_mask_scalef_ph(_mm512_setzero_ph(), k, a, b)
10349}
10350
10351/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10352/// the results in dst.
10353///
10354/// Rounding is done according to the rounding parameter, which can be one of:
10355///
10356/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10357/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10358/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10359/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10360/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10361///
10362/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph)
10363#[inline]
10364#[target_feature(enable = "avx512fp16")]
10365#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10366#[rustc_legacy_const_generics(2)]
10367#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10368pub fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
10369    static_assert_rounding!(ROUNDING);
10370    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_undefined_ph(), 0xffffffff, a, b)
10371}
10372
10373/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10374/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10375///
10376/// Rounding is done according to the rounding parameter, which can be one of:
10377///
10378/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10379/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10380/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10381/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10382/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10383///
10384/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph)
10385#[inline]
10386#[target_feature(enable = "avx512fp16")]
10387#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10388#[rustc_legacy_const_generics(4)]
10389#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10390pub fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
10391    src: __m512h,
10392    k: __mmask32,
10393    a: __m512h,
10394    b: __m512h,
10395) -> __m512h {
10396    unsafe {
10397        static_assert_rounding!(ROUNDING);
10398        vscalefph_512(a, b, src, k, ROUNDING)
10399    }
10400}
10401
10402/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10403/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10404///
10405/// Rounding is done according to the rounding parameter, which can be one of:
10406///
10407/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10408/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10409/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10410/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10411/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10412///
10413/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ph)
10414#[inline]
10415#[target_feature(enable = "avx512fp16")]
10416#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10417#[rustc_legacy_const_generics(3)]
10418#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10419pub fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
10420    k: __mmask32,
10421    a: __m512h,
10422    b: __m512h,
10423) -> __m512h {
10424    static_assert_rounding!(ROUNDING);
10425    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
10426}
10427
10428/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10429/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10430/// elements of dst.
10431///
10432/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh)
10433#[inline]
10434#[target_feature(enable = "avx512fp16")]
10435#[cfg_attr(test, assert_instr(vscalefsh))]
10436#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10437pub fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
10438    _mm_mask_scalef_sh(_mm_undefined_ph(), 0xff, a, b)
10439}
10440
10441/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10442/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10443/// and copy the upper 7 packed elements from a to the upper elements of dst.
10444///
10445/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh)
10446#[inline]
10447#[target_feature(enable = "avx512fp16")]
10448#[cfg_attr(test, assert_instr(vscalefsh))]
10449#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10450pub fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10451    _mm_mask_scalef_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10452}
10453
10454/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10455/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10456/// and copy the upper 7 packed elements from a to the upper elements of dst.
10457///
10458/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh)
10459#[inline]
10460#[target_feature(enable = "avx512fp16")]
10461#[cfg_attr(test, assert_instr(vscalefsh))]
10462#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10463pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10464    _mm_mask_scalef_sh(_mm_setzero_ph(), k, a, b)
10465}
10466
10467/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10468/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10469/// elements of dst.
10470///
10471/// Rounding is done according to the rounding parameter, which can be one of:
10472///
10473/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10474/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10475/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10476/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10477/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10478///
10479/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh)
10480#[inline]
10481#[target_feature(enable = "avx512fp16")]
10482#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10483#[rustc_legacy_const_generics(2)]
10484#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10485pub fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
10486    static_assert_rounding!(ROUNDING);
10487    _mm_mask_scalef_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
10488}
10489
10490/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10491/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10492/// and copy the upper 7 packed elements from a to the upper elements of dst.
10493///
10494/// Rounding is done according to the rounding parameter, which can be one of:
10495///
10496/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10497/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10498/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10499/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10500/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10501///
10502/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sh)
10503#[inline]
10504#[target_feature(enable = "avx512fp16")]
10505#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10506#[rustc_legacy_const_generics(4)]
10507#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10508pub fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
10509    src: __m128h,
10510    k: __mmask8,
10511    a: __m128h,
10512    b: __m128h,
10513) -> __m128h {
10514    unsafe {
10515        static_assert_rounding!(ROUNDING);
10516        vscalefsh(a, b, src, k, ROUNDING)
10517    }
10518}
10519
10520/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10521/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10522/// and copy the upper 7 packed elements from a to the upper elements of dst.
10523///
10524/// Rounding is done according to the rounding parameter, which can be one of:
10525///
10526/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10527/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10528/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10529/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10530/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10531///
10532/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sh)
10533#[inline]
10534#[target_feature(enable = "avx512fp16")]
10535#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10536#[rustc_legacy_const_generics(3)]
10537#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10538pub fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
10539    k: __mmask8,
10540    a: __m128h,
10541    b: __m128h,
10542) -> __m128h {
10543    static_assert_rounding!(ROUNDING);
10544    _mm_mask_scalef_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
10545}
10546
10547/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10548/// number of bits specified by imm8, and store the results in dst.
10549///
10550/// Rounding is done according to the imm8 parameter, which can be one of:
10551///
10552/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10553/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10554/// * [`_MM_FROUND_TO_POS_INF`] : round up
10555/// * [`_MM_FROUND_TO_ZERO`] : truncate
10556/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10557///
10558/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_ph)
10559#[inline]
10560#[target_feature(enable = "avx512fp16,avx512vl")]
10561#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10562#[rustc_legacy_const_generics(1)]
10563#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10564pub fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
10565    static_assert_uimm_bits!(IMM8, 8);
10566    _mm_mask_reduce_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
10567}
10568
10569/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10570/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10571/// from src when the corresponding mask bit is not set).
10572///
10573/// Rounding is done according to the imm8 parameter, which can be one of:
10574///
10575/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10576/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10577/// * [`_MM_FROUND_TO_POS_INF`] : round up
10578/// * [`_MM_FROUND_TO_ZERO`] : truncate
10579/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10580///
10581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph)
10582#[inline]
10583#[target_feature(enable = "avx512fp16,avx512vl")]
10584#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10585#[rustc_legacy_const_generics(3)]
10586#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10587pub fn _mm_mask_reduce_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
10588    unsafe {
10589        static_assert_uimm_bits!(IMM8, 8);
10590        vreduceph_128(a, IMM8, src, k)
10591    }
10592}
10593
10594/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10595/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10596/// out when the corresponding mask bit is not set).
10597///
10598/// Rounding is done according to the imm8 parameter, which can be one of:
10599///
10600/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10601/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10602/// * [`_MM_FROUND_TO_POS_INF`] : round up
10603/// * [`_MM_FROUND_TO_ZERO`] : truncate
10604/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10605///
10606/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph)
10607#[inline]
10608#[target_feature(enable = "avx512fp16,avx512vl")]
10609#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10610#[rustc_legacy_const_generics(2)]
10611#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10612pub fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
10613    static_assert_uimm_bits!(IMM8, 8);
10614    _mm_mask_reduce_ph::<IMM8>(_mm_setzero_ph(), k, a)
10615}
10616
10617/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10618/// number of bits specified by imm8, and store the results in dst.
10619///
10620/// Rounding is done according to the imm8 parameter, which can be one of:
10621///
10622/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10623/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10624/// * [`_MM_FROUND_TO_POS_INF`] : round up
10625/// * [`_MM_FROUND_TO_ZERO`] : truncate
10626/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10627///
10628/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_ph)
10629#[inline]
10630#[target_feature(enable = "avx512fp16,avx512vl")]
10631#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10632#[rustc_legacy_const_generics(1)]
10633#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10634pub fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
10635    static_assert_uimm_bits!(IMM8, 8);
10636    _mm256_mask_reduce_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
10637}
10638
10639/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10640/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10641/// from src when the corresponding mask bit is not set).
10642///
10643/// Rounding is done according to the imm8 parameter, which can be one of:
10644///
10645/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10646/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10647/// * [`_MM_FROUND_TO_POS_INF`] : round up
10648/// * [`_MM_FROUND_TO_ZERO`] : truncate
10649/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10650///
10651/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph)
10652#[inline]
10653#[target_feature(enable = "avx512fp16,avx512vl")]
10654#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10655#[rustc_legacy_const_generics(3)]
10656#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10657pub fn _mm256_mask_reduce_ph<const IMM8: i32>(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
10658    unsafe {
10659        static_assert_uimm_bits!(IMM8, 8);
10660        vreduceph_256(a, IMM8, src, k)
10661    }
10662}
10663
10664/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10665/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10666/// out when the corresponding mask bit is not set).
10667///
10668/// Rounding is done according to the imm8 parameter, which can be one of:
10669///
10670/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10671/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10672/// * [`_MM_FROUND_TO_POS_INF`] : round up
10673/// * [`_MM_FROUND_TO_ZERO`] : truncate
10674/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10675///
10676/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph)
10677#[inline]
10678#[target_feature(enable = "avx512fp16,avx512vl")]
10679#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10680#[rustc_legacy_const_generics(2)]
10681#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10682pub fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
10683    static_assert_uimm_bits!(IMM8, 8);
10684    _mm256_mask_reduce_ph::<IMM8>(_mm256_setzero_ph(), k, a)
10685}
10686
10687/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10688/// number of bits specified by imm8, and store the results in dst.
10689///
10690/// Rounding is done according to the imm8 parameter, which can be one of:
10691///
10692/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10693/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10694/// * [`_MM_FROUND_TO_POS_INF`] : round up
10695/// * [`_MM_FROUND_TO_ZERO`] : truncate
10696/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10697///
10698/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_ph)
10699#[inline]
10700#[target_feature(enable = "avx512fp16")]
10701#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10702#[rustc_legacy_const_generics(1)]
10703#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10704pub fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
10705    static_assert_uimm_bits!(IMM8, 8);
10706    _mm512_mask_reduce_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
10707}
10708
10709/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10710/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10711/// from src when the corresponding mask bit is not set).
10712///
10713/// Rounding is done according to the imm8 parameter, which can be one of:
10714///
10715/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10716/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10717/// * [`_MM_FROUND_TO_POS_INF`] : round up
10718/// * [`_MM_FROUND_TO_ZERO`] : truncate
10719/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10720///
10721/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph)
10722#[inline]
10723#[target_feature(enable = "avx512fp16")]
10724#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10725#[rustc_legacy_const_generics(3)]
10726#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10727pub fn _mm512_mask_reduce_ph<const IMM8: i32>(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
10728    static_assert_uimm_bits!(IMM8, 8);
10729    _mm512_mask_reduce_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
10730}
10731
10732/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10733/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10734/// out when the corresponding mask bit is not set).
10735///
10736/// Rounding is done according to the imm8 parameter, which can be one of:
10737///
10738/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10739/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10740/// * [`_MM_FROUND_TO_POS_INF`] : round up
10741/// * [`_MM_FROUND_TO_ZERO`] : truncate
10742/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10743///
10744/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph)
10745#[inline]
10746#[target_feature(enable = "avx512fp16")]
10747#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10748#[rustc_legacy_const_generics(2)]
10749#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10750pub fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
10751    static_assert_uimm_bits!(IMM8, 8);
10752    _mm512_mask_reduce_ph::<IMM8>(_mm512_setzero_ph(), k, a)
10753}
10754
10755/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10756/// number of bits specified by imm8, and store the results in dst.
10757///
10758/// Rounding is done according to the imm8 parameter, which can be one of:
10759///
10760/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10761/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10762/// * [`_MM_FROUND_TO_POS_INF`] : round up
10763/// * [`_MM_FROUND_TO_ZERO`] : truncate
10764/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10765///
10766/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10767///
10768/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_round_ph)
10769#[inline]
10770#[target_feature(enable = "avx512fp16")]
10771#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10772#[rustc_legacy_const_generics(1, 2)]
10773#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10774pub fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
10775    static_assert_uimm_bits!(IMM8, 8);
10776    static_assert_sae!(SAE);
10777    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
10778}
10779
10780/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10781/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10782/// from src when the corresponding mask bit is not set).
10783///
10784/// Rounding is done according to the imm8 parameter, which can be one of:
10785///
10786/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10787/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10788/// * [`_MM_FROUND_TO_POS_INF`] : round up
10789/// * [`_MM_FROUND_TO_ZERO`] : truncate
10790/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10791///
10792/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10793///
10794/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph)
10795#[inline]
10796#[target_feature(enable = "avx512fp16")]
10797#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10798#[rustc_legacy_const_generics(3, 4)]
10799#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10800pub fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
10801    src: __m512h,
10802    k: __mmask32,
10803    a: __m512h,
10804) -> __m512h {
10805    unsafe {
10806        static_assert_uimm_bits!(IMM8, 8);
10807        static_assert_sae!(SAE);
10808        vreduceph_512(a, IMM8, src, k, SAE)
10809    }
10810}
10811
10812/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10813/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10814/// out when the corresponding mask bit is not set).
10815///
10816/// Rounding is done according to the imm8 parameter, which can be one of:
10817///
10818/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10819/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10820/// * [`_MM_FROUND_TO_POS_INF`] : round up
10821/// * [`_MM_FROUND_TO_ZERO`] : truncate
10822/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10823///
10824/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10825///
10826/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph)
10827#[inline]
10828#[target_feature(enable = "avx512fp16")]
10829#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10830#[rustc_legacy_const_generics(2, 3)]
10831#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10832pub fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
10833    k: __mmask32,
10834    a: __m512h,
10835) -> __m512h {
10836    static_assert_uimm_bits!(IMM8, 8);
10837    static_assert_sae!(SAE);
10838    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
10839}
10840
10841/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10842/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the
10843/// upper 7 packed elements from a to the upper elements of dst.
10844///
10845/// Rounding is done according to the imm8 parameter, which can be one of:
10846///
10847/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10848/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10849/// * [`_MM_FROUND_TO_POS_INF`] : round up
10850/// * [`_MM_FROUND_TO_ZERO`] : truncate
10851/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10852///
10853/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh)
10854#[inline]
10855#[target_feature(enable = "avx512fp16")]
10856#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10857#[rustc_legacy_const_generics(2)]
10858#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10859pub fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
10860    static_assert_uimm_bits!(IMM8, 8);
10861    _mm_mask_reduce_sh::<IMM8>(_mm_undefined_ph(), 0xff, a, b)
10862}
10863
10864/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10865/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
10866/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from
10867/// a to the upper elements of dst.
10868///
10869/// Rounding is done according to the imm8 parameter, which can be one of:
10870///
10871/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10872/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10873/// * [`_MM_FROUND_TO_POS_INF`] : round up
10874/// * [`_MM_FROUND_TO_ZERO`] : truncate
10875/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10876///
10877/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh)
10878#[inline]
10879#[target_feature(enable = "avx512fp16")]
10880#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10881#[rustc_legacy_const_generics(4)]
10882#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10883pub fn _mm_mask_reduce_sh<const IMM8: i32>(
10884    src: __m128h,
10885    k: __mmask8,
10886    a: __m128h,
10887    b: __m128h,
10888) -> __m128h {
10889    static_assert_uimm_bits!(IMM8, 8);
10890    _mm_mask_reduce_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10891}
10892
10893/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10894/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
10895/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
10896/// to the upper elements of dst.
10897///
10898/// Rounding is done according to the imm8 parameter, which can be one of:
10899///
10900/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10901/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10902/// * [`_MM_FROUND_TO_POS_INF`] : round up
10903/// * [`_MM_FROUND_TO_ZERO`] : truncate
10904/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10905///
10906/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh)
10907#[inline]
10908#[target_feature(enable = "avx512fp16")]
10909#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10910#[rustc_legacy_const_generics(3)]
10911#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10912pub fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10913    static_assert_uimm_bits!(IMM8, 8);
10914    _mm_mask_reduce_sh::<IMM8>(_mm_setzero_ph(), k, a, b)
10915}
10916
10917/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10918/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper
10919/// 7 packed elements from a to the upper elements of dst.
10920///
10921/// Rounding is done according to the imm8 parameter, which can be one of:
10922///
10923/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10924/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10925/// * [`_MM_FROUND_TO_POS_INF`] : round up
10926/// * [`_MM_FROUND_TO_ZERO`] : truncate
10927/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10928///
10929/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10930///
10931/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh)
10932#[inline]
10933#[target_feature(enable = "avx512fp16")]
10934#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
10935#[rustc_legacy_const_generics(2, 3)]
10936#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10937pub fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
10938    static_assert_uimm_bits!(IMM8, 8);
10939    static_assert_sae!(SAE);
10940    _mm_mask_reduce_round_sh::<IMM8, SAE>(_mm_undefined_ph(), 0xff, a, b)
10941}
10942
10943/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10944/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
10945/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a
10946/// to the upper elements of dst.
10947///
10948/// Rounding is done according to the imm8 parameter, which can be one of:
10949///
10950/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10951/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10952/// * [`_MM_FROUND_TO_POS_INF`] : round up
10953/// * [`_MM_FROUND_TO_ZERO`] : truncate
10954/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10955///
10956/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10957///
10958/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh)
10959#[inline]
10960#[target_feature(enable = "avx512fp16")]
10961#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
10962#[rustc_legacy_const_generics(4, 5)]
10963#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10964pub fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
10965    src: __m128h,
10966    k: __mmask8,
10967    a: __m128h,
10968    b: __m128h,
10969) -> __m128h {
10970    unsafe {
10971        static_assert_uimm_bits!(IMM8, 8);
10972        static_assert_sae!(SAE);
10973        vreducesh(a, b, src, k, IMM8, SAE)
10974    }
10975}
10976
10977/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10978/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
10979/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
10980/// to the upper elements of dst.
10981///
10982/// Rounding is done according to the imm8 parameter, which can be one of:
10983///
10984/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10985/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10986/// * [`_MM_FROUND_TO_POS_INF`] : round up
10987/// * [`_MM_FROUND_TO_ZERO`] : truncate
10988/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10989///
10990/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10991///
10992/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh)
10993#[inline]
10994#[target_feature(enable = "avx512fp16")]
10995#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
10996#[rustc_legacy_const_generics(3, 4)]
10997#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10998pub fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
10999    k: __mmask8,
11000    a: __m128h,
11001    b: __m128h,
11002) -> __m128h {
11003    static_assert_uimm_bits!(IMM8, 8);
11004    static_assert_sae!(SAE);
11005    _mm_mask_reduce_round_sh::<IMM8, SAE>(_mm_setzero_ph(), k, a, b)
11006}
11007
11008/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11009/// sum of all elements in a.
11010///
11011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_ph)
11012#[inline]
11013#[target_feature(enable = "avx512fp16,avx512vl")]
11014#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11015pub fn _mm_reduce_add_ph(a: __m128h) -> f16 {
11016    unsafe {
11017        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11018        let a = _mm_add_ph(a, b);
11019        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11020        let a = _mm_add_ph(a, b);
11021        simd_extract::<_, f16>(a, 0) + simd_extract::<_, f16>(a, 1)
11022    }
11023}
11024
11025/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11026/// sum of all elements in a.
11027///
11028/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_ph)
11029#[inline]
11030#[target_feature(enable = "avx512fp16,avx512vl")]
11031#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11032pub fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
11033    unsafe {
11034        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11035        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11036        _mm_reduce_add_ph(_mm_add_ph(p, q))
11037    }
11038}
11039
11040/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11041/// sum of all elements in a.
11042///
11043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ph)
11044#[inline]
11045#[target_feature(enable = "avx512fp16")]
11046#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11047pub fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
11048    unsafe {
11049        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11050        let q = simd_shuffle!(
11051            a,
11052            a,
11053            [
11054                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11055            ]
11056        );
11057        _mm256_reduce_add_ph(_mm256_add_ph(p, q))
11058    }
11059}
11060
11061/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11062/// the product of all elements in a.
11063///
11064/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_ph)
11065#[inline]
11066#[target_feature(enable = "avx512fp16,avx512vl")]
11067#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11068pub fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
11069    unsafe {
11070        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11071        let a = _mm_mul_ph(a, b);
11072        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11073        let a = _mm_mul_ph(a, b);
11074        simd_extract::<_, f16>(a, 0) * simd_extract::<_, f16>(a, 1)
11075    }
11076}
11077
11078/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11079/// the product of all elements in a.
11080///
11081/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_ph)
11082#[inline]
11083#[target_feature(enable = "avx512fp16,avx512vl")]
11084#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11085pub fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
11086    unsafe {
11087        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11088        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11089        _mm_reduce_mul_ph(_mm_mul_ph(p, q))
11090    }
11091}
11092
11093/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11094/// the product of all elements in a.
11095///
11096/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ph)
11097#[inline]
11098#[target_feature(enable = "avx512fp16")]
11099#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11100pub unsafe fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
11101    unsafe {
11102        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11103        let q = simd_shuffle!(
11104            a,
11105            a,
11106            [
11107                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11108            ]
11109        );
11110        _mm256_reduce_mul_ph(_mm256_mul_ph(p, q))
11111    }
11112}
11113
11114/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11115/// minimum of all elements in a.
11116///
11117/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_ph)
11118#[inline]
11119#[target_feature(enable = "avx512fp16,avx512vl")]
11120#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11121pub fn _mm_reduce_min_ph(a: __m128h) -> f16 {
11122    unsafe {
11123        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11124        let a = _mm_min_ph(a, b);
11125        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11126        let a = _mm_min_ph(a, b);
11127        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11128        simd_extract!(_mm_min_sh(a, b), 0)
11129    }
11130}
11131
11132/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11133/// minimum of all elements in a.
11134///
11135/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_ph)
11136#[inline]
11137#[target_feature(enable = "avx512fp16,avx512vl")]
11138#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11139pub fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
11140    unsafe {
11141        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11142        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11143        _mm_reduce_min_ph(_mm_min_ph(p, q))
11144    }
11145}
11146
11147/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11148/// minimum of all elements in a.
11149///
11150/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ph)
11151#[inline]
11152#[target_feature(enable = "avx512fp16")]
11153#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11154pub fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
11155    unsafe {
11156        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11157        let q = simd_shuffle!(
11158            a,
11159            a,
11160            [
11161                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11162            ]
11163        );
11164        _mm256_reduce_min_ph(_mm256_min_ph(p, q))
11165    }
11166}
11167
11168/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11169/// maximum of all elements in a.
11170///
11171/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_ph)
11172#[inline]
11173#[target_feature(enable = "avx512fp16,avx512vl")]
11174#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11175pub fn _mm_reduce_max_ph(a: __m128h) -> f16 {
11176    unsafe {
11177        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11178        let a = _mm_max_ph(a, b);
11179        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11180        let a = _mm_max_ph(a, b);
11181        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11182        simd_extract!(_mm_max_sh(a, b), 0)
11183    }
11184}
11185
11186/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11187/// maximum of all elements in a.
11188///
11189/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_ph)
11190#[inline]
11191#[target_feature(enable = "avx512fp16,avx512vl")]
11192#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11193pub fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
11194    unsafe {
11195        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11196        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11197        _mm_reduce_max_ph(_mm_max_ph(p, q))
11198    }
11199}
11200
11201/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11202/// maximum of all elements in a.
11203///
11204/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ph)
11205#[inline]
11206#[target_feature(enable = "avx512fp16")]
11207#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11208pub fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
11209    unsafe {
11210        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11211        let q = simd_shuffle!(
11212            a,
11213            a,
11214            [
11215                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11216            ]
11217        );
11218        _mm256_reduce_max_ph(_mm256_max_ph(p, q))
11219    }
11220}
11221
11222macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
11223    ($mask_type: ty, $reg: ident, $a: expr) => {{
11224        let dst: $mask_type;
11225        asm!(
11226            "vfpclassph {k}, {src}, {imm8}",
11227            k = lateout(kreg) dst,
11228            src = in($reg) $a,
11229            imm8 = const IMM8,
11230            options(pure, nomem, nostack)
11231        );
11232        dst
11233    }};
11234    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
11235        let dst: $mask_type;
11236        asm!(
11237            "vfpclassph {k} {{ {mask} }}, {src}, {imm8}",
11238            k = lateout(kreg) dst,
11239            mask = in(kreg) $mask,
11240            src = in($reg) $a,
11241            imm8 = const IMM8,
11242            options(pure, nomem, nostack)
11243        );
11244        dst
11245    }};
11246}
11247
11248/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11249/// by imm8, and store the results in mask vector k.
11250/// imm can be a combination of:
11251///
11252///     0x01 // QNaN
11253///     0x02 // Positive Zero
11254///     0x04 // Negative Zero
11255///     0x08 // Positive Infinity
11256///     0x10 // Negative Infinity
11257///     0x20 // Denormal
11258///     0x40 // Negative
11259///     0x80 // SNaN
11260///
11261/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_ph_mask)
11262#[inline]
11263#[target_feature(enable = "avx512fp16,avx512vl")]
11264#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11265#[rustc_legacy_const_generics(1)]
11266#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11267pub fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11268    unsafe {
11269        static_assert_uimm_bits!(IMM8, 8);
11270        fpclass_asm!(__mmask8, xmm_reg, a)
11271    }
11272}
11273
11274/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11275/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11276/// corresponding mask bit is not set).
11277/// imm can be a combination of:
11278///
11279///     0x01 // QNaN
11280///     0x02 // Positive Zero
11281///     0x04 // Negative Zero
11282///     0x08 // Positive Infinity
11283///     0x10 // Negative Infinity
11284///     0x20 // Denormal
11285///     0x40 // Negative
11286///     0x80 // SNaN
11287///
11288/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_ph_mask)
11289#[inline]
11290#[target_feature(enable = "avx512fp16,avx512vl")]
11291#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11292#[rustc_legacy_const_generics(2)]
11293#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11294pub fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11295    unsafe {
11296        static_assert_uimm_bits!(IMM8, 8);
11297        fpclass_asm!(__mmask8, k1, xmm_reg, a)
11298    }
11299}
11300
11301/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11302/// by imm8, and store the results in mask vector k.
11303/// imm can be a combination of:
11304///
11305///     0x01 // QNaN
11306///     0x02 // Positive Zero
11307///     0x04 // Negative Zero
11308///     0x08 // Positive Infinity
11309///     0x10 // Negative Infinity
11310///     0x20 // Denormal
11311///     0x40 // Negative
11312///     0x80 // SNaN
11313///
11314/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fpclass_ph_mask)
11315#[inline]
11316#[target_feature(enable = "avx512fp16,avx512vl")]
11317#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11318#[rustc_legacy_const_generics(1)]
11319#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11320pub fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
11321    unsafe {
11322        static_assert_uimm_bits!(IMM8, 8);
11323        fpclass_asm!(__mmask16, ymm_reg, a)
11324    }
11325}
11326
11327/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11328/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11329/// corresponding mask bit is not set).
11330/// imm can be a combination of:
11331///
11332///     0x01 // QNaN
11333///     0x02 // Positive Zero
11334///     0x04 // Negative Zero
11335///     0x08 // Positive Infinity
11336///     0x10 // Negative Infinity
11337///     0x20 // Denormal
11338///     0x40 // Negative
11339///     0x80 // SNaN
11340///
11341/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fpclass_ph_mask)
11342#[inline]
11343#[target_feature(enable = "avx512fp16,avx512vl")]
11344#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11345#[rustc_legacy_const_generics(2)]
11346#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11347pub fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m256h) -> __mmask16 {
11348    unsafe {
11349        static_assert_uimm_bits!(IMM8, 8);
11350        fpclass_asm!(__mmask16, k1, ymm_reg, a)
11351    }
11352}
11353
11354/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11355/// by imm8, and store the results in mask vector k.
11356/// imm can be a combination of:
11357///
11358///     0x01 // QNaN
11359///     0x02 // Positive Zero
11360///     0x04 // Negative Zero
11361///     0x08 // Positive Infinity
11362///     0x10 // Negative Infinity
11363///     0x20 // Denormal
11364///     0x40 // Negative
11365///     0x80 // SNaN
11366///
11367/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fpclass_ph_mask)
11368#[inline]
11369#[target_feature(enable = "avx512fp16")]
11370#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11371#[rustc_legacy_const_generics(1)]
11372#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11373pub fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
11374    unsafe {
11375        static_assert_uimm_bits!(IMM8, 8);
11376        fpclass_asm!(__mmask32, zmm_reg, a)
11377    }
11378}
11379
11380/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11381/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11382/// corresponding mask bit is not set).
11383/// imm can be a combination of:
11384///
11385///     0x01 // QNaN
11386///     0x02 // Positive Zero
11387///     0x04 // Negative Zero
11388///     0x08 // Positive Infinity
11389///     0x10 // Negative Infinity
11390///     0x20 // Denormal
11391///     0x40 // Negative
11392///     0x80 // SNaN
11393///
11394/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fpclass_ph_mask)
11395#[inline]
11396#[target_feature(enable = "avx512fp16")]
11397#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11398#[rustc_legacy_const_generics(2)]
11399#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11400pub fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m512h) -> __mmask32 {
11401    unsafe {
11402        static_assert_uimm_bits!(IMM8, 8);
11403        fpclass_asm!(__mmask32, k1, zmm_reg, a)
11404    }
11405}
11406
11407/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11408/// by imm8, and store the result in mask vector k.
11409/// imm can be a combination of:
11410///
11411///     0x01 // QNaN
11412///     0x02 // Positive Zero
11413///     0x04 // Negative Zero
11414///     0x08 // Positive Infinity
11415///     0x10 // Negative Infinity
11416///     0x20 // Denormal
11417///     0x40 // Negative
11418///     0x80 // SNaN
11419///
11420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_sh_mask)
11421#[inline]
11422#[target_feature(enable = "avx512fp16")]
11423#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11424#[rustc_legacy_const_generics(1)]
11425#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11426pub fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11427    _mm_mask_fpclass_sh_mask::<IMM8>(0xff, a)
11428}
11429
11430/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11431/// by imm8, and store the result in mask vector k using zeromask k (elements are zeroed out when the
11432/// corresponding mask bit is not set).
11433/// imm can be a combination of:
11434///
11435///     0x01 // QNaN
11436///     0x02 // Positive Zero
11437///     0x04 // Negative Zero
11438///     0x08 // Positive Infinity
11439///     0x10 // Negative Infinity
11440///     0x20 // Denormal
11441///     0x40 // Negative
11442///     0x80 // SNaN
11443///
11444/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_sh_mask)
11445#[inline]
11446#[target_feature(enable = "avx512fp16")]
11447#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11448#[rustc_legacy_const_generics(2)]
11449#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11450pub fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11451    unsafe {
11452        static_assert_uimm_bits!(IMM8, 8);
11453        vfpclasssh(a, IMM8, k1)
11454    }
11455}
11456
11457/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11458/// and store the results in dst.
11459///
11460/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ph)
11461#[inline]
11462#[target_feature(enable = "avx512fp16,avx512vl")]
11463#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11464pub fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
11465    unsafe { simd_select_bitmask(k, b, a) }
11466}
11467
11468/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11469/// and store the results in dst.
11470///
11471/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ph)
11472#[inline]
11473#[target_feature(enable = "avx512fp16,avx512vl")]
11474#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11475pub fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
11476    unsafe { simd_select_bitmask(k, b, a) }
11477}
11478
11479/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11480/// and store the results in dst.
11481///
11482/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ph)
11483#[inline]
11484#[target_feature(enable = "avx512fp16")]
11485#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11486pub fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
11487    unsafe { simd_select_bitmask(k, b, a) }
11488}
11489
11490/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11491/// and index in idx, and store the results in dst.
11492///
11493/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ph)
11494#[inline]
11495#[target_feature(enable = "avx512fp16,avx512vl")]
11496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11497pub fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m128h {
11498    _mm_castsi128_ph(_mm_permutex2var_epi16(
11499        _mm_castph_si128(a),
11500        idx,
11501        _mm_castph_si128(b),
11502    ))
11503}
11504
11505/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11506/// and index in idx, and store the results in dst.
11507///
11508/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ph)
11509#[inline]
11510#[target_feature(enable = "avx512fp16,avx512vl")]
11511#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11512pub fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __m256h {
11513    _mm256_castsi256_ph(_mm256_permutex2var_epi16(
11514        _mm256_castph_si256(a),
11515        idx,
11516        _mm256_castph_si256(b),
11517    ))
11518}
11519
11520/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11521/// and index in idx, and store the results in dst.
11522///
11523/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ph)
11524#[inline]
11525#[target_feature(enable = "avx512fp16")]
11526#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11527pub fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __m512h {
11528    _mm512_castsi512_ph(_mm512_permutex2var_epi16(
11529        _mm512_castph_si512(a),
11530        idx,
11531        _mm512_castph_si512(b),
11532    ))
11533}
11534
11535/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11536/// and store the results in dst.
11537///
11538/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_ph)
11539#[inline]
11540#[target_feature(enable = "avx512fp16,avx512vl")]
11541#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11542pub fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
11543    _mm_castsi128_ph(_mm_permutexvar_epi16(idx, _mm_castph_si128(a)))
11544}
11545
11546/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11547/// and store the results in dst.
11548///
11549/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ph)
11550#[inline]
11551#[target_feature(enable = "avx512fp16,avx512vl")]
11552#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11553pub fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
11554    _mm256_castsi256_ph(_mm256_permutexvar_epi16(idx, _mm256_castph_si256(a)))
11555}
11556
11557/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11558/// and store the results in dst.
11559///
11560/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ph)
11561#[inline]
11562#[target_feature(enable = "avx512fp16")]
11563#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11564pub fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
11565    _mm512_castsi512_ph(_mm512_permutexvar_epi16(idx, _mm512_castph_si512(a)))
11566}
11567
11568/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11569/// and store the results in dst.
11570///
11571/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_ph)
11572#[inline]
11573#[target_feature(enable = "avx512fp16,avx512vl")]
11574#[cfg_attr(test, assert_instr(vcvtw2ph))]
11575#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11576pub fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
11577    unsafe { vcvtw2ph_128(a.as_i16x8(), _MM_FROUND_CUR_DIRECTION) }
11578}
11579
11580/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11581/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11582/// mask bit is not set).
11583///
11584/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_ph)
11585#[inline]
11586#[target_feature(enable = "avx512fp16,avx512vl")]
11587#[cfg_attr(test, assert_instr(vcvtw2ph))]
11588#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11589pub fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11590    unsafe { simd_select_bitmask(k, _mm_cvtepi16_ph(a), src) }
11591}
11592
11593/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11594/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11595///
11596/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_ph)
11597#[inline]
11598#[target_feature(enable = "avx512fp16,avx512vl")]
11599#[cfg_attr(test, assert_instr(vcvtw2ph))]
11600#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11601pub fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
11602    _mm_mask_cvtepi16_ph(_mm_setzero_ph(), k, a)
11603}
11604
11605/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11606/// and store the results in dst.
11607///
11608/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_ph)
11609#[inline]
11610#[target_feature(enable = "avx512fp16,avx512vl")]
11611#[cfg_attr(test, assert_instr(vcvtw2ph))]
11612#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11613pub fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
11614    unsafe { vcvtw2ph_256(a.as_i16x16(), _MM_FROUND_CUR_DIRECTION) }
11615}
11616
11617/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11618/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11619/// mask bit is not set).
11620///
11621/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_ph)
11622#[inline]
11623#[target_feature(enable = "avx512fp16,avx512vl")]
11624#[cfg_attr(test, assert_instr(vcvtw2ph))]
11625#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11626pub fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11627    unsafe { simd_select_bitmask(k, _mm256_cvtepi16_ph(a), src) }
11628}
11629
11630/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11631/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11632///
11633/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_ph)
11634#[inline]
11635#[target_feature(enable = "avx512fp16,avx512vl")]
11636#[cfg_attr(test, assert_instr(vcvtw2ph))]
11637#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11638pub fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
11639    _mm256_mask_cvtepi16_ph(_mm256_setzero_ph(), k, a)
11640}
11641
11642/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11643/// and store the results in dst.
11644///
11645/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_ph)
11646#[inline]
11647#[target_feature(enable = "avx512fp16")]
11648#[cfg_attr(test, assert_instr(vcvtw2ph))]
11649#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11650pub fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
11651    unsafe { vcvtw2ph_512(a.as_i16x32(), _MM_FROUND_CUR_DIRECTION) }
11652}
11653
11654/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11655/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11656/// mask bit is not set).
11657///
11658/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_ph)
11659#[inline]
11660#[target_feature(enable = "avx512fp16")]
11661#[cfg_attr(test, assert_instr(vcvtw2ph))]
11662#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11663pub fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11664    unsafe { simd_select_bitmask(k, _mm512_cvtepi16_ph(a), src) }
11665}
11666
11667/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11668/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11669///
11670/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_ph)
11671#[inline]
11672#[target_feature(enable = "avx512fp16")]
11673#[cfg_attr(test, assert_instr(vcvtw2ph))]
11674#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11675pub fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
11676    _mm512_mask_cvtepi16_ph(_mm512_setzero_ph(), k, a)
11677}
11678
11679/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11680/// and store the results in dst.
11681///
11682/// Rounding is done according to the rounding parameter, which can be one of:
11683///
11684/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11685/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11686/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11687/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11688/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11689///
11690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi16_ph)
11691#[inline]
11692#[target_feature(enable = "avx512fp16")]
11693#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11694#[rustc_legacy_const_generics(1)]
11695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11696pub fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
11697    unsafe {
11698        static_assert_rounding!(ROUNDING);
11699        vcvtw2ph_512(a.as_i16x32(), ROUNDING)
11700    }
11701}
11702
11703/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11704/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11705/// mask bit is not set).
11706///
11707/// Rounding is done according to the rounding parameter, which can be one of:
11708///
11709/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11710/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11711/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11712/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11713/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11714///
11715/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi16_ph)
11716#[inline]
11717#[target_feature(enable = "avx512fp16")]
11718#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11719#[rustc_legacy_const_generics(3)]
11720#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11721pub fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
11722    src: __m512h,
11723    k: __mmask32,
11724    a: __m512i,
11725) -> __m512h {
11726    unsafe {
11727        static_assert_rounding!(ROUNDING);
11728        simd_select_bitmask(k, _mm512_cvt_roundepi16_ph::<ROUNDING>(a), src)
11729    }
11730}
11731
11732/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11733/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11734///
11735/// Rounding is done according to the rounding parameter, which can be one of:
11736///
11737/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11738/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11739/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11740/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11741/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11742///
11743/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi16_ph)
11744#[inline]
11745#[target_feature(enable = "avx512fp16")]
11746#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11747#[rustc_legacy_const_generics(2)]
11748#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11749pub fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
11750    static_assert_rounding!(ROUNDING);
11751    _mm512_mask_cvt_roundepi16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
11752}
11753
11754/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11755/// and store the results in dst.
11756///
11757/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_ph)
11758#[inline]
11759#[target_feature(enable = "avx512fp16,avx512vl")]
11760#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11761#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11762pub fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
11763    unsafe { vcvtuw2ph_128(a.as_u16x8(), _MM_FROUND_CUR_DIRECTION) }
11764}
11765
11766/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11767/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11768/// mask bit is not set).
11769///
11770/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_ph)
11771#[inline]
11772#[target_feature(enable = "avx512fp16,avx512vl")]
11773#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11774#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11775pub fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11776    unsafe { simd_select_bitmask(k, _mm_cvtepu16_ph(a), src) }
11777}
11778
11779/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11780/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11781///
11782/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_ph)
11783#[inline]
11784#[target_feature(enable = "avx512fp16,avx512vl")]
11785#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11786#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11787pub fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
11788    _mm_mask_cvtepu16_ph(_mm_setzero_ph(), k, a)
11789}
11790
11791/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11792/// and store the results in dst.
11793///
11794/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_ph)
11795#[inline]
11796#[target_feature(enable = "avx512fp16,avx512vl")]
11797#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11798#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11799pub fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
11800    unsafe { vcvtuw2ph_256(a.as_u16x16(), _MM_FROUND_CUR_DIRECTION) }
11801}
11802
11803/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11804/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11805/// mask bit is not set).
11806///
11807/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_ph)
11808#[inline]
11809#[target_feature(enable = "avx512fp16,avx512vl")]
11810#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11811#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11812pub fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11813    unsafe { simd_select_bitmask(k, _mm256_cvtepu16_ph(a), src) }
11814}
11815
11816/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11817/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11818///
11819/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_ph)
11820#[inline]
11821#[target_feature(enable = "avx512fp16,avx512vl")]
11822#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11823#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11824pub fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
11825    _mm256_mask_cvtepu16_ph(_mm256_setzero_ph(), k, a)
11826}
11827
11828/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11829/// and store the results in dst.
11830///
11831/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_ph)
11832#[inline]
11833#[target_feature(enable = "avx512fp16")]
11834#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11835#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11836pub fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
11837    unsafe { vcvtuw2ph_512(a.as_u16x32(), _MM_FROUND_CUR_DIRECTION) }
11838}
11839
11840/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11841/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11842/// mask bit is not set).
11843///
11844/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_ph)
11845#[inline]
11846#[target_feature(enable = "avx512fp16")]
11847#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11848#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11849pub fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11850    unsafe { simd_select_bitmask(k, _mm512_cvtepu16_ph(a), src) }
11851}
11852
11853/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11854/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11855///
11856/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_ph)
11857#[inline]
11858#[target_feature(enable = "avx512fp16")]
11859#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11860#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11861pub fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
11862    _mm512_mask_cvtepu16_ph(_mm512_setzero_ph(), k, a)
11863}
11864
11865/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11866/// and store the results in dst.
11867///
11868/// Rounding is done according to the rounding parameter, which can be one of:
11869///
11870/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11871/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11872/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11873/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11874/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11875///
11876/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu16_ph)
11877#[inline]
11878#[target_feature(enable = "avx512fp16")]
11879#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11880#[rustc_legacy_const_generics(1)]
11881#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11882pub fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
11883    unsafe {
11884        static_assert_rounding!(ROUNDING);
11885        vcvtuw2ph_512(a.as_u16x32(), ROUNDING)
11886    }
11887}
11888
11889/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11890/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11891/// mask bit is not set).
11892///
11893/// Rounding is done according to the rounding parameter, which can be one of:
11894///
11895/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11896/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11897/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11898/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11899/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11900///
11901/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu16_ph)
11902#[inline]
11903#[target_feature(enable = "avx512fp16")]
11904#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11905#[rustc_legacy_const_generics(3)]
11906#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11907pub fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
11908    src: __m512h,
11909    k: __mmask32,
11910    a: __m512i,
11911) -> __m512h {
11912    unsafe {
11913        static_assert_rounding!(ROUNDING);
11914        simd_select_bitmask(k, _mm512_cvt_roundepu16_ph::<ROUNDING>(a), src)
11915    }
11916}
11917
11918/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11919/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11920///
11921/// Rounding is done according to the rounding parameter, which can be one of:
11922///
11923/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11924/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11925/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11926/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11927/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11928///
11929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu16_ph)
11930#[inline]
11931#[target_feature(enable = "avx512fp16")]
11932#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11933#[rustc_legacy_const_generics(2)]
11934#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11935pub fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
11936    static_assert_rounding!(ROUNDING);
11937    _mm512_mask_cvt_roundepu16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
11938}
11939
11940/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11941/// and store the results in dst. The upper 64 bits of dst are zeroed out.
11942///
11943/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ph)
11944#[inline]
11945#[target_feature(enable = "avx512fp16,avx512vl")]
11946#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11947#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11948pub fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
11949    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), 0xff, a)
11950}
11951
11952/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11953/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11954/// mask bit is not set). The upper 64 bits of dst are zeroed out.
11955///
11956/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ph)
11957#[inline]
11958#[target_feature(enable = "avx512fp16,avx512vl")]
11959#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11960#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11961pub fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11962    unsafe { vcvtdq2ph_128(a.as_i32x4(), src, k) }
11963}
11964
11965/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11966/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11967/// The upper 64 bits of dst are zeroed out.
11968///
11969/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ph)
11970#[inline]
11971#[target_feature(enable = "avx512fp16,avx512vl")]
11972#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11973#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11974pub fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
11975    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
11976}
11977
11978/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11979/// and store the results in dst.
11980///
11981/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ph)
11982#[inline]
11983#[target_feature(enable = "avx512fp16,avx512vl")]
11984#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11985#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11986pub fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
11987    unsafe { vcvtdq2ph_256(a.as_i32x8(), _MM_FROUND_CUR_DIRECTION) }
11988}
11989
11990/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11991/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11992/// mask bit is not set).
11993///
11994/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ph)
11995#[inline]
11996#[target_feature(enable = "avx512fp16,avx512vl")]
11997#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11998#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11999pub fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12000    unsafe { simd_select_bitmask(k, _mm256_cvtepi32_ph(a), src) }
12001}
12002
12003/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12004/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12005///
12006/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ph)
12007#[inline]
12008#[target_feature(enable = "avx512fp16,avx512vl")]
12009#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12010#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12011pub fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
12012    _mm256_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
12013}
12014
12015/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12016/// and store the results in dst.
12017///
12018/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ph)
12019#[inline]
12020#[target_feature(enable = "avx512fp16")]
12021#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12022#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12023pub fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
12024    unsafe { vcvtdq2ph_512(a.as_i32x16(), _MM_FROUND_CUR_DIRECTION) }
12025}
12026
12027/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12028/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12029/// mask bit is not set).
12030///
12031/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_ph)
12032#[inline]
12033#[target_feature(enable = "avx512fp16")]
12034#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12035#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12036pub fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12037    unsafe { simd_select_bitmask(k, _mm512_cvtepi32_ph(a), src) }
12038}
12039
12040/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12041/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12042///
12043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_ph)
12044#[inline]
12045#[target_feature(enable = "avx512fp16")]
12046#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12047#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12048pub fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
12049    _mm512_mask_cvtepi32_ph(_mm256_setzero_ph(), k, a)
12050}
12051
12052/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12053/// and store the results in dst.
12054///
12055/// Rounding is done according to the rounding parameter, which can be one of:
12056///
12057/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12058/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12059/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12060/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12061/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12062///
12063/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ph)
12064#[inline]
12065#[target_feature(enable = "avx512fp16")]
12066#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12067#[rustc_legacy_const_generics(1)]
12068#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12069pub fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12070    unsafe {
12071        static_assert_rounding!(ROUNDING);
12072        vcvtdq2ph_512(a.as_i32x16(), ROUNDING)
12073    }
12074}
12075
12076/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12077/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12078/// mask bit is not set).
12079///
12080/// Rounding is done according to the rounding parameter, which can be one of:
12081///
12082/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12083/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12084/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12085/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12086/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12087///
12088/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ph)
12089#[inline]
12090#[target_feature(enable = "avx512fp16")]
12091#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12092#[rustc_legacy_const_generics(3)]
12093#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12094pub fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
12095    src: __m256h,
12096    k: __mmask16,
12097    a: __m512i,
12098) -> __m256h {
12099    unsafe {
12100        static_assert_rounding!(ROUNDING);
12101        simd_select_bitmask(k, _mm512_cvt_roundepi32_ph::<ROUNDING>(a), src)
12102    }
12103}
12104
12105/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12106/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12107///
12108/// Rounding is done according to the rounding parameter, which can be one of:
12109///
12110/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12111/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12112/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12113/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12114/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12115///
12116/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ph)
12117#[inline]
12118#[target_feature(enable = "avx512fp16")]
12119#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12120#[rustc_legacy_const_generics(2)]
12121#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12122pub fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12123    static_assert_rounding!(ROUNDING);
12124    _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
12125}
12126
12127/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12128/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12129/// of dst.
12130///
12131/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti32_sh)
12132#[inline]
12133#[target_feature(enable = "avx512fp16")]
12134#[cfg_attr(test, assert_instr(vcvtsi2sh))]
12135#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12136pub fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
12137    unsafe { vcvtsi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12138}
12139
12140/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12141/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12142/// of dst.
12143///
12144/// Rounding is done according to the rounding parameter, which can be one of:
12145///
12146/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12147/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12148/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12149/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12150/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12151///
12152/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi32_sh)
12153#[inline]
12154#[target_feature(enable = "avx512fp16")]
12155#[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
12156#[rustc_legacy_const_generics(2)]
12157#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12158pub fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __m128h {
12159    unsafe {
12160        static_assert_rounding!(ROUNDING);
12161        vcvtsi2sh(a, b, ROUNDING)
12162    }
12163}
12164
12165/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12166/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12167///
12168/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_ph)
12169#[inline]
12170#[target_feature(enable = "avx512fp16,avx512vl")]
12171#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12172#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12173pub fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
12174    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), 0xff, a)
12175}
12176
12177/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12178/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12179/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12180///
12181/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_ph)
12182#[inline]
12183#[target_feature(enable = "avx512fp16,avx512vl")]
12184#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12185#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12186pub fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12187    unsafe { vcvtudq2ph_128(a.as_u32x4(), src, k) }
12188}
12189
12190/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12191/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12192/// The upper 64 bits of dst are zeroed out.
12193///
12194/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_ph)
12195#[inline]
12196#[target_feature(enable = "avx512fp16,avx512vl")]
12197#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12198#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12199pub fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
12200    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12201}
12202
12203/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12204/// and store the results in dst.
12205///
12206/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_ph)
12207#[inline]
12208#[target_feature(enable = "avx512fp16,avx512vl")]
12209#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12210#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12211pub fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
12212    unsafe { vcvtudq2ph_256(a.as_u32x8(), _MM_FROUND_CUR_DIRECTION) }
12213}
12214
12215/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12216/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12217/// mask bit is not set).
12218///
12219/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_ph)
12220#[inline]
12221#[target_feature(enable = "avx512fp16,avx512vl")]
12222#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12223#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12224pub fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12225    unsafe { simd_select_bitmask(k, _mm256_cvtepu32_ph(a), src) }
12226}
12227
12228/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12229/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12230///
12231/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_ph)
12232#[inline]
12233#[target_feature(enable = "avx512fp16,avx512vl")]
12234#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12235#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12236pub fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
12237    _mm256_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12238}
12239
12240/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12241/// and store the results in dst.
12242///
12243/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_ph)
12244#[inline]
12245#[target_feature(enable = "avx512fp16")]
12246#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12247#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12248pub fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
12249    unsafe { vcvtudq2ph_512(a.as_u32x16(), _MM_FROUND_CUR_DIRECTION) }
12250}
12251
12252/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12253/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12254/// mask bit is not set).
12255///
12256/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_ph)
12257#[inline]
12258#[target_feature(enable = "avx512fp16")]
12259#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12261pub fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12262    unsafe { simd_select_bitmask(k, _mm512_cvtepu32_ph(a), src) }
12263}
12264
12265/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12266/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12267///
12268/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ph)
12269#[inline]
12270#[target_feature(enable = "avx512fp16")]
12271#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12272#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12273pub fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
12274    _mm512_mask_cvtepu32_ph(_mm256_setzero_ph(), k, a)
12275}
12276
12277/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12278/// and store the results in dst.
12279///
12280/// Rounding is done according to the rounding parameter, which can be one of:
12281///
12282/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12283/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12284/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12285/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12286/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12287///
12288/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ph)
12289#[inline]
12290#[target_feature(enable = "avx512fp16")]
12291#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12292#[rustc_legacy_const_generics(1)]
12293#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12294pub fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12295    unsafe {
12296        static_assert_rounding!(ROUNDING);
12297        vcvtudq2ph_512(a.as_u32x16(), ROUNDING)
12298    }
12299}
12300
12301/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12302/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12303/// mask bit is not set).
12304///
12305/// Rounding is done according to the rounding parameter, which can be one of:
12306///
12307/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12308/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12309/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12310/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12311/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12312///
12313/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ph)
12314#[inline]
12315#[target_feature(enable = "avx512fp16")]
12316#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12317#[rustc_legacy_const_generics(3)]
12318#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12319pub fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
12320    src: __m256h,
12321    k: __mmask16,
12322    a: __m512i,
12323) -> __m256h {
12324    unsafe {
12325        static_assert_rounding!(ROUNDING);
12326        simd_select_bitmask(k, _mm512_cvt_roundepu32_ph::<ROUNDING>(a), src)
12327    }
12328}
12329
12330/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12331/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12332///
12333/// Rounding is done according to the rounding parameter, which can be one of:
12334///
12335/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12336/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12337/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12338/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12339/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12340///
12341/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ph)
12342#[inline]
12343#[target_feature(enable = "avx512fp16")]
12344#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12345#[rustc_legacy_const_generics(2)]
12346#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12347pub fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12348    static_assert_rounding!(ROUNDING);
12349    _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
12350}
12351
12352/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12353/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12354/// of dst.
12355///
12356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sh)
12357#[inline]
12358#[target_feature(enable = "avx512fp16")]
12359#[cfg_attr(test, assert_instr(vcvtusi2sh))]
12360#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12361pub fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
12362    unsafe { vcvtusi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12363}
12364
12365/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12366/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12367/// of dst.
12368///
12369/// Rounding is done according to the rounding parameter, which can be one of:
12370///
12371/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12372/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12373/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12374/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12375/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12376///
12377/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu32_sh)
12378#[inline]
12379#[target_feature(enable = "avx512fp16")]
12380#[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
12381#[rustc_legacy_const_generics(2)]
12382#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12383pub fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __m128h {
12384    unsafe {
12385        static_assert_rounding!(ROUNDING);
12386        vcvtusi2sh(a, b, ROUNDING)
12387    }
12388}
12389
12390/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12391/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12392///
12393/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_ph)
12394#[inline]
12395#[target_feature(enable = "avx512fp16,avx512vl")]
12396#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12397#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12398pub fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
12399    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12400}
12401
12402/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12403/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12404/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12405///
12406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_ph)
12407#[inline]
12408#[target_feature(enable = "avx512fp16,avx512vl")]
12409#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12410#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12411pub fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12412    unsafe { vcvtqq2ph_128(a.as_i64x2(), src, k) }
12413}
12414
12415/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12416/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12417/// The upper 96 bits of dst are zeroed out.
12418///
12419/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_ph)
12420#[inline]
12421#[target_feature(enable = "avx512fp16,avx512vl")]
12422#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12423#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12424pub fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
12425    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12426}
12427
12428/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12429/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12430///
12431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_ph)
12432#[inline]
12433#[target_feature(enable = "avx512fp16,avx512vl")]
12434#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12435#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12436pub fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
12437    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12438}
12439
12440/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12441/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12442/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12443///
12444/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_ph)
12445#[inline]
12446#[target_feature(enable = "avx512fp16,avx512vl")]
12447#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12448#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12449pub fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12450    unsafe { vcvtqq2ph_256(a.as_i64x4(), src, k) }
12451}
12452
12453/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12454/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12455/// The upper 64 bits of dst are zeroed out.
12456///
12457/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_ph)
12458#[inline]
12459#[target_feature(enable = "avx512fp16,avx512vl")]
12460#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12461#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12462pub fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
12463    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12464}
12465
12466/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12467/// and store the results in dst.
12468///
12469/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_ph)
12470#[inline]
12471#[target_feature(enable = "avx512fp16")]
12472#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12473#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12474pub fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
12475    unsafe { vcvtqq2ph_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION) }
12476}
12477
12478/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12479/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12480/// mask bit is not set).
12481///
12482/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_ph)
12483#[inline]
12484#[target_feature(enable = "avx512fp16")]
12485#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12486#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12487pub fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12488    unsafe { simd_select_bitmask(k, _mm512_cvtepi64_ph(a), src) }
12489}
12490
12491/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12492/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12493///
12494/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_ph)
12495#[inline]
12496#[target_feature(enable = "avx512fp16")]
12497#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12498#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12499pub fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
12500    _mm512_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12501}
12502
12503/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12504/// and store the results in dst.
12505///
12506/// Rounding is done according to the rounding parameter, which can be one of:
12507///
12508/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12509/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12510/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12511/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12512/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12513///
12514/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi64_ph)
12515#[inline]
12516#[target_feature(enable = "avx512fp16")]
12517#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12518#[rustc_legacy_const_generics(1)]
12519#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12520pub fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12521    unsafe {
12522        static_assert_rounding!(ROUNDING);
12523        vcvtqq2ph_512(a.as_i64x8(), ROUNDING)
12524    }
12525}
12526
12527/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12528/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12529/// mask bit is not set).
12530///
12531/// Rounding is done according to the rounding parameter, which can be one of:
12532///
12533/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12534/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12535/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12536/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12537/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12538///
12539/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi64_ph)
12540#[inline]
12541#[target_feature(enable = "avx512fp16")]
12542#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12543#[rustc_legacy_const_generics(3)]
12544#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12545pub fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
12546    src: __m128h,
12547    k: __mmask8,
12548    a: __m512i,
12549) -> __m128h {
12550    unsafe {
12551        static_assert_rounding!(ROUNDING);
12552        simd_select_bitmask(k, _mm512_cvt_roundepi64_ph::<ROUNDING>(a), src)
12553    }
12554}
12555
12556/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12557/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12558///
12559/// Rounding is done according to the rounding parameter, which can be one of:
12560///
12561/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12562/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12563/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12564/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12565/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12566///
12567/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi64_ph)
12568#[inline]
12569#[target_feature(enable = "avx512fp16")]
12570#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12571#[rustc_legacy_const_generics(2)]
12572#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12573pub fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
12574    static_assert_rounding!(ROUNDING);
12575    _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
12576}
12577
12578/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12579/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12580///
12581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu64_ph)
12582#[inline]
12583#[target_feature(enable = "avx512fp16,avx512vl")]
12584#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12585#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12586pub fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
12587    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12588}
12589
12590/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12591/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12592/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12593///
12594/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu64_ph)
12595#[inline]
12596#[target_feature(enable = "avx512fp16,avx512vl")]
12597#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12598#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12599pub fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12600    unsafe { vcvtuqq2ph_128(a.as_u64x2(), src, k) }
12601}
12602
12603/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12604/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12605/// The upper 96 bits of dst are zeroed out.
12606///
12607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu64_ph)
12608#[inline]
12609#[target_feature(enable = "avx512fp16,avx512vl")]
12610#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12611#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12612pub fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
12613    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12614}
12615
12616/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12617/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12618///
12619/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu64_ph)
12620#[inline]
12621#[target_feature(enable = "avx512fp16,avx512vl")]
12622#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12623#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12624pub fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
12625    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12626}
12627
12628/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12629/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12630/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12631///
12632/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu64_ph)
12633#[inline]
12634#[target_feature(enable = "avx512fp16,avx512vl")]
12635#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12636#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12637pub fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12638    unsafe { vcvtuqq2ph_256(a.as_u64x4(), src, k) }
12639}
12640
12641/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12642/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12643/// The upper 64 bits of dst are zeroed out.
12644///
12645/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu64_ph)
12646#[inline]
12647#[target_feature(enable = "avx512fp16,avx512vl")]
12648#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12649#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12650pub fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
12651    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12652}
12653
12654/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12655/// and store the results in dst.
12656///
12657/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu64_ph)
12658#[inline]
12659#[target_feature(enable = "avx512fp16")]
12660#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12661#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12662pub fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
12663    unsafe { vcvtuqq2ph_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION) }
12664}
12665
12666/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12667/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12668/// mask bit is not set).
12669///
12670/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu64_ph)
12671#[inline]
12672#[target_feature(enable = "avx512fp16")]
12673#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12674#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12675pub fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12676    unsafe { simd_select_bitmask(k, _mm512_cvtepu64_ph(a), src) }
12677}
12678
12679/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12680/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12681///
12682/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu64_ph)
12683#[inline]
12684#[target_feature(enable = "avx512fp16")]
12685#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12686#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12687pub fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
12688    _mm512_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12689}
12690
12691/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12692/// and store the results in dst.
12693///
12694/// Rounding is done according to the rounding parameter, which can be one of:
12695///
12696/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12697/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12698/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12699/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12700/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12701///
12702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu64_ph)
12703#[inline]
12704#[target_feature(enable = "avx512fp16")]
12705#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12706#[rustc_legacy_const_generics(1)]
12707#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12708pub fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12709    unsafe {
12710        static_assert_rounding!(ROUNDING);
12711        vcvtuqq2ph_512(a.as_u64x8(), ROUNDING)
12712    }
12713}
12714
12715/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12716/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12717/// mask bit is not set).
12718///
12719/// Rounding is done according to the rounding parameter, which can be one of:
12720///
12721/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12722/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12723/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12724/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12725/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12726///
12727/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu64_ph)
12728#[inline]
12729#[target_feature(enable = "avx512fp16")]
12730#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12731#[rustc_legacy_const_generics(3)]
12732#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12733pub fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
12734    src: __m128h,
12735    k: __mmask8,
12736    a: __m512i,
12737) -> __m128h {
12738    unsafe {
12739        static_assert_rounding!(ROUNDING);
12740        simd_select_bitmask(k, _mm512_cvt_roundepu64_ph::<ROUNDING>(a), src)
12741    }
12742}
12743
12744/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12745/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12746///
12747/// Rounding is done according to the rounding parameter, which can be one of:
12748///
12749/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12750/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12751/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12752/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12753/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12754///
12755/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu64_ph)
12756#[inline]
12757#[target_feature(enable = "avx512fp16")]
12758#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12759#[rustc_legacy_const_generics(2)]
12760#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12761pub fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
12762    static_assert_rounding!(ROUNDING);
12763    _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
12764}
12765
12766/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12767/// floating-point elements, and store the results in dst.
12768///
12769/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxps_ph)
12770#[inline]
12771#[target_feature(enable = "avx512fp16,avx512vl")]
12772#[cfg_attr(test, assert_instr(vcvtps2phx))]
12773#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12774pub fn _mm_cvtxps_ph(a: __m128) -> __m128h {
12775    _mm_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
12776}
12777
12778/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12779/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12780/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12781///
12782/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxps_ph)
12783#[inline]
12784#[target_feature(enable = "avx512fp16,avx512vl")]
12785#[cfg_attr(test, assert_instr(vcvtps2phx))]
12786#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12787pub fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128h {
12788    unsafe { vcvtps2phx_128(a, src, k) }
12789}
12790
12791/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12792/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12793/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12794///
12795/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxps_ph)
12796#[inline]
12797#[target_feature(enable = "avx512fp16,avx512vl")]
12798#[cfg_attr(test, assert_instr(vcvtps2phx))]
12799#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12800pub fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
12801    _mm_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
12802}
12803
12804/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12805/// floating-point elements, and store the results in dst.
12806///
12807/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxps_ph)
12808#[inline]
12809#[target_feature(enable = "avx512fp16,avx512vl")]
12810#[cfg_attr(test, assert_instr(vcvtps2phx))]
12811#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12812pub fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
12813    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
12814}
12815
12816/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12817/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12818/// when the corresponding mask bit is not set).
12819///
12820/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxps_ph)
12821#[inline]
12822#[target_feature(enable = "avx512fp16,avx512vl")]
12823#[cfg_attr(test, assert_instr(vcvtps2phx))]
12824#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12825pub fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m128h {
12826    unsafe { vcvtps2phx_256(a, src, k) }
12827}
12828
12829/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12830/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12831/// corresponding mask bit is not set).
12832///
12833/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxps_ph)
12834#[inline]
12835#[target_feature(enable = "avx512fp16,avx512vl")]
12836#[cfg_attr(test, assert_instr(vcvtps2phx))]
12837#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12838pub fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
12839    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
12840}
12841
12842/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12843/// floating-point elements, and store the results in dst.
12844///
12845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxps_ph)
12846#[inline]
12847#[target_feature(enable = "avx512fp16")]
12848#[cfg_attr(test, assert_instr(vcvtps2phx))]
12849#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12850pub fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
12851    _mm512_mask_cvtxps_ph(_mm256_setzero_ph(), 0xffff, a)
12852}
12853
12854/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12855/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12856/// when the corresponding mask bit is not set).
12857///
12858/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxps_ph)
12859#[inline]
12860#[target_feature(enable = "avx512fp16")]
12861#[cfg_attr(test, assert_instr(vcvtps2phx))]
12862#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12863pub fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
12864    unsafe { vcvtps2phx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
12865}
12866
12867/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12868/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12869/// corresponding mask bit is not set).
12870///
12871/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxps_ph)
12872#[inline]
12873#[target_feature(enable = "avx512fp16")]
12874#[cfg_attr(test, assert_instr(vcvtps2phx))]
12875#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12876pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
12877    _mm512_mask_cvtxps_ph(_mm256_setzero_ph(), k, a)
12878}
12879
12880/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12881/// floating-point elements, and store the results in dst.
12882///
12883/// Rounding is done according to the rounding parameter, which can be one of:
12884///
12885/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12886/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12887/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12888/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12889/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12890///
12891/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundps_ph)
12892#[inline]
12893#[target_feature(enable = "avx512fp16")]
12894#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12895#[rustc_legacy_const_generics(1)]
12896#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12897pub fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
12898    static_assert_rounding!(ROUNDING);
12899    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(_mm256_setzero_ph(), 0xffff, a)
12900}
12901
12902/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12903/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12904/// when the corresponding mask bit is not set).
12905///
12906/// Rounding is done according to the rounding parameter, which can be one of:
12907///
12908/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12909/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12910/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12911/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12912/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12913///
12914/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundps_ph)
12915#[inline]
12916#[target_feature(enable = "avx512fp16")]
12917#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12918#[rustc_legacy_const_generics(3)]
12919#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12920pub fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
12921    src: __m256h,
12922    k: __mmask16,
12923    a: __m512,
12924) -> __m256h {
12925    unsafe {
12926        static_assert_rounding!(ROUNDING);
12927        vcvtps2phx_512(a, src, k, ROUNDING)
12928    }
12929}
12930
12931/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12932/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12933/// corresponding mask bit is not set).
12934///
12935/// Rounding is done according to the rounding parameter, which can be one of:
12936///
12937/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12938/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12939/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12940/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12941/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12942///
12943/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundps_ph)
12944#[inline]
12945#[target_feature(enable = "avx512fp16")]
12946#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12947#[rustc_legacy_const_generics(2)]
12948#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12949pub fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256h {
12950    static_assert_rounding!(ROUNDING);
12951    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
12952}
12953
12954/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12955/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
12956/// elements from a to the upper elements of dst.
12957///
12958/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sh)
12959#[inline]
12960#[target_feature(enable = "avx512fp16")]
12961#[cfg_attr(test, assert_instr(vcvtss2sh))]
12962#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12963pub fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
12964    _mm_mask_cvtss_sh(_mm_undefined_ph(), 0xff, a, b)
12965}
12966
12967/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12968/// floating-point elements, store the result in the lower element of dst using writemask k (the element
12969/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
12970/// upper elements of dst.
12971///
12972/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sh)
12973#[inline]
12974#[target_feature(enable = "avx512fp16")]
12975#[cfg_attr(test, assert_instr(vcvtss2sh))]
12976#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12977pub fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __m128h {
12978    unsafe { vcvtss2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
12979}
12980
12981/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12982/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
12983/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
12984/// elements of dst.
12985///
12986/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sh)
12987#[inline]
12988#[target_feature(enable = "avx512fp16")]
12989#[cfg_attr(test, assert_instr(vcvtss2sh))]
12990#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12991pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
12992    _mm_mask_cvtss_sh(_mm_setzero_ph(), k, a, b)
12993}
12994
12995/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12996/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
12997/// elements from a to the upper elements of dst.
12998///
12999/// Rounding is done according to the rounding parameter, which can be one of:
13000///
13001/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13002/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13003/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13004/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13005/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13006///
13007/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_sh)
13008#[inline]
13009#[target_feature(enable = "avx512fp16")]
13010#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13011#[rustc_legacy_const_generics(2)]
13012#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13013pub fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
13014    static_assert_rounding!(ROUNDING);
13015    _mm_mask_cvt_roundss_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
13016}
13017
13018/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13019/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13020/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13021/// upper elements of dst.
13022///
13023/// Rounding is done according to the rounding parameter, which can be one of:
13024///
13025/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13026/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13027/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13028/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13029/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13030///
13031/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sh)
13032#[inline]
13033#[target_feature(enable = "avx512fp16")]
13034#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13035#[rustc_legacy_const_generics(4)]
13036#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13037pub fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
13038    src: __m128h,
13039    k: __mmask8,
13040    a: __m128h,
13041    b: __m128,
13042) -> __m128h {
13043    unsafe {
13044        static_assert_rounding!(ROUNDING);
13045        vcvtss2sh(a, b, src, k, ROUNDING)
13046    }
13047}
13048
13049/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13050/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13051/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13052/// elements of dst.
13053///
13054/// Rounding is done according to the rounding parameter, which can be one of:
13055///
13056/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13057/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13058/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13059/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13060/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13061///
13062/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sh)
13063#[inline]
13064#[target_feature(enable = "avx512fp16")]
13065#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13066#[rustc_legacy_const_generics(3)]
13067#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13068pub fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
13069    k: __mmask8,
13070    a: __m128h,
13071    b: __m128,
13072) -> __m128h {
13073    static_assert_rounding!(ROUNDING);
13074    _mm_mask_cvt_roundss_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
13075}
13076
13077/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13078/// floating-point elements, and store the results in dst. The upper 96 bits of dst are zeroed out.
13079///
13080/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ph)
13081#[inline]
13082#[target_feature(enable = "avx512fp16,avx512vl")]
13083#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13084#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13085pub fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
13086    _mm_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13087}
13088
13089/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13090/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13091/// when the corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13092///
13093/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ph)
13094#[inline]
13095#[target_feature(enable = "avx512fp16,avx512vl")]
13096#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13097#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13098pub fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128h {
13099    unsafe { vcvtpd2ph_128(a, src, k) }
13100}
13101
13102/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13103/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13104/// corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13105///
13106/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ph)
13107#[inline]
13108#[target_feature(enable = "avx512fp16,avx512vl")]
13109#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13110#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13111pub fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
13112    _mm_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13113}
13114
13115/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13116/// floating-point elements, and store the results in dst. The upper 64 bits of dst are zeroed out.
13117///
13118/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ph)
13119#[inline]
13120#[target_feature(enable = "avx512fp16,avx512vl")]
13121#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13122#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13123pub fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
13124    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13125}
13126
13127/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13128/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13129/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13130///
13131/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ph)
13132#[inline]
13133#[target_feature(enable = "avx512fp16,avx512vl")]
13134#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13135#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13136pub fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m128h {
13137    unsafe { vcvtpd2ph_256(a, src, k) }
13138}
13139
13140/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13141/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13142/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13143///
13144/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ph)
13145#[inline]
13146#[target_feature(enable = "avx512fp16,avx512vl")]
13147#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13148#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13149pub fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
13150    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13151}
13152
13153/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13154/// floating-point elements, and store the results in dst.
13155///
13156/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ph)
13157#[inline]
13158#[target_feature(enable = "avx512fp16")]
13159#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13160#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13161pub fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
13162    _mm512_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13163}
13164
13165/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13166/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13167/// when the corresponding mask bit is not set).
13168///
13169/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ph)
13170#[inline]
13171#[target_feature(enable = "avx512fp16")]
13172#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13173#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13174pub fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
13175    unsafe { vcvtpd2ph_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
13176}
13177
13178/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13179/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13180/// corresponding mask bit is not set).
13181///
13182/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ph)
13183#[inline]
13184#[target_feature(enable = "avx512fp16")]
13185#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13186#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13187pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
13188    _mm512_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13189}
13190
13191/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13192/// floating-point elements, and store the results in dst.
13193///
13194/// Rounding is done according to the rounding parameter, which can be one of:
13195///
13196/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13197/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13198/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13199/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13200/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13201///
13202/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ph)
13203#[inline]
13204#[target_feature(enable = "avx512fp16")]
13205#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13206#[rustc_legacy_const_generics(1)]
13207#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13208pub fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
13209    static_assert_rounding!(ROUNDING);
13210    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(_mm_setzero_ph(), 0xff, a)
13211}
13212
13213/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13214/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13215/// when the corresponding mask bit is not set).
13216///
13217/// Rounding is done according to the rounding parameter, which can be one of:
13218///
13219/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13220/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13221/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13222/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13223/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13224///
13225/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ph)
13226#[inline]
13227#[target_feature(enable = "avx512fp16")]
13228#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13229#[rustc_legacy_const_generics(3)]
13230#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13231pub fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
13232    src: __m128h,
13233    k: __mmask8,
13234    a: __m512d,
13235) -> __m128h {
13236    unsafe {
13237        static_assert_rounding!(ROUNDING);
13238        vcvtpd2ph_512(a, src, k, ROUNDING)
13239    }
13240}
13241
13242/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13243/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13244/// corresponding mask bit is not set).
13245///
13246/// Rounding is done according to the rounding parameter, which can be one of:
13247///
13248/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13249/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13250/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13251/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13252/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13253///
13254/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ph)
13255#[inline]
13256#[target_feature(enable = "avx512fp16")]
13257#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13258#[rustc_legacy_const_generics(2)]
13259#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13260pub fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
13261    static_assert_rounding!(ROUNDING);
13262    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
13263}
13264
13265/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13266/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13267/// elements from a to the upper elements of dst.
13268///
13269/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_sh)
13270#[inline]
13271#[target_feature(enable = "avx512fp16")]
13272#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13273#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13274pub fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
13275    _mm_mask_cvtsd_sh(_mm_undefined_ph(), 0xff, a, b)
13276}
13277
13278/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13279/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13280/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13281/// upper elements of dst.
13282///
13283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_sh)
13284#[inline]
13285#[target_feature(enable = "avx512fp16")]
13286#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13287#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13288pub fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13289    unsafe { vcvtsd2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
13290}
13291
13292/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13293/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13294/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13295/// elements of dst.
13296///
13297/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_sh)
13298#[inline]
13299#[target_feature(enable = "avx512fp16")]
13300#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13301#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13302pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13303    _mm_mask_cvtsd_sh(_mm_setzero_ph(), k, a, b)
13304}
13305
13306/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13307/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13308/// elements from a to the upper elements of dst.
13309///
13310/// Rounding is done according to the rounding parameter, which can be one of:
13311///
13312/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13313/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13314/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13315/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13316/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13317///
13318/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_sh)
13319#[inline]
13320#[target_feature(enable = "avx512fp16")]
13321#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13322#[rustc_legacy_const_generics(2)]
13323#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13324pub fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
13325    static_assert_rounding!(ROUNDING);
13326    _mm_mask_cvt_roundsd_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
13327}
13328
13329/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13330/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13331/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13332/// upper elements of dst.
13333///
13334/// Rounding is done according to the rounding parameter, which can be one of:
13335///
13336/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13337/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13338/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13339/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13340/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13341///
13342/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsd_sh)
13343#[inline]
13344#[target_feature(enable = "avx512fp16")]
13345#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13346#[rustc_legacy_const_generics(4)]
13347#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13348pub fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
13349    src: __m128h,
13350    k: __mmask8,
13351    a: __m128h,
13352    b: __m128d,
13353) -> __m128h {
13354    unsafe {
13355        static_assert_rounding!(ROUNDING);
13356        vcvtsd2sh(a, b, src, k, ROUNDING)
13357    }
13358}
13359
13360/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13361/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13362/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13363/// elements of dst.
13364///
13365/// Rounding is done according to the rounding parameter, which can be one of:
13366///
13367/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13368/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13369/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13370/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13371/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13372///
13373/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_sh)
13374#[inline]
13375#[target_feature(enable = "avx512fp16")]
13376#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13377#[rustc_legacy_const_generics(3)]
13378#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13379pub fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
13380    k: __mmask8,
13381    a: __m128h,
13382    b: __m128d,
13383) -> __m128h {
13384    static_assert_rounding!(ROUNDING);
13385    _mm_mask_cvt_roundsd_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
13386}
13387
13388/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13389/// store the results in dst.
13390///
13391/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi16)
13392#[inline]
13393#[target_feature(enable = "avx512fp16,avx512vl")]
13394#[cfg_attr(test, assert_instr(vcvtph2w))]
13395#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13396pub fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
13397    _mm_mask_cvtph_epi16(_mm_undefined_si128(), 0xff, a)
13398}
13399
13400/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13401/// store the results in dst using writemask k (elements are copied from src when the corresponding
13402/// mask bit is not set).
13403///
13404/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi16)
13405#[inline]
13406#[target_feature(enable = "avx512fp16,avx512vl")]
13407#[cfg_attr(test, assert_instr(vcvtph2w))]
13408#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13409pub fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13410    unsafe { transmute(vcvtph2w_128(a, src.as_i16x8(), k)) }
13411}
13412
13413/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13414/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13415///
13416/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi16)
13417#[inline]
13418#[target_feature(enable = "avx512fp16,avx512vl")]
13419#[cfg_attr(test, assert_instr(vcvtph2w))]
13420#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13421pub fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13422    _mm_mask_cvtph_epi16(_mm_setzero_si128(), k, a)
13423}
13424
13425/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13426/// store the results in dst.
13427///
13428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi16)
13429#[inline]
13430#[target_feature(enable = "avx512fp16,avx512vl")]
13431#[cfg_attr(test, assert_instr(vcvtph2w))]
13432#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13433pub fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
13434    _mm256_mask_cvtph_epi16(_mm256_undefined_si256(), 0xffff, a)
13435}
13436
13437/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13438/// store the results in dst using writemask k (elements are copied from src when the corresponding
13439/// mask bit is not set).
13440///
13441/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi16)
13442#[inline]
13443#[target_feature(enable = "avx512fp16,avx512vl")]
13444#[cfg_attr(test, assert_instr(vcvtph2w))]
13445#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13446pub fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13447    unsafe { transmute(vcvtph2w_256(a, src.as_i16x16(), k)) }
13448}
13449
13450/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13451/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13452///
13453/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi16)
13454#[inline]
13455#[target_feature(enable = "avx512fp16,avx512vl")]
13456#[cfg_attr(test, assert_instr(vcvtph2w))]
13457#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13458pub fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13459    _mm256_mask_cvtph_epi16(_mm256_setzero_si256(), k, a)
13460}
13461
13462/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13463/// store the results in dst.
13464///
13465/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi16)
13466#[inline]
13467#[target_feature(enable = "avx512fp16")]
13468#[cfg_attr(test, assert_instr(vcvtph2w))]
13469#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13470pub fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
13471    _mm512_mask_cvtph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13472}
13473
13474/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13475/// store the results in dst using writemask k (elements are copied from src when the corresponding
13476/// mask bit is not set).
13477///
13478/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi16)
13479#[inline]
13480#[target_feature(enable = "avx512fp16")]
13481#[cfg_attr(test, assert_instr(vcvtph2w))]
13482#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13483pub fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13484    unsafe {
13485        transmute(vcvtph2w_512(
13486            a,
13487            src.as_i16x32(),
13488            k,
13489            _MM_FROUND_CUR_DIRECTION,
13490        ))
13491    }
13492}
13493
13494/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13495/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13496///
13497/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi16)
13498#[inline]
13499#[target_feature(enable = "avx512fp16")]
13500#[cfg_attr(test, assert_instr(vcvtph2w))]
13501#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13502pub fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13503    _mm512_mask_cvtph_epi16(_mm512_setzero_si512(), k, a)
13504}
13505
13506/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13507/// store the results in dst.
13508///
13509/// Rounding is done according to the rounding parameter, which can be one of:
13510///
13511/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13512/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13513/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13514/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13515/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13516///
13517/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi16)
13518#[inline]
13519#[target_feature(enable = "avx512fp16")]
13520#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13521#[rustc_legacy_const_generics(1)]
13522#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13523pub fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m512i {
13524    static_assert_rounding!(ROUNDING);
13525    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
13526}
13527
13528/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13529/// store the results in dst using writemask k (elements are copied from src when the corresponding
13530/// mask bit is not set).
13531///
13532/// Rounding is done according to the rounding parameter, which can be one of:
13533///
13534/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13535/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13536/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13537/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13538/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13539///
13540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi16)
13541#[inline]
13542#[target_feature(enable = "avx512fp16")]
13543#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13544#[rustc_legacy_const_generics(3)]
13545#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13546pub fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
13547    src: __m512i,
13548    k: __mmask32,
13549    a: __m512h,
13550) -> __m512i {
13551    unsafe {
13552        static_assert_rounding!(ROUNDING);
13553        transmute(vcvtph2w_512(a, src.as_i16x32(), k, ROUNDING))
13554    }
13555}
13556
13557/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13558/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13559///
13560/// Rounding is done according to the rounding parameter, which can be one of:
13561///
13562/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13563/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13564/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13565/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13566/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13567///
13568/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi16)
13569#[inline]
13570#[target_feature(enable = "avx512fp16")]
13571#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13572#[rustc_legacy_const_generics(2)]
13573#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13574pub fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
13575    static_assert_rounding!(ROUNDING);
13576    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_setzero_si512(), k, a)
13577}
13578
13579/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13580/// and store the results in dst.
13581///
13582/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu16)
13583#[inline]
13584#[target_feature(enable = "avx512fp16,avx512vl")]
13585#[cfg_attr(test, assert_instr(vcvtph2uw))]
13586#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13587pub fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
13588    _mm_mask_cvtph_epu16(_mm_undefined_si128(), 0xff, a)
13589}
13590
13591/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13592/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13593/// mask bit is not set).
13594///
13595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu16)
13596#[inline]
13597#[target_feature(enable = "avx512fp16,avx512vl")]
13598#[cfg_attr(test, assert_instr(vcvtph2uw))]
13599#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13600pub fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13601    unsafe { transmute(vcvtph2uw_128(a, src.as_u16x8(), k)) }
13602}
13603
13604/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13605/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13606///
13607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu16)
13608#[inline]
13609#[target_feature(enable = "avx512fp16,avx512vl")]
13610#[cfg_attr(test, assert_instr(vcvtph2uw))]
13611#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13612pub fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
13613    _mm_mask_cvtph_epu16(_mm_setzero_si128(), k, a)
13614}
13615
13616/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13617/// and store the results in dst.
13618///
13619/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu16)
13620#[inline]
13621#[target_feature(enable = "avx512fp16,avx512vl")]
13622#[cfg_attr(test, assert_instr(vcvtph2uw))]
13623#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13624pub fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
13625    _mm256_mask_cvtph_epu16(_mm256_undefined_si256(), 0xffff, a)
13626}
13627
13628/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13629/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13630/// mask bit is not set).
13631///
13632/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu16)
13633#[inline]
13634#[target_feature(enable = "avx512fp16,avx512vl")]
13635#[cfg_attr(test, assert_instr(vcvtph2uw))]
13636#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13637pub fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13638    unsafe { transmute(vcvtph2uw_256(a, src.as_u16x16(), k)) }
13639}
13640
13641/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13642/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13643///
13644/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu16)
13645#[inline]
13646#[target_feature(enable = "avx512fp16,avx512vl")]
13647#[cfg_attr(test, assert_instr(vcvtph2uw))]
13648#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13649pub fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
13650    _mm256_mask_cvtph_epu16(_mm256_setzero_si256(), k, a)
13651}
13652
13653/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13654/// and store the results in dst.
13655///
13656/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu16)
13657#[inline]
13658#[target_feature(enable = "avx512fp16")]
13659#[cfg_attr(test, assert_instr(vcvtph2uw))]
13660#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13661pub fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
13662    _mm512_mask_cvtph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
13663}
13664
13665/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13666/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13667/// mask bit is not set).
13668///
13669/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu16)
13670#[inline]
13671#[target_feature(enable = "avx512fp16")]
13672#[cfg_attr(test, assert_instr(vcvtph2uw))]
13673#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13674pub fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13675    unsafe {
13676        transmute(vcvtph2uw_512(
13677            a,
13678            src.as_u16x32(),
13679            k,
13680            _MM_FROUND_CUR_DIRECTION,
13681        ))
13682    }
13683}
13684
13685/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13686/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13687///
13688/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu16)
13689#[inline]
13690#[target_feature(enable = "avx512fp16")]
13691#[cfg_attr(test, assert_instr(vcvtph2uw))]
13692#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13693pub fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
13694    _mm512_mask_cvtph_epu16(_mm512_setzero_si512(), k, a)
13695}
13696
13697/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13698/// and store the results in dst.
13699///
13700/// Rounding is done according to the rounding parameter, which can be one of:
13701///
13702/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13703/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13704/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13705/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13706/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13707///
13708/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu16)
13709#[inline]
13710#[target_feature(enable = "avx512fp16")]
13711#[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
13712#[rustc_legacy_const_generics(1)]
13713#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13714pub fn _mm512_cvt_roundph_epu16<const ROUNDING: i32>(a: __m512h) -> __m512i {
13715    static_assert_rounding!(ROUNDING);
13716    _mm512_mask_cvt_roundph_epu16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
13717}
13718
13719/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13720/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13721/// mask bit is not set).
13722///
13723/// Rounding is done according to the rounding parameter, which can be one of:
13724///
13725/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13726/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13727/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13728/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13729/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13730///
13731/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu16)
13732#[inline]
13733#[target_feature(enable = "avx512fp16")]
13734#[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
13735#[rustc_legacy_const_generics(3)]
13736#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13737pub fn _mm512_mask_cvt_roundph_epu16<const ROUNDING: i32>(
13738    src: __m512i,
13739    k: __mmask32,
13740    a: __m512h,
13741) -> __m512i {
13742    unsafe {
13743        static_assert_rounding!(ROUNDING);
13744        transmute(vcvtph2uw_512(a, src.as_u16x32(), k, ROUNDING))
13745    }
13746}
13747
13748/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13749/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13750///
13751/// Rounding is done according to the rounding parameter, which can be one of:
13752///
13753/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13754/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13755/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13756/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13757/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13758///
13759/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu16)
13760#[inline]
13761#[target_feature(enable = "avx512fp16")]
13762#[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
13763#[rustc_legacy_const_generics(2)]
13764#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13765pub fn _mm512_maskz_cvt_roundph_epu16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
13766    static_assert_rounding!(ROUNDING);
13767    _mm512_mask_cvt_roundph_epu16::<ROUNDING>(_mm512_setzero_si512(), k, a)
13768}
13769
13770/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13771/// truncation, and store the results in dst.
13772///
13773/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi16)
13774#[inline]
13775#[target_feature(enable = "avx512fp16,avx512vl")]
13776#[cfg_attr(test, assert_instr(vcvttph2w))]
13777#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13778pub fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
13779    _mm_mask_cvttph_epi16(_mm_undefined_si128(), 0xff, a)
13780}
13781
13782/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13783/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13784/// mask bit is not set).
13785///
13786/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi16)
13787#[inline]
13788#[target_feature(enable = "avx512fp16,avx512vl")]
13789#[cfg_attr(test, assert_instr(vcvttph2w))]
13790#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13791pub fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13792    unsafe { transmute(vcvttph2w_128(a, src.as_i16x8(), k)) }
13793}
13794
13795/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13796/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13797/// mask bit is not set).
13798///
13799/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi16)
13800#[inline]
13801#[target_feature(enable = "avx512fp16,avx512vl")]
13802#[cfg_attr(test, assert_instr(vcvttph2w))]
13803#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13804pub fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13805    _mm_mask_cvttph_epi16(_mm_setzero_si128(), k, a)
13806}
13807
13808/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13809/// truncation, and store the results in dst.
13810///
13811/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi16)
13812#[inline]
13813#[target_feature(enable = "avx512fp16,avx512vl")]
13814#[cfg_attr(test, assert_instr(vcvttph2w))]
13815#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13816pub fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
13817    _mm256_mask_cvttph_epi16(_mm256_undefined_si256(), 0xffff, a)
13818}
13819
13820/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13821/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13822/// mask bit is not set).
13823///
13824/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi16)
13825#[inline]
13826#[target_feature(enable = "avx512fp16,avx512vl")]
13827#[cfg_attr(test, assert_instr(vcvttph2w))]
13828#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13829pub fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13830    unsafe { transmute(vcvttph2w_256(a, src.as_i16x16(), k)) }
13831}
13832
13833/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13834/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13835/// mask bit is not set).
13836///
13837/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi16)
13838#[inline]
13839#[target_feature(enable = "avx512fp16,avx512vl")]
13840#[cfg_attr(test, assert_instr(vcvttph2w))]
13841#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13842pub fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13843    _mm256_mask_cvttph_epi16(_mm256_setzero_si256(), k, a)
13844}
13845
13846/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13847/// truncation, and store the results in dst.
13848///
13849/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi16)
13850#[inline]
13851#[target_feature(enable = "avx512fp16")]
13852#[cfg_attr(test, assert_instr(vcvttph2w))]
13853#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13854pub fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
13855    _mm512_mask_cvttph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13856}
13857
13858/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13859/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13860/// mask bit is not set).
13861///
13862/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi16)
13863#[inline]
13864#[target_feature(enable = "avx512fp16")]
13865#[cfg_attr(test, assert_instr(vcvttph2w))]
13866#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13867pub fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13868    unsafe {
13869        transmute(vcvttph2w_512(
13870            a,
13871            src.as_i16x32(),
13872            k,
13873            _MM_FROUND_CUR_DIRECTION,
13874        ))
13875    }
13876}
13877
13878/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13879/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13880/// mask bit is not set).
13881///
13882/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi16)
13883#[inline]
13884#[target_feature(enable = "avx512fp16")]
13885#[cfg_attr(test, assert_instr(vcvttph2w))]
13886#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13887pub fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13888    _mm512_mask_cvttph_epi16(_mm512_setzero_si512(), k, a)
13889}
13890
13891/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13892/// truncation, and store the results in dst.
13893///
13894/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13895///
13896/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi16)
13897#[inline]
13898#[target_feature(enable = "avx512fp16")]
13899#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13900#[rustc_legacy_const_generics(1)]
13901#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13902pub fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
13903    static_assert_sae!(SAE);
13904    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
13905}
13906
13907/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13908/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13909/// mask bit is not set).
13910///
13911/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13912///
13913/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi16)
13914#[inline]
13915#[target_feature(enable = "avx512fp16")]
13916#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13917#[rustc_legacy_const_generics(3)]
13918#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13919pub fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
13920    src: __m512i,
13921    k: __mmask32,
13922    a: __m512h,
13923) -> __m512i {
13924    unsafe {
13925        static_assert_sae!(SAE);
13926        transmute(vcvttph2w_512(a, src.as_i16x32(), k, SAE))
13927    }
13928}
13929
13930/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13931/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13932/// mask bit is not set).
13933///
13934/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13935///
13936/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi16)
13937#[inline]
13938#[target_feature(enable = "avx512fp16")]
13939#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13940#[rustc_legacy_const_generics(2)]
13941#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13942pub fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
13943    static_assert_sae!(SAE);
13944    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_setzero_si512(), k, a)
13945}
13946
13947/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13948/// truncation, and store the results in dst.
13949///
13950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu16)
13951#[inline]
13952#[target_feature(enable = "avx512fp16,avx512vl")]
13953#[cfg_attr(test, assert_instr(vcvttph2uw))]
13954#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13955pub fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
13956    _mm_mask_cvttph_epu16(_mm_undefined_si128(), 0xff, a)
13957}
13958
13959/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13960/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13961/// mask bit is not set).
13962///
13963/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu16)
13964#[inline]
13965#[target_feature(enable = "avx512fp16,avx512vl")]
13966#[cfg_attr(test, assert_instr(vcvttph2uw))]
13967#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13968pub fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13969    unsafe { transmute(vcvttph2uw_128(a, src.as_u16x8(), k)) }
13970}
13971
13972/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13973/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13974/// mask bit is not set).
13975///
13976/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu16)
13977#[inline]
13978#[target_feature(enable = "avx512fp16,avx512vl")]
13979#[cfg_attr(test, assert_instr(vcvttph2uw))]
13980#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13981pub fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
13982    _mm_mask_cvttph_epu16(_mm_setzero_si128(), k, a)
13983}
13984
13985/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13986/// truncation, and store the results in dst.
13987///
13988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu16)
13989#[inline]
13990#[target_feature(enable = "avx512fp16,avx512vl")]
13991#[cfg_attr(test, assert_instr(vcvttph2uw))]
13992#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13993pub fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
13994    _mm256_mask_cvttph_epu16(_mm256_undefined_si256(), 0xffff, a)
13995}
13996
13997/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13998/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13999/// mask bit is not set).
14000///
14001/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu16)
14002#[inline]
14003#[target_feature(enable = "avx512fp16,avx512vl")]
14004#[cfg_attr(test, assert_instr(vcvttph2uw))]
14005#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14006pub fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
14007    unsafe { transmute(vcvttph2uw_256(a, src.as_u16x16(), k)) }
14008}
14009
14010/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14011/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14012/// mask bit is not set).
14013///
14014/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu16)
14015#[inline]
14016#[target_feature(enable = "avx512fp16,avx512vl")]
14017#[cfg_attr(test, assert_instr(vcvttph2uw))]
14018#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14019pub fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
14020    _mm256_mask_cvttph_epu16(_mm256_setzero_si256(), k, a)
14021}
14022
14023/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14024/// truncation, and store the results in dst.
14025///
14026/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu16)
14027#[inline]
14028#[target_feature(enable = "avx512fp16")]
14029#[cfg_attr(test, assert_instr(vcvttph2uw))]
14030#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14031pub fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
14032    _mm512_mask_cvttph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
14033}
14034
14035/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14036/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14037/// mask bit is not set).
14038///
14039/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu16)
14040#[inline]
14041#[target_feature(enable = "avx512fp16")]
14042#[cfg_attr(test, assert_instr(vcvttph2uw))]
14043#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14044pub fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
14045    unsafe {
14046        transmute(vcvttph2uw_512(
14047            a,
14048            src.as_u16x32(),
14049            k,
14050            _MM_FROUND_CUR_DIRECTION,
14051        ))
14052    }
14053}
14054
14055/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14056/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14057/// mask bit is not set).
14058///
14059/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu16)
14060#[inline]
14061#[target_feature(enable = "avx512fp16")]
14062#[cfg_attr(test, assert_instr(vcvttph2uw))]
14063#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14064pub fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
14065    _mm512_mask_cvttph_epu16(_mm512_setzero_si512(), k, a)
14066}
14067
14068/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14069/// truncation, and store the results in dst.
14070///
14071/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14072///
14073/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu16)
14074#[inline]
14075#[target_feature(enable = "avx512fp16")]
14076#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14077#[rustc_legacy_const_generics(1)]
14078#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14079pub fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
14080    static_assert_sae!(SAE);
14081    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
14082}
14083
14084/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14085/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14086/// mask bit is not set).
14087///
14088/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14089///
14090/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu16)
14091#[inline]
14092#[target_feature(enable = "avx512fp16")]
14093#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14094#[rustc_legacy_const_generics(3)]
14095#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14096pub fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
14097    src: __m512i,
14098    k: __mmask32,
14099    a: __m512h,
14100) -> __m512i {
14101    unsafe {
14102        static_assert_sae!(SAE);
14103        transmute(vcvttph2uw_512(a, src.as_u16x32(), k, SAE))
14104    }
14105}
14106
14107/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14108/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14109/// mask bit is not set).
14110///
14111/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14112///
14113/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu16)
14114#[inline]
14115#[target_feature(enable = "avx512fp16")]
14116#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14117#[rustc_legacy_const_generics(2)]
14118#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14119pub fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
14120    static_assert_sae!(SAE);
14121    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
14122}
14123
14124/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14125/// results in dst.
14126///
14127/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi32)
14128#[inline]
14129#[target_feature(enable = "avx512fp16,avx512vl")]
14130#[cfg_attr(test, assert_instr(vcvtph2dq))]
14131#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14132pub fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
14133    _mm_mask_cvtph_epi32(_mm_undefined_si128(), 0xff, a)
14134}
14135
14136/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14137/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14138///
14139/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi32)
14140#[inline]
14141#[target_feature(enable = "avx512fp16,avx512vl")]
14142#[cfg_attr(test, assert_instr(vcvtph2dq))]
14143#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14144pub fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14145    unsafe { transmute(vcvtph2dq_128(a, src.as_i32x4(), k)) }
14146}
14147
14148/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14149/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14150///
14151/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi32)
14152#[inline]
14153#[target_feature(enable = "avx512fp16,avx512vl")]
14154#[cfg_attr(test, assert_instr(vcvtph2dq))]
14155#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14156pub fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14157    _mm_mask_cvtph_epi32(_mm_setzero_si128(), k, a)
14158}
14159
14160/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14161/// results in dst.
14162///
14163/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi32)
14164#[inline]
14165#[target_feature(enable = "avx512fp16,avx512vl")]
14166#[cfg_attr(test, assert_instr(vcvtph2dq))]
14167#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14168pub fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
14169    _mm256_mask_cvtph_epi32(_mm256_undefined_si256(), 0xff, a)
14170}
14171
14172/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14173/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14174///
14175/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi32)
14176#[inline]
14177#[target_feature(enable = "avx512fp16,avx512vl")]
14178#[cfg_attr(test, assert_instr(vcvtph2dq))]
14179#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14180pub fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14181    unsafe { transmute(vcvtph2dq_256(a, src.as_i32x8(), k)) }
14182}
14183
14184/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14185/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14186///
14187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi32)
14188#[inline]
14189#[target_feature(enable = "avx512fp16,avx512vl")]
14190#[cfg_attr(test, assert_instr(vcvtph2dq))]
14191#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14192pub fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14193    _mm256_mask_cvtph_epi32(_mm256_setzero_si256(), k, a)
14194}
14195
14196/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14197/// results in dst.
14198///
14199/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi32)
14200#[inline]
14201#[target_feature(enable = "avx512fp16")]
14202#[cfg_attr(test, assert_instr(vcvtph2dq))]
14203#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14204pub fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
14205    _mm512_mask_cvtph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14206}
14207
14208/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14209/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14210///
14211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi32)
14212#[inline]
14213#[target_feature(enable = "avx512fp16")]
14214#[cfg_attr(test, assert_instr(vcvtph2dq))]
14215#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14216pub fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14217    unsafe {
14218        transmute(vcvtph2dq_512(
14219            a,
14220            src.as_i32x16(),
14221            k,
14222            _MM_FROUND_CUR_DIRECTION,
14223        ))
14224    }
14225}
14226
14227/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14228/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14229///
14230/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi32)
14231#[inline]
14232#[target_feature(enable = "avx512fp16")]
14233#[cfg_attr(test, assert_instr(vcvtph2dq))]
14234#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14235pub fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14236    _mm512_mask_cvtph_epi32(_mm512_setzero_si512(), k, a)
14237}
14238
14239/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14240/// results in dst.
14241///
14242/// Rounding is done according to the rounding parameter, which can be one of:
14243///
14244/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14245/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14246/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14247/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14248/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14249///
14250/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi32)
14251#[inline]
14252#[target_feature(enable = "avx512fp16")]
14253#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14254#[rustc_legacy_const_generics(1)]
14255#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14256pub fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14257    static_assert_rounding!(ROUNDING);
14258    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14259}
14260
14261/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14262/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14263///
14264/// Rounding is done according to the rounding parameter, which can be one of:
14265///
14266/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14267/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14268/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14269/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14270/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14271///
14272/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi32)
14273#[inline]
14274#[target_feature(enable = "avx512fp16")]
14275#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14276#[rustc_legacy_const_generics(3)]
14277#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14278pub fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
14279    src: __m512i,
14280    k: __mmask16,
14281    a: __m256h,
14282) -> __m512i {
14283    unsafe {
14284        static_assert_rounding!(ROUNDING);
14285        transmute(vcvtph2dq_512(a, src.as_i32x16(), k, ROUNDING))
14286    }
14287}
14288
14289/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14290/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14291///
14292/// Rounding is done according to the rounding parameter, which can be one of:
14293///
14294/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14295/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14296/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14297/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14298/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14299///
14300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi32)
14301#[inline]
14302#[target_feature(enable = "avx512fp16")]
14303#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14304#[rustc_legacy_const_generics(2)]
14305#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14306pub fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14307    static_assert_rounding!(ROUNDING);
14308    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14309}
14310
14311/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14312/// the result in dst.
14313///
14314/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_i32)
14315#[inline]
14316#[target_feature(enable = "avx512fp16")]
14317#[cfg_attr(test, assert_instr(vcvtsh2si))]
14318#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14319pub fn _mm_cvtsh_i32(a: __m128h) -> i32 {
14320    unsafe { vcvtsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
14321}
14322
14323/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14324/// the result in dst.
14325///
14326/// Rounding is done according to the rounding parameter, which can be one of:
14327///
14328/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14329/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14330/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14331/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14332/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14333///
14334/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_i32)
14335#[inline]
14336#[target_feature(enable = "avx512fp16")]
14337#[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
14338#[rustc_legacy_const_generics(1)]
14339#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14340pub fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
14341    unsafe {
14342        static_assert_rounding!(ROUNDING);
14343        vcvtsh2si32(a, ROUNDING)
14344    }
14345}
14346
14347/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14348/// results in dst.
14349///
14350/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu32)
14351#[inline]
14352#[target_feature(enable = "avx512fp16,avx512vl")]
14353#[cfg_attr(test, assert_instr(vcvtph2udq))]
14354#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14355pub fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
14356    _mm_mask_cvtph_epu32(_mm_undefined_si128(), 0xff, a)
14357}
14358
14359/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14360/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14361///
14362/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu32)
14363#[inline]
14364#[target_feature(enable = "avx512fp16,avx512vl")]
14365#[cfg_attr(test, assert_instr(vcvtph2udq))]
14366#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14367pub fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14368    unsafe { transmute(vcvtph2udq_128(a, src.as_u32x4(), k)) }
14369}
14370
14371/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14372/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14373///
14374/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu32)
14375#[inline]
14376#[target_feature(enable = "avx512fp16,avx512vl")]
14377#[cfg_attr(test, assert_instr(vcvtph2udq))]
14378#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14379pub fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14380    _mm_mask_cvtph_epu32(_mm_setzero_si128(), k, a)
14381}
14382
14383/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14384/// the results in dst.
14385///
14386/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu32)
14387#[inline]
14388#[target_feature(enable = "avx512fp16,avx512vl")]
14389#[cfg_attr(test, assert_instr(vcvtph2udq))]
14390#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14391pub fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
14392    _mm256_mask_cvtph_epu32(_mm256_undefined_si256(), 0xff, a)
14393}
14394
14395/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14396/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14397///
14398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu32)
14399#[inline]
14400#[target_feature(enable = "avx512fp16,avx512vl")]
14401#[cfg_attr(test, assert_instr(vcvtph2udq))]
14402#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14403pub fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14404    unsafe { transmute(vcvtph2udq_256(a, src.as_u32x8(), k)) }
14405}
14406
14407/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14408/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14409///
14410/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu32)
14411#[inline]
14412#[target_feature(enable = "avx512fp16,avx512vl")]
14413#[cfg_attr(test, assert_instr(vcvtph2udq))]
14414#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14415pub fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14416    _mm256_mask_cvtph_epu32(_mm256_setzero_si256(), k, a)
14417}
14418
14419/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14420/// the results in dst.
14421///
14422/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu32)
14423#[inline]
14424#[target_feature(enable = "avx512fp16")]
14425#[cfg_attr(test, assert_instr(vcvtph2udq))]
14426#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14427pub fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
14428    _mm512_mask_cvtph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14429}
14430
14431/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14432/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14433///
14434/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu32)
14435#[inline]
14436#[target_feature(enable = "avx512fp16")]
14437#[cfg_attr(test, assert_instr(vcvtph2udq))]
14438#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14439pub fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14440    unsafe {
14441        transmute(vcvtph2udq_512(
14442            a,
14443            src.as_u32x16(),
14444            k,
14445            _MM_FROUND_CUR_DIRECTION,
14446        ))
14447    }
14448}
14449
14450/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14451/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14452///
14453/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu32)
14454#[inline]
14455#[target_feature(enable = "avx512fp16")]
14456#[cfg_attr(test, assert_instr(vcvtph2udq))]
14457#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14458pub fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14459    _mm512_mask_cvtph_epu32(_mm512_setzero_si512(), k, a)
14460}
14461
14462/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14463/// the results in dst.
14464///
14465/// Rounding is done according to the rounding parameter, which can be one of:
14466///
14467/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14468/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14469/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14470/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14471/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14472///
14473/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu32)
14474#[inline]
14475#[target_feature(enable = "avx512fp16")]
14476#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14477#[rustc_legacy_const_generics(1)]
14478#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14479pub fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14480    static_assert_rounding!(ROUNDING);
14481    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14482}
14483
14484/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14485/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14486///
14487/// Rounding is done according to the rounding parameter, which can be one of:
14488///
14489/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14490/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14491/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14492/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14493/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14494///
14495/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu32)
14496#[inline]
14497#[target_feature(enable = "avx512fp16")]
14498#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14499#[rustc_legacy_const_generics(3)]
14500#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14501pub fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
14502    src: __m512i,
14503    k: __mmask16,
14504    a: __m256h,
14505) -> __m512i {
14506    unsafe {
14507        static_assert_rounding!(ROUNDING);
14508        transmute(vcvtph2udq_512(a, src.as_u32x16(), k, ROUNDING))
14509    }
14510}
14511
14512/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14513/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14514///
14515/// Rounding is done according to the rounding parameter, which can be one of:
14516///
14517/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14518/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14519/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14520/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14521/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14522///
14523/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu32)
14524#[inline]
14525#[target_feature(enable = "avx512fp16")]
14526#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14527#[rustc_legacy_const_generics(2)]
14528#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14529pub fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14530    static_assert_rounding!(ROUNDING);
14531    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14532}
14533
14534/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14535/// the result in dst.
14536///
14537/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_u32)
14538#[inline]
14539#[target_feature(enable = "avx512fp16")]
14540#[cfg_attr(test, assert_instr(vcvtsh2usi))]
14541#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14542pub fn _mm_cvtsh_u32(a: __m128h) -> u32 {
14543    unsafe { vcvtsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
14544}
14545
14546/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14547/// the result in dst.
14548///
14549/// Rounding is done according to the rounding parameter, which can be one of:
14550///
14551/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14552/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14553/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14554/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14555/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14556///
14557/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_u32)
14558#[inline]
14559#[target_feature(enable = "avx512fp16")]
14560#[cfg_attr(test, assert_instr(vcvtsh2usi, ROUNDING = 8))]
14561#[rustc_legacy_const_generics(1)]
14562#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14563pub fn _mm_cvt_roundsh_u32<const ROUNDING: i32>(a: __m128h) -> u32 {
14564    unsafe {
14565        static_assert_rounding!(ROUNDING);
14566        vcvtsh2usi32(a, ROUNDING)
14567    }
14568}
14569
14570/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14571/// store the results in dst.
14572///
14573/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi32)
14574#[inline]
14575#[target_feature(enable = "avx512fp16,avx512vl")]
14576#[cfg_attr(test, assert_instr(vcvttph2dq))]
14577#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14578pub fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
14579    _mm_mask_cvttph_epi32(_mm_undefined_si128(), 0xff, a)
14580}
14581
14582/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14583/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14584///
14585/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi32)
14586#[inline]
14587#[target_feature(enable = "avx512fp16,avx512vl")]
14588#[cfg_attr(test, assert_instr(vcvttph2dq))]
14589#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14590pub fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14591    unsafe { transmute(vcvttph2dq_128(a, src.as_i32x4(), k)) }
14592}
14593
14594/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14595/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14596///
14597/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi32)
14598#[inline]
14599#[target_feature(enable = "avx512fp16,avx512vl")]
14600#[cfg_attr(test, assert_instr(vcvttph2dq))]
14601#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14602pub fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14603    _mm_mask_cvttph_epi32(_mm_setzero_si128(), k, a)
14604}
14605
14606/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14607/// store the results in dst.
14608///
14609/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi32)
14610#[inline]
14611#[target_feature(enable = "avx512fp16,avx512vl")]
14612#[cfg_attr(test, assert_instr(vcvttph2dq))]
14613#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14614pub fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
14615    _mm256_mask_cvttph_epi32(_mm256_undefined_si256(), 0xff, a)
14616}
14617
14618/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14619/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14620///
14621/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi32)
14622#[inline]
14623#[target_feature(enable = "avx512fp16,avx512vl")]
14624#[cfg_attr(test, assert_instr(vcvttph2dq))]
14625#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14626pub fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14627    unsafe { transmute(vcvttph2dq_256(a, src.as_i32x8(), k)) }
14628}
14629
14630/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14631/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14632///
14633/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi32)
14634#[inline]
14635#[target_feature(enable = "avx512fp16,avx512vl")]
14636#[cfg_attr(test, assert_instr(vcvttph2dq))]
14637#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14638pub fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14639    _mm256_mask_cvttph_epi32(_mm256_setzero_si256(), k, a)
14640}
14641
14642/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14643/// store the results in dst.
14644///
14645/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi32)
14646#[inline]
14647#[target_feature(enable = "avx512fp16")]
14648#[cfg_attr(test, assert_instr(vcvttph2dq))]
14649#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14650pub fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
14651    _mm512_mask_cvttph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14652}
14653
14654/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14655/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14656///
14657/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi32)
14658#[inline]
14659#[target_feature(enable = "avx512fp16")]
14660#[cfg_attr(test, assert_instr(vcvttph2dq))]
14661#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14662pub fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14663    unsafe {
14664        transmute(vcvttph2dq_512(
14665            a,
14666            src.as_i32x16(),
14667            k,
14668            _MM_FROUND_CUR_DIRECTION,
14669        ))
14670    }
14671}
14672
14673/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14674/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14675///
14676/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi32)
14677#[inline]
14678#[target_feature(enable = "avx512fp16")]
14679#[cfg_attr(test, assert_instr(vcvttph2dq))]
14680#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14681pub fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14682    _mm512_mask_cvttph_epi32(_mm512_setzero_si512(), k, a)
14683}
14684
14685/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14686/// store the results in dst.
14687///
14688/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14689///
14690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi32)
14691#[inline]
14692#[target_feature(enable = "avx512fp16")]
14693#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14694#[rustc_legacy_const_generics(1)]
14695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14696pub fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
14697    static_assert_sae!(SAE);
14698    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14699}
14700
14701/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14702/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14703///
14704/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14705///
14706/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi32)
14707#[inline]
14708#[target_feature(enable = "avx512fp16")]
14709#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14710#[rustc_legacy_const_generics(3)]
14711#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14712pub fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
14713    src: __m512i,
14714    k: __mmask16,
14715    a: __m256h,
14716) -> __m512i {
14717    unsafe {
14718        static_assert_sae!(SAE);
14719        transmute(vcvttph2dq_512(a, src.as_i32x16(), k, SAE))
14720    }
14721}
14722
14723/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14724/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14725///
14726/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14727///
14728/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi32)
14729#[inline]
14730#[target_feature(enable = "avx512fp16")]
14731#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14732#[rustc_legacy_const_generics(2)]
14733#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14734pub fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
14735    static_assert_sae!(SAE);
14736    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_setzero_si512(), k, a)
14737}
14738
14739/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
14740/// the result in dst.
14741///
14742/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_i32)
14743#[inline]
14744#[target_feature(enable = "avx512fp16")]
14745#[cfg_attr(test, assert_instr(vcvttsh2si))]
14746#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14747pub fn _mm_cvttsh_i32(a: __m128h) -> i32 {
14748    unsafe { vcvttsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
14749}
14750
14751/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
14752/// the result in dst.
14753///
14754/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14755///
14756/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_i32)
14757#[inline]
14758#[target_feature(enable = "avx512fp16")]
14759#[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
14760#[rustc_legacy_const_generics(1)]
14761#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14762pub fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
14763    unsafe {
14764        static_assert_sae!(SAE);
14765        vcvttsh2si32(a, SAE)
14766    }
14767}
14768
14769/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14770/// store the results in dst.
14771///
14772/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu32)
14773#[inline]
14774#[target_feature(enable = "avx512fp16,avx512vl")]
14775#[cfg_attr(test, assert_instr(vcvttph2udq))]
14776#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14777pub fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
14778    _mm_mask_cvttph_epu32(_mm_undefined_si128(), 0xff, a)
14779}
14780
14781/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14782/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14783///
14784/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu32)
14785#[inline]
14786#[target_feature(enable = "avx512fp16,avx512vl")]
14787#[cfg_attr(test, assert_instr(vcvttph2udq))]
14788#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14789pub fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14790    unsafe { transmute(vcvttph2udq_128(a, src.as_u32x4(), k)) }
14791}
14792
14793/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14794/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14795///
14796/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu32)
14797#[inline]
14798#[target_feature(enable = "avx512fp16,avx512vl")]
14799#[cfg_attr(test, assert_instr(vcvttph2udq))]
14800#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14801pub fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14802    _mm_mask_cvttph_epu32(_mm_setzero_si128(), k, a)
14803}
14804
14805/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14806/// store the results in dst.
14807///
14808/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu32)
14809#[inline]
14810#[target_feature(enable = "avx512fp16,avx512vl")]
14811#[cfg_attr(test, assert_instr(vcvttph2udq))]
14812#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14813pub fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
14814    _mm256_mask_cvttph_epu32(_mm256_undefined_si256(), 0xff, a)
14815}
14816
14817/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14818/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14819///
14820/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu32)
14821#[inline]
14822#[target_feature(enable = "avx512fp16,avx512vl")]
14823#[cfg_attr(test, assert_instr(vcvttph2udq))]
14824#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14825pub fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14826    unsafe { transmute(vcvttph2udq_256(a, src.as_u32x8(), k)) }
14827}
14828
14829/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14830/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14831///
14832/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu32)
14833#[inline]
14834#[target_feature(enable = "avx512fp16,avx512vl")]
14835#[cfg_attr(test, assert_instr(vcvttph2udq))]
14836#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14837pub fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14838    _mm256_mask_cvttph_epu32(_mm256_setzero_si256(), k, a)
14839}
14840
14841/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14842/// store the results in dst.
14843///
14844/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu32)
14845#[inline]
14846#[target_feature(enable = "avx512fp16")]
14847#[cfg_attr(test, assert_instr(vcvttph2udq))]
14848#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14849pub fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
14850    _mm512_mask_cvttph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14851}
14852
14853/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14854/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14855///
14856/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu32)
14857#[inline]
14858#[target_feature(enable = "avx512fp16")]
14859#[cfg_attr(test, assert_instr(vcvttph2udq))]
14860#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14861pub fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14862    unsafe {
14863        transmute(vcvttph2udq_512(
14864            a,
14865            src.as_u32x16(),
14866            k,
14867            _MM_FROUND_CUR_DIRECTION,
14868        ))
14869    }
14870}
14871
14872/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14873/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14874///
14875/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu32)
14876#[inline]
14877#[target_feature(enable = "avx512fp16")]
14878#[cfg_attr(test, assert_instr(vcvttph2udq))]
14879#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14880pub fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14881    _mm512_mask_cvttph_epu32(_mm512_setzero_si512(), k, a)
14882}
14883
14884/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14885/// store the results in dst.
14886///
14887/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14888///
14889/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu32)
14890#[inline]
14891#[target_feature(enable = "avx512fp16")]
14892#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14893#[rustc_legacy_const_generics(1)]
14894#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14895pub fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
14896    static_assert_sae!(SAE);
14897    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14898}
14899
14900/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14901/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14902///
14903/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14904///
14905/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu32)
14906#[inline]
14907#[target_feature(enable = "avx512fp16")]
14908#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14909#[rustc_legacy_const_generics(3)]
14910#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14911pub fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
14912    src: __m512i,
14913    k: __mmask16,
14914    a: __m256h,
14915) -> __m512i {
14916    unsafe {
14917        static_assert_sae!(SAE);
14918        transmute(vcvttph2udq_512(a, src.as_u32x16(), k, SAE))
14919    }
14920}
14921
14922/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14923/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14924///
14925/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14926///
14927/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu32)
14928#[inline]
14929#[target_feature(enable = "avx512fp16")]
14930#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14931#[rustc_legacy_const_generics(2)]
14932#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14933pub fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
14934    static_assert_sae!(SAE);
14935    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_setzero_si512(), k, a)
14936}
14937
14938/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
14939/// the result in dst.
14940///
14941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_u32)
14942#[inline]
14943#[target_feature(enable = "avx512fp16")]
14944#[cfg_attr(test, assert_instr(vcvttsh2usi))]
14945#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14946pub fn _mm_cvttsh_u32(a: __m128h) -> u32 {
14947    unsafe { vcvttsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
14948}
14949
14950/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
14951/// the result in dst.
14952///
14953/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14954///
14955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_u32)
14956#[inline]
14957#[target_feature(enable = "avx512fp16")]
14958#[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
14959#[rustc_legacy_const_generics(1)]
14960#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14961pub fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
14962    unsafe {
14963        static_assert_sae!(SAE);
14964        vcvttsh2usi32(a, SAE)
14965    }
14966}
14967
14968/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14969/// store the results in dst.
14970///
14971/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi64)
14972#[inline]
14973#[target_feature(enable = "avx512fp16,avx512vl")]
14974#[cfg_attr(test, assert_instr(vcvtph2qq))]
14975#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14976pub fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
14977    _mm_mask_cvtph_epi64(_mm_undefined_si128(), 0xff, a)
14978}
14979
14980/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14981/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14982///
14983/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi64)
14984#[inline]
14985#[target_feature(enable = "avx512fp16,avx512vl")]
14986#[cfg_attr(test, assert_instr(vcvtph2qq))]
14987#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14988pub fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14989    unsafe { transmute(vcvtph2qq_128(a, src.as_i64x2(), k)) }
14990}
14991
14992/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14993/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14994///
14995/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi64)
14996#[inline]
14997#[target_feature(enable = "avx512fp16,avx512vl")]
14998#[cfg_attr(test, assert_instr(vcvtph2qq))]
14999#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15000pub fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15001    _mm_mask_cvtph_epi64(_mm_setzero_si128(), k, a)
15002}
15003
15004/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15005/// store the results in dst.
15006///
15007/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi64)
15008#[inline]
15009#[target_feature(enable = "avx512fp16,avx512vl")]
15010#[cfg_attr(test, assert_instr(vcvtph2qq))]
15011#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15012pub fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
15013    _mm256_mask_cvtph_epi64(_mm256_undefined_si256(), 0xff, a)
15014}
15015
15016/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15017/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15018///
15019/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi64)
15020#[inline]
15021#[target_feature(enable = "avx512fp16,avx512vl")]
15022#[cfg_attr(test, assert_instr(vcvtph2qq))]
15023#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15024pub fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15025    unsafe { transmute(vcvtph2qq_256(a, src.as_i64x4(), k)) }
15026}
15027
15028/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15029/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15030///
15031/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi64)
15032#[inline]
15033#[target_feature(enable = "avx512fp16,avx512vl")]
15034#[cfg_attr(test, assert_instr(vcvtph2qq))]
15035#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15036pub fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15037    _mm256_mask_cvtph_epi64(_mm256_setzero_si256(), k, a)
15038}
15039
15040/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15041/// store the results in dst.
15042///
15043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi64)
15044#[inline]
15045#[target_feature(enable = "avx512fp16")]
15046#[cfg_attr(test, assert_instr(vcvtph2qq))]
15047#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15048pub fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
15049    _mm512_mask_cvtph_epi64(_mm512_undefined_epi32(), 0xff, a)
15050}
15051
15052/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15053/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15054///
15055/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi64)
15056#[inline]
15057#[target_feature(enable = "avx512fp16")]
15058#[cfg_attr(test, assert_instr(vcvtph2qq))]
15059#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15060pub fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15061    unsafe {
15062        transmute(vcvtph2qq_512(
15063            a,
15064            src.as_i64x8(),
15065            k,
15066            _MM_FROUND_CUR_DIRECTION,
15067        ))
15068    }
15069}
15070
15071/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15072/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15073///
15074/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi64)
15075#[inline]
15076#[target_feature(enable = "avx512fp16")]
15077#[cfg_attr(test, assert_instr(vcvtph2qq))]
15078#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15079pub fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15080    _mm512_mask_cvtph_epi64(_mm512_setzero_si512(), k, a)
15081}
15082
15083/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15084/// store the results in dst.
15085///
15086/// Rounding is done according to the rounding parameter, which can be one of:
15087///
15088/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15089/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15090/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15091/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15092/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15093///
15094/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi64)
15095#[inline]
15096#[target_feature(enable = "avx512fp16")]
15097#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15098#[rustc_legacy_const_generics(1)]
15099#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15100pub fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15101    static_assert_rounding!(ROUNDING);
15102    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15103}
15104
15105/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15106/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15107///
15108/// Rounding is done according to the rounding parameter, which can be one of:
15109///
15110/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15111/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15112/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15113/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15114/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15115///
15116/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi64)
15117#[inline]
15118#[target_feature(enable = "avx512fp16")]
15119#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15120#[rustc_legacy_const_generics(3)]
15121#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15122pub fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
15123    src: __m512i,
15124    k: __mmask8,
15125    a: __m128h,
15126) -> __m512i {
15127    unsafe {
15128        static_assert_rounding!(ROUNDING);
15129        transmute(vcvtph2qq_512(a, src.as_i64x8(), k, ROUNDING))
15130    }
15131}
15132
15133/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15134/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15135///
15136/// Rounding is done according to the rounding parameter, which can be one of:
15137///
15138/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15139/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15140/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15141/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15142/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15143///
15144/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi64)
15145#[inline]
15146#[target_feature(enable = "avx512fp16")]
15147#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15148#[rustc_legacy_const_generics(2)]
15149#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15150pub fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15151    static_assert_rounding!(ROUNDING);
15152    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15153}
15154
15155/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15156/// store the results in dst.
15157///
15158/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu64)
15159#[inline]
15160#[target_feature(enable = "avx512fp16,avx512vl")]
15161#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15162#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15163pub fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
15164    _mm_mask_cvtph_epu64(_mm_undefined_si128(), 0xff, a)
15165}
15166
15167/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15168/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15169///
15170/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu64)
15171#[inline]
15172#[target_feature(enable = "avx512fp16,avx512vl")]
15173#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15174#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15175pub fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15176    unsafe { transmute(vcvtph2uqq_128(a, src.as_u64x2(), k)) }
15177}
15178
15179/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15180/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15181///
15182/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu64)
15183#[inline]
15184#[target_feature(enable = "avx512fp16,avx512vl")]
15185#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15186#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15187pub fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15188    _mm_mask_cvtph_epu64(_mm_setzero_si128(), k, a)
15189}
15190
15191/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15192/// store the results in dst.
15193///
15194/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu64)
15195#[inline]
15196#[target_feature(enable = "avx512fp16,avx512vl")]
15197#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15198#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15199pub fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
15200    _mm256_mask_cvtph_epu64(_mm256_undefined_si256(), 0xff, a)
15201}
15202
15203/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15204/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15205///
15206/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu64)
15207#[inline]
15208#[target_feature(enable = "avx512fp16,avx512vl")]
15209#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15210#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15211pub fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15212    unsafe { transmute(vcvtph2uqq_256(a, src.as_u64x4(), k)) }
15213}
15214
15215/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15216/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15217///
15218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu64)
15219#[inline]
15220#[target_feature(enable = "avx512fp16,avx512vl")]
15221#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15222#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15223pub fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15224    _mm256_mask_cvtph_epu64(_mm256_setzero_si256(), k, a)
15225}
15226
15227/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15228/// store the results in dst.
15229///
15230/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu64)
15231#[inline]
15232#[target_feature(enable = "avx512fp16")]
15233#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15234#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15235pub fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
15236    _mm512_mask_cvtph_epu64(_mm512_undefined_epi32(), 0xff, a)
15237}
15238
15239/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15240/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15241///
15242/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu64)
15243#[inline]
15244#[target_feature(enable = "avx512fp16")]
15245#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15246#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15247pub fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15248    unsafe {
15249        transmute(vcvtph2uqq_512(
15250            a,
15251            src.as_u64x8(),
15252            k,
15253            _MM_FROUND_CUR_DIRECTION,
15254        ))
15255    }
15256}
15257
15258/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15259/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15260///
15261/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu64)
15262#[inline]
15263#[target_feature(enable = "avx512fp16")]
15264#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15265#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15266pub fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15267    _mm512_mask_cvtph_epu64(_mm512_setzero_si512(), k, a)
15268}
15269
15270/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15271/// store the results in dst.
15272///
15273/// Rounding is done according to the rounding parameter, which can be one of:
15274///
15275/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15276/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15277/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15278/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15279/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15280///
15281/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu64)
15282#[inline]
15283#[target_feature(enable = "avx512fp16")]
15284#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15285#[rustc_legacy_const_generics(1)]
15286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15287pub fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15288    static_assert_rounding!(ROUNDING);
15289    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15290}
15291
15292/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15293/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15294///
15295/// Rounding is done according to the rounding parameter, which can be one of:
15296///
15297/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15298/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15299/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15300/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15301/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15302///
15303/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu64)
15304#[inline]
15305#[target_feature(enable = "avx512fp16")]
15306#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15307#[rustc_legacy_const_generics(3)]
15308#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15309pub fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
15310    src: __m512i,
15311    k: __mmask8,
15312    a: __m128h,
15313) -> __m512i {
15314    unsafe {
15315        static_assert_rounding!(ROUNDING);
15316        transmute(vcvtph2uqq_512(a, src.as_u64x8(), k, ROUNDING))
15317    }
15318}
15319
15320/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15321/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15322///
15323/// Rounding is done according to the rounding parameter, which can be one of:
15324///
15325/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15326/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15327/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15328/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15329/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15330///
15331/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu64)
15332#[inline]
15333#[target_feature(enable = "avx512fp16")]
15334#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15335#[rustc_legacy_const_generics(2)]
15336#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15337pub fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15338    static_assert_rounding!(ROUNDING);
15339    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15340}
15341
15342/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15343/// store the results in dst.
15344///
15345/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi64)
15346#[inline]
15347#[target_feature(enable = "avx512fp16,avx512vl")]
15348#[cfg_attr(test, assert_instr(vcvttph2qq))]
15349#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15350pub fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
15351    _mm_mask_cvttph_epi64(_mm_undefined_si128(), 0xff, a)
15352}
15353
15354/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15355/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15356///
15357/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi64)
15358#[inline]
15359#[target_feature(enable = "avx512fp16,avx512vl")]
15360#[cfg_attr(test, assert_instr(vcvttph2qq))]
15361#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15362pub fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15363    unsafe { transmute(vcvttph2qq_128(a, src.as_i64x2(), k)) }
15364}
15365
15366/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15367/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15368///
15369/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi64)
15370#[inline]
15371#[target_feature(enable = "avx512fp16,avx512vl")]
15372#[cfg_attr(test, assert_instr(vcvttph2qq))]
15373#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15374pub fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15375    _mm_mask_cvttph_epi64(_mm_setzero_si128(), k, a)
15376}
15377
15378/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15379/// store the results in dst.
15380///
15381/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi64)
15382#[inline]
15383#[target_feature(enable = "avx512fp16,avx512vl")]
15384#[cfg_attr(test, assert_instr(vcvttph2qq))]
15385#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15386pub fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
15387    _mm256_mask_cvttph_epi64(_mm256_undefined_si256(), 0xff, a)
15388}
15389
15390/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15391/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15392///
15393/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi64)
15394#[inline]
15395#[target_feature(enable = "avx512fp16,avx512vl")]
15396#[cfg_attr(test, assert_instr(vcvttph2qq))]
15397#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15398pub fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15399    unsafe { transmute(vcvttph2qq_256(a, src.as_i64x4(), k)) }
15400}
15401
15402/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15403/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15404///
15405/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi64)
15406#[inline]
15407#[target_feature(enable = "avx512fp16,avx512vl")]
15408#[cfg_attr(test, assert_instr(vcvttph2qq))]
15409#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15410pub fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15411    _mm256_mask_cvttph_epi64(_mm256_setzero_si256(), k, a)
15412}
15413
15414/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15415/// store the results in dst.
15416///
15417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi64)
15418#[inline]
15419#[target_feature(enable = "avx512fp16")]
15420#[cfg_attr(test, assert_instr(vcvttph2qq))]
15421#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15422pub fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
15423    _mm512_mask_cvttph_epi64(_mm512_undefined_epi32(), 0xff, a)
15424}
15425
15426/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15427/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15428///
15429/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi64)
15430#[inline]
15431#[target_feature(enable = "avx512fp16")]
15432#[cfg_attr(test, assert_instr(vcvttph2qq))]
15433#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15434pub fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15435    unsafe {
15436        transmute(vcvttph2qq_512(
15437            a,
15438            src.as_i64x8(),
15439            k,
15440            _MM_FROUND_CUR_DIRECTION,
15441        ))
15442    }
15443}
15444
15445/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15446/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15447///
15448/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi64)
15449#[inline]
15450#[target_feature(enable = "avx512fp16")]
15451#[cfg_attr(test, assert_instr(vcvttph2qq))]
15452#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15453pub fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15454    _mm512_mask_cvttph_epi64(_mm512_setzero_si512(), k, a)
15455}
15456
15457/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15458/// store the results in dst.
15459///
15460/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15461///
15462/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi64)
15463#[inline]
15464#[target_feature(enable = "avx512fp16")]
15465#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15466#[rustc_legacy_const_generics(1)]
15467#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15468pub fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
15469    static_assert_sae!(SAE);
15470    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15471}
15472
15473/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15474/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15475///
15476/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15477///
15478/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi64)
15479#[inline]
15480#[target_feature(enable = "avx512fp16")]
15481#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15482#[rustc_legacy_const_generics(3)]
15483#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15484pub fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
15485    src: __m512i,
15486    k: __mmask8,
15487    a: __m128h,
15488) -> __m512i {
15489    unsafe {
15490        static_assert_sae!(SAE);
15491        transmute(vcvttph2qq_512(a, src.as_i64x8(), k, SAE))
15492    }
15493}
15494
15495/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15496/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15497///
15498/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15499///
15500/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi64)
15501#[inline]
15502#[target_feature(enable = "avx512fp16")]
15503#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15504#[rustc_legacy_const_generics(2)]
15505#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15506pub fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15507    static_assert_sae!(SAE);
15508    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_setzero_si512(), k, a)
15509}
15510
15511/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15512/// store the results in dst.
15513///
15514/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu64)
15515#[inline]
15516#[target_feature(enable = "avx512fp16,avx512vl")]
15517#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15518#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15519pub fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
15520    _mm_mask_cvttph_epu64(_mm_undefined_si128(), 0xff, a)
15521}
15522
15523/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15524/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15525///
15526/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu64)
15527#[inline]
15528#[target_feature(enable = "avx512fp16,avx512vl")]
15529#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15530#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15531pub fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15532    unsafe { transmute(vcvttph2uqq_128(a, src.as_u64x2(), k)) }
15533}
15534
15535/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15536/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15537///
15538/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu64)
15539#[inline]
15540#[target_feature(enable = "avx512fp16,avx512vl")]
15541#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15542#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15543pub fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15544    _mm_mask_cvttph_epu64(_mm_setzero_si128(), k, a)
15545}
15546
15547/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15548/// store the results in dst.
15549///
15550/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu64)
15551#[inline]
15552#[target_feature(enable = "avx512fp16,avx512vl")]
15553#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15554#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15555pub fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
15556    _mm256_mask_cvttph_epu64(_mm256_undefined_si256(), 0xff, a)
15557}
15558
15559/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15560/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15561///
15562/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu64)
15563#[inline]
15564#[target_feature(enable = "avx512fp16,avx512vl")]
15565#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15566#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15567pub fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15568    unsafe { transmute(vcvttph2uqq_256(a, src.as_u64x4(), k)) }
15569}
15570
15571/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15572/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15573///
15574/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu64)
15575#[inline]
15576#[target_feature(enable = "avx512fp16,avx512vl")]
15577#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15578#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15579pub fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15580    _mm256_mask_cvttph_epu64(_mm256_setzero_si256(), k, a)
15581}
15582
15583/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15584/// store the results in dst.
15585///
15586/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu64)
15587#[inline]
15588#[target_feature(enable = "avx512fp16")]
15589#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15590#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15591pub fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
15592    _mm512_mask_cvttph_epu64(_mm512_undefined_epi32(), 0xff, a)
15593}
15594
15595/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15596/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15597///
15598/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu64)
15599#[inline]
15600#[target_feature(enable = "avx512fp16")]
15601#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15602#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15603pub fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15604    unsafe {
15605        transmute(vcvttph2uqq_512(
15606            a,
15607            src.as_u64x8(),
15608            k,
15609            _MM_FROUND_CUR_DIRECTION,
15610        ))
15611    }
15612}
15613
15614/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15615/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15616///
15617/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu64)
15618#[inline]
15619#[target_feature(enable = "avx512fp16")]
15620#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15621#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15622pub fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15623    _mm512_mask_cvttph_epu64(_mm512_setzero_si512(), k, a)
15624}
15625
15626/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15627/// store the results in dst.
15628///
15629/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15630///
15631/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu64)
15632#[inline]
15633#[target_feature(enable = "avx512fp16")]
15634#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15635#[rustc_legacy_const_generics(1)]
15636#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15637pub fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
15638    static_assert_sae!(SAE);
15639    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15640}
15641
15642/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15643/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15644///
15645/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15646///
15647/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu64)
15648#[inline]
15649#[target_feature(enable = "avx512fp16")]
15650#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15651#[rustc_legacy_const_generics(3)]
15652#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15653pub fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
15654    src: __m512i,
15655    k: __mmask8,
15656    a: __m128h,
15657) -> __m512i {
15658    unsafe {
15659        static_assert_sae!(SAE);
15660        transmute(vcvttph2uqq_512(a, src.as_u64x8(), k, SAE))
15661    }
15662}
15663
15664/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15665/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15666///
15667/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15668///
15669/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu64)
15670#[inline]
15671#[target_feature(enable = "avx512fp16")]
15672#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15673#[rustc_legacy_const_generics(2)]
15674#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15675pub fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15676    static_assert_sae!(SAE);
15677    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_setzero_si512(), k, a)
15678}
15679
15680/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15681/// floating-point elements, and store the results in dst.
15682///
15683/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxph_ps)
15684#[inline]
15685#[target_feature(enable = "avx512fp16,avx512vl")]
15686#[cfg_attr(test, assert_instr(vcvtph2psx))]
15687#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15688pub fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
15689    _mm_mask_cvtxph_ps(_mm_setzero_ps(), 0xff, a)
15690}
15691
15692/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15693/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15694/// dst when the corresponding mask bit is not set).
15695///
15696/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxph_ps)
15697#[inline]
15698#[target_feature(enable = "avx512fp16,avx512vl")]
15699#[cfg_attr(test, assert_instr(vcvtph2psx))]
15700#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15701pub fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128 {
15702    unsafe { vcvtph2psx_128(a, src, k) }
15703}
15704
15705/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15706/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15707/// corresponding mask bit is not set).
15708///
15709/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxph_ps)
15710#[inline]
15711#[target_feature(enable = "avx512fp16,avx512vl")]
15712#[cfg_attr(test, assert_instr(vcvtph2psx))]
15713#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15714pub fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
15715    _mm_mask_cvtxph_ps(_mm_setzero_ps(), k, a)
15716}
15717
15718/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15719/// floating-point elements, and store the results in dst.
15720///
15721/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxph_ps)
15722#[inline]
15723#[target_feature(enable = "avx512fp16,avx512vl")]
15724#[cfg_attr(test, assert_instr(vcvtph2psx))]
15725#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15726pub fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
15727    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), 0xff, a)
15728}
15729
15730/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15731/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15732/// dst when the corresponding mask bit is not set).
15733///
15734/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxph_ps)
15735#[inline]
15736#[target_feature(enable = "avx512fp16,avx512vl")]
15737#[cfg_attr(test, assert_instr(vcvtph2psx))]
15738#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15739pub fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m256 {
15740    unsafe { vcvtph2psx_256(a, src, k) }
15741}
15742
15743/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15744/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15745/// corresponding mask bit is not set).
15746///
15747/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxph_ps)
15748#[inline]
15749#[target_feature(enable = "avx512fp16,avx512vl")]
15750#[cfg_attr(test, assert_instr(vcvtph2psx))]
15751#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15752pub fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
15753    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), k, a)
15754}
15755
15756/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15757/// floating-point elements, and store the results in dst.
15758///
15759/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxph_ps)
15760#[inline]
15761#[target_feature(enable = "avx512fp16")]
15762#[cfg_attr(test, assert_instr(vcvtph2psx))]
15763#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15764pub fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
15765    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), 0xffff, a)
15766}
15767
15768/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15769/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15770/// dst when the corresponding mask bit is not set).
15771///
15772/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxph_ps)
15773#[inline]
15774#[target_feature(enable = "avx512fp16")]
15775#[cfg_attr(test, assert_instr(vcvtph2psx))]
15776#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15777pub fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __m512 {
15778    unsafe { vcvtph2psx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
15779}
15780
15781/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15782/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15783/// corresponding mask bit is not set).
15784///
15785/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxph_ps)
15786#[inline]
15787#[target_feature(enable = "avx512fp16")]
15788#[cfg_attr(test, assert_instr(vcvtph2psx))]
15789#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15790pub fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
15791    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), k, a)
15792}
15793
15794/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15795/// floating-point elements, and store the results in dst.
15796///
15797/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15798///
15799/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundph_ps)
15800#[inline]
15801#[target_feature(enable = "avx512fp16")]
15802#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15803#[rustc_legacy_const_generics(1)]
15804#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15805pub fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
15806    static_assert_sae!(SAE);
15807    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), 0xffff, a)
15808}
15809
15810/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15811/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15812/// dst when the corresponding mask bit is not set).
15813///
15814/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15815///
15816/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundph_ps)
15817#[inline]
15818#[target_feature(enable = "avx512fp16")]
15819#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15820#[rustc_legacy_const_generics(3)]
15821#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15822pub fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
15823    src: __m512,
15824    k: __mmask16,
15825    a: __m256h,
15826) -> __m512 {
15827    unsafe {
15828        static_assert_sae!(SAE);
15829        vcvtph2psx_512(a, src, k, SAE)
15830    }
15831}
15832
15833/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15834/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15835/// corresponding mask bit is not set).
15836///
15837/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15838///
15839/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundph_ps)
15840#[inline]
15841#[target_feature(enable = "avx512fp16")]
15842#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15843#[rustc_legacy_const_generics(2)]
15844#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15845pub fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512 {
15846    static_assert_sae!(SAE);
15847    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), k, a)
15848}
15849
15850/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15851/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed
15852/// elements from a to the upper elements of dst.
15853///
15854/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_ss)
15855#[inline]
15856#[target_feature(enable = "avx512fp16")]
15857#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15858#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15859pub fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
15860    _mm_mask_cvtsh_ss(a, 0xff, a, b)
15861}
15862
15863/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15864/// floating-point element, store the result in the lower element of dst using writemask k (the element is
15865/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
15866/// upper elements of dst.
15867///
15868/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_ss)
15869#[inline]
15870#[target_feature(enable = "avx512fp16")]
15871#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15872#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15873pub fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15874    unsafe { vcvtsh2ss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
15875}
15876
15877/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15878/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
15879/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
15880/// of dst.
15881///
15882/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_ss)
15883#[inline]
15884#[target_feature(enable = "avx512fp16")]
15885#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15886#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15887pub fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15888    _mm_mask_cvtsh_ss(_mm_setzero_ps(), k, a, b)
15889}
15890
15891/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15892/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements
15893/// from a to the upper elements of dst.
15894///
15895/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15896///
15897/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_ss)
15898#[inline]
15899#[target_feature(enable = "avx512fp16")]
15900#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15901#[rustc_legacy_const_generics(2)]
15902#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15903pub fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m128 {
15904    static_assert_sae!(SAE);
15905    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_undefined_ps(), 0xff, a, b)
15906}
15907
15908/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15909/// floating-point element, store the result in the lower element of dst using writemask k (the element is
15910/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
15911/// upper elements of dst.
15912///
15913/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15914///
15915/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_ss)
15916#[inline]
15917#[target_feature(enable = "avx512fp16")]
15918#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15919#[rustc_legacy_const_generics(4)]
15920#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15921pub fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
15922    src: __m128,
15923    k: __mmask8,
15924    a: __m128,
15925    b: __m128h,
15926) -> __m128 {
15927    unsafe {
15928        static_assert_sae!(SAE);
15929        vcvtsh2ss(a, b, src, k, SAE)
15930    }
15931}
15932
15933/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15934/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
15935/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
15936/// of dst.
15937///
15938/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15939///
15940/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_ss)
15941#[inline]
15942#[target_feature(enable = "avx512fp16")]
15943#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15944#[rustc_legacy_const_generics(3)]
15945#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15946pub fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15947    static_assert_sae!(SAE);
15948    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_setzero_ps(), k, a, b)
15949}
15950
15951/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15952/// floating-point elements, and store the results in dst.
15953///
15954/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_pd)
15955#[inline]
15956#[target_feature(enable = "avx512fp16,avx512vl")]
15957#[cfg_attr(test, assert_instr(vcvtph2pd))]
15958#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15959pub fn _mm_cvtph_pd(a: __m128h) -> __m128d {
15960    _mm_mask_cvtph_pd(_mm_setzero_pd(), 0xff, a)
15961}
15962
15963/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15964/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15965/// dst when the corresponding mask bit is not set).
15966///
15967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_pd)
15968#[inline]
15969#[target_feature(enable = "avx512fp16,avx512vl")]
15970#[cfg_attr(test, assert_instr(vcvtph2pd))]
15971#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15972pub fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128d {
15973    unsafe { vcvtph2pd_128(a, src, k) }
15974}
15975
15976/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15977/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15978/// corresponding mask bit is not set).
15979///
15980/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_pd)
15981#[inline]
15982#[target_feature(enable = "avx512fp16,avx512vl")]
15983#[cfg_attr(test, assert_instr(vcvtph2pd))]
15984#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15985pub fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
15986    _mm_mask_cvtph_pd(_mm_setzero_pd(), k, a)
15987}
15988
15989/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15990/// floating-point elements, and store the results in dst.
15991///
15992/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_pd)
15993#[inline]
15994#[target_feature(enable = "avx512fp16,avx512vl")]
15995#[cfg_attr(test, assert_instr(vcvtph2pd))]
15996#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15997pub fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
15998    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), 0xff, a)
15999}
16000
16001/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16002/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16003/// dst when the corresponding mask bit is not set).
16004///
16005/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_pd)
16006#[inline]
16007#[target_feature(enable = "avx512fp16,avx512vl")]
16008#[cfg_attr(test, assert_instr(vcvtph2pd))]
16009#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16010pub fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m256d {
16011    unsafe { vcvtph2pd_256(a, src, k) }
16012}
16013
16014/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16015/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16016/// corresponding mask bit is not set).
16017///
16018/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_pd)
16019#[inline]
16020#[target_feature(enable = "avx512fp16,avx512vl")]
16021#[cfg_attr(test, assert_instr(vcvtph2pd))]
16022#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16023pub fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
16024    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), k, a)
16025}
16026
16027/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16028/// floating-point elements, and store the results in dst.
16029///
16030/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_pd)
16031#[inline]
16032#[target_feature(enable = "avx512fp16")]
16033#[cfg_attr(test, assert_instr(vcvtph2pd))]
16034#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16035pub fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
16036    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), 0xff, a)
16037}
16038
16039/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16040/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16041/// dst when the corresponding mask bit is not set).
16042///
16043/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_pd)
16044#[inline]
16045#[target_feature(enable = "avx512fp16")]
16046#[cfg_attr(test, assert_instr(vcvtph2pd))]
16047#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16048pub fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m512d {
16049    unsafe { vcvtph2pd_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
16050}
16051
16052/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16053/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16054/// corresponding mask bit is not set).
16055///
16056/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_pd)
16057#[inline]
16058#[target_feature(enable = "avx512fp16")]
16059#[cfg_attr(test, assert_instr(vcvtph2pd))]
16060#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16061pub fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
16062    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), k, a)
16063}
16064
16065/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16066/// floating-point elements, and store the results in dst.
16067///
16068/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16069///
16070/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_pd)
16071#[inline]
16072#[target_feature(enable = "avx512fp16")]
16073#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16074#[rustc_legacy_const_generics(1)]
16075#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16076pub fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
16077    static_assert_sae!(SAE);
16078    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), 0xff, a)
16079}
16080
16081/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16082/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16083/// dst when the corresponding mask bit is not set).
16084///
16085/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16086///
16087/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_pd)
16088#[inline]
16089#[target_feature(enable = "avx512fp16")]
16090#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16091#[rustc_legacy_const_generics(3)]
16092#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16093pub fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
16094    src: __m512d,
16095    k: __mmask8,
16096    a: __m128h,
16097) -> __m512d {
16098    unsafe {
16099        static_assert_sae!(SAE);
16100        vcvtph2pd_512(a, src, k, SAE)
16101    }
16102}
16103
16104/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16105/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16106/// corresponding mask bit is not set).
16107///
16108/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16109///
16110/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_pd)
16111#[inline]
16112#[target_feature(enable = "avx512fp16")]
16113#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16114#[rustc_legacy_const_generics(2)]
16115#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16116pub fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512d {
16117    static_assert_sae!(SAE);
16118    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), k, a)
16119}
16120
16121/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16122/// floating-point element, store the result in the lower element of dst, and copy the upper element
16123/// from a to the upper element of dst.
16124///
16125/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_sd)
16126#[inline]
16127#[target_feature(enable = "avx512fp16")]
16128#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16129#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16130pub fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
16131    _mm_mask_cvtsh_sd(a, 0xff, a, b)
16132}
16133
16134/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16135/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16136/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16137/// of dst.
16138///
16139/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_sd)
16140#[inline]
16141#[target_feature(enable = "avx512fp16")]
16142#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16143#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16144pub fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16145    unsafe { vcvtsh2sd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
16146}
16147
16148/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16149/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16150/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16151///
16152/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_sd)
16153#[inline]
16154#[target_feature(enable = "avx512fp16")]
16155#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16156#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16157pub fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16158    _mm_mask_cvtsh_sd(_mm_setzero_pd(), k, a, b)
16159}
16160
16161/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16162/// floating-point element, store the result in the lower element of dst, and copy the upper element from a
16163/// to the upper element of dst.
16164///
16165/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16166///
16167/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_sd)
16168#[inline]
16169#[target_feature(enable = "avx512fp16")]
16170#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16171#[rustc_legacy_const_generics(2)]
16172#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16173pub fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m128d {
16174    static_assert_sae!(SAE);
16175    _mm_mask_cvt_roundsh_sd::<SAE>(a, 0xff, a, b)
16176}
16177
16178/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16179/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16180/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16181/// of dst.
16182///
16183/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16184///
16185/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_sd)
16186#[inline]
16187#[target_feature(enable = "avx512fp16")]
16188#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16189#[rustc_legacy_const_generics(4)]
16190#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16191pub fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
16192    src: __m128d,
16193    k: __mmask8,
16194    a: __m128d,
16195    b: __m128h,
16196) -> __m128d {
16197    unsafe {
16198        static_assert_sae!(SAE);
16199        vcvtsh2sd(a, b, src, k, SAE)
16200    }
16201}
16202
16203/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16204/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16205/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16206///
16207/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16208///
16209/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_sd)
16210#[inline]
16211#[target_feature(enable = "avx512fp16")]
16212#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16213#[rustc_legacy_const_generics(3)]
16214#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16215pub fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16216    static_assert_sae!(SAE);
16217    _mm_mask_cvt_roundsh_sd::<SAE>(_mm_setzero_pd(), k, a, b)
16218}
16219
16220/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16221///
16222/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
16223#[inline]
16224#[target_feature(enable = "avx512fp16")]
16225#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16226pub fn _mm_cvtsh_h(a: __m128h) -> f16 {
16227    unsafe { simd_extract!(a, 0) }
16228}
16229
16230/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16231///
16232/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
16233#[inline]
16234#[target_feature(enable = "avx512fp16")]
16235#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16236pub fn _mm256_cvtsh_h(a: __m256h) -> f16 {
16237    unsafe { simd_extract!(a, 0) }
16238}
16239
16240/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16241///
16242/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
16243#[inline]
16244#[target_feature(enable = "avx512fp16")]
16245#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16246pub fn _mm512_cvtsh_h(a: __m512h) -> f16 {
16247    unsafe { simd_extract!(a, 0) }
16248}
16249
16250/// Copy the lower 16-bit integer in a to dst.
16251///
16252/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
16253#[inline]
16254#[target_feature(enable = "avx512fp16")]
16255#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16256pub fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
16257    unsafe { simd_extract!(a.as_i16x8(), 0) }
16258}
16259
16260/// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
16261///
16262/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
16263#[inline]
16264#[target_feature(enable = "avx512fp16")]
16265#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16266pub fn _mm_cvtsi16_si128(a: i16) -> __m128i {
16267    unsafe { transmute(simd_insert!(i16x8::ZERO, 0, a)) }
16268}
16269
16270#[allow(improper_ctypes)]
16271unsafe extern "C" {
16272    #[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
16273    fn vcmpsh(a: __m128h, b: __m128h, imm8: i32, mask: __mmask8, sae: i32) -> __mmask8;
16274    #[link_name = "llvm.x86.avx512fp16.vcomi.sh"]
16275    fn vcomish(a: __m128h, b: __m128h, imm8: i32, sae: i32) -> i32;
16276
16277    #[link_name = "llvm.x86.avx512fp16.add.ph.512"]
16278    fn vaddph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16279    #[link_name = "llvm.x86.avx512fp16.sub.ph.512"]
16280    fn vsubph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16281    #[link_name = "llvm.x86.avx512fp16.mul.ph.512"]
16282    fn vmulph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16283    #[link_name = "llvm.x86.avx512fp16.div.ph.512"]
16284    fn vdivph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16285
16286    #[link_name = "llvm.x86.avx512fp16.mask.add.sh.round"]
16287    fn vaddsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16288    #[link_name = "llvm.x86.avx512fp16.mask.sub.sh.round"]
16289    fn vsubsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16290    #[link_name = "llvm.x86.avx512fp16.mask.mul.sh.round"]
16291    fn vmulsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16292    #[link_name = "llvm.x86.avx512fp16.mask.div.sh.round"]
16293    fn vdivsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16294
16295    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.128"]
16296    fn vfmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16297    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.256"]
16298    fn vfmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16299    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.512"]
16300    fn vfmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16301    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.csh"]
16302    fn vfmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16303
16304    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.128"]
16305    fn vfcmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16306    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.256"]
16307    fn vfcmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16308    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.512"]
16309    fn vfcmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16310    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.csh"]
16311    fn vfcmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16312
16313    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.128"]
16314    fn vfmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16315    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.128"]
16316    fn vfmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16317    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.256"]
16318    fn vfmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16319    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.256"]
16320    fn vfmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16321    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.512"]
16322    fn vfmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16323    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.512"]
16324    fn vfmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16325    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.csh"]
16326    fn vfmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16327    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.csh"]
16328    fn vfmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16329
16330    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.128"]
16331    fn vfcmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16332    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.128"]
16333    fn vfcmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16334    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.256"]
16335    fn vfcmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16336    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.256"]
16337    fn vfcmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16338    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.512"]
16339    fn vfcmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16340    -> __m512;
16341    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.512"]
16342    fn vfcmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16343    -> __m512;
16344    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.csh"]
16345    fn vfcmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16346    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.csh"]
16347    fn vfcmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16348
16349    #[link_name = "llvm.x86.avx512fp16.vfmadd.ph.512"]
16350    fn vfmaddph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16351    #[link_name = "llvm.x86.avx512fp16.vfmadd.f16"]
16352    fn vfmaddsh(a: f16, b: f16, c: f16, rounding: i32) -> f16;
16353
16354    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.128"]
16355    fn vfmaddsubph_128(a: __m128h, b: __m128h, c: __m128h) -> __m128h;
16356    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.256"]
16357    fn vfmaddsubph_256(a: __m256h, b: __m256h, c: __m256h) -> __m256h;
16358    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.512"]
16359    fn vfmaddsubph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16360
16361    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.128"]
16362    fn vrcpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16363    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.256"]
16364    fn vrcpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16365    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.512"]
16366    fn vrcpph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16367    #[link_name = "llvm.x86.avx512fp16.mask.rcp.sh"]
16368    fn vrcpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16369
16370    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.128"]
16371    fn vrsqrtph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16372    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.256"]
16373    fn vrsqrtph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16374    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.512"]
16375    fn vrsqrtph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16376    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.sh"]
16377    fn vrsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16378
16379    #[link_name = "llvm.x86.avx512fp16.sqrt.ph.512"]
16380    fn vsqrtph_512(a: __m512h, rounding: i32) -> __m512h;
16381    #[link_name = "llvm.x86.avx512fp16.mask.sqrt.sh"]
16382    fn vsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16383
16384    #[link_name = "llvm.x86.avx512fp16.max.ph.128"]
16385    fn vmaxph_128(a: __m128h, b: __m128h) -> __m128h;
16386    #[link_name = "llvm.x86.avx512fp16.max.ph.256"]
16387    fn vmaxph_256(a: __m256h, b: __m256h) -> __m256h;
16388    #[link_name = "llvm.x86.avx512fp16.max.ph.512"]
16389    fn vmaxph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16390    #[link_name = "llvm.x86.avx512fp16.mask.max.sh.round"]
16391    fn vmaxsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16392
16393    #[link_name = "llvm.x86.avx512fp16.min.ph.128"]
16394    fn vminph_128(a: __m128h, b: __m128h) -> __m128h;
16395    #[link_name = "llvm.x86.avx512fp16.min.ph.256"]
16396    fn vminph_256(a: __m256h, b: __m256h) -> __m256h;
16397    #[link_name = "llvm.x86.avx512fp16.min.ph.512"]
16398    fn vminph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16399    #[link_name = "llvm.x86.avx512fp16.mask.min.sh.round"]
16400    fn vminsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16401
16402    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.128"]
16403    fn vgetexpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16404    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.256"]
16405    fn vgetexpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16406    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.512"]
16407    fn vgetexpph_512(a: __m512h, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16408    #[link_name = "llvm.x86.avx512fp16.mask.getexp.sh"]
16409    fn vgetexpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16410
16411    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.128"]
16412    fn vgetmantph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16413    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.256"]
16414    fn vgetmantph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16415    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.512"]
16416    fn vgetmantph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16417    #[link_name = "llvm.x86.avx512fp16.mask.getmant.sh"]
16418    fn vgetmantsh(
16419        a: __m128h,
16420        b: __m128h,
16421        imm8: i32,
16422        src: __m128h,
16423        k: __mmask8,
16424        sae: i32,
16425    ) -> __m128h;
16426
16427    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.128"]
16428    fn vrndscaleph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16429    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.256"]
16430    fn vrndscaleph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16431    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.512"]
16432    fn vrndscaleph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16433    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.sh"]
16434    fn vrndscalesh(
16435        a: __m128h,
16436        b: __m128h,
16437        src: __m128h,
16438        k: __mmask8,
16439        imm8: i32,
16440        sae: i32,
16441    ) -> __m128h;
16442
16443    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.128"]
16444    fn vscalefph_128(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16445    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.256"]
16446    fn vscalefph_256(a: __m256h, b: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16447    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.512"]
16448    fn vscalefph_512(a: __m512h, b: __m512h, src: __m512h, k: __mmask32, rounding: i32) -> __m512h;
16449    #[link_name = "llvm.x86.avx512fp16.mask.scalef.sh"]
16450    fn vscalefsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16451
16452    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.128"]
16453    fn vreduceph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16454    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.256"]
16455    fn vreduceph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16456    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.512"]
16457    fn vreduceph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16458    #[link_name = "llvm.x86.avx512fp16.mask.reduce.sh"]
16459    fn vreducesh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, imm8: i32, sae: i32)
16460    -> __m128h;
16461
16462    #[link_name = "llvm.x86.avx512fp16.mask.fpclass.sh"]
16463    fn vfpclasssh(a: __m128h, imm8: i32, k: __mmask8) -> __mmask8;
16464
16465    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i16"]
16466    fn vcvtw2ph_128(a: i16x8, rounding: i32) -> __m128h;
16467    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i16"]
16468    fn vcvtw2ph_256(a: i16x16, rounding: i32) -> __m256h;
16469    #[link_name = "llvm.x86.avx512.sitofp.round.v32f16.v32i16"]
16470    fn vcvtw2ph_512(a: i16x32, rounding: i32) -> __m512h;
16471    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8u16"]
16472    fn vcvtuw2ph_128(a: u16x8, rounding: i32) -> __m128h;
16473    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16u16"]
16474    fn vcvtuw2ph_256(a: u16x16, rounding: i32) -> __m256h;
16475    #[link_name = "llvm.x86.avx512.uitofp.round.v32f16.v32u16"]
16476    fn vcvtuw2ph_512(a: u16x32, rounding: i32) -> __m512h;
16477
16478    #[link_name = "llvm.x86.avx512fp16.mask.vcvtdq2ph.128"]
16479    fn vcvtdq2ph_128(a: i32x4, src: __m128h, k: __mmask8) -> __m128h;
16480    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i32"]
16481    fn vcvtdq2ph_256(a: i32x8, rounding: i32) -> __m128h;
16482    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i32"]
16483    fn vcvtdq2ph_512(a: i32x16, rounding: i32) -> __m256h;
16484    #[link_name = "llvm.x86.avx512fp16.vcvtsi2sh"]
16485    fn vcvtsi2sh(a: __m128h, b: i32, rounding: i32) -> __m128h;
16486    #[link_name = "llvm.x86.avx512fp16.mask.vcvtudq2ph.128"]
16487    fn vcvtudq2ph_128(a: u32x4, src: __m128h, k: __mmask8) -> __m128h;
16488    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8u32"]
16489    fn vcvtudq2ph_256(a: u32x8, rounding: i32) -> __m128h;
16490    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16u32"]
16491    fn vcvtudq2ph_512(a: u32x16, rounding: i32) -> __m256h;
16492    #[link_name = "llvm.x86.avx512fp16.vcvtusi2sh"]
16493    fn vcvtusi2sh(a: __m128h, b: u32, rounding: i32) -> __m128h;
16494
16495    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.128"]
16496    fn vcvtqq2ph_128(a: i64x2, src: __m128h, k: __mmask8) -> __m128h;
16497    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.256"]
16498    fn vcvtqq2ph_256(a: i64x4, src: __m128h, k: __mmask8) -> __m128h;
16499    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i64"]
16500    fn vcvtqq2ph_512(a: i64x8, rounding: i32) -> __m128h;
16501    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.128"]
16502    fn vcvtuqq2ph_128(a: u64x2, src: __m128h, k: __mmask8) -> __m128h;
16503    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.256"]
16504    fn vcvtuqq2ph_256(a: u64x4, src: __m128h, k: __mmask8) -> __m128h;
16505    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8u64"]
16506    fn vcvtuqq2ph_512(a: u64x8, rounding: i32) -> __m128h;
16507
16508    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.128"]
16509    fn vcvtps2phx_128(a: __m128, src: __m128h, k: __mmask8) -> __m128h;
16510    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.256"]
16511    fn vcvtps2phx_256(a: __m256, src: __m128h, k: __mmask8) -> __m128h;
16512    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.512"]
16513    fn vcvtps2phx_512(a: __m512, src: __m256h, k: __mmask16, rounding: i32) -> __m256h;
16514    #[link_name = "llvm.x86.avx512fp16.mask.vcvtss2sh.round"]
16515    fn vcvtss2sh(a: __m128h, b: __m128, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16516
16517    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.128"]
16518    fn vcvtpd2ph_128(a: __m128d, src: __m128h, k: __mmask8) -> __m128h;
16519    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.256"]
16520    fn vcvtpd2ph_256(a: __m256d, src: __m128h, k: __mmask8) -> __m128h;
16521    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.512"]
16522    fn vcvtpd2ph_512(a: __m512d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16523    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsd2sh.round"]
16524    fn vcvtsd2sh(a: __m128h, b: __m128d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16525
16526    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.128"]
16527    fn vcvtph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16528    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.256"]
16529    fn vcvtph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16530    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.512"]
16531    fn vcvtph2w_512(a: __m512h, src: i16x32, k: __mmask32, rounding: i32) -> i16x32;
16532    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.128"]
16533    fn vcvtph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16534    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.256"]
16535    fn vcvtph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16536    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.512"]
16537    fn vcvtph2uw_512(a: __m512h, src: u16x32, k: __mmask32, rounding: i32) -> u16x32;
16538
16539    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.128"]
16540    fn vcvttph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16541    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.256"]
16542    fn vcvttph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16543    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.512"]
16544    fn vcvttph2w_512(a: __m512h, src: i16x32, k: __mmask32, sae: i32) -> i16x32;
16545    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.128"]
16546    fn vcvttph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16547    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.256"]
16548    fn vcvttph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16549    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.512"]
16550    fn vcvttph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
16551
16552    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.128"]
16553    fn vcvtph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16554    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.256"]
16555    fn vcvtph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16556    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.512"]
16557    fn vcvtph2dq_512(a: __m256h, src: i32x16, k: __mmask16, rounding: i32) -> i32x16;
16558    #[link_name = "llvm.x86.avx512fp16.vcvtsh2si32"]
16559    fn vcvtsh2si32(a: __m128h, rounding: i32) -> i32;
16560    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.128"]
16561    fn vcvtph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16562    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.256"]
16563    fn vcvtph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16564    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.512"]
16565    fn vcvtph2udq_512(a: __m256h, src: u32x16, k: __mmask16, rounding: i32) -> u32x16;
16566    #[link_name = "llvm.x86.avx512fp16.vcvtsh2usi32"]
16567    fn vcvtsh2usi32(a: __m128h, sae: i32) -> u32;
16568
16569    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.128"]
16570    fn vcvttph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16571    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.256"]
16572    fn vcvttph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16573    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.512"]
16574    fn vcvttph2dq_512(a: __m256h, src: i32x16, k: __mmask16, sae: i32) -> i32x16;
16575    #[link_name = "llvm.x86.avx512fp16.vcvttsh2si32"]
16576    fn vcvttsh2si32(a: __m128h, sae: i32) -> i32;
16577    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.128"]
16578    fn vcvttph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16579    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.256"]
16580    fn vcvttph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16581    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.512"]
16582    fn vcvttph2udq_512(a: __m256h, src: u32x16, k: __mmask16, sae: i32) -> u32x16;
16583    #[link_name = "llvm.x86.avx512fp16.vcvttsh2usi32"]
16584    fn vcvttsh2usi32(a: __m128h, sae: i32) -> u32;
16585
16586    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.128"]
16587    fn vcvtph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16588    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.256"]
16589    fn vcvtph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16590    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.512"]
16591    fn vcvtph2qq_512(a: __m128h, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
16592    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.128"]
16593    fn vcvtph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16594    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.256"]
16595    fn vcvtph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16596    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.512"]
16597    fn vcvtph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
16598
16599    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.128"]
16600    fn vcvttph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16601    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.256"]
16602    fn vcvttph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16603    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.512"]
16604    fn vcvttph2qq_512(a: __m128h, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
16605    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.128"]
16606    fn vcvttph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16607    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.256"]
16608    fn vcvttph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16609    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.512"]
16610    fn vcvttph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
16611
16612    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.128"]
16613    fn vcvtph2psx_128(a: __m128h, src: __m128, k: __mmask8) -> __m128;
16614    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.256"]
16615    fn vcvtph2psx_256(a: __m128h, src: __m256, k: __mmask8) -> __m256;
16616    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.512"]
16617    fn vcvtph2psx_512(a: __m256h, src: __m512, k: __mmask16, sae: i32) -> __m512;
16618    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2ss.round"]
16619    fn vcvtsh2ss(a: __m128, b: __m128h, src: __m128, k: __mmask8, sae: i32) -> __m128;
16620
16621    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.128"]
16622    fn vcvtph2pd_128(a: __m128h, src: __m128d, k: __mmask8) -> __m128d;
16623    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.256"]
16624    fn vcvtph2pd_256(a: __m128h, src: __m256d, k: __mmask8) -> __m256d;
16625    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.512"]
16626    fn vcvtph2pd_512(a: __m128h, src: __m512d, k: __mmask8, sae: i32) -> __m512d;
16627    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2sd.round"]
16628    fn vcvtsh2sd(a: __m128d, b: __m128h, src: __m128d, k: __mmask8, sae: i32) -> __m128d;
16629
16630}
16631
16632#[cfg(test)]
16633mod tests {
16634    use crate::core_arch::x86::*;
16635    use crate::mem::transmute;
16636    use crate::ptr::{addr_of, addr_of_mut};
16637    use stdarch_test::simd_test;
16638
16639    #[target_feature(enable = "avx512fp16")]
16640    unsafe fn _mm_set1_pch(re: f16, im: f16) -> __m128h {
16641        _mm_setr_ph(re, im, re, im, re, im, re, im)
16642    }
16643
16644    #[target_feature(enable = "avx512fp16")]
16645    unsafe fn _mm256_set1_pch(re: f16, im: f16) -> __m256h {
16646        _mm256_setr_ph(
16647            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16648        )
16649    }
16650
16651    #[target_feature(enable = "avx512fp16")]
16652    unsafe fn _mm512_set1_pch(re: f16, im: f16) -> __m512h {
16653        _mm512_setr_ph(
16654            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16655            re, im, re, im, re, im, re, im, re, im,
16656        )
16657    }
16658
16659    #[simd_test(enable = "avx512fp16")]
16660    unsafe fn test_mm_set_ph() {
16661        let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16662        let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16663        assert_eq_m128h(r, e);
16664    }
16665
16666    #[simd_test(enable = "avx512fp16")]
16667    unsafe fn test_mm256_set_ph() {
16668        let r = _mm256_set_ph(
16669            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16670        );
16671        let e = _mm256_setr_ph(
16672            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16673        );
16674        assert_eq_m256h(r, e);
16675    }
16676
16677    #[simd_test(enable = "avx512fp16")]
16678    unsafe fn test_mm512_set_ph() {
16679        let r = _mm512_set_ph(
16680            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16681            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16682            31.0, 32.0,
16683        );
16684        let e = _mm512_setr_ph(
16685            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16686            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16687            3.0, 2.0, 1.0,
16688        );
16689        assert_eq_m512h(r, e);
16690    }
16691
16692    #[simd_test(enable = "avx512fp16")]
16693    unsafe fn test_mm_set_sh() {
16694        let r = _mm_set_sh(1.0);
16695        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0);
16696        assert_eq_m128h(r, e);
16697    }
16698
16699    #[simd_test(enable = "avx512fp16")]
16700    unsafe fn test_mm_set1_ph() {
16701        let r = _mm_set1_ph(1.0);
16702        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
16703        assert_eq_m128h(r, e);
16704    }
16705
16706    #[simd_test(enable = "avx512fp16")]
16707    unsafe fn test_mm256_set1_ph() {
16708        let r = _mm256_set1_ph(1.0);
16709        let e = _mm256_set_ph(
16710            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16711        );
16712        assert_eq_m256h(r, e);
16713    }
16714
16715    #[simd_test(enable = "avx512fp16")]
16716    unsafe fn test_mm512_set1_ph() {
16717        let r = _mm512_set1_ph(1.0);
16718        let e = _mm512_set_ph(
16719            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16720            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16721        );
16722        assert_eq_m512h(r, e);
16723    }
16724
16725    #[simd_test(enable = "avx512fp16")]
16726    unsafe fn test_mm_setr_ph() {
16727        let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16728        let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16729        assert_eq_m128h(r, e);
16730    }
16731
16732    #[simd_test(enable = "avx512fp16")]
16733    unsafe fn test_mm256_setr_ph() {
16734        let r = _mm256_setr_ph(
16735            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16736        );
16737        let e = _mm256_set_ph(
16738            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16739        );
16740        assert_eq_m256h(r, e);
16741    }
16742
16743    #[simd_test(enable = "avx512fp16")]
16744    unsafe fn test_mm512_setr_ph() {
16745        let r = _mm512_setr_ph(
16746            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16747            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16748            31.0, 32.0,
16749        );
16750        let e = _mm512_set_ph(
16751            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16752            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16753            3.0, 2.0, 1.0,
16754        );
16755        assert_eq_m512h(r, e);
16756    }
16757
16758    #[simd_test(enable = "avx512fp16,avx512vl")]
16759    unsafe fn test_mm_setzero_ph() {
16760        let r = _mm_setzero_ph();
16761        let e = _mm_set1_ph(0.0);
16762        assert_eq_m128h(r, e);
16763    }
16764
16765    #[simd_test(enable = "avx512fp16,avx512vl")]
16766    unsafe fn test_mm256_setzero_ph() {
16767        let r = _mm256_setzero_ph();
16768        let e = _mm256_set1_ph(0.0);
16769        assert_eq_m256h(r, e);
16770    }
16771
16772    #[simd_test(enable = "avx512fp16")]
16773    unsafe fn test_mm512_setzero_ph() {
16774        let r = _mm512_setzero_ph();
16775        let e = _mm512_set1_ph(0.0);
16776        assert_eq_m512h(r, e);
16777    }
16778
16779    #[simd_test(enable = "avx512fp16")]
16780    unsafe fn test_mm_castsi128_ph() {
16781        let a = _mm_set1_epi16(0x3c00);
16782        let r = _mm_castsi128_ph(a);
16783        let e = _mm_set1_ph(1.0);
16784        assert_eq_m128h(r, e);
16785    }
16786
16787    #[simd_test(enable = "avx512fp16")]
16788    unsafe fn test_mm256_castsi256_ph() {
16789        let a = _mm256_set1_epi16(0x3c00);
16790        let r = _mm256_castsi256_ph(a);
16791        let e = _mm256_set1_ph(1.0);
16792        assert_eq_m256h(r, e);
16793    }
16794
16795    #[simd_test(enable = "avx512fp16")]
16796    unsafe fn test_mm512_castsi512_ph() {
16797        let a = _mm512_set1_epi16(0x3c00);
16798        let r = _mm512_castsi512_ph(a);
16799        let e = _mm512_set1_ph(1.0);
16800        assert_eq_m512h(r, e);
16801    }
16802
16803    #[simd_test(enable = "avx512fp16")]
16804    unsafe fn test_mm_castph_si128() {
16805        let a = _mm_set1_ph(1.0);
16806        let r = _mm_castph_si128(a);
16807        let e = _mm_set1_epi16(0x3c00);
16808        assert_eq_m128i(r, e);
16809    }
16810
16811    #[simd_test(enable = "avx512fp16")]
16812    unsafe fn test_mm256_castph_si256() {
16813        let a = _mm256_set1_ph(1.0);
16814        let r = _mm256_castph_si256(a);
16815        let e = _mm256_set1_epi16(0x3c00);
16816        assert_eq_m256i(r, e);
16817    }
16818
16819    #[simd_test(enable = "avx512fp16")]
16820    unsafe fn test_mm512_castph_si512() {
16821        let a = _mm512_set1_ph(1.0);
16822        let r = _mm512_castph_si512(a);
16823        let e = _mm512_set1_epi16(0x3c00);
16824        assert_eq_m512i(r, e);
16825    }
16826
16827    #[simd_test(enable = "avx512fp16")]
16828    unsafe fn test_mm_castps_ph() {
16829        let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00));
16830        let r = _mm_castps_ph(a);
16831        let e = _mm_set1_ph(1.0);
16832        assert_eq_m128h(r, e);
16833    }
16834
16835    #[simd_test(enable = "avx512fp16")]
16836    unsafe fn test_mm256_castps_ph() {
16837        let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00));
16838        let r = _mm256_castps_ph(a);
16839        let e = _mm256_set1_ph(1.0);
16840        assert_eq_m256h(r, e);
16841    }
16842
16843    #[simd_test(enable = "avx512fp16")]
16844    unsafe fn test_mm512_castps_ph() {
16845        let a = _mm512_castsi512_ps(_mm512_set1_epi16(0x3c00));
16846        let r = _mm512_castps_ph(a);
16847        let e = _mm512_set1_ph(1.0);
16848        assert_eq_m512h(r, e);
16849    }
16850
16851    #[simd_test(enable = "avx512fp16")]
16852    unsafe fn test_mm_castph_ps() {
16853        let a = _mm_castsi128_ph(_mm_set1_epi32(0x3f800000));
16854        let r = _mm_castph_ps(a);
16855        let e = _mm_set1_ps(1.0);
16856        assert_eq_m128(r, e);
16857    }
16858
16859    #[simd_test(enable = "avx512fp16")]
16860    unsafe fn test_mm256_castph_ps() {
16861        let a = _mm256_castsi256_ph(_mm256_set1_epi32(0x3f800000));
16862        let r = _mm256_castph_ps(a);
16863        let e = _mm256_set1_ps(1.0);
16864        assert_eq_m256(r, e);
16865    }
16866
16867    #[simd_test(enable = "avx512fp16")]
16868    unsafe fn test_mm512_castph_ps() {
16869        let a = _mm512_castsi512_ph(_mm512_set1_epi32(0x3f800000));
16870        let r = _mm512_castph_ps(a);
16871        let e = _mm512_set1_ps(1.0);
16872        assert_eq_m512(r, e);
16873    }
16874
16875    #[simd_test(enable = "avx512fp16")]
16876    unsafe fn test_mm_castpd_ph() {
16877        let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00));
16878        let r = _mm_castpd_ph(a);
16879        let e = _mm_set1_ph(1.0);
16880        assert_eq_m128h(r, e);
16881    }
16882
16883    #[simd_test(enable = "avx512fp16")]
16884    unsafe fn test_mm256_castpd_ph() {
16885        let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00));
16886        let r = _mm256_castpd_ph(a);
16887        let e = _mm256_set1_ph(1.0);
16888        assert_eq_m256h(r, e);
16889    }
16890
16891    #[simd_test(enable = "avx512fp16")]
16892    unsafe fn test_mm512_castpd_ph() {
16893        let a = _mm512_castsi512_pd(_mm512_set1_epi16(0x3c00));
16894        let r = _mm512_castpd_ph(a);
16895        let e = _mm512_set1_ph(1.0);
16896        assert_eq_m512h(r, e);
16897    }
16898
16899    #[simd_test(enable = "avx512fp16")]
16900    unsafe fn test_mm_castph_pd() {
16901        let a = _mm_castsi128_ph(_mm_set1_epi64x(0x3ff0000000000000));
16902        let r = _mm_castph_pd(a);
16903        let e = _mm_set1_pd(1.0);
16904        assert_eq_m128d(r, e);
16905    }
16906
16907    #[simd_test(enable = "avx512fp16")]
16908    unsafe fn test_mm256_castph_pd() {
16909        let a = _mm256_castsi256_ph(_mm256_set1_epi64x(0x3ff0000000000000));
16910        let r = _mm256_castph_pd(a);
16911        let e = _mm256_set1_pd(1.0);
16912        assert_eq_m256d(r, e);
16913    }
16914
16915    #[simd_test(enable = "avx512fp16")]
16916    unsafe fn test_mm512_castph_pd() {
16917        let a = _mm512_castsi512_ph(_mm512_set1_epi64(0x3ff0000000000000));
16918        let r = _mm512_castph_pd(a);
16919        let e = _mm512_set1_pd(1.0);
16920        assert_eq_m512d(r, e);
16921    }
16922
16923    #[simd_test(enable = "avx512fp16")]
16924    unsafe fn test_mm256_castph256_ph128() {
16925        let a = _mm256_setr_ph(
16926            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16927        );
16928        let r = _mm256_castph256_ph128(a);
16929        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16930        assert_eq_m128h(r, e);
16931    }
16932
16933    #[simd_test(enable = "avx512fp16")]
16934    unsafe fn test_mm512_castph512_ph128() {
16935        let a = _mm512_setr_ph(
16936            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
16937            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
16938        );
16939        let r = _mm512_castph512_ph128(a);
16940        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16941        assert_eq_m128h(r, e);
16942    }
16943
16944    #[simd_test(enable = "avx512fp16")]
16945    unsafe fn test_mm512_castph512_ph256() {
16946        let a = _mm512_setr_ph(
16947            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
16948            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
16949        );
16950        let r = _mm512_castph512_ph256(a);
16951        let e = _mm256_setr_ph(
16952            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16953        );
16954        assert_eq_m256h(r, e);
16955    }
16956
16957    #[simd_test(enable = "avx512fp16")]
16958    unsafe fn test_mm256_castph128_ph256() {
16959        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16960        let r = _mm256_castph128_ph256(a);
16961        assert_eq_m128h(_mm256_castph256_ph128(r), a);
16962    }
16963
16964    #[simd_test(enable = "avx512fp16")]
16965    unsafe fn test_mm512_castph128_ph512() {
16966        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16967        let r = _mm512_castph128_ph512(a);
16968        assert_eq_m128h(_mm512_castph512_ph128(r), a);
16969    }
16970
16971    #[simd_test(enable = "avx512fp16")]
16972    unsafe fn test_mm512_castph256_ph512() {
16973        let a = _mm256_setr_ph(
16974            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16975        );
16976        let r = _mm512_castph256_ph512(a);
16977        assert_eq_m256h(_mm512_castph512_ph256(r), a);
16978    }
16979
16980    #[simd_test(enable = "avx512fp16")]
16981    unsafe fn test_mm256_zextph128_ph256() {
16982        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16983        let r = _mm256_zextph128_ph256(a);
16984        let e = _mm256_setr_ph(
16985            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
16986        );
16987        assert_eq_m256h(r, e);
16988    }
16989
16990    #[simd_test(enable = "avx512fp16")]
16991    unsafe fn test_mm512_zextph128_ph512() {
16992        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16993        let r = _mm512_zextph128_ph512(a);
16994        let e = _mm512_setr_ph(
16995            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
16996            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
16997        );
16998        assert_eq_m512h(r, e);
16999    }
17000
17001    #[simd_test(enable = "avx512fp16")]
17002    unsafe fn test_mm512_zextph256_ph512() {
17003        let a = _mm256_setr_ph(
17004            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17005        );
17006        let r = _mm512_zextph256_ph512(a);
17007        let e = _mm512_setr_ph(
17008            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 0., 0., 0., 0.,
17009            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17010        );
17011        assert_eq_m512h(r, e);
17012    }
17013
17014    #[simd_test(enable = "avx512fp16,avx512vl")]
17015    unsafe fn test_mm_cmp_ph_mask() {
17016        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17017        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17018        let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17019        assert_eq!(r, 0b11110000);
17020    }
17021
17022    #[simd_test(enable = "avx512fp16,avx512vl")]
17023    unsafe fn test_mm_mask_cmp_ph_mask() {
17024        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17025        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17026        let r = _mm_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101, a, b);
17027        assert_eq!(r, 0b01010000);
17028    }
17029
17030    #[simd_test(enable = "avx512fp16,avx512vl")]
17031    unsafe fn test_mm256_cmp_ph_mask() {
17032        let a = _mm256_set_ph(
17033            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17034        );
17035        let b = _mm256_set_ph(
17036            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17037            -16.0,
17038        );
17039        let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17040        assert_eq!(r, 0b1111000011110000);
17041    }
17042
17043    #[simd_test(enable = "avx512fp16,avx512vl")]
17044    unsafe fn test_mm256_mask_cmp_ph_mask() {
17045        let a = _mm256_set_ph(
17046            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17047        );
17048        let b = _mm256_set_ph(
17049            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17050            -16.0,
17051        );
17052        let r = _mm256_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b0101010101010101, a, b);
17053        assert_eq!(r, 0b0101000001010000);
17054    }
17055
17056    #[simd_test(enable = "avx512fp16")]
17057    unsafe fn test_mm512_cmp_ph_mask() {
17058        let a = _mm512_set_ph(
17059            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17060            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17061            31.0, 32.0,
17062        );
17063        let b = _mm512_set_ph(
17064            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17065            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17066            -29.0, -30.0, -31.0, -32.0,
17067        );
17068        let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17069        assert_eq!(r, 0b11110000111100001111000011110000);
17070    }
17071
17072    #[simd_test(enable = "avx512fp16")]
17073    unsafe fn test_mm512_mask_cmp_ph_mask() {
17074        let a = _mm512_set_ph(
17075            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17076            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17077            31.0, 32.0,
17078        );
17079        let b = _mm512_set_ph(
17080            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17081            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17082            -29.0, -30.0, -31.0, -32.0,
17083        );
17084        let r = _mm512_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101010101010101010101010101, a, b);
17085        assert_eq!(r, 0b01010000010100000101000001010000);
17086    }
17087
17088    #[simd_test(enable = "avx512fp16")]
17089    unsafe fn test_mm512_cmp_round_ph_mask() {
17090        let a = _mm512_set_ph(
17091            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17092            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17093            31.0, 32.0,
17094        );
17095        let b = _mm512_set_ph(
17096            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17097            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17098            -29.0, -30.0, -31.0, -32.0,
17099        );
17100        let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17101        assert_eq!(r, 0b11110000111100001111000011110000);
17102    }
17103
17104    #[simd_test(enable = "avx512fp16")]
17105    unsafe fn test_mm512_mask_cmp_round_ph_mask() {
17106        let a = _mm512_set_ph(
17107            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17108            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17109            31.0, 32.0,
17110        );
17111        let b = _mm512_set_ph(
17112            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17113            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17114            -29.0, -30.0, -31.0, -32.0,
17115        );
17116        let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
17117            0b01010101010101010101010101010101,
17118            a,
17119            b,
17120        );
17121        assert_eq!(r, 0b01010000010100000101000001010000);
17122    }
17123
17124    #[simd_test(enable = "avx512fp16")]
17125    unsafe fn test_mm_cmp_round_sh_mask() {
17126        let a = _mm_set_sh(1.0);
17127        let b = _mm_set_sh(1.0);
17128        let r = _mm_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17129        assert_eq!(r, 1);
17130    }
17131
17132    #[simd_test(enable = "avx512fp16")]
17133    unsafe fn test_mm_mask_cmp_round_sh_mask() {
17134        let a = _mm_set_sh(1.0);
17135        let b = _mm_set_sh(1.0);
17136        let r = _mm_mask_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(0, a, b);
17137        assert_eq!(r, 0);
17138    }
17139
17140    #[simd_test(enable = "avx512fp16")]
17141    unsafe fn test_mm_cmp_sh_mask() {
17142        let a = _mm_set_sh(1.0);
17143        let b = _mm_set_sh(1.0);
17144        let r = _mm_cmp_sh_mask::<_CMP_EQ_OQ>(a, b);
17145        assert_eq!(r, 1);
17146    }
17147
17148    #[simd_test(enable = "avx512fp16")]
17149    unsafe fn test_mm_mask_cmp_sh_mask() {
17150        let a = _mm_set_sh(1.0);
17151        let b = _mm_set_sh(1.0);
17152        let r = _mm_mask_cmp_sh_mask::<_CMP_EQ_OQ>(0, a, b);
17153        assert_eq!(r, 0);
17154    }
17155
17156    #[simd_test(enable = "avx512fp16")]
17157    unsafe fn test_mm_comi_round_sh() {
17158        let a = _mm_set_sh(1.0);
17159        let b = _mm_set_sh(1.0);
17160        let r = _mm_comi_round_sh::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17161        assert_eq!(r, 1);
17162    }
17163
17164    #[simd_test(enable = "avx512fp16")]
17165    unsafe fn test_mm_comi_sh() {
17166        let a = _mm_set_sh(1.0);
17167        let b = _mm_set_sh(1.0);
17168        let r = _mm_comi_sh::<_CMP_EQ_OQ>(a, b);
17169        assert_eq!(r, 1);
17170    }
17171
17172    #[simd_test(enable = "avx512fp16")]
17173    unsafe fn test_mm_comieq_sh() {
17174        let a = _mm_set_sh(1.0);
17175        let b = _mm_set_sh(1.0);
17176        let r = _mm_comieq_sh(a, b);
17177        assert_eq!(r, 1);
17178    }
17179
17180    #[simd_test(enable = "avx512fp16")]
17181    unsafe fn test_mm_comige_sh() {
17182        let a = _mm_set_sh(2.0);
17183        let b = _mm_set_sh(1.0);
17184        let r = _mm_comige_sh(a, b);
17185        assert_eq!(r, 1);
17186    }
17187
17188    #[simd_test(enable = "avx512fp16")]
17189    unsafe fn test_mm_comigt_sh() {
17190        let a = _mm_set_sh(2.0);
17191        let b = _mm_set_sh(1.0);
17192        let r = _mm_comigt_sh(a, b);
17193        assert_eq!(r, 1);
17194    }
17195
17196    #[simd_test(enable = "avx512fp16")]
17197    unsafe fn test_mm_comile_sh() {
17198        let a = _mm_set_sh(1.0);
17199        let b = _mm_set_sh(2.0);
17200        let r = _mm_comile_sh(a, b);
17201        assert_eq!(r, 1);
17202    }
17203
17204    #[simd_test(enable = "avx512fp16")]
17205    unsafe fn test_mm_comilt_sh() {
17206        let a = _mm_set_sh(1.0);
17207        let b = _mm_set_sh(2.0);
17208        let r = _mm_comilt_sh(a, b);
17209        assert_eq!(r, 1);
17210    }
17211
17212    #[simd_test(enable = "avx512fp16")]
17213    unsafe fn test_mm_comineq_sh() {
17214        let a = _mm_set_sh(1.0);
17215        let b = _mm_set_sh(2.0);
17216        let r = _mm_comineq_sh(a, b);
17217        assert_eq!(r, 1);
17218    }
17219
17220    #[simd_test(enable = "avx512fp16")]
17221    unsafe fn test_mm_ucomieq_sh() {
17222        let a = _mm_set_sh(1.0);
17223        let b = _mm_set_sh(1.0);
17224        let r = _mm_ucomieq_sh(a, b);
17225        assert_eq!(r, 1);
17226    }
17227
17228    #[simd_test(enable = "avx512fp16")]
17229    unsafe fn test_mm_ucomige_sh() {
17230        let a = _mm_set_sh(2.0);
17231        let b = _mm_set_sh(1.0);
17232        let r = _mm_ucomige_sh(a, b);
17233        assert_eq!(r, 1);
17234    }
17235
17236    #[simd_test(enable = "avx512fp16")]
17237    unsafe fn test_mm_ucomigt_sh() {
17238        let a = _mm_set_sh(2.0);
17239        let b = _mm_set_sh(1.0);
17240        let r = _mm_ucomigt_sh(a, b);
17241        assert_eq!(r, 1);
17242    }
17243
17244    #[simd_test(enable = "avx512fp16")]
17245    unsafe fn test_mm_ucomile_sh() {
17246        let a = _mm_set_sh(1.0);
17247        let b = _mm_set_sh(2.0);
17248        let r = _mm_ucomile_sh(a, b);
17249        assert_eq!(r, 1);
17250    }
17251
17252    #[simd_test(enable = "avx512fp16")]
17253    unsafe fn test_mm_ucomilt_sh() {
17254        let a = _mm_set_sh(1.0);
17255        let b = _mm_set_sh(2.0);
17256        let r = _mm_ucomilt_sh(a, b);
17257        assert_eq!(r, 1);
17258    }
17259
17260    #[simd_test(enable = "avx512fp16")]
17261    unsafe fn test_mm_ucomineq_sh() {
17262        let a = _mm_set_sh(1.0);
17263        let b = _mm_set_sh(2.0);
17264        let r = _mm_ucomineq_sh(a, b);
17265        assert_eq!(r, 1);
17266    }
17267
17268    #[simd_test(enable = "avx512fp16,avx512vl")]
17269    unsafe fn test_mm_load_ph() {
17270        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17271        let b = _mm_load_ph(addr_of!(a).cast());
17272        assert_eq_m128h(a, b);
17273    }
17274
17275    #[simd_test(enable = "avx512fp16,avx512vl")]
17276    unsafe fn test_mm256_load_ph() {
17277        let a = _mm256_set_ph(
17278            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17279        );
17280        let b = _mm256_load_ph(addr_of!(a).cast());
17281        assert_eq_m256h(a, b);
17282    }
17283
17284    #[simd_test(enable = "avx512fp16")]
17285    unsafe fn test_mm512_load_ph() {
17286        let a = _mm512_set_ph(
17287            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17288            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17289            31.0, 32.0,
17290        );
17291        let b = _mm512_load_ph(addr_of!(a).cast());
17292        assert_eq_m512h(a, b);
17293    }
17294
17295    #[simd_test(enable = "avx512fp16")]
17296    unsafe fn test_mm_load_sh() {
17297        let a = _mm_set_sh(1.0);
17298        let b = _mm_load_sh(addr_of!(a).cast());
17299        assert_eq_m128h(a, b);
17300    }
17301
17302    #[simd_test(enable = "avx512fp16")]
17303    unsafe fn test_mm_mask_load_sh() {
17304        let a = _mm_set_sh(1.0);
17305        let src = _mm_set_sh(2.);
17306        let b = _mm_mask_load_sh(src, 1, addr_of!(a).cast());
17307        assert_eq_m128h(a, b);
17308        let b = _mm_mask_load_sh(src, 0, addr_of!(a).cast());
17309        assert_eq_m128h(src, b);
17310    }
17311
17312    #[simd_test(enable = "avx512fp16")]
17313    unsafe fn test_mm_maskz_load_sh() {
17314        let a = _mm_set_sh(1.0);
17315        let b = _mm_maskz_load_sh(1, addr_of!(a).cast());
17316        assert_eq_m128h(a, b);
17317        let b = _mm_maskz_load_sh(0, addr_of!(a).cast());
17318        assert_eq_m128h(_mm_setzero_ph(), b);
17319    }
17320
17321    #[simd_test(enable = "avx512fp16,avx512vl")]
17322    unsafe fn test_mm_loadu_ph() {
17323        let array = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
17324        let r = _mm_loadu_ph(array.as_ptr());
17325        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17326        assert_eq_m128h(r, e);
17327    }
17328
17329    #[simd_test(enable = "avx512fp16,avx512vl")]
17330    unsafe fn test_mm256_loadu_ph() {
17331        let array = [
17332            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17333        ];
17334        let r = _mm256_loadu_ph(array.as_ptr());
17335        let e = _mm256_setr_ph(
17336            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17337        );
17338        assert_eq_m256h(r, e);
17339    }
17340
17341    #[simd_test(enable = "avx512fp16")]
17342    unsafe fn test_mm512_loadu_ph() {
17343        let array = [
17344            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17345            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17346            31.0, 32.0,
17347        ];
17348        let r = _mm512_loadu_ph(array.as_ptr());
17349        let e = _mm512_setr_ph(
17350            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17351            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17352            31.0, 32.0,
17353        );
17354        assert_eq_m512h(r, e);
17355    }
17356
17357    #[simd_test(enable = "avx512fp16")]
17358    unsafe fn test_mm_move_sh() {
17359        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17360        let b = _mm_set_sh(9.0);
17361        let r = _mm_move_sh(a, b);
17362        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0);
17363        assert_eq_m128h(r, e);
17364    }
17365
17366    #[simd_test(enable = "avx512fp16")]
17367    unsafe fn test_mm_mask_move_sh() {
17368        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17369        let b = _mm_set_sh(9.0);
17370        let src = _mm_set_sh(10.0);
17371        let r = _mm_mask_move_sh(src, 0, a, b);
17372        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 10.0);
17373        assert_eq_m128h(r, e);
17374    }
17375
17376    #[simd_test(enable = "avx512fp16")]
17377    unsafe fn test_mm_maskz_move_sh() {
17378        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17379        let b = _mm_set_sh(9.0);
17380        let r = _mm_maskz_move_sh(0, a, b);
17381        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0);
17382        assert_eq_m128h(r, e);
17383    }
17384
17385    #[simd_test(enable = "avx512fp16,avx512vl")]
17386    unsafe fn test_mm_store_ph() {
17387        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17388        let mut b = _mm_setzero_ph();
17389        _mm_store_ph(addr_of_mut!(b).cast(), a);
17390        assert_eq_m128h(a, b);
17391    }
17392
17393    #[simd_test(enable = "avx512fp16,avx512vl")]
17394    unsafe fn test_mm256_store_ph() {
17395        let a = _mm256_set_ph(
17396            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17397        );
17398        let mut b = _mm256_setzero_ph();
17399        _mm256_store_ph(addr_of_mut!(b).cast(), a);
17400        assert_eq_m256h(a, b);
17401    }
17402
17403    #[simd_test(enable = "avx512fp16")]
17404    unsafe fn test_mm512_store_ph() {
17405        let a = _mm512_set_ph(
17406            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17407            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17408            31.0, 32.0,
17409        );
17410        let mut b = _mm512_setzero_ph();
17411        _mm512_store_ph(addr_of_mut!(b).cast(), a);
17412        assert_eq_m512h(a, b);
17413    }
17414
17415    #[simd_test(enable = "avx512fp16")]
17416    unsafe fn test_mm_store_sh() {
17417        let a = _mm_set_sh(1.0);
17418        let mut b = _mm_setzero_ph();
17419        _mm_store_sh(addr_of_mut!(b).cast(), a);
17420        assert_eq_m128h(a, b);
17421    }
17422
17423    #[simd_test(enable = "avx512fp16")]
17424    unsafe fn test_mm_mask_store_sh() {
17425        let a = _mm_set_sh(1.0);
17426        let mut b = _mm_setzero_ph();
17427        _mm_mask_store_sh(addr_of_mut!(b).cast(), 0, a);
17428        assert_eq_m128h(_mm_setzero_ph(), b);
17429        _mm_mask_store_sh(addr_of_mut!(b).cast(), 1, a);
17430        assert_eq_m128h(a, b);
17431    }
17432
17433    #[simd_test(enable = "avx512fp16,avx512vl")]
17434    unsafe fn test_mm_storeu_ph() {
17435        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17436        let mut array = [0.0; 8];
17437        _mm_storeu_ph(array.as_mut_ptr(), a);
17438        assert_eq_m128h(a, _mm_loadu_ph(array.as_ptr()));
17439    }
17440
17441    #[simd_test(enable = "avx512fp16,avx512vl")]
17442    unsafe fn test_mm256_storeu_ph() {
17443        let a = _mm256_set_ph(
17444            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17445        );
17446        let mut array = [0.0; 16];
17447        _mm256_storeu_ph(array.as_mut_ptr(), a);
17448        assert_eq_m256h(a, _mm256_loadu_ph(array.as_ptr()));
17449    }
17450
17451    #[simd_test(enable = "avx512fp16")]
17452    unsafe fn test_mm512_storeu_ph() {
17453        let a = _mm512_set_ph(
17454            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17455            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17456            31.0, 32.0,
17457        );
17458        let mut array = [0.0; 32];
17459        _mm512_storeu_ph(array.as_mut_ptr(), a);
17460        assert_eq_m512h(a, _mm512_loadu_ph(array.as_ptr()));
17461    }
17462
17463    #[simd_test(enable = "avx512fp16,avx512vl")]
17464    unsafe fn test_mm_add_ph() {
17465        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17466        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17467        let r = _mm_add_ph(a, b);
17468        let e = _mm_set1_ph(9.0);
17469        assert_eq_m128h(r, e);
17470    }
17471
17472    #[simd_test(enable = "avx512fp16,avx512vl")]
17473    unsafe fn test_mm_mask_add_ph() {
17474        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17475        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17476        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17477        let r = _mm_mask_add_ph(src, 0b01010101, a, b);
17478        let e = _mm_set_ph(10., 9., 12., 9., 14., 9., 16., 9.);
17479        assert_eq_m128h(r, e);
17480    }
17481
17482    #[simd_test(enable = "avx512fp16,avx512vl")]
17483    unsafe fn test_mm_maskz_add_ph() {
17484        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17485        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17486        let r = _mm_maskz_add_ph(0b01010101, a, b);
17487        let e = _mm_set_ph(0., 9., 0., 9., 0., 9., 0., 9.);
17488        assert_eq_m128h(r, e);
17489    }
17490
17491    #[simd_test(enable = "avx512fp16,avx512vl")]
17492    unsafe fn test_mm256_add_ph() {
17493        let a = _mm256_set_ph(
17494            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17495        );
17496        let b = _mm256_set_ph(
17497            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17498        );
17499        let r = _mm256_add_ph(a, b);
17500        let e = _mm256_set1_ph(17.0);
17501        assert_eq_m256h(r, e);
17502    }
17503
17504    #[simd_test(enable = "avx512fp16,avx512vl")]
17505    unsafe fn test_mm256_mask_add_ph() {
17506        let a = _mm256_set_ph(
17507            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17508        );
17509        let b = _mm256_set_ph(
17510            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17511        );
17512        let src = _mm256_set_ph(
17513            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17514        );
17515        let r = _mm256_mask_add_ph(src, 0b0101010101010101, a, b);
17516        let e = _mm256_set_ph(
17517            18., 17., 20., 17., 22., 17., 24., 17., 26., 17., 28., 17., 30., 17., 32., 17.,
17518        );
17519        assert_eq_m256h(r, e);
17520    }
17521
17522    #[simd_test(enable = "avx512fp16,avx512vl")]
17523    unsafe fn test_mm256_maskz_add_ph() {
17524        let a = _mm256_set_ph(
17525            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17526        );
17527        let b = _mm256_set_ph(
17528            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17529        );
17530        let r = _mm256_maskz_add_ph(0b0101010101010101, a, b);
17531        let e = _mm256_set_ph(
17532            0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17.,
17533        );
17534        assert_eq_m256h(r, e);
17535    }
17536
17537    #[simd_test(enable = "avx512fp16")]
17538    unsafe fn test_mm512_add_ph() {
17539        let a = _mm512_set_ph(
17540            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17541            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17542            31.0, 32.0,
17543        );
17544        let b = _mm512_set_ph(
17545            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17546            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17547            3.0, 2.0, 1.0,
17548        );
17549        let r = _mm512_add_ph(a, b);
17550        let e = _mm512_set1_ph(33.0);
17551        assert_eq_m512h(r, e);
17552    }
17553
17554    #[simd_test(enable = "avx512fp16")]
17555    unsafe fn test_mm512_mask_add_ph() {
17556        let a = _mm512_set_ph(
17557            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17558            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17559            31.0, 32.0,
17560        );
17561        let b = _mm512_set_ph(
17562            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17563            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17564            3.0, 2.0, 1.0,
17565        );
17566        let src = _mm512_set_ph(
17567            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17568            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17569        );
17570        let r = _mm512_mask_add_ph(src, 0b01010101010101010101010101010101, a, b);
17571        let e = _mm512_set_ph(
17572            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17573            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17574        );
17575        assert_eq_m512h(r, e);
17576    }
17577
17578    #[simd_test(enable = "avx512fp16")]
17579    unsafe fn test_mm512_maskz_add_ph() {
17580        let a = _mm512_set_ph(
17581            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17582            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17583            31.0, 32.0,
17584        );
17585        let b = _mm512_set_ph(
17586            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17587            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17588            3.0, 2.0, 1.0,
17589        );
17590        let r = _mm512_maskz_add_ph(0b01010101010101010101010101010101, a, b);
17591        let e = _mm512_set_ph(
17592            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17593            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17594        );
17595        assert_eq_m512h(r, e);
17596    }
17597
17598    #[simd_test(enable = "avx512fp16")]
17599    unsafe fn test_mm512_add_round_ph() {
17600        let a = _mm512_set_ph(
17601            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17602            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17603            31.0, 32.0,
17604        );
17605        let b = _mm512_set_ph(
17606            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17607            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17608            3.0, 2.0, 1.0,
17609        );
17610        let r = _mm512_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17611        let e = _mm512_set1_ph(33.0);
17612        assert_eq_m512h(r, e);
17613    }
17614
17615    #[simd_test(enable = "avx512fp16")]
17616    unsafe fn test_mm512_mask_add_round_ph() {
17617        let a = _mm512_set_ph(
17618            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17619            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17620            31.0, 32.0,
17621        );
17622        let b = _mm512_set_ph(
17623            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17624            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17625            3.0, 2.0, 1.0,
17626        );
17627        let src = _mm512_set_ph(
17628            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17629            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17630        );
17631        let r = _mm512_mask_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17632            src,
17633            0b01010101010101010101010101010101,
17634            a,
17635            b,
17636        );
17637        let e = _mm512_set_ph(
17638            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17639            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17640        );
17641        assert_eq_m512h(r, e);
17642    }
17643
17644    #[simd_test(enable = "avx512fp16")]
17645    unsafe fn test_mm512_maskz_add_round_ph() {
17646        let a = _mm512_set_ph(
17647            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17648            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17649            31.0, 32.0,
17650        );
17651        let b = _mm512_set_ph(
17652            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17653            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17654            3.0, 2.0, 1.0,
17655        );
17656        let r = _mm512_maskz_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17657            0b01010101010101010101010101010101,
17658            a,
17659            b,
17660        );
17661        let e = _mm512_set_ph(
17662            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17663            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17664        );
17665        assert_eq_m512h(r, e);
17666    }
17667
17668    #[simd_test(enable = "avx512fp16")]
17669    unsafe fn test_mm_add_round_sh() {
17670        let a = _mm_set_sh(1.0);
17671        let b = _mm_set_sh(2.0);
17672        let r = _mm_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17673        let e = _mm_set_sh(3.0);
17674        assert_eq_m128h(r, e);
17675    }
17676
17677    #[simd_test(enable = "avx512fp16")]
17678    unsafe fn test_mm_mask_add_round_sh() {
17679        let a = _mm_set_sh(1.0);
17680        let b = _mm_set_sh(2.0);
17681        let src = _mm_set_sh(4.0);
17682        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17683            src, 0, a, b,
17684        );
17685        let e = _mm_set_sh(4.0);
17686        assert_eq_m128h(r, e);
17687        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17688            src, 1, a, b,
17689        );
17690        let e = _mm_set_sh(3.0);
17691        assert_eq_m128h(r, e);
17692    }
17693
17694    #[simd_test(enable = "avx512fp16")]
17695    unsafe fn test_mm_maskz_add_round_sh() {
17696        let a = _mm_set_sh(1.0);
17697        let b = _mm_set_sh(2.0);
17698        let r =
17699            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
17700        let e = _mm_set_sh(0.0);
17701        assert_eq_m128h(r, e);
17702        let r =
17703            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
17704        let e = _mm_set_sh(3.0);
17705        assert_eq_m128h(r, e);
17706    }
17707
17708    #[simd_test(enable = "avx512fp16")]
17709    unsafe fn test_mm_add_sh() {
17710        let a = _mm_set_sh(1.0);
17711        let b = _mm_set_sh(2.0);
17712        let r = _mm_add_sh(a, b);
17713        let e = _mm_set_sh(3.0);
17714        assert_eq_m128h(r, e);
17715    }
17716
17717    #[simd_test(enable = "avx512fp16")]
17718    unsafe fn test_mm_mask_add_sh() {
17719        let a = _mm_set_sh(1.0);
17720        let b = _mm_set_sh(2.0);
17721        let src = _mm_set_sh(4.0);
17722        let r = _mm_mask_add_sh(src, 0, a, b);
17723        let e = _mm_set_sh(4.0);
17724        assert_eq_m128h(r, e);
17725        let r = _mm_mask_add_sh(src, 1, a, b);
17726        let e = _mm_set_sh(3.0);
17727        assert_eq_m128h(r, e);
17728    }
17729
17730    #[simd_test(enable = "avx512fp16")]
17731    unsafe fn test_mm_maskz_add_sh() {
17732        let a = _mm_set_sh(1.0);
17733        let b = _mm_set_sh(2.0);
17734        let r = _mm_maskz_add_sh(0, a, b);
17735        let e = _mm_set_sh(0.0);
17736        assert_eq_m128h(r, e);
17737        let r = _mm_maskz_add_sh(1, a, b);
17738        let e = _mm_set_sh(3.0);
17739        assert_eq_m128h(r, e);
17740    }
17741
17742    #[simd_test(enable = "avx512fp16,avx512vl")]
17743    unsafe fn test_mm_sub_ph() {
17744        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17745        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17746        let r = _mm_sub_ph(a, b);
17747        let e = _mm_set_ph(-7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0);
17748        assert_eq_m128h(r, e);
17749    }
17750
17751    #[simd_test(enable = "avx512fp16,avx512vl")]
17752    unsafe fn test_mm_mask_sub_ph() {
17753        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17754        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17755        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17756        let r = _mm_mask_sub_ph(src, 0b01010101, a, b);
17757        let e = _mm_set_ph(10., -5., 12., -1., 14., 3., 16., 7.);
17758        assert_eq_m128h(r, e);
17759    }
17760
17761    #[simd_test(enable = "avx512fp16,avx512vl")]
17762    unsafe fn test_mm_maskz_sub_ph() {
17763        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17764        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17765        let r = _mm_maskz_sub_ph(0b01010101, a, b);
17766        let e = _mm_set_ph(0., -5., 0., -1., 0., 3., 0., 7.);
17767        assert_eq_m128h(r, e);
17768    }
17769
17770    #[simd_test(enable = "avx512fp16,avx512vl")]
17771    unsafe fn test_mm256_sub_ph() {
17772        let a = _mm256_set_ph(
17773            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17774        );
17775        let b = _mm256_set_ph(
17776            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17777        );
17778        let r = _mm256_sub_ph(a, b);
17779        let e = _mm256_set_ph(
17780            -15.0, -13.0, -11.0, -9.0, -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0,
17781            15.0,
17782        );
17783        assert_eq_m256h(r, e);
17784    }
17785
17786    #[simd_test(enable = "avx512fp16,avx512vl")]
17787    unsafe fn test_mm256_mask_sub_ph() {
17788        let a = _mm256_set_ph(
17789            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17790        );
17791        let b = _mm256_set_ph(
17792            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17793        );
17794        let src = _mm256_set_ph(
17795            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17796        );
17797        let r = _mm256_mask_sub_ph(src, 0b0101010101010101, a, b);
17798        let e = _mm256_set_ph(
17799            18., -13., 20., -9., 22., -5., 24., -1., 26., 3., 28., 7., 30., 11., 32., 15.,
17800        );
17801        assert_eq_m256h(r, e);
17802    }
17803
17804    #[simd_test(enable = "avx512fp16,avx512vl")]
17805    unsafe fn test_mm256_maskz_sub_ph() {
17806        let a = _mm256_set_ph(
17807            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17808        );
17809        let b = _mm256_set_ph(
17810            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17811        );
17812        let r = _mm256_maskz_sub_ph(0b0101010101010101, a, b);
17813        let e = _mm256_set_ph(
17814            0., -13., 0., -9., 0., -5., 0., -1., 0., 3., 0., 7., 0., 11., 0., 15.,
17815        );
17816        assert_eq_m256h(r, e);
17817    }
17818
17819    #[simd_test(enable = "avx512fp16")]
17820    unsafe fn test_mm512_sub_ph() {
17821        let a = _mm512_set_ph(
17822            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17823            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17824            31.0, 32.0,
17825        );
17826        let b = _mm512_set_ph(
17827            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17828            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17829            3.0, 2.0, 1.0,
17830        );
17831        let r = _mm512_sub_ph(a, b);
17832        let e = _mm512_set_ph(
17833            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
17834            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
17835            23.0, 25.0, 27.0, 29.0, 31.0,
17836        );
17837        assert_eq_m512h(r, e);
17838    }
17839
17840    #[simd_test(enable = "avx512fp16")]
17841    unsafe fn test_mm512_mask_sub_ph() {
17842        let a = _mm512_set_ph(
17843            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17844            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17845            31.0, 32.0,
17846        );
17847        let b = _mm512_set_ph(
17848            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17849            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17850            3.0, 2.0, 1.0,
17851        );
17852        let src = _mm512_set_ph(
17853            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17854            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17855        );
17856        let r = _mm512_mask_sub_ph(src, 0b01010101010101010101010101010101, a, b);
17857        let e = _mm512_set_ph(
17858            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
17859            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
17860        );
17861        assert_eq_m512h(r, e);
17862    }
17863
17864    #[simd_test(enable = "avx512fp16")]
17865    unsafe fn test_mm512_maskz_sub_ph() {
17866        let a = _mm512_set_ph(
17867            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17868            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17869            31.0, 32.0,
17870        );
17871        let b = _mm512_set_ph(
17872            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17873            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17874            3.0, 2.0, 1.0,
17875        );
17876        let r = _mm512_maskz_sub_ph(0b01010101010101010101010101010101, a, b);
17877        let e = _mm512_set_ph(
17878            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
17879            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
17880        );
17881        assert_eq_m512h(r, e);
17882    }
17883
17884    #[simd_test(enable = "avx512fp16")]
17885    unsafe fn test_mm512_sub_round_ph() {
17886        let a = _mm512_set_ph(
17887            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17888            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17889            31.0, 32.0,
17890        );
17891        let b = _mm512_set_ph(
17892            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17893            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17894            3.0, 2.0, 1.0,
17895        );
17896        let r = _mm512_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17897        let e = _mm512_set_ph(
17898            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
17899            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
17900            23.0, 25.0, 27.0, 29.0, 31.0,
17901        );
17902        assert_eq_m512h(r, e);
17903    }
17904
17905    #[simd_test(enable = "avx512fp16")]
17906    unsafe fn test_mm512_mask_sub_round_ph() {
17907        let a = _mm512_set_ph(
17908            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17909            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17910            31.0, 32.0,
17911        );
17912        let b = _mm512_set_ph(
17913            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17914            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17915            3.0, 2.0, 1.0,
17916        );
17917        let src = _mm512_set_ph(
17918            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17919            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17920        );
17921        let r = _mm512_mask_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17922            src,
17923            0b01010101010101010101010101010101,
17924            a,
17925            b,
17926        );
17927        let e = _mm512_set_ph(
17928            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
17929            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
17930        );
17931        assert_eq_m512h(r, e);
17932    }
17933
17934    #[simd_test(enable = "avx512fp16")]
17935    unsafe fn test_mm512_maskz_sub_round_ph() {
17936        let a = _mm512_set_ph(
17937            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17938            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17939            31.0, 32.0,
17940        );
17941        let b = _mm512_set_ph(
17942            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17943            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17944            3.0, 2.0, 1.0,
17945        );
17946        let r = _mm512_maskz_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17947            0b01010101010101010101010101010101,
17948            a,
17949            b,
17950        );
17951        let e = _mm512_set_ph(
17952            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
17953            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
17954        );
17955        assert_eq_m512h(r, e);
17956    }
17957
17958    #[simd_test(enable = "avx512fp16")]
17959    unsafe fn test_mm_sub_round_sh() {
17960        let a = _mm_set_sh(1.0);
17961        let b = _mm_set_sh(2.0);
17962        let r = _mm_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17963        let e = _mm_set_sh(-1.0);
17964        assert_eq_m128h(r, e);
17965    }
17966
17967    #[simd_test(enable = "avx512fp16")]
17968    unsafe fn test_mm_mask_sub_round_sh() {
17969        let a = _mm_set_sh(1.0);
17970        let b = _mm_set_sh(2.0);
17971        let src = _mm_set_sh(4.0);
17972        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17973            src, 0, a, b,
17974        );
17975        let e = _mm_set_sh(4.0);
17976        assert_eq_m128h(r, e);
17977        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17978            src, 1, a, b,
17979        );
17980        let e = _mm_set_sh(-1.0);
17981        assert_eq_m128h(r, e);
17982    }
17983
17984    #[simd_test(enable = "avx512fp16")]
17985    unsafe fn test_mm_maskz_sub_round_sh() {
17986        let a = _mm_set_sh(1.0);
17987        let b = _mm_set_sh(2.0);
17988        let r =
17989            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
17990        let e = _mm_set_sh(0.0);
17991        assert_eq_m128h(r, e);
17992        let r =
17993            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
17994        let e = _mm_set_sh(-1.0);
17995        assert_eq_m128h(r, e);
17996    }
17997
17998    #[simd_test(enable = "avx512fp16")]
17999    unsafe fn test_mm_sub_sh() {
18000        let a = _mm_set_sh(1.0);
18001        let b = _mm_set_sh(2.0);
18002        let r = _mm_sub_sh(a, b);
18003        let e = _mm_set_sh(-1.0);
18004        assert_eq_m128h(r, e);
18005    }
18006
18007    #[simd_test(enable = "avx512fp16")]
18008    unsafe fn test_mm_mask_sub_sh() {
18009        let a = _mm_set_sh(1.0);
18010        let b = _mm_set_sh(2.0);
18011        let src = _mm_set_sh(4.0);
18012        let r = _mm_mask_sub_sh(src, 0, a, b);
18013        let e = _mm_set_sh(4.0);
18014        assert_eq_m128h(r, e);
18015        let r = _mm_mask_sub_sh(src, 1, a, b);
18016        let e = _mm_set_sh(-1.0);
18017        assert_eq_m128h(r, e);
18018    }
18019
18020    #[simd_test(enable = "avx512fp16")]
18021    unsafe fn test_mm_maskz_sub_sh() {
18022        let a = _mm_set_sh(1.0);
18023        let b = _mm_set_sh(2.0);
18024        let r = _mm_maskz_sub_sh(0, a, b);
18025        let e = _mm_set_sh(0.0);
18026        assert_eq_m128h(r, e);
18027        let r = _mm_maskz_sub_sh(1, a, b);
18028        let e = _mm_set_sh(-1.0);
18029        assert_eq_m128h(r, e);
18030    }
18031
18032    #[simd_test(enable = "avx512fp16,avx512vl")]
18033    unsafe fn test_mm_mul_ph() {
18034        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18035        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18036        let r = _mm_mul_ph(a, b);
18037        let e = _mm_set_ph(8.0, 14.0, 18.0, 20.0, 20.0, 18.0, 14.0, 8.0);
18038        assert_eq_m128h(r, e);
18039    }
18040
18041    #[simd_test(enable = "avx512fp16,avx512vl")]
18042    unsafe fn test_mm_mask_mul_ph() {
18043        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18044        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18045        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
18046        let r = _mm_mask_mul_ph(src, 0b01010101, a, b);
18047        let e = _mm_set_ph(10., 14., 12., 20., 14., 18., 16., 8.);
18048        assert_eq_m128h(r, e);
18049    }
18050
18051    #[simd_test(enable = "avx512fp16,avx512vl")]
18052    unsafe fn test_mm_maskz_mul_ph() {
18053        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18054        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18055        let r = _mm_maskz_mul_ph(0b01010101, a, b);
18056        let e = _mm_set_ph(0., 14., 0., 20., 0., 18., 0., 8.);
18057        assert_eq_m128h(r, e);
18058    }
18059
18060    #[simd_test(enable = "avx512fp16,avx512vl")]
18061    unsafe fn test_mm256_mul_ph() {
18062        let a = _mm256_set_ph(
18063            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18064        );
18065        let b = _mm256_set_ph(
18066            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18067        );
18068        let r = _mm256_mul_ph(a, b);
18069        let e = _mm256_set_ph(
18070            16.0, 30.0, 42.0, 52.0, 60.0, 66.0, 70.0, 72.0, 72.0, 70.0, 66.0, 60.0, 52.0, 42.0,
18071            30.0, 16.0,
18072        );
18073        assert_eq_m256h(r, e);
18074    }
18075
18076    #[simd_test(enable = "avx512fp16,avx512vl")]
18077    unsafe fn test_mm256_mask_mul_ph() {
18078        let a = _mm256_set_ph(
18079            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18080        );
18081        let b = _mm256_set_ph(
18082            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18083        );
18084        let src = _mm256_set_ph(
18085            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
18086        );
18087        let r = _mm256_mask_mul_ph(src, 0b0101010101010101, a, b);
18088        let e = _mm256_set_ph(
18089            18., 30., 20., 52., 22., 66., 24., 72., 26., 70., 28., 60., 30., 42., 32., 16.,
18090        );
18091        assert_eq_m256h(r, e);
18092    }
18093
18094    #[simd_test(enable = "avx512fp16,avx512vl")]
18095    unsafe fn test_mm256_maskz_mul_ph() {
18096        let a = _mm256_set_ph(
18097            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18098        );
18099        let b = _mm256_set_ph(
18100            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18101        );
18102        let r = _mm256_maskz_mul_ph(0b0101010101010101, a, b);
18103        let e = _mm256_set_ph(
18104            0., 30., 0., 52., 0., 66., 0., 72., 0., 70., 0., 60., 0., 42., 0., 16.,
18105        );
18106        assert_eq_m256h(r, e);
18107    }
18108
18109    #[simd_test(enable = "avx512fp16")]
18110    unsafe fn test_mm512_mul_ph() {
18111        let a = _mm512_set_ph(
18112            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18113            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18114            31.0, 32.0,
18115        );
18116        let b = _mm512_set_ph(
18117            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18118            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18119            3.0, 2.0, 1.0,
18120        );
18121        let r = _mm512_mul_ph(a, b);
18122        let e = _mm512_set_ph(
18123            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18124            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18125            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18126        );
18127        assert_eq_m512h(r, e);
18128    }
18129
18130    #[simd_test(enable = "avx512fp16")]
18131    unsafe fn test_mm512_mask_mul_ph() {
18132        let a = _mm512_set_ph(
18133            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18134            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18135            31.0, 32.0,
18136        );
18137        let b = _mm512_set_ph(
18138            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18139            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18140            3.0, 2.0, 1.0,
18141        );
18142        let src = _mm512_set_ph(
18143            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18144            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18145        );
18146        let r = _mm512_mask_mul_ph(src, 0b01010101010101010101010101010101, a, b);
18147        let e = _mm512_set_ph(
18148            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18149            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18150        );
18151        assert_eq_m512h(r, e);
18152    }
18153
18154    #[simd_test(enable = "avx512fp16")]
18155    unsafe fn test_mm512_maskz_mul_ph() {
18156        let a = _mm512_set_ph(
18157            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18158            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18159            31.0, 32.0,
18160        );
18161        let b = _mm512_set_ph(
18162            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18163            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18164            3.0, 2.0, 1.0,
18165        );
18166        let r = _mm512_maskz_mul_ph(0b01010101010101010101010101010101, a, b);
18167        let e = _mm512_set_ph(
18168            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18169            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18170        );
18171        assert_eq_m512h(r, e);
18172    }
18173
18174    #[simd_test(enable = "avx512fp16")]
18175    unsafe fn test_mm512_mul_round_ph() {
18176        let a = _mm512_set_ph(
18177            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18178            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18179            31.0, 32.0,
18180        );
18181        let b = _mm512_set_ph(
18182            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18183            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18184            3.0, 2.0, 1.0,
18185        );
18186        let r = _mm512_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18187        let e = _mm512_set_ph(
18188            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18189            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18190            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18191        );
18192        assert_eq_m512h(r, e);
18193    }
18194
18195    #[simd_test(enable = "avx512fp16")]
18196    unsafe fn test_mm512_mask_mul_round_ph() {
18197        let a = _mm512_set_ph(
18198            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18199            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18200            31.0, 32.0,
18201        );
18202        let b = _mm512_set_ph(
18203            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18204            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18205            3.0, 2.0, 1.0,
18206        );
18207        let src = _mm512_set_ph(
18208            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18209            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18210        );
18211        let r = _mm512_mask_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18212            src,
18213            0b01010101010101010101010101010101,
18214            a,
18215            b,
18216        );
18217        let e = _mm512_set_ph(
18218            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18219            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18220        );
18221        assert_eq_m512h(r, e);
18222    }
18223
18224    #[simd_test(enable = "avx512fp16")]
18225    unsafe fn test_mm512_maskz_mul_round_ph() {
18226        let a = _mm512_set_ph(
18227            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18228            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18229            31.0, 32.0,
18230        );
18231        let b = _mm512_set_ph(
18232            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18233            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18234            3.0, 2.0, 1.0,
18235        );
18236        let r = _mm512_maskz_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18237            0b01010101010101010101010101010101,
18238            a,
18239            b,
18240        );
18241        let e = _mm512_set_ph(
18242            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18243            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18244        );
18245        assert_eq_m512h(r, e);
18246    }
18247
18248    #[simd_test(enable = "avx512fp16")]
18249    unsafe fn test_mm_mul_round_sh() {
18250        let a = _mm_set_sh(1.0);
18251        let b = _mm_set_sh(2.0);
18252        let r = _mm_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18253        let e = _mm_set_sh(2.0);
18254        assert_eq_m128h(r, e);
18255    }
18256
18257    #[simd_test(enable = "avx512fp16")]
18258    unsafe fn test_mm_mask_mul_round_sh() {
18259        let a = _mm_set_sh(1.0);
18260        let b = _mm_set_sh(2.0);
18261        let src = _mm_set_sh(4.0);
18262        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18263            src, 0, a, b,
18264        );
18265        let e = _mm_set_sh(4.0);
18266        assert_eq_m128h(r, e);
18267        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18268            src, 1, a, b,
18269        );
18270        let e = _mm_set_sh(2.0);
18271        assert_eq_m128h(r, e);
18272    }
18273
18274    #[simd_test(enable = "avx512fp16")]
18275    unsafe fn test_mm_maskz_mul_round_sh() {
18276        let a = _mm_set_sh(1.0);
18277        let b = _mm_set_sh(2.0);
18278        let r =
18279            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18280        let e = _mm_set_sh(0.0);
18281        assert_eq_m128h(r, e);
18282        let r =
18283            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18284        let e = _mm_set_sh(2.0);
18285        assert_eq_m128h(r, e);
18286    }
18287
18288    #[simd_test(enable = "avx512fp16")]
18289    unsafe fn test_mm_mul_sh() {
18290        let a = _mm_set_sh(1.0);
18291        let b = _mm_set_sh(2.0);
18292        let r = _mm_mul_sh(a, b);
18293        let e = _mm_set_sh(2.0);
18294        assert_eq_m128h(r, e);
18295    }
18296
18297    #[simd_test(enable = "avx512fp16")]
18298    unsafe fn test_mm_mask_mul_sh() {
18299        let a = _mm_set_sh(1.0);
18300        let b = _mm_set_sh(2.0);
18301        let src = _mm_set_sh(4.0);
18302        let r = _mm_mask_mul_sh(src, 0, a, b);
18303        let e = _mm_set_sh(4.0);
18304        assert_eq_m128h(r, e);
18305        let r = _mm_mask_mul_sh(src, 1, a, b);
18306        let e = _mm_set_sh(2.0);
18307        assert_eq_m128h(r, e);
18308    }
18309
18310    #[simd_test(enable = "avx512fp16")]
18311    unsafe fn test_mm_maskz_mul_sh() {
18312        let a = _mm_set_sh(1.0);
18313        let b = _mm_set_sh(2.0);
18314        let r = _mm_maskz_mul_sh(0, a, b);
18315        let e = _mm_set_sh(0.0);
18316        assert_eq_m128h(r, e);
18317        let r = _mm_maskz_mul_sh(1, a, b);
18318        let e = _mm_set_sh(2.0);
18319        assert_eq_m128h(r, e);
18320    }
18321
18322    #[simd_test(enable = "avx512fp16,avx512vl")]
18323    unsafe fn test_mm_div_ph() {
18324        let a = _mm_set1_ph(1.0);
18325        let b = _mm_set1_ph(2.0);
18326        let r = _mm_div_ph(a, b);
18327        let e = _mm_set1_ph(0.5);
18328        assert_eq_m128h(r, e);
18329    }
18330
18331    #[simd_test(enable = "avx512fp16,avx512vl")]
18332    unsafe fn test_mm_mask_div_ph() {
18333        let a = _mm_set1_ph(1.0);
18334        let b = _mm_set1_ph(2.0);
18335        let src = _mm_set_ph(4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0);
18336        let r = _mm_mask_div_ph(src, 0b01010101, a, b);
18337        let e = _mm_set_ph(4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5);
18338        assert_eq_m128h(r, e);
18339    }
18340
18341    #[simd_test(enable = "avx512fp16,avx512vl")]
18342    unsafe fn test_mm_maskz_div_ph() {
18343        let a = _mm_set1_ph(1.0);
18344        let b = _mm_set1_ph(2.0);
18345        let r = _mm_maskz_div_ph(0b01010101, a, b);
18346        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
18347        assert_eq_m128h(r, e);
18348    }
18349
18350    #[simd_test(enable = "avx512fp16,avx512vl")]
18351    unsafe fn test_mm256_div_ph() {
18352        let a = _mm256_set1_ph(1.0);
18353        let b = _mm256_set1_ph(2.0);
18354        let r = _mm256_div_ph(a, b);
18355        let e = _mm256_set1_ph(0.5);
18356        assert_eq_m256h(r, e);
18357    }
18358
18359    #[simd_test(enable = "avx512fp16,avx512vl")]
18360    unsafe fn test_mm256_mask_div_ph() {
18361        let a = _mm256_set1_ph(1.0);
18362        let b = _mm256_set1_ph(2.0);
18363        let src = _mm256_set_ph(
18364            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18365            19.0,
18366        );
18367        let r = _mm256_mask_div_ph(src, 0b0101010101010101, a, b);
18368        let e = _mm256_set_ph(
18369            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18370        );
18371        assert_eq_m256h(r, e);
18372    }
18373
18374    #[simd_test(enable = "avx512fp16,avx512vl")]
18375    unsafe fn test_mm256_maskz_div_ph() {
18376        let a = _mm256_set1_ph(1.0);
18377        let b = _mm256_set1_ph(2.0);
18378        let r = _mm256_maskz_div_ph(0b0101010101010101, a, b);
18379        let e = _mm256_set_ph(
18380            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18381        );
18382        assert_eq_m256h(r, e);
18383    }
18384
18385    #[simd_test(enable = "avx512fp16")]
18386    unsafe fn test_mm512_div_ph() {
18387        let a = _mm512_set1_ph(1.0);
18388        let b = _mm512_set1_ph(2.0);
18389        let r = _mm512_div_ph(a, b);
18390        let e = _mm512_set1_ph(0.5);
18391        assert_eq_m512h(r, e);
18392    }
18393
18394    #[simd_test(enable = "avx512fp16")]
18395    unsafe fn test_mm512_mask_div_ph() {
18396        let a = _mm512_set1_ph(1.0);
18397        let b = _mm512_set1_ph(2.0);
18398        let src = _mm512_set_ph(
18399            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18400            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18401            33.0, 34.0, 35.0,
18402        );
18403        let r = _mm512_mask_div_ph(src, 0b01010101010101010101010101010101, a, b);
18404        let e = _mm512_set_ph(
18405            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18406            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18407        );
18408        assert_eq_m512h(r, e);
18409    }
18410
18411    #[simd_test(enable = "avx512fp16")]
18412    unsafe fn test_mm512_maskz_div_ph() {
18413        let a = _mm512_set1_ph(1.0);
18414        let b = _mm512_set1_ph(2.0);
18415        let r = _mm512_maskz_div_ph(0b01010101010101010101010101010101, a, b);
18416        let e = _mm512_set_ph(
18417            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18418            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18419        );
18420        assert_eq_m512h(r, e);
18421    }
18422
18423    #[simd_test(enable = "avx512fp16")]
18424    unsafe fn test_mm512_div_round_ph() {
18425        let a = _mm512_set1_ph(1.0);
18426        let b = _mm512_set1_ph(2.0);
18427        let r = _mm512_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18428        let e = _mm512_set1_ph(0.5);
18429        assert_eq_m512h(r, e);
18430    }
18431
18432    #[simd_test(enable = "avx512fp16")]
18433    unsafe fn test_mm512_mask_div_round_ph() {
18434        let a = _mm512_set1_ph(1.0);
18435        let b = _mm512_set1_ph(2.0);
18436        let src = _mm512_set_ph(
18437            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18438            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18439            33.0, 34.0, 35.0,
18440        );
18441        let r = _mm512_mask_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18442            src,
18443            0b01010101010101010101010101010101,
18444            a,
18445            b,
18446        );
18447        let e = _mm512_set_ph(
18448            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18449            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18450        );
18451        assert_eq_m512h(r, e);
18452    }
18453
18454    #[simd_test(enable = "avx512fp16")]
18455    unsafe fn test_mm512_maskz_div_round_ph() {
18456        let a = _mm512_set1_ph(1.0);
18457        let b = _mm512_set1_ph(2.0);
18458        let r = _mm512_maskz_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18459            0b01010101010101010101010101010101,
18460            a,
18461            b,
18462        );
18463        let e = _mm512_set_ph(
18464            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18465            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18466        );
18467        assert_eq_m512h(r, e);
18468    }
18469
18470    #[simd_test(enable = "avx512fp16")]
18471    unsafe fn test_mm_div_round_sh() {
18472        let a = _mm_set_sh(1.0);
18473        let b = _mm_set_sh(2.0);
18474        let r = _mm_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18475        let e = _mm_set_sh(0.5);
18476        assert_eq_m128h(r, e);
18477    }
18478
18479    #[simd_test(enable = "avx512fp16")]
18480    unsafe fn test_mm_mask_div_round_sh() {
18481        let a = _mm_set_sh(1.0);
18482        let b = _mm_set_sh(2.0);
18483        let src = _mm_set_sh(4.0);
18484        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18485            src, 0, a, b,
18486        );
18487        let e = _mm_set_sh(4.0);
18488        assert_eq_m128h(r, e);
18489        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18490            src, 1, a, b,
18491        );
18492        let e = _mm_set_sh(0.5);
18493        assert_eq_m128h(r, e);
18494    }
18495
18496    #[simd_test(enable = "avx512fp16")]
18497    unsafe fn test_mm_maskz_div_round_sh() {
18498        let a = _mm_set_sh(1.0);
18499        let b = _mm_set_sh(2.0);
18500        let r =
18501            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18502        let e = _mm_set_sh(0.0);
18503        assert_eq_m128h(r, e);
18504        let r =
18505            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18506        let e = _mm_set_sh(0.5);
18507        assert_eq_m128h(r, e);
18508    }
18509
18510    #[simd_test(enable = "avx512fp16")]
18511    unsafe fn test_mm_div_sh() {
18512        let a = _mm_set_sh(1.0);
18513        let b = _mm_set_sh(2.0);
18514        let r = _mm_div_sh(a, b);
18515        let e = _mm_set_sh(0.5);
18516        assert_eq_m128h(r, e);
18517    }
18518
18519    #[simd_test(enable = "avx512fp16")]
18520    unsafe fn test_mm_mask_div_sh() {
18521        let a = _mm_set_sh(1.0);
18522        let b = _mm_set_sh(2.0);
18523        let src = _mm_set_sh(4.0);
18524        let r = _mm_mask_div_sh(src, 0, a, b);
18525        let e = _mm_set_sh(4.0);
18526        assert_eq_m128h(r, e);
18527        let r = _mm_mask_div_sh(src, 1, a, b);
18528        let e = _mm_set_sh(0.5);
18529        assert_eq_m128h(r, e);
18530    }
18531
18532    #[simd_test(enable = "avx512fp16")]
18533    unsafe fn test_mm_maskz_div_sh() {
18534        let a = _mm_set_sh(1.0);
18535        let b = _mm_set_sh(2.0);
18536        let r = _mm_maskz_div_sh(0, a, b);
18537        let e = _mm_set_sh(0.0);
18538        assert_eq_m128h(r, e);
18539        let r = _mm_maskz_div_sh(1, a, b);
18540        let e = _mm_set_sh(0.5);
18541        assert_eq_m128h(r, e);
18542    }
18543
18544    #[simd_test(enable = "avx512fp16,avx512vl")]
18545    unsafe fn test_mm_mul_pch() {
18546        let a = _mm_set1_pch(0.0, 1.0);
18547        let b = _mm_set1_pch(0.0, 1.0);
18548        let r = _mm_mul_pch(a, b);
18549        let e = _mm_set1_pch(-1.0, 0.0);
18550        assert_eq_m128h(r, e);
18551    }
18552
18553    #[simd_test(enable = "avx512fp16,avx512vl")]
18554    unsafe fn test_mm_mask_mul_pch() {
18555        let a = _mm_set1_pch(0.0, 1.0);
18556        let b = _mm_set1_pch(0.0, 1.0);
18557        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18558        let r = _mm_mask_mul_pch(src, 0b0101, a, b);
18559        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18560        assert_eq_m128h(r, e);
18561    }
18562
18563    #[simd_test(enable = "avx512fp16,avx512vl")]
18564    unsafe fn test_mm_maskz_mul_pch() {
18565        let a = _mm_set1_pch(0.0, 1.0);
18566        let b = _mm_set1_pch(0.0, 1.0);
18567        let r = _mm_maskz_mul_pch(0b0101, a, b);
18568        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18569        assert_eq_m128h(r, e);
18570    }
18571
18572    #[simd_test(enable = "avx512fp16,avx512vl")]
18573    unsafe fn test_mm256_mul_pch() {
18574        let a = _mm256_set1_pch(0.0, 1.0);
18575        let b = _mm256_set1_pch(0.0, 1.0);
18576        let r = _mm256_mul_pch(a, b);
18577        let e = _mm256_set1_pch(-1.0, 0.0);
18578        assert_eq_m256h(r, e);
18579    }
18580
18581    #[simd_test(enable = "avx512fp16,avx512vl")]
18582    unsafe fn test_mm256_mask_mul_pch() {
18583        let a = _mm256_set1_pch(0.0, 1.0);
18584        let b = _mm256_set1_pch(0.0, 1.0);
18585        let src = _mm256_setr_ph(
18586            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18587        );
18588        let r = _mm256_mask_mul_pch(src, 0b01010101, a, b);
18589        let e = _mm256_setr_ph(
18590            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18591        );
18592        assert_eq_m256h(r, e);
18593    }
18594
18595    #[simd_test(enable = "avx512fp16,avx512vl")]
18596    unsafe fn test_mm256_maskz_mul_pch() {
18597        let a = _mm256_set1_pch(0.0, 1.0);
18598        let b = _mm256_set1_pch(0.0, 1.0);
18599        let r = _mm256_maskz_mul_pch(0b01010101, a, b);
18600        let e = _mm256_setr_ph(
18601            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18602        );
18603        assert_eq_m256h(r, e);
18604    }
18605
18606    #[simd_test(enable = "avx512fp16")]
18607    unsafe fn test_mm512_mul_pch() {
18608        let a = _mm512_set1_pch(0.0, 1.0);
18609        let b = _mm512_set1_pch(0.0, 1.0);
18610        let r = _mm512_mul_pch(a, b);
18611        let e = _mm512_set1_pch(-1.0, 0.0);
18612        assert_eq_m512h(r, e);
18613    }
18614
18615    #[simd_test(enable = "avx512fp16")]
18616    unsafe fn test_mm512_mask_mul_pch() {
18617        let a = _mm512_set1_pch(0.0, 1.0);
18618        let b = _mm512_set1_pch(0.0, 1.0);
18619        let src = _mm512_setr_ph(
18620            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18621            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18622            32.0, 33.0,
18623        );
18624        let r = _mm512_mask_mul_pch(src, 0b0101010101010101, a, b);
18625        let e = _mm512_setr_ph(
18626            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18627            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18628            33.0,
18629        );
18630        assert_eq_m512h(r, e);
18631    }
18632
18633    #[simd_test(enable = "avx512fp16")]
18634    unsafe fn test_mm512_maskz_mul_pch() {
18635        let a = _mm512_set1_pch(0.0, 1.0);
18636        let b = _mm512_set1_pch(0.0, 1.0);
18637        let r = _mm512_maskz_mul_pch(0b0101010101010101, a, b);
18638        let e = _mm512_setr_ph(
18639            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18640            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18641        );
18642        assert_eq_m512h(r, e);
18643    }
18644
18645    #[simd_test(enable = "avx512fp16")]
18646    unsafe fn test_mm512_mul_round_pch() {
18647        let a = _mm512_set1_pch(0.0, 1.0);
18648        let b = _mm512_set1_pch(0.0, 1.0);
18649        let r = _mm512_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18650        let e = _mm512_set1_pch(-1.0, 0.0);
18651        assert_eq_m512h(r, e);
18652    }
18653
18654    #[simd_test(enable = "avx512fp16")]
18655    unsafe fn test_mm512_mask_mul_round_pch() {
18656        let a = _mm512_set1_pch(0.0, 1.0);
18657        let b = _mm512_set1_pch(0.0, 1.0);
18658        let src = _mm512_setr_ph(
18659            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18660            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18661            32.0, 33.0,
18662        );
18663        let r = _mm512_mask_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18664            src,
18665            0b0101010101010101,
18666            a,
18667            b,
18668        );
18669        let e = _mm512_setr_ph(
18670            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18671            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18672            33.0,
18673        );
18674        assert_eq_m512h(r, e);
18675    }
18676
18677    #[simd_test(enable = "avx512fp16")]
18678    unsafe fn test_mm512_maskz_mul_round_pch() {
18679        let a = _mm512_set1_pch(0.0, 1.0);
18680        let b = _mm512_set1_pch(0.0, 1.0);
18681        let r = _mm512_maskz_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18682            0b0101010101010101,
18683            a,
18684            b,
18685        );
18686        let e = _mm512_setr_ph(
18687            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18688            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18689        );
18690        assert_eq_m512h(r, e);
18691    }
18692
18693    #[simd_test(enable = "avx512fp16")]
18694    unsafe fn test_mm_mul_round_sch() {
18695        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18696        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18697        let r = _mm_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18698        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18699        assert_eq_m128h(r, e);
18700    }
18701
18702    #[simd_test(enable = "avx512fp16")]
18703    unsafe fn test_mm_mask_mul_round_sch() {
18704        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18705        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18706        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18707        let r = _mm_mask_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18708            src, 0, a, b,
18709        );
18710        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18711        assert_eq_m128h(r, e);
18712    }
18713
18714    #[simd_test(enable = "avx512fp16")]
18715    unsafe fn test_mm_maskz_mul_round_sch() {
18716        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18717        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18718        let r =
18719            _mm_maskz_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18720        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18721        assert_eq_m128h(r, e);
18722    }
18723
18724    #[simd_test(enable = "avx512fp16")]
18725    unsafe fn test_mm_mul_sch() {
18726        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18727        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18728        let r = _mm_mul_sch(a, b);
18729        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18730        assert_eq_m128h(r, e);
18731    }
18732
18733    #[simd_test(enable = "avx512fp16")]
18734    unsafe fn test_mm_mask_mul_sch() {
18735        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18736        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18737        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18738        let r = _mm_mask_mul_sch(src, 0, a, b);
18739        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18740        assert_eq_m128h(r, e);
18741    }
18742
18743    #[simd_test(enable = "avx512fp16")]
18744    unsafe fn test_mm_maskz_mul_sch() {
18745        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18746        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18747        let r = _mm_maskz_mul_sch(0, a, b);
18748        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18749        assert_eq_m128h(r, e);
18750    }
18751
18752    #[simd_test(enable = "avx512fp16,avx512vl")]
18753    unsafe fn test_mm_fmul_pch() {
18754        let a = _mm_set1_pch(0.0, 1.0);
18755        let b = _mm_set1_pch(0.0, 1.0);
18756        let r = _mm_fmul_pch(a, b);
18757        let e = _mm_set1_pch(-1.0, 0.0);
18758        assert_eq_m128h(r, e);
18759    }
18760
18761    #[simd_test(enable = "avx512fp16,avx512vl")]
18762    unsafe fn test_mm_mask_fmul_pch() {
18763        let a = _mm_set1_pch(0.0, 1.0);
18764        let b = _mm_set1_pch(0.0, 1.0);
18765        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18766        let r = _mm_mask_fmul_pch(src, 0b0101, a, b);
18767        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18768        assert_eq_m128h(r, e);
18769    }
18770
18771    #[simd_test(enable = "avx512fp16,avx512vl")]
18772    unsafe fn test_mm_maskz_fmul_pch() {
18773        let a = _mm_set1_pch(0.0, 1.0);
18774        let b = _mm_set1_pch(0.0, 1.0);
18775        let r = _mm_maskz_fmul_pch(0b0101, a, b);
18776        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18777        assert_eq_m128h(r, e);
18778    }
18779
18780    #[simd_test(enable = "avx512fp16,avx512vl")]
18781    unsafe fn test_mm256_fmul_pch() {
18782        let a = _mm256_set1_pch(0.0, 1.0);
18783        let b = _mm256_set1_pch(0.0, 1.0);
18784        let r = _mm256_fmul_pch(a, b);
18785        let e = _mm256_set1_pch(-1.0, 0.0);
18786        assert_eq_m256h(r, e);
18787    }
18788
18789    #[simd_test(enable = "avx512fp16,avx512vl")]
18790    unsafe fn test_mm256_mask_fmul_pch() {
18791        let a = _mm256_set1_pch(0.0, 1.0);
18792        let b = _mm256_set1_pch(0.0, 1.0);
18793        let src = _mm256_setr_ph(
18794            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18795        );
18796        let r = _mm256_mask_fmul_pch(src, 0b01010101, a, b);
18797        let e = _mm256_setr_ph(
18798            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18799        );
18800        assert_eq_m256h(r, e);
18801    }
18802
18803    #[simd_test(enable = "avx512fp16,avx512vl")]
18804    unsafe fn test_mm256_maskz_fmul_pch() {
18805        let a = _mm256_set1_pch(0.0, 1.0);
18806        let b = _mm256_set1_pch(0.0, 1.0);
18807        let r = _mm256_maskz_fmul_pch(0b01010101, a, b);
18808        let e = _mm256_setr_ph(
18809            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18810        );
18811        assert_eq_m256h(r, e);
18812    }
18813
18814    #[simd_test(enable = "avx512fp16")]
18815    unsafe fn test_mm512_fmul_pch() {
18816        let a = _mm512_set1_pch(0.0, 1.0);
18817        let b = _mm512_set1_pch(0.0, 1.0);
18818        let r = _mm512_fmul_pch(a, b);
18819        let e = _mm512_set1_pch(-1.0, 0.0);
18820        assert_eq_m512h(r, e);
18821    }
18822
18823    #[simd_test(enable = "avx512fp16")]
18824    unsafe fn test_mm512_mask_fmul_pch() {
18825        let a = _mm512_set1_pch(0.0, 1.0);
18826        let b = _mm512_set1_pch(0.0, 1.0);
18827        let src = _mm512_setr_ph(
18828            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18829            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18830            32.0, 33.0,
18831        );
18832        let r = _mm512_mask_fmul_pch(src, 0b0101010101010101, a, b);
18833        let e = _mm512_setr_ph(
18834            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18835            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18836            33.0,
18837        );
18838        assert_eq_m512h(r, e);
18839    }
18840
18841    #[simd_test(enable = "avx512fp16")]
18842    unsafe fn test_mm512_maskz_fmul_pch() {
18843        let a = _mm512_set1_pch(0.0, 1.0);
18844        let b = _mm512_set1_pch(0.0, 1.0);
18845        let r = _mm512_maskz_fmul_pch(0b0101010101010101, a, b);
18846        let e = _mm512_setr_ph(
18847            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18848            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18849        );
18850        assert_eq_m512h(r, e);
18851    }
18852
18853    #[simd_test(enable = "avx512fp16")]
18854    unsafe fn test_mm512_fmul_round_pch() {
18855        let a = _mm512_set1_pch(0.0, 1.0);
18856        let b = _mm512_set1_pch(0.0, 1.0);
18857        let r = _mm512_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18858        let e = _mm512_set1_pch(-1.0, 0.0);
18859        assert_eq_m512h(r, e);
18860    }
18861
18862    #[simd_test(enable = "avx512fp16")]
18863    unsafe fn test_mm512_mask_fmul_round_pch() {
18864        let a = _mm512_set1_pch(0.0, 1.0);
18865        let b = _mm512_set1_pch(0.0, 1.0);
18866        let src = _mm512_setr_ph(
18867            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18868            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18869            32.0, 33.0,
18870        );
18871        let r = _mm512_mask_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18872            src,
18873            0b0101010101010101,
18874            a,
18875            b,
18876        );
18877        let e = _mm512_setr_ph(
18878            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18879            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18880            33.0,
18881        );
18882        assert_eq_m512h(r, e);
18883    }
18884
18885    #[simd_test(enable = "avx512fp16")]
18886    unsafe fn test_mm512_maskz_fmul_round_pch() {
18887        let a = _mm512_set1_pch(0.0, 1.0);
18888        let b = _mm512_set1_pch(0.0, 1.0);
18889        let r = _mm512_maskz_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18890            0b0101010101010101,
18891            a,
18892            b,
18893        );
18894        let e = _mm512_setr_ph(
18895            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18896            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18897        );
18898        assert_eq_m512h(r, e);
18899    }
18900
18901    #[simd_test(enable = "avx512fp16")]
18902    unsafe fn test_mm_fmul_round_sch() {
18903        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18904        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18905        let r = _mm_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18906        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18907        assert_eq_m128h(r, e);
18908    }
18909
18910    #[simd_test(enable = "avx512fp16")]
18911    unsafe fn test_mm_mask_fmul_round_sch() {
18912        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18913        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18914        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18915        let r = _mm_mask_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18916            src, 0, a, b,
18917        );
18918        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18919        assert_eq_m128h(r, e);
18920    }
18921
18922    #[simd_test(enable = "avx512fp16")]
18923    unsafe fn test_mm_maskz_fmul_round_sch() {
18924        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18925        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18926        let r =
18927            _mm_maskz_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18928        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18929        assert_eq_m128h(r, e);
18930    }
18931
18932    #[simd_test(enable = "avx512fp16")]
18933    unsafe fn test_mm_fmul_sch() {
18934        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18935        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18936        let r = _mm_fmul_sch(a, b);
18937        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18938        assert_eq_m128h(r, e);
18939    }
18940
18941    #[simd_test(enable = "avx512fp16")]
18942    unsafe fn test_mm_mask_fmul_sch() {
18943        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18944        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18945        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18946        let r = _mm_mask_fmul_sch(src, 0, a, b);
18947        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18948        assert_eq_m128h(r, e);
18949    }
18950
18951    #[simd_test(enable = "avx512fp16")]
18952    unsafe fn test_mm_maskz_fmul_sch() {
18953        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18954        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18955        let r = _mm_maskz_fmul_sch(0, a, b);
18956        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18957        assert_eq_m128h(r, e);
18958    }
18959
18960    #[simd_test(enable = "avx512fp16,avx512vl")]
18961    unsafe fn test_mm_cmul_pch() {
18962        let a = _mm_set1_pch(0.0, 1.0);
18963        let b = _mm_set1_pch(0.0, -1.0);
18964        let r = _mm_cmul_pch(a, b);
18965        let e = _mm_set1_pch(-1.0, 0.0);
18966        assert_eq_m128h(r, e);
18967    }
18968
18969    #[simd_test(enable = "avx512fp16,avx512vl")]
18970    unsafe fn test_mm_mask_cmul_pch() {
18971        let a = _mm_set1_pch(0.0, 1.0);
18972        let b = _mm_set1_pch(0.0, -1.0);
18973        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18974        let r = _mm_mask_cmul_pch(src, 0b0101, a, b);
18975        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18976        assert_eq_m128h(r, e);
18977    }
18978
18979    #[simd_test(enable = "avx512fp16,avx512vl")]
18980    unsafe fn test_mm_maskz_cmul_pch() {
18981        let a = _mm_set1_pch(0.0, 1.0);
18982        let b = _mm_set1_pch(0.0, -1.0);
18983        let r = _mm_maskz_cmul_pch(0b0101, a, b);
18984        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18985        assert_eq_m128h(r, e);
18986    }
18987
18988    #[simd_test(enable = "avx512fp16,avx512vl")]
18989    unsafe fn test_mm256_cmul_pch() {
18990        let a = _mm256_set1_pch(0.0, 1.0);
18991        let b = _mm256_set1_pch(0.0, -1.0);
18992        let r = _mm256_cmul_pch(a, b);
18993        let e = _mm256_set1_pch(-1.0, 0.0);
18994        assert_eq_m256h(r, e);
18995    }
18996
18997    #[simd_test(enable = "avx512fp16,avx512vl")]
18998    unsafe fn test_mm256_mask_cmul_pch() {
18999        let a = _mm256_set1_pch(0.0, 1.0);
19000        let b = _mm256_set1_pch(0.0, -1.0);
19001        let src = _mm256_setr_ph(
19002            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19003        );
19004        let r = _mm256_mask_cmul_pch(src, 0b01010101, a, b);
19005        let e = _mm256_setr_ph(
19006            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19007        );
19008        assert_eq_m256h(r, e);
19009    }
19010
19011    #[simd_test(enable = "avx512fp16,avx512vl")]
19012    unsafe fn test_mm256_maskz_cmul_pch() {
19013        let a = _mm256_set1_pch(0.0, 1.0);
19014        let b = _mm256_set1_pch(0.0, -1.0);
19015        let r = _mm256_maskz_cmul_pch(0b01010101, a, b);
19016        let e = _mm256_setr_ph(
19017            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19018        );
19019        assert_eq_m256h(r, e);
19020    }
19021
19022    #[simd_test(enable = "avx512fp16")]
19023    unsafe fn test_mm512_cmul_pch() {
19024        let a = _mm512_set1_pch(0.0, 1.0);
19025        let b = _mm512_set1_pch(0.0, -1.0);
19026        let r = _mm512_cmul_pch(a, b);
19027        let e = _mm512_set1_pch(-1.0, 0.0);
19028        assert_eq_m512h(r, e);
19029    }
19030
19031    #[simd_test(enable = "avx512fp16")]
19032    unsafe fn test_mm512_mask_cmul_pch() {
19033        let a = _mm512_set1_pch(0.0, 1.0);
19034        let b = _mm512_set1_pch(0.0, -1.0);
19035        let src = _mm512_setr_ph(
19036            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19037            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19038            32.0, 33.0,
19039        );
19040        let r = _mm512_mask_cmul_pch(src, 0b0101010101010101, a, b);
19041        let e = _mm512_setr_ph(
19042            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19043            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19044            33.0,
19045        );
19046        assert_eq_m512h(r, e);
19047    }
19048
19049    #[simd_test(enable = "avx512fp16")]
19050    unsafe fn test_mm512_maskz_cmul_pch() {
19051        let a = _mm512_set1_pch(0.0, 1.0);
19052        let b = _mm512_set1_pch(0.0, -1.0);
19053        let r = _mm512_maskz_cmul_pch(0b0101010101010101, a, b);
19054        let e = _mm512_setr_ph(
19055            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19056            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19057        );
19058        assert_eq_m512h(r, e);
19059    }
19060
19061    #[simd_test(enable = "avx512fp16")]
19062    unsafe fn test_mm512_cmul_round_pch() {
19063        let a = _mm512_set1_pch(0.0, 1.0);
19064        let b = _mm512_set1_pch(0.0, -1.0);
19065        let r = _mm512_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19066        let e = _mm512_set1_pch(-1.0, 0.0);
19067        assert_eq_m512h(r, e);
19068    }
19069
19070    #[simd_test(enable = "avx512fp16")]
19071    unsafe fn test_mm512_mask_cmul_round_pch() {
19072        let a = _mm512_set1_pch(0.0, 1.0);
19073        let b = _mm512_set1_pch(0.0, -1.0);
19074        let src = _mm512_setr_ph(
19075            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19076            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19077            32.0, 33.0,
19078        );
19079        let r = _mm512_mask_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19080            src,
19081            0b0101010101010101,
19082            a,
19083            b,
19084        );
19085        let e = _mm512_setr_ph(
19086            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19087            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19088            33.0,
19089        );
19090        assert_eq_m512h(r, e);
19091    }
19092
19093    #[simd_test(enable = "avx512fp16")]
19094    unsafe fn test_mm512_maskz_cmul_round_pch() {
19095        let a = _mm512_set1_pch(0.0, 1.0);
19096        let b = _mm512_set1_pch(0.0, -1.0);
19097        let r = _mm512_maskz_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19098            0b0101010101010101,
19099            a,
19100            b,
19101        );
19102        let e = _mm512_setr_ph(
19103            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19104            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19105        );
19106        assert_eq_m512h(r, e);
19107    }
19108
19109    #[simd_test(enable = "avx512fp16")]
19110    unsafe fn test_mm_cmul_sch() {
19111        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19112        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19113        let r = _mm_cmul_sch(a, b);
19114        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19115        assert_eq_m128h(r, e);
19116    }
19117
19118    #[simd_test(enable = "avx512fp16")]
19119    unsafe fn test_mm_mask_cmul_sch() {
19120        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19121        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19122        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19123        let r = _mm_mask_cmul_sch(src, 0, a, b);
19124        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19125        assert_eq_m128h(r, e);
19126    }
19127
19128    #[simd_test(enable = "avx512fp16")]
19129    unsafe fn test_mm_maskz_cmul_sch() {
19130        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19131        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19132        let r = _mm_maskz_cmul_sch(0, a, b);
19133        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19134        assert_eq_m128h(r, e);
19135    }
19136
19137    #[simd_test(enable = "avx512fp16")]
19138    unsafe fn test_mm_cmul_round_sch() {
19139        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19140        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19141        let r = _mm_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19142        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19143        assert_eq_m128h(r, e);
19144    }
19145
19146    #[simd_test(enable = "avx512fp16")]
19147    unsafe fn test_mm_mask_cmul_round_sch() {
19148        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19149        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19150        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19151        let r = _mm_mask_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19152            src, 0, a, b,
19153        );
19154        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19155        assert_eq_m128h(r, e);
19156    }
19157
19158    #[simd_test(enable = "avx512fp16")]
19159    unsafe fn test_mm_maskz_cmul_round_sch() {
19160        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19161        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19162        let r =
19163            _mm_maskz_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19164        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19165        assert_eq_m128h(r, e);
19166    }
19167
19168    #[simd_test(enable = "avx512fp16,avx512vl")]
19169    unsafe fn test_mm_fcmul_pch() {
19170        let a = _mm_set1_pch(0.0, 1.0);
19171        let b = _mm_set1_pch(0.0, -1.0);
19172        let r = _mm_fcmul_pch(a, b);
19173        let e = _mm_set1_pch(-1.0, 0.0);
19174        assert_eq_m128h(r, e);
19175    }
19176
19177    #[simd_test(enable = "avx512fp16,avx512vl")]
19178    unsafe fn test_mm_mask_fcmul_pch() {
19179        let a = _mm_set1_pch(0.0, 1.0);
19180        let b = _mm_set1_pch(0.0, -1.0);
19181        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19182        let r = _mm_mask_fcmul_pch(src, 0b0101, a, b);
19183        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
19184        assert_eq_m128h(r, e);
19185    }
19186
19187    #[simd_test(enable = "avx512fp16,avx512vl")]
19188    unsafe fn test_mm_maskz_fcmul_pch() {
19189        let a = _mm_set1_pch(0.0, 1.0);
19190        let b = _mm_set1_pch(0.0, -1.0);
19191        let r = _mm_maskz_fcmul_pch(0b0101, a, b);
19192        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19193        assert_eq_m128h(r, e);
19194    }
19195
19196    #[simd_test(enable = "avx512fp16,avx512vl")]
19197    unsafe fn test_mm256_fcmul_pch() {
19198        let a = _mm256_set1_pch(0.0, 1.0);
19199        let b = _mm256_set1_pch(0.0, -1.0);
19200        let r = _mm256_fcmul_pch(a, b);
19201        let e = _mm256_set1_pch(-1.0, 0.0);
19202        assert_eq_m256h(r, e);
19203    }
19204
19205    #[simd_test(enable = "avx512fp16,avx512vl")]
19206    unsafe fn test_mm256_mask_fcmul_pch() {
19207        let a = _mm256_set1_pch(0.0, 1.0);
19208        let b = _mm256_set1_pch(0.0, -1.0);
19209        let src = _mm256_setr_ph(
19210            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19211        );
19212        let r = _mm256_mask_fcmul_pch(src, 0b01010101, a, b);
19213        let e = _mm256_setr_ph(
19214            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19215        );
19216        assert_eq_m256h(r, e);
19217    }
19218
19219    #[simd_test(enable = "avx512fp16,avx512vl")]
19220    unsafe fn test_mm256_maskz_fcmul_pch() {
19221        let a = _mm256_set1_pch(0.0, 1.0);
19222        let b = _mm256_set1_pch(0.0, -1.0);
19223        let r = _mm256_maskz_fcmul_pch(0b01010101, a, b);
19224        let e = _mm256_setr_ph(
19225            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19226        );
19227        assert_eq_m256h(r, e);
19228    }
19229
19230    #[simd_test(enable = "avx512fp16")]
19231    unsafe fn test_mm512_fcmul_pch() {
19232        let a = _mm512_set1_pch(0.0, 1.0);
19233        let b = _mm512_set1_pch(0.0, -1.0);
19234        let r = _mm512_fcmul_pch(a, b);
19235        let e = _mm512_set1_pch(-1.0, 0.0);
19236        assert_eq_m512h(r, e);
19237    }
19238
19239    #[simd_test(enable = "avx512fp16")]
19240    unsafe fn test_mm512_mask_fcmul_pch() {
19241        let a = _mm512_set1_pch(0.0, 1.0);
19242        let b = _mm512_set1_pch(0.0, -1.0);
19243        let src = _mm512_setr_ph(
19244            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19245            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19246            32.0, 33.0,
19247        );
19248        let r = _mm512_mask_fcmul_pch(src, 0b0101010101010101, a, b);
19249        let e = _mm512_setr_ph(
19250            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19251            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19252            33.0,
19253        );
19254        assert_eq_m512h(r, e);
19255    }
19256
19257    #[simd_test(enable = "avx512fp16")]
19258    unsafe fn test_mm512_maskz_fcmul_pch() {
19259        let a = _mm512_set1_pch(0.0, 1.0);
19260        let b = _mm512_set1_pch(0.0, -1.0);
19261        let r = _mm512_maskz_fcmul_pch(0b0101010101010101, a, b);
19262        let e = _mm512_setr_ph(
19263            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19264            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19265        );
19266        assert_eq_m512h(r, e);
19267    }
19268
19269    #[simd_test(enable = "avx512fp16")]
19270    unsafe fn test_mm512_fcmul_round_pch() {
19271        let a = _mm512_set1_pch(0.0, 1.0);
19272        let b = _mm512_set1_pch(0.0, -1.0);
19273        let r = _mm512_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19274        let e = _mm512_set1_pch(-1.0, 0.0);
19275        assert_eq_m512h(r, e);
19276    }
19277
19278    #[simd_test(enable = "avx512fp16")]
19279    unsafe fn test_mm512_mask_fcmul_round_pch() {
19280        let a = _mm512_set1_pch(0.0, 1.0);
19281        let b = _mm512_set1_pch(0.0, -1.0);
19282        let src = _mm512_setr_ph(
19283            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19284            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19285            32.0, 33.0,
19286        );
19287        let r = _mm512_mask_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19288            src,
19289            0b0101010101010101,
19290            a,
19291            b,
19292        );
19293        let e = _mm512_setr_ph(
19294            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19295            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19296            33.0,
19297        );
19298        assert_eq_m512h(r, e);
19299    }
19300
19301    #[simd_test(enable = "avx512fp16")]
19302    unsafe fn test_mm512_maskz_fcmul_round_pch() {
19303        let a = _mm512_set1_pch(0.0, 1.0);
19304        let b = _mm512_set1_pch(0.0, -1.0);
19305        let r = _mm512_maskz_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19306            0b0101010101010101,
19307            a,
19308            b,
19309        );
19310        let e = _mm512_setr_ph(
19311            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19312            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19313        );
19314        assert_eq_m512h(r, e);
19315    }
19316
19317    #[simd_test(enable = "avx512fp16")]
19318    unsafe fn test_mm_fcmul_sch() {
19319        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19320        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19321        let r = _mm_fcmul_sch(a, b);
19322        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19323        assert_eq_m128h(r, e);
19324    }
19325
19326    #[simd_test(enable = "avx512fp16")]
19327    unsafe fn test_mm_mask_fcmul_sch() {
19328        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19329        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19330        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19331        let r = _mm_mask_fcmul_sch(src, 0, a, b);
19332        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19333        assert_eq_m128h(r, e);
19334    }
19335
19336    #[simd_test(enable = "avx512fp16")]
19337    unsafe fn test_mm_maskz_fcmul_sch() {
19338        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19339        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19340        let r = _mm_maskz_fcmul_sch(0, a, b);
19341        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19342        assert_eq_m128h(r, e);
19343    }
19344
19345    #[simd_test(enable = "avx512fp16")]
19346    unsafe fn test_mm_fcmul_round_sch() {
19347        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19348        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19349        let r = _mm_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19350        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19351        assert_eq_m128h(r, e);
19352    }
19353
19354    #[simd_test(enable = "avx512fp16")]
19355    unsafe fn test_mm_mask_fcmul_round_sch() {
19356        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19357        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19358        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19359        let r = _mm_mask_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19360            src, 0, a, b,
19361        );
19362        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19363        assert_eq_m128h(r, e);
19364    }
19365
19366    #[simd_test(enable = "avx512fp16")]
19367    unsafe fn test_mm_maskz_fcmul_round_sch() {
19368        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19369        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19370        let r =
19371            _mm_maskz_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19372        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19373        assert_eq_m128h(r, e);
19374    }
19375
19376    #[simd_test(enable = "avx512fp16,avx512vl")]
19377    unsafe fn test_mm_abs_ph() {
19378        let a = _mm_set_ph(-1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0);
19379        let r = _mm_abs_ph(a);
19380        let e = _mm_set_ph(1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0);
19381        assert_eq_m128h(r, e);
19382    }
19383
19384    #[simd_test(enable = "avx512fp16,avx512vl")]
19385    unsafe fn test_mm256_abs_ph() {
19386        let a = _mm256_set_ph(
19387            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19388            -14.0,
19389        );
19390        let r = _mm256_abs_ph(a);
19391        let e = _mm256_set_ph(
19392            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19393        );
19394        assert_eq_m256h(r, e);
19395    }
19396
19397    #[simd_test(enable = "avx512fp16")]
19398    unsafe fn test_mm512_abs_ph() {
19399        let a = _mm512_set_ph(
19400            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19401            -14.0, 15.0, -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0,
19402            27.0, -28.0, 29.0, -30.0,
19403        );
19404        let r = _mm512_abs_ph(a);
19405        let e = _mm512_set_ph(
19406            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19407            15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0,
19408            29.0, 30.0,
19409        );
19410        assert_eq_m512h(r, e);
19411    }
19412
19413    #[simd_test(enable = "avx512fp16,avx512vl")]
19414    unsafe fn test_mm_conj_pch() {
19415        let a = _mm_set1_pch(0.0, 1.0);
19416        let r = _mm_conj_pch(a);
19417        let e = _mm_set1_pch(0.0, -1.0);
19418        assert_eq_m128h(r, e);
19419    }
19420
19421    #[simd_test(enable = "avx512fp16,avx512vl")]
19422    unsafe fn test_mm_mask_conj_pch() {
19423        let a = _mm_set1_pch(0.0, 1.0);
19424        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19425        let r = _mm_mask_conj_pch(src, 0b0101, a);
19426        let e = _mm_setr_ph(0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0);
19427        assert_eq_m128h(r, e);
19428    }
19429
19430    #[simd_test(enable = "avx512fp16,avx512vl")]
19431    unsafe fn test_mm_maskz_conj_pch() {
19432        let a = _mm_set1_pch(0.0, 1.0);
19433        let r = _mm_maskz_conj_pch(0b0101, a);
19434        let e = _mm_setr_ph(0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0);
19435        assert_eq_m128h(r, e);
19436    }
19437
19438    #[simd_test(enable = "avx512fp16,avx512vl")]
19439    unsafe fn test_mm256_conj_pch() {
19440        let a = _mm256_set1_pch(0.0, 1.0);
19441        let r = _mm256_conj_pch(a);
19442        let e = _mm256_set1_pch(0.0, -1.0);
19443        assert_eq_m256h(r, e);
19444    }
19445
19446    #[simd_test(enable = "avx512fp16,avx512vl")]
19447    unsafe fn test_mm256_mask_conj_pch() {
19448        let a = _mm256_set1_pch(0.0, 1.0);
19449        let src = _mm256_setr_ph(
19450            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19451        );
19452        let r = _mm256_mask_conj_pch(src, 0b01010101, a);
19453        let e = _mm256_setr_ph(
19454            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19455        );
19456        assert_eq_m256h(r, e);
19457    }
19458
19459    #[simd_test(enable = "avx512fp16,avx512vl")]
19460    unsafe fn test_mm256_maskz_conj_pch() {
19461        let a = _mm256_set1_pch(0.0, 1.0);
19462        let r = _mm256_maskz_conj_pch(0b01010101, a);
19463        let e = _mm256_setr_ph(
19464            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19465        );
19466        assert_eq_m256h(r, e);
19467    }
19468
19469    #[simd_test(enable = "avx512fp16")]
19470    unsafe fn test_mm512_conj_pch() {
19471        let a = _mm512_set1_pch(0.0, 1.0);
19472        let r = _mm512_conj_pch(a);
19473        let e = _mm512_set1_pch(0.0, -1.0);
19474        assert_eq_m512h(r, e);
19475    }
19476
19477    #[simd_test(enable = "avx512fp16")]
19478    unsafe fn test_mm512_mask_conj_pch() {
19479        let a = _mm512_set1_pch(0.0, 1.0);
19480        let src = _mm512_setr_ph(
19481            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19482            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19483            32.0, 33.0,
19484        );
19485        let r = _mm512_mask_conj_pch(src, 0b0101010101010101, a);
19486        let e = _mm512_setr_ph(
19487            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19488            0.0, -1.0, 20.0, 21.0, 0.0, -1.0, 24.0, 25.0, 0.0, -1.0, 28.0, 29.0, 0.0, -1.0, 32.0,
19489            33.0,
19490        );
19491        assert_eq_m512h(r, e);
19492    }
19493
19494    #[simd_test(enable = "avx512fp16")]
19495    unsafe fn test_mm512_maskz_conj_pch() {
19496        let a = _mm512_set1_pch(0.0, 1.0);
19497        let r = _mm512_maskz_conj_pch(0b0101010101010101, a);
19498        let e = _mm512_setr_ph(
19499            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19500            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19501        );
19502        assert_eq_m512h(r, e);
19503    }
19504
19505    #[simd_test(enable = "avx512fp16,avx512vl")]
19506    unsafe fn test_mm_fmadd_pch() {
19507        let a = _mm_set1_pch(0.0, 1.0);
19508        let b = _mm_set1_pch(0.0, 2.0);
19509        let c = _mm_set1_pch(0.0, 3.0);
19510        let r = _mm_fmadd_pch(a, b, c);
19511        let e = _mm_set1_pch(-2.0, 3.0);
19512        assert_eq_m128h(r, e);
19513    }
19514
19515    #[simd_test(enable = "avx512fp16,avx512vl")]
19516    unsafe fn test_mm_mask_fmadd_pch() {
19517        let a = _mm_set1_pch(0.0, 1.0);
19518        let b = _mm_set1_pch(0.0, 2.0);
19519        let c = _mm_set1_pch(0.0, 3.0);
19520        let r = _mm_mask_fmadd_pch(a, 0b0101, b, c);
19521        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0);
19522        assert_eq_m128h(r, e);
19523    }
19524
19525    #[simd_test(enable = "avx512fp16,avx512vl")]
19526    unsafe fn test_mm_mask3_fmadd_pch() {
19527        let a = _mm_set1_pch(0.0, 1.0);
19528        let b = _mm_set1_pch(0.0, 2.0);
19529        let c = _mm_set1_pch(0.0, 3.0);
19530        let r = _mm_mask3_fmadd_pch(a, b, c, 0b0101);
19531        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0);
19532        assert_eq_m128h(r, e);
19533    }
19534
19535    #[simd_test(enable = "avx512fp16,avx512vl")]
19536    unsafe fn test_mm_maskz_fmadd_pch() {
19537        let a = _mm_set1_pch(0.0, 1.0);
19538        let b = _mm_set1_pch(0.0, 2.0);
19539        let c = _mm_set1_pch(0.0, 3.0);
19540        let r = _mm_maskz_fmadd_pch(0b0101, a, b, c);
19541        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0);
19542        assert_eq_m128h(r, e);
19543    }
19544
19545    #[simd_test(enable = "avx512fp16,avx512vl")]
19546    unsafe fn test_mm256_fmadd_pch() {
19547        let a = _mm256_set1_pch(0.0, 1.0);
19548        let b = _mm256_set1_pch(0.0, 2.0);
19549        let c = _mm256_set1_pch(0.0, 3.0);
19550        let r = _mm256_fmadd_pch(a, b, c);
19551        let e = _mm256_set1_pch(-2.0, 3.0);
19552        assert_eq_m256h(r, e);
19553    }
19554
19555    #[simd_test(enable = "avx512fp16,avx512vl")]
19556    unsafe fn test_mm256_mask_fmadd_pch() {
19557        let a = _mm256_set1_pch(0.0, 1.0);
19558        let b = _mm256_set1_pch(0.0, 2.0);
19559        let c = _mm256_set1_pch(0.0, 3.0);
19560        let r = _mm256_mask_fmadd_pch(a, 0b01010101, b, c);
19561        let e = _mm256_setr_ph(
19562            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19563        );
19564        assert_eq_m256h(r, e);
19565    }
19566
19567    #[simd_test(enable = "avx512fp16,avx512vl")]
19568    unsafe fn test_mm256_mask3_fmadd_pch() {
19569        let a = _mm256_set1_pch(0.0, 1.0);
19570        let b = _mm256_set1_pch(0.0, 2.0);
19571        let c = _mm256_set1_pch(0.0, 3.0);
19572        let r = _mm256_mask3_fmadd_pch(a, b, c, 0b01010101);
19573        let e = _mm256_setr_ph(
19574            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19575        );
19576        assert_eq_m256h(r, e);
19577    }
19578
19579    #[simd_test(enable = "avx512fp16,avx512vl")]
19580    unsafe fn test_mm256_maskz_fmadd_pch() {
19581        let a = _mm256_set1_pch(0.0, 1.0);
19582        let b = _mm256_set1_pch(0.0, 2.0);
19583        let c = _mm256_set1_pch(0.0, 3.0);
19584        let r = _mm256_maskz_fmadd_pch(0b01010101, a, b, c);
19585        let e = _mm256_setr_ph(
19586            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19587        );
19588        assert_eq_m256h(r, e);
19589    }
19590
19591    #[simd_test(enable = "avx512fp16")]
19592    unsafe fn test_mm512_fmadd_pch() {
19593        let a = _mm512_set1_pch(0.0, 1.0);
19594        let b = _mm512_set1_pch(0.0, 2.0);
19595        let c = _mm512_set1_pch(0.0, 3.0);
19596        let r = _mm512_fmadd_pch(a, b, c);
19597        let e = _mm512_set1_pch(-2.0, 3.0);
19598        assert_eq_m512h(r, e);
19599    }
19600
19601    #[simd_test(enable = "avx512fp16")]
19602    unsafe fn test_mm512_mask_fmadd_pch() {
19603        let a = _mm512_set1_pch(0.0, 1.0);
19604        let b = _mm512_set1_pch(0.0, 2.0);
19605        let c = _mm512_set1_pch(0.0, 3.0);
19606        let r = _mm512_mask_fmadd_pch(a, 0b0101010101010101, b, c);
19607        let e = _mm512_setr_ph(
19608            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19609            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19610        );
19611        assert_eq_m512h(r, e);
19612    }
19613
19614    #[simd_test(enable = "avx512fp16")]
19615    unsafe fn test_mm512_mask3_fmadd_pch() {
19616        let a = _mm512_set1_pch(0.0, 1.0);
19617        let b = _mm512_set1_pch(0.0, 2.0);
19618        let c = _mm512_set1_pch(0.0, 3.0);
19619        let r = _mm512_mask3_fmadd_pch(a, b, c, 0b0101010101010101);
19620        let e = _mm512_setr_ph(
19621            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19622            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19623        );
19624        assert_eq_m512h(r, e);
19625    }
19626
19627    #[simd_test(enable = "avx512fp16")]
19628    unsafe fn test_mm512_maskz_fmadd_pch() {
19629        let a = _mm512_set1_pch(0.0, 1.0);
19630        let b = _mm512_set1_pch(0.0, 2.0);
19631        let c = _mm512_set1_pch(0.0, 3.0);
19632        let r = _mm512_maskz_fmadd_pch(0b0101010101010101, a, b, c);
19633        let e = _mm512_setr_ph(
19634            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19635            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19636        );
19637        assert_eq_m512h(r, e);
19638    }
19639
19640    #[simd_test(enable = "avx512fp16")]
19641    unsafe fn test_mm512_fmadd_round_pch() {
19642        let a = _mm512_set1_pch(0.0, 1.0);
19643        let b = _mm512_set1_pch(0.0, 2.0);
19644        let c = _mm512_set1_pch(0.0, 3.0);
19645        let r =
19646            _mm512_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19647        let e = _mm512_set1_pch(-2.0, 3.0);
19648        assert_eq_m512h(r, e);
19649    }
19650
19651    #[simd_test(enable = "avx512fp16")]
19652    unsafe fn test_mm512_mask_fmadd_round_pch() {
19653        let a = _mm512_set1_pch(0.0, 1.0);
19654        let b = _mm512_set1_pch(0.0, 2.0);
19655        let c = _mm512_set1_pch(0.0, 3.0);
19656        let r = _mm512_mask_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19657            a,
19658            0b0101010101010101,
19659            b,
19660            c,
19661        );
19662        let e = _mm512_setr_ph(
19663            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19664            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19665        );
19666        assert_eq_m512h(r, e);
19667    }
19668
19669    #[simd_test(enable = "avx512fp16")]
19670    unsafe fn test_mm512_mask3_fmadd_round_pch() {
19671        let a = _mm512_set1_pch(0.0, 1.0);
19672        let b = _mm512_set1_pch(0.0, 2.0);
19673        let c = _mm512_set1_pch(0.0, 3.0);
19674        let r = _mm512_mask3_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19675            a,
19676            b,
19677            c,
19678            0b0101010101010101,
19679        );
19680        let e = _mm512_setr_ph(
19681            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19682            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19683        );
19684        assert_eq_m512h(r, e);
19685    }
19686
19687    #[simd_test(enable = "avx512fp16")]
19688    unsafe fn test_mm512_maskz_fmadd_round_pch() {
19689        let a = _mm512_set1_pch(0.0, 1.0);
19690        let b = _mm512_set1_pch(0.0, 2.0);
19691        let c = _mm512_set1_pch(0.0, 3.0);
19692        let r = _mm512_maskz_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19693            0b0101010101010101,
19694            a,
19695            b,
19696            c,
19697        );
19698        let e = _mm512_setr_ph(
19699            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19700            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19701        );
19702        assert_eq_m512h(r, e);
19703    }
19704
19705    #[simd_test(enable = "avx512fp16")]
19706    unsafe fn test_mm_fmadd_sch() {
19707        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19708        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19709        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19710        let r = _mm_fmadd_sch(a, b, c);
19711        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19712        assert_eq_m128h(r, e);
19713    }
19714
19715    #[simd_test(enable = "avx512fp16")]
19716    unsafe fn test_mm_mask_fmadd_sch() {
19717        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19718        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19719        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19720        let r = _mm_mask_fmadd_sch(a, 0, b, c);
19721        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19722        assert_eq_m128h(r, e);
19723        let r = _mm_mask_fmadd_sch(a, 1, b, c);
19724        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19725        assert_eq_m128h(r, e);
19726    }
19727
19728    #[simd_test(enable = "avx512fp16")]
19729    unsafe fn test_mm_mask3_fmadd_sch() {
19730        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19731        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19732        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19733        let r = _mm_mask3_fmadd_sch(a, b, c, 0);
19734        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19735        assert_eq_m128h(r, e);
19736        let r = _mm_mask3_fmadd_sch(a, b, c, 1);
19737        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19738        assert_eq_m128h(r, e);
19739    }
19740
19741    #[simd_test(enable = "avx512fp16")]
19742    unsafe fn test_mm_maskz_fmadd_sch() {
19743        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19744        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19745        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19746        let r = _mm_maskz_fmadd_sch(0, a, b, c);
19747        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19748        assert_eq_m128h(r, e);
19749        let r = _mm_maskz_fmadd_sch(1, a, b, c);
19750        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19751        assert_eq_m128h(r, e);
19752    }
19753
19754    #[simd_test(enable = "avx512fp16")]
19755    unsafe fn test_mm_fmadd_round_sch() {
19756        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19757        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19758        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19759        let r = _mm_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19760        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19761        assert_eq_m128h(r, e);
19762    }
19763
19764    #[simd_test(enable = "avx512fp16")]
19765    unsafe fn test_mm_mask_fmadd_round_sch() {
19766        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19767        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19768        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19769        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19770            a, 0, b, c,
19771        );
19772        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19773        assert_eq_m128h(r, e);
19774        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19775            a, 1, b, c,
19776        );
19777        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19778        assert_eq_m128h(r, e);
19779    }
19780
19781    #[simd_test(enable = "avx512fp16")]
19782    unsafe fn test_mm_mask3_fmadd_round_sch() {
19783        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19784        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19785        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19786        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19787            a, b, c, 0,
19788        );
19789        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19790        assert_eq_m128h(r, e);
19791        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19792            a, b, c, 1,
19793        );
19794        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19795        assert_eq_m128h(r, e);
19796    }
19797
19798    #[simd_test(enable = "avx512fp16")]
19799    unsafe fn test_mm_maskz_fmadd_round_sch() {
19800        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19801        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19802        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19803        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19804            0, a, b, c,
19805        );
19806        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19807        assert_eq_m128h(r, e);
19808        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19809            1, a, b, c,
19810        );
19811        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19812        assert_eq_m128h(r, e);
19813    }
19814
19815    #[simd_test(enable = "avx512fp16,avx512vl")]
19816    unsafe fn test_mm_fcmadd_pch() {
19817        let a = _mm_set1_pch(0.0, 1.0);
19818        let b = _mm_set1_pch(0.0, 2.0);
19819        let c = _mm_set1_pch(0.0, 3.0);
19820        let r = _mm_fcmadd_pch(a, b, c);
19821        let e = _mm_set1_pch(2.0, 3.0);
19822        assert_eq_m128h(r, e);
19823    }
19824
19825    #[simd_test(enable = "avx512fp16,avx512vl")]
19826    unsafe fn test_mm_mask_fcmadd_pch() {
19827        let a = _mm_set1_pch(0.0, 1.0);
19828        let b = _mm_set1_pch(0.0, 2.0);
19829        let c = _mm_set1_pch(0.0, 3.0);
19830        let r = _mm_mask_fcmadd_pch(a, 0b0101, b, c);
19831        let e = _mm_setr_ph(2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0);
19832        assert_eq_m128h(r, e);
19833    }
19834
19835    #[simd_test(enable = "avx512fp16,avx512vl")]
19836    unsafe fn test_mm_mask3_fcmadd_pch() {
19837        let a = _mm_set1_pch(0.0, 1.0);
19838        let b = _mm_set1_pch(0.0, 2.0);
19839        let c = _mm_set1_pch(0.0, 3.0);
19840        let r = _mm_mask3_fcmadd_pch(a, b, c, 0b0101);
19841        let e = _mm_setr_ph(2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0);
19842        assert_eq_m128h(r, e);
19843    }
19844
19845    #[simd_test(enable = "avx512fp16,avx512vl")]
19846    unsafe fn test_mm_maskz_fcmadd_pch() {
19847        let a = _mm_set1_pch(0.0, 1.0);
19848        let b = _mm_set1_pch(0.0, 2.0);
19849        let c = _mm_set1_pch(0.0, 3.0);
19850        let r = _mm_maskz_fcmadd_pch(0b0101, a, b, c);
19851        let e = _mm_setr_ph(2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0);
19852        assert_eq_m128h(r, e);
19853    }
19854
19855    #[simd_test(enable = "avx512fp16,avx512vl")]
19856    unsafe fn test_mm256_fcmadd_pch() {
19857        let a = _mm256_set1_pch(0.0, 1.0);
19858        let b = _mm256_set1_pch(0.0, 2.0);
19859        let c = _mm256_set1_pch(0.0, 3.0);
19860        let r = _mm256_fcmadd_pch(a, b, c);
19861        let e = _mm256_set1_pch(2.0, 3.0);
19862        assert_eq_m256h(r, e);
19863    }
19864
19865    #[simd_test(enable = "avx512fp16,avx512vl")]
19866    unsafe fn test_mm256_mask_fcmadd_pch() {
19867        let a = _mm256_set1_pch(0.0, 1.0);
19868        let b = _mm256_set1_pch(0.0, 2.0);
19869        let c = _mm256_set1_pch(0.0, 3.0);
19870        let r = _mm256_mask_fcmadd_pch(a, 0b01010101, b, c);
19871        let e = _mm256_setr_ph(
19872            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19873        );
19874        assert_eq_m256h(r, e);
19875    }
19876
19877    #[simd_test(enable = "avx512fp16,avx512vl")]
19878    unsafe fn test_mm256_mask3_fcmadd_pch() {
19879        let a = _mm256_set1_pch(0.0, 1.0);
19880        let b = _mm256_set1_pch(0.0, 2.0);
19881        let c = _mm256_set1_pch(0.0, 3.0);
19882        let r = _mm256_mask3_fcmadd_pch(a, b, c, 0b01010101);
19883        let e = _mm256_setr_ph(
19884            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19885        );
19886        assert_eq_m256h(r, e);
19887    }
19888
19889    #[simd_test(enable = "avx512fp16,avx512vl")]
19890    unsafe fn test_mm256_maskz_fcmadd_pch() {
19891        let a = _mm256_set1_pch(0.0, 1.0);
19892        let b = _mm256_set1_pch(0.0, 2.0);
19893        let c = _mm256_set1_pch(0.0, 3.0);
19894        let r = _mm256_maskz_fcmadd_pch(0b01010101, a, b, c);
19895        let e = _mm256_setr_ph(
19896            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
19897        );
19898        assert_eq_m256h(r, e);
19899    }
19900
19901    #[simd_test(enable = "avx512fp16")]
19902    unsafe fn test_mm512_fcmadd_pch() {
19903        let a = _mm512_set1_pch(0.0, 1.0);
19904        let b = _mm512_set1_pch(0.0, 2.0);
19905        let c = _mm512_set1_pch(0.0, 3.0);
19906        let r = _mm512_fcmadd_pch(a, b, c);
19907        let e = _mm512_set1_pch(2.0, 3.0);
19908        assert_eq_m512h(r, e);
19909    }
19910
19911    #[simd_test(enable = "avx512fp16")]
19912    unsafe fn test_mm512_mask_fcmadd_pch() {
19913        let a = _mm512_set1_pch(0.0, 1.0);
19914        let b = _mm512_set1_pch(0.0, 2.0);
19915        let c = _mm512_set1_pch(0.0, 3.0);
19916        let r = _mm512_mask_fcmadd_pch(a, 0b0101010101010101, b, c);
19917        let e = _mm512_setr_ph(
19918            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
19919            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19920        );
19921        assert_eq_m512h(r, e);
19922    }
19923
19924    #[simd_test(enable = "avx512fp16")]
19925    unsafe fn test_mm512_mask3_fcmadd_pch() {
19926        let a = _mm512_set1_pch(0.0, 1.0);
19927        let b = _mm512_set1_pch(0.0, 2.0);
19928        let c = _mm512_set1_pch(0.0, 3.0);
19929        let r = _mm512_mask3_fcmadd_pch(a, b, c, 0b0101010101010101);
19930        let e = _mm512_setr_ph(
19931            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
19932            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19933        );
19934        assert_eq_m512h(r, e);
19935    }
19936
19937    #[simd_test(enable = "avx512fp16")]
19938    unsafe fn test_mm512_maskz_fcmadd_pch() {
19939        let a = _mm512_set1_pch(0.0, 1.0);
19940        let b = _mm512_set1_pch(0.0, 2.0);
19941        let c = _mm512_set1_pch(0.0, 3.0);
19942        let r = _mm512_maskz_fcmadd_pch(0b0101010101010101, a, b, c);
19943        let e = _mm512_setr_ph(
19944            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
19945            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
19946        );
19947        assert_eq_m512h(r, e);
19948    }
19949
19950    #[simd_test(enable = "avx512fp16")]
19951    unsafe fn test_mm512_fcmadd_round_pch() {
19952        let a = _mm512_set1_pch(0.0, 1.0);
19953        let b = _mm512_set1_pch(0.0, 2.0);
19954        let c = _mm512_set1_pch(0.0, 3.0);
19955        let r =
19956            _mm512_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19957        let e = _mm512_set1_pch(2.0, 3.0);
19958        assert_eq_m512h(r, e);
19959    }
19960
19961    #[simd_test(enable = "avx512fp16")]
19962    unsafe fn test_mm512_mask_fcmadd_round_pch() {
19963        let a = _mm512_set1_pch(0.0, 1.0);
19964        let b = _mm512_set1_pch(0.0, 2.0);
19965        let c = _mm512_set1_pch(0.0, 3.0);
19966        let r = _mm512_mask_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19967            a,
19968            0b0101010101010101,
19969            b,
19970            c,
19971        );
19972        let e = _mm512_setr_ph(
19973            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
19974            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19975        );
19976        assert_eq_m512h(r, e);
19977    }
19978
19979    #[simd_test(enable = "avx512fp16")]
19980    unsafe fn test_mm512_mask3_fcmadd_round_pch() {
19981        let a = _mm512_set1_pch(0.0, 1.0);
19982        let b = _mm512_set1_pch(0.0, 2.0);
19983        let c = _mm512_set1_pch(0.0, 3.0);
19984        let r = _mm512_mask3_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19985            a,
19986            b,
19987            c,
19988            0b0101010101010101,
19989        );
19990        let e = _mm512_setr_ph(
19991            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
19992            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19993        );
19994        assert_eq_m512h(r, e);
19995    }
19996
19997    #[simd_test(enable = "avx512fp16")]
19998    unsafe fn test_mm512_maskz_fcmadd_round_pch() {
19999        let a = _mm512_set1_pch(0.0, 1.0);
20000        let b = _mm512_set1_pch(0.0, 2.0);
20001        let c = _mm512_set1_pch(0.0, 3.0);
20002        let r = _mm512_maskz_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20003            0b0101010101010101,
20004            a,
20005            b,
20006            c,
20007        );
20008        let e = _mm512_setr_ph(
20009            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
20010            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20011        );
20012        assert_eq_m512h(r, e);
20013    }
20014
20015    #[simd_test(enable = "avx512fp16")]
20016    unsafe fn test_mm_fcmadd_sch() {
20017        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20018        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20019        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20020        let r = _mm_fcmadd_sch(a, b, c);
20021        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20022        assert_eq_m128h(r, e);
20023    }
20024
20025    #[simd_test(enable = "avx512fp16")]
20026    unsafe fn test_mm_mask_fcmadd_sch() {
20027        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20028        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20029        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20030        let r = _mm_mask_fcmadd_sch(a, 0, b, c);
20031        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20032        assert_eq_m128h(r, e);
20033        let r = _mm_mask_fcmadd_sch(a, 1, b, c);
20034        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20035        assert_eq_m128h(r, e);
20036    }
20037
20038    #[simd_test(enable = "avx512fp16")]
20039    unsafe fn test_mm_mask3_fcmadd_sch() {
20040        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20041        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20042        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20043        let r = _mm_mask3_fcmadd_sch(a, b, c, 0);
20044        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20045        assert_eq_m128h(r, e);
20046        let r = _mm_mask3_fcmadd_sch(a, b, c, 1);
20047        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20048        assert_eq_m128h(r, e);
20049    }
20050
20051    #[simd_test(enable = "avx512fp16")]
20052    unsafe fn test_mm_maskz_fcmadd_sch() {
20053        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20054        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20055        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20056        let r = _mm_maskz_fcmadd_sch(0, a, b, c);
20057        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20058        assert_eq_m128h(r, e);
20059        let r = _mm_maskz_fcmadd_sch(1, a, b, c);
20060        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20061        assert_eq_m128h(r, e);
20062    }
20063
20064    #[simd_test(enable = "avx512fp16")]
20065    unsafe fn test_mm_fcmadd_round_sch() {
20066        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20067        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20068        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20069        let r = _mm_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20070        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20071        assert_eq_m128h(r, e);
20072    }
20073
20074    #[simd_test(enable = "avx512fp16")]
20075    unsafe fn test_mm_mask_fcmadd_round_sch() {
20076        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20077        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20078        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20079        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20080            a, 0, b, c,
20081        );
20082        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20083        assert_eq_m128h(r, e);
20084        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20085            a, 1, b, c,
20086        );
20087        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20088        assert_eq_m128h(r, e);
20089    }
20090
20091    #[simd_test(enable = "avx512fp16")]
20092    unsafe fn test_mm_mask3_fcmadd_round_sch() {
20093        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20094        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20095        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20096        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20097            a, b, c, 0,
20098        );
20099        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20100        assert_eq_m128h(r, e);
20101        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20102            a, b, c, 1,
20103        );
20104        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20105        assert_eq_m128h(r, e);
20106    }
20107
20108    #[simd_test(enable = "avx512fp16")]
20109    unsafe fn test_mm_maskz_fcmadd_round_sch() {
20110        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20111        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20112        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20113        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20114            0, a, b, c,
20115        );
20116        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20117        assert_eq_m128h(r, e);
20118        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20119            1, a, b, c,
20120        );
20121        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20122        assert_eq_m128h(r, e);
20123    }
20124
20125    #[simd_test(enable = "avx512fp16,avx512vl")]
20126    unsafe fn test_mm_fmadd_ph() {
20127        let a = _mm_set1_ph(1.0);
20128        let b = _mm_set1_ph(2.0);
20129        let c = _mm_set1_ph(3.0);
20130        let r = _mm_fmadd_ph(a, b, c);
20131        let e = _mm_set1_ph(5.0);
20132        assert_eq_m128h(r, e);
20133    }
20134
20135    #[simd_test(enable = "avx512fp16,avx512vl")]
20136    unsafe fn test_mm_mask_fmadd_ph() {
20137        let a = _mm_set1_ph(1.0);
20138        let b = _mm_set1_ph(2.0);
20139        let c = _mm_set1_ph(3.0);
20140        let r = _mm_mask_fmadd_ph(a, 0b01010101, b, c);
20141        let e = _mm_set_ph(1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0);
20142        assert_eq_m128h(r, e);
20143    }
20144
20145    #[simd_test(enable = "avx512fp16,avx512vl")]
20146    unsafe fn test_mm_mask3_fmadd_ph() {
20147        let a = _mm_set1_ph(1.0);
20148        let b = _mm_set1_ph(2.0);
20149        let c = _mm_set1_ph(3.0);
20150        let r = _mm_mask3_fmadd_ph(a, b, c, 0b01010101);
20151        let e = _mm_set_ph(3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0);
20152        assert_eq_m128h(r, e);
20153    }
20154
20155    #[simd_test(enable = "avx512fp16,avx512vl")]
20156    unsafe fn test_mm_maskz_fmadd_ph() {
20157        let a = _mm_set1_ph(1.0);
20158        let b = _mm_set1_ph(2.0);
20159        let c = _mm_set1_ph(3.0);
20160        let r = _mm_maskz_fmadd_ph(0b01010101, a, b, c);
20161        let e = _mm_set_ph(0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0);
20162        assert_eq_m128h(r, e);
20163    }
20164
20165    #[simd_test(enable = "avx512fp16,avx512vl")]
20166    unsafe fn test_mm256_fmadd_ph() {
20167        let a = _mm256_set1_ph(1.0);
20168        let b = _mm256_set1_ph(2.0);
20169        let c = _mm256_set1_ph(3.0);
20170        let r = _mm256_fmadd_ph(a, b, c);
20171        let e = _mm256_set1_ph(5.0);
20172        assert_eq_m256h(r, e);
20173    }
20174
20175    #[simd_test(enable = "avx512fp16,avx512vl")]
20176    unsafe fn test_mm256_mask_fmadd_ph() {
20177        let a = _mm256_set1_ph(1.0);
20178        let b = _mm256_set1_ph(2.0);
20179        let c = _mm256_set1_ph(3.0);
20180        let r = _mm256_mask_fmadd_ph(a, 0b0101010101010101, b, c);
20181        let e = _mm256_set_ph(
20182            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20183        );
20184        assert_eq_m256h(r, e);
20185    }
20186
20187    #[simd_test(enable = "avx512fp16,avx512vl")]
20188    unsafe fn test_mm256_mask3_fmadd_ph() {
20189        let a = _mm256_set1_ph(1.0);
20190        let b = _mm256_set1_ph(2.0);
20191        let c = _mm256_set1_ph(3.0);
20192        let r = _mm256_mask3_fmadd_ph(a, b, c, 0b0101010101010101);
20193        let e = _mm256_set_ph(
20194            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20195        );
20196        assert_eq_m256h(r, e);
20197    }
20198
20199    #[simd_test(enable = "avx512fp16,avx512vl")]
20200    unsafe fn test_mm256_maskz_fmadd_ph() {
20201        let a = _mm256_set1_ph(1.0);
20202        let b = _mm256_set1_ph(2.0);
20203        let c = _mm256_set1_ph(3.0);
20204        let r = _mm256_maskz_fmadd_ph(0b0101010101010101, a, b, c);
20205        let e = _mm256_set_ph(
20206            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20207        );
20208        assert_eq_m256h(r, e);
20209    }
20210
20211    #[simd_test(enable = "avx512fp16")]
20212    unsafe fn test_mm512_fmadd_ph() {
20213        let a = _mm512_set1_ph(1.0);
20214        let b = _mm512_set1_ph(2.0);
20215        let c = _mm512_set1_ph(3.0);
20216        let r = _mm512_fmadd_ph(a, b, c);
20217        let e = _mm512_set1_ph(5.0);
20218        assert_eq_m512h(r, e);
20219    }
20220
20221    #[simd_test(enable = "avx512fp16")]
20222    unsafe fn test_mm512_mask_fmadd_ph() {
20223        let a = _mm512_set1_ph(1.0);
20224        let b = _mm512_set1_ph(2.0);
20225        let c = _mm512_set1_ph(3.0);
20226        let r = _mm512_mask_fmadd_ph(a, 0b01010101010101010101010101010101, b, c);
20227        let e = _mm512_set_ph(
20228            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20229            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20230        );
20231        assert_eq_m512h(r, e);
20232    }
20233
20234    #[simd_test(enable = "avx512fp16")]
20235    unsafe fn test_mm512_mask3_fmadd_ph() {
20236        let a = _mm512_set1_ph(1.0);
20237        let b = _mm512_set1_ph(2.0);
20238        let c = _mm512_set1_ph(3.0);
20239        let r = _mm512_mask3_fmadd_ph(a, b, c, 0b01010101010101010101010101010101);
20240        let e = _mm512_set_ph(
20241            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20242            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20243        );
20244        assert_eq_m512h(r, e);
20245    }
20246
20247    #[simd_test(enable = "avx512fp16")]
20248    unsafe fn test_mm512_maskz_fmadd_ph() {
20249        let a = _mm512_set1_ph(1.0);
20250        let b = _mm512_set1_ph(2.0);
20251        let c = _mm512_set1_ph(3.0);
20252        let r = _mm512_maskz_fmadd_ph(0b01010101010101010101010101010101, a, b, c);
20253        let e = _mm512_set_ph(
20254            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20255            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20256        );
20257        assert_eq_m512h(r, e);
20258    }
20259
20260    #[simd_test(enable = "avx512fp16")]
20261    unsafe fn test_mm512_fmadd_round_ph() {
20262        let a = _mm512_set1_ph(1.0);
20263        let b = _mm512_set1_ph(2.0);
20264        let c = _mm512_set1_ph(3.0);
20265        let r = _mm512_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20266        let e = _mm512_set1_ph(5.0);
20267        assert_eq_m512h(r, e);
20268    }
20269
20270    #[simd_test(enable = "avx512fp16")]
20271    unsafe fn test_mm512_mask_fmadd_round_ph() {
20272        let a = _mm512_set1_ph(1.0);
20273        let b = _mm512_set1_ph(2.0);
20274        let c = _mm512_set1_ph(3.0);
20275        let r = _mm512_mask_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20276            a,
20277            0b01010101010101010101010101010101,
20278            b,
20279            c,
20280        );
20281        let e = _mm512_set_ph(
20282            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20283            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20284        );
20285        assert_eq_m512h(r, e);
20286    }
20287
20288    #[simd_test(enable = "avx512fp16")]
20289    unsafe fn test_mm512_mask3_fmadd_round_ph() {
20290        let a = _mm512_set1_ph(1.0);
20291        let b = _mm512_set1_ph(2.0);
20292        let c = _mm512_set1_ph(3.0);
20293        let r = _mm512_mask3_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20294            a,
20295            b,
20296            c,
20297            0b01010101010101010101010101010101,
20298        );
20299        let e = _mm512_set_ph(
20300            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20301            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20302        );
20303        assert_eq_m512h(r, e);
20304    }
20305
20306    #[simd_test(enable = "avx512fp16")]
20307    unsafe fn test_mm512_maskz_fmadd_round_ph() {
20308        let a = _mm512_set1_ph(1.0);
20309        let b = _mm512_set1_ph(2.0);
20310        let c = _mm512_set1_ph(3.0);
20311        let r = _mm512_maskz_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20312            0b01010101010101010101010101010101,
20313            a,
20314            b,
20315            c,
20316        );
20317        let e = _mm512_set_ph(
20318            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20319            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20320        );
20321        assert_eq_m512h(r, e);
20322    }
20323
20324    #[simd_test(enable = "avx512fp16")]
20325    unsafe fn test_mm_fmadd_sh() {
20326        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20327        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20328        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20329        let r = _mm_fmadd_sh(a, b, c);
20330        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20331        assert_eq_m128h(r, e);
20332    }
20333
20334    #[simd_test(enable = "avx512fp16")]
20335    unsafe fn test_mm_mask_fmadd_sh() {
20336        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20337        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20338        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20339        let r = _mm_mask_fmadd_sh(a, 0, b, c);
20340        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20341        assert_eq_m128h(r, e);
20342        let r = _mm_mask_fmadd_sh(a, 1, b, c);
20343        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20344        assert_eq_m128h(r, e);
20345    }
20346
20347    #[simd_test(enable = "avx512fp16")]
20348    unsafe fn test_mm_mask3_fmadd_sh() {
20349        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20350        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20351        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20352        let r = _mm_mask3_fmadd_sh(a, b, c, 0);
20353        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20354        assert_eq_m128h(r, e);
20355        let r = _mm_mask3_fmadd_sh(a, b, c, 1);
20356        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20357        assert_eq_m128h(r, e);
20358    }
20359
20360    #[simd_test(enable = "avx512fp16")]
20361    unsafe fn test_mm_maskz_fmadd_sh() {
20362        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20363        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20364        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20365        let r = _mm_maskz_fmadd_sh(0, a, b, c);
20366        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20367        assert_eq_m128h(r, e);
20368        let r = _mm_maskz_fmadd_sh(1, a, b, c);
20369        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20370        assert_eq_m128h(r, e);
20371    }
20372
20373    #[simd_test(enable = "avx512fp16")]
20374    unsafe fn test_mm_fmadd_round_sh() {
20375        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20376        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20377        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20378        let r = _mm_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20379        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20380        assert_eq_m128h(r, e);
20381    }
20382
20383    #[simd_test(enable = "avx512fp16")]
20384    unsafe fn test_mm_mask_fmadd_round_sh() {
20385        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20386        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20387        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20388        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20389            a, 0, b, c,
20390        );
20391        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20392        assert_eq_m128h(r, e);
20393        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20394            a, 1, b, c,
20395        );
20396        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20397        assert_eq_m128h(r, e);
20398    }
20399
20400    #[simd_test(enable = "avx512fp16")]
20401    unsafe fn test_mm_mask3_fmadd_round_sh() {
20402        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20403        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20404        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20405        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20406            a, b, c, 0,
20407        );
20408        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20409        assert_eq_m128h(r, e);
20410        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20411            a, b, c, 1,
20412        );
20413        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20414        assert_eq_m128h(r, e);
20415    }
20416
20417    #[simd_test(enable = "avx512fp16")]
20418    unsafe fn test_mm_maskz_fmadd_round_sh() {
20419        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20420        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20421        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20422        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20423            0, a, b, c,
20424        );
20425        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20426        assert_eq_m128h(r, e);
20427        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20428            1, a, b, c,
20429        );
20430        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20431        assert_eq_m128h(r, e);
20432    }
20433
20434    #[simd_test(enable = "avx512fp16,avx512vl")]
20435    unsafe fn test_mm_fmsub_ph() {
20436        let a = _mm_set1_ph(1.0);
20437        let b = _mm_set1_ph(2.0);
20438        let c = _mm_set1_ph(3.0);
20439        let r = _mm_fmsub_ph(a, b, c);
20440        let e = _mm_set1_ph(-1.0);
20441        assert_eq_m128h(r, e);
20442    }
20443
20444    #[simd_test(enable = "avx512fp16,avx512vl")]
20445    unsafe fn test_mm_mask_fmsub_ph() {
20446        let a = _mm_set1_ph(1.0);
20447        let b = _mm_set1_ph(2.0);
20448        let c = _mm_set1_ph(3.0);
20449        let r = _mm_mask_fmsub_ph(a, 0b01010101, b, c);
20450        let e = _mm_set_ph(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
20451        assert_eq_m128h(r, e);
20452    }
20453
20454    #[simd_test(enable = "avx512fp16,avx512vl")]
20455    unsafe fn test_mm_mask3_fmsub_ph() {
20456        let a = _mm_set1_ph(1.0);
20457        let b = _mm_set1_ph(2.0);
20458        let c = _mm_set1_ph(3.0);
20459        let r = _mm_mask3_fmsub_ph(a, b, c, 0b01010101);
20460        let e = _mm_set_ph(3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0);
20461        assert_eq_m128h(r, e);
20462    }
20463
20464    #[simd_test(enable = "avx512fp16,avx512vl")]
20465    unsafe fn test_mm_maskz_fmsub_ph() {
20466        let a = _mm_set1_ph(1.0);
20467        let b = _mm_set1_ph(2.0);
20468        let c = _mm_set1_ph(3.0);
20469        let r = _mm_maskz_fmsub_ph(0b01010101, a, b, c);
20470        let e = _mm_set_ph(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0);
20471        assert_eq_m128h(r, e);
20472    }
20473
20474    #[simd_test(enable = "avx512fp16,avx512vl")]
20475    unsafe fn test_mm256_fmsub_ph() {
20476        let a = _mm256_set1_ph(1.0);
20477        let b = _mm256_set1_ph(2.0);
20478        let c = _mm256_set1_ph(3.0);
20479        let r = _mm256_fmsub_ph(a, b, c);
20480        let e = _mm256_set1_ph(-1.0);
20481        assert_eq_m256h(r, e);
20482    }
20483
20484    #[simd_test(enable = "avx512fp16,avx512vl")]
20485    unsafe fn test_mm256_mask_fmsub_ph() {
20486        let a = _mm256_set1_ph(1.0);
20487        let b = _mm256_set1_ph(2.0);
20488        let c = _mm256_set1_ph(3.0);
20489        let r = _mm256_mask_fmsub_ph(a, 0b0101010101010101, b, c);
20490        let e = _mm256_set_ph(
20491            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20492        );
20493        assert_eq_m256h(r, e);
20494    }
20495
20496    #[simd_test(enable = "avx512fp16,avx512vl")]
20497    unsafe fn test_mm256_mask3_fmsub_ph() {
20498        let a = _mm256_set1_ph(1.0);
20499        let b = _mm256_set1_ph(2.0);
20500        let c = _mm256_set1_ph(3.0);
20501        let r = _mm256_mask3_fmsub_ph(a, b, c, 0b0101010101010101);
20502        let e = _mm256_set_ph(
20503            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20504        );
20505        assert_eq_m256h(r, e);
20506    }
20507
20508    #[simd_test(enable = "avx512fp16,avx512vl")]
20509    unsafe fn test_mm256_maskz_fmsub_ph() {
20510        let a = _mm256_set1_ph(1.0);
20511        let b = _mm256_set1_ph(2.0);
20512        let c = _mm256_set1_ph(3.0);
20513        let r = _mm256_maskz_fmsub_ph(0b0101010101010101, a, b, c);
20514        let e = _mm256_set_ph(
20515            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20516        );
20517        assert_eq_m256h(r, e);
20518    }
20519
20520    #[simd_test(enable = "avx512fp16")]
20521    unsafe fn test_mm512_fmsub_ph() {
20522        let a = _mm512_set1_ph(1.0);
20523        let b = _mm512_set1_ph(2.0);
20524        let c = _mm512_set1_ph(3.0);
20525        let r = _mm512_fmsub_ph(a, b, c);
20526        let e = _mm512_set1_ph(-1.0);
20527        assert_eq_m512h(r, e);
20528    }
20529
20530    #[simd_test(enable = "avx512fp16")]
20531    unsafe fn test_mm512_mask_fmsub_ph() {
20532        let a = _mm512_set1_ph(1.0);
20533        let b = _mm512_set1_ph(2.0);
20534        let c = _mm512_set1_ph(3.0);
20535        let r = _mm512_mask_fmsub_ph(a, 0b01010101010101010101010101010101, b, c);
20536        let e = _mm512_set_ph(
20537            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20538            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20539        );
20540        assert_eq_m512h(r, e);
20541    }
20542
20543    #[simd_test(enable = "avx512fp16")]
20544    unsafe fn test_mm512_mask3_fmsub_ph() {
20545        let a = _mm512_set1_ph(1.0);
20546        let b = _mm512_set1_ph(2.0);
20547        let c = _mm512_set1_ph(3.0);
20548        let r = _mm512_mask3_fmsub_ph(a, b, c, 0b01010101010101010101010101010101);
20549        let e = _mm512_set_ph(
20550            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20551            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20552        );
20553        assert_eq_m512h(r, e);
20554    }
20555
20556    #[simd_test(enable = "avx512fp16")]
20557    unsafe fn test_mm512_maskz_fmsub_ph() {
20558        let a = _mm512_set1_ph(1.0);
20559        let b = _mm512_set1_ph(2.0);
20560        let c = _mm512_set1_ph(3.0);
20561        let r = _mm512_maskz_fmsub_ph(0b01010101010101010101010101010101, a, b, c);
20562        let e = _mm512_set_ph(
20563            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20564            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20565        );
20566        assert_eq_m512h(r, e);
20567    }
20568
20569    #[simd_test(enable = "avx512fp16")]
20570    unsafe fn test_mm512_fmsub_round_ph() {
20571        let a = _mm512_set1_ph(1.0);
20572        let b = _mm512_set1_ph(2.0);
20573        let c = _mm512_set1_ph(3.0);
20574        let r = _mm512_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20575        let e = _mm512_set1_ph(-1.0);
20576        assert_eq_m512h(r, e);
20577    }
20578
20579    #[simd_test(enable = "avx512fp16")]
20580    unsafe fn test_mm512_mask_fmsub_round_ph() {
20581        let a = _mm512_set1_ph(1.0);
20582        let b = _mm512_set1_ph(2.0);
20583        let c = _mm512_set1_ph(3.0);
20584        let r = _mm512_mask_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20585            a,
20586            0b01010101010101010101010101010101,
20587            b,
20588            c,
20589        );
20590        let e = _mm512_set_ph(
20591            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20592            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20593        );
20594        assert_eq_m512h(r, e);
20595    }
20596
20597    #[simd_test(enable = "avx512fp16")]
20598    unsafe fn test_mm512_mask3_fmsub_round_ph() {
20599        let a = _mm512_set1_ph(1.0);
20600        let b = _mm512_set1_ph(2.0);
20601        let c = _mm512_set1_ph(3.0);
20602        let r = _mm512_mask3_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20603            a,
20604            b,
20605            c,
20606            0b01010101010101010101010101010101,
20607        );
20608        let e = _mm512_set_ph(
20609            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20610            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20611        );
20612        assert_eq_m512h(r, e);
20613    }
20614
20615    #[simd_test(enable = "avx512fp16")]
20616    unsafe fn test_mm512_maskz_fmsub_round_ph() {
20617        let a = _mm512_set1_ph(1.0);
20618        let b = _mm512_set1_ph(2.0);
20619        let c = _mm512_set1_ph(3.0);
20620        let r = _mm512_maskz_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20621            0b01010101010101010101010101010101,
20622            a,
20623            b,
20624            c,
20625        );
20626        let e = _mm512_set_ph(
20627            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20628            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20629        );
20630        assert_eq_m512h(r, e);
20631    }
20632
20633    #[simd_test(enable = "avx512fp16")]
20634    unsafe fn test_mm_fmsub_sh() {
20635        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20636        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20637        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20638        let r = _mm_fmsub_sh(a, b, c);
20639        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20640        assert_eq_m128h(r, e);
20641    }
20642
20643    #[simd_test(enable = "avx512fp16")]
20644    unsafe fn test_mm_mask_fmsub_sh() {
20645        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20646        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20647        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20648        let r = _mm_mask_fmsub_sh(a, 0, b, c);
20649        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20650        assert_eq_m128h(r, e);
20651        let r = _mm_mask_fmsub_sh(a, 1, b, c);
20652        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20653        assert_eq_m128h(r, e);
20654    }
20655
20656    #[simd_test(enable = "avx512fp16")]
20657    unsafe fn test_mm_mask3_fmsub_sh() {
20658        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20659        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20660        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20661        let r = _mm_mask3_fmsub_sh(a, b, c, 0);
20662        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20663        assert_eq_m128h(r, e);
20664        let r = _mm_mask3_fmsub_sh(a, b, c, 1);
20665        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20666        assert_eq_m128h(r, e);
20667    }
20668
20669    #[simd_test(enable = "avx512fp16")]
20670    unsafe fn test_mm_maskz_fmsub_sh() {
20671        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20672        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20673        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20674        let r = _mm_maskz_fmsub_sh(0, a, b, c);
20675        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20676        assert_eq_m128h(r, e);
20677        let r = _mm_maskz_fmsub_sh(1, a, b, c);
20678        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20679        assert_eq_m128h(r, e);
20680    }
20681
20682    #[simd_test(enable = "avx512fp16")]
20683    unsafe fn test_mm_fmsub_round_sh() {
20684        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20685        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20686        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20687        let r = _mm_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20688        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20689        assert_eq_m128h(r, e);
20690    }
20691
20692    #[simd_test(enable = "avx512fp16")]
20693    unsafe fn test_mm_mask_fmsub_round_sh() {
20694        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20695        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20696        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20697        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20698            a, 0, b, c,
20699        );
20700        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20701        assert_eq_m128h(r, e);
20702        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20703            a, 1, b, c,
20704        );
20705        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20706        assert_eq_m128h(r, e);
20707    }
20708
20709    #[simd_test(enable = "avx512fp16")]
20710    unsafe fn test_mm_mask3_fmsub_round_sh() {
20711        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20712        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20713        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20714        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20715            a, b, c, 0,
20716        );
20717        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20718        assert_eq_m128h(r, e);
20719        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20720            a, b, c, 1,
20721        );
20722        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20723        assert_eq_m128h(r, e);
20724    }
20725
20726    #[simd_test(enable = "avx512fp16")]
20727    unsafe fn test_mm_maskz_fmsub_round_sh() {
20728        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20729        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20730        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20731        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20732            0, a, b, c,
20733        );
20734        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20735        assert_eq_m128h(r, e);
20736        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20737            1, a, b, c,
20738        );
20739        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20740        assert_eq_m128h(r, e);
20741    }
20742
20743    #[simd_test(enable = "avx512fp16,avx512vl")]
20744    unsafe fn test_mm_fnmadd_ph() {
20745        let a = _mm_set1_ph(1.0);
20746        let b = _mm_set1_ph(2.0);
20747        let c = _mm_set1_ph(3.0);
20748        let r = _mm_fnmadd_ph(a, b, c);
20749        let e = _mm_set1_ph(1.0);
20750        assert_eq_m128h(r, e);
20751    }
20752
20753    #[simd_test(enable = "avx512fp16,avx512vl")]
20754    unsafe fn test_mm_mask_fnmadd_ph() {
20755        let a = _mm_set1_ph(1.0);
20756        let b = _mm_set1_ph(2.0);
20757        let c = _mm_set1_ph(3.0);
20758        let r = _mm_mask_fnmadd_ph(a, 0b01010101, b, c);
20759        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
20760        assert_eq_m128h(r, e);
20761    }
20762
20763    #[simd_test(enable = "avx512fp16,avx512vl")]
20764    unsafe fn test_mm_mask3_fnmadd_ph() {
20765        let a = _mm_set1_ph(1.0);
20766        let b = _mm_set1_ph(2.0);
20767        let c = _mm_set1_ph(3.0);
20768        let r = _mm_mask3_fnmadd_ph(a, b, c, 0b01010101);
20769        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
20770        assert_eq_m128h(r, e);
20771    }
20772
20773    #[simd_test(enable = "avx512fp16,avx512vl")]
20774    unsafe fn test_mm_maskz_fnmadd_ph() {
20775        let a = _mm_set1_ph(1.0);
20776        let b = _mm_set1_ph(2.0);
20777        let c = _mm_set1_ph(3.0);
20778        let r = _mm_maskz_fnmadd_ph(0b01010101, a, b, c);
20779        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
20780        assert_eq_m128h(r, e);
20781    }
20782
20783    #[simd_test(enable = "avx512fp16,avx512vl")]
20784    unsafe fn test_mm256_fnmadd_ph() {
20785        let a = _mm256_set1_ph(1.0);
20786        let b = _mm256_set1_ph(2.0);
20787        let c = _mm256_set1_ph(3.0);
20788        let r = _mm256_fnmadd_ph(a, b, c);
20789        let e = _mm256_set1_ph(1.0);
20790        assert_eq_m256h(r, e);
20791    }
20792
20793    #[simd_test(enable = "avx512fp16,avx512vl")]
20794    unsafe fn test_mm256_mask_fnmadd_ph() {
20795        let a = _mm256_set1_ph(1.0);
20796        let b = _mm256_set1_ph(2.0);
20797        let c = _mm256_set1_ph(3.0);
20798        let r = _mm256_mask_fnmadd_ph(a, 0b0101010101010101, b, c);
20799        let e = _mm256_set_ph(
20800            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20801        );
20802        assert_eq_m256h(r, e);
20803    }
20804
20805    #[simd_test(enable = "avx512fp16,avx512vl")]
20806    unsafe fn test_mm256_mask3_fnmadd_ph() {
20807        let a = _mm256_set1_ph(1.0);
20808        let b = _mm256_set1_ph(2.0);
20809        let c = _mm256_set1_ph(3.0);
20810        let r = _mm256_mask3_fnmadd_ph(a, b, c, 0b0101010101010101);
20811        let e = _mm256_set_ph(
20812            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20813        );
20814        assert_eq_m256h(r, e);
20815    }
20816
20817    #[simd_test(enable = "avx512fp16,avx512vl")]
20818    unsafe fn test_mm256_maskz_fnmadd_ph() {
20819        let a = _mm256_set1_ph(1.0);
20820        let b = _mm256_set1_ph(2.0);
20821        let c = _mm256_set1_ph(3.0);
20822        let r = _mm256_maskz_fnmadd_ph(0b0101010101010101, a, b, c);
20823        let e = _mm256_set_ph(
20824            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20825        );
20826        assert_eq_m256h(r, e);
20827    }
20828
20829    #[simd_test(enable = "avx512fp16")]
20830    unsafe fn test_mm512_fnmadd_ph() {
20831        let a = _mm512_set1_ph(1.0);
20832        let b = _mm512_set1_ph(2.0);
20833        let c = _mm512_set1_ph(3.0);
20834        let r = _mm512_fnmadd_ph(a, b, c);
20835        let e = _mm512_set1_ph(1.0);
20836        assert_eq_m512h(r, e);
20837    }
20838
20839    #[simd_test(enable = "avx512fp16")]
20840    unsafe fn test_mm512_mask_fnmadd_ph() {
20841        let a = _mm512_set1_ph(1.0);
20842        let b = _mm512_set1_ph(2.0);
20843        let c = _mm512_set1_ph(3.0);
20844        let r = _mm512_mask_fnmadd_ph(a, 0b01010101010101010101010101010101, b, c);
20845        let e = _mm512_set_ph(
20846            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20847            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20848        );
20849        assert_eq_m512h(r, e);
20850    }
20851
20852    #[simd_test(enable = "avx512fp16")]
20853    unsafe fn test_mm512_mask3_fnmadd_ph() {
20854        let a = _mm512_set1_ph(1.0);
20855        let b = _mm512_set1_ph(2.0);
20856        let c = _mm512_set1_ph(3.0);
20857        let r = _mm512_mask3_fnmadd_ph(a, b, c, 0b01010101010101010101010101010101);
20858        let e = _mm512_set_ph(
20859            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
20860            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20861        );
20862        assert_eq_m512h(r, e);
20863    }
20864
20865    #[simd_test(enable = "avx512fp16")]
20866    unsafe fn test_mm512_maskz_fnmadd_ph() {
20867        let a = _mm512_set1_ph(1.0);
20868        let b = _mm512_set1_ph(2.0);
20869        let c = _mm512_set1_ph(3.0);
20870        let r = _mm512_maskz_fnmadd_ph(0b01010101010101010101010101010101, a, b, c);
20871        let e = _mm512_set_ph(
20872            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
20873            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20874        );
20875        assert_eq_m512h(r, e);
20876    }
20877
20878    #[simd_test(enable = "avx512fp16")]
20879    unsafe fn test_mm512_fnmadd_round_ph() {
20880        let a = _mm512_set1_ph(1.0);
20881        let b = _mm512_set1_ph(2.0);
20882        let c = _mm512_set1_ph(3.0);
20883        let r =
20884            _mm512_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20885        let e = _mm512_set1_ph(1.0);
20886        assert_eq_m512h(r, e);
20887    }
20888
20889    #[simd_test(enable = "avx512fp16")]
20890    unsafe fn test_mm512_mask_fnmadd_round_ph() {
20891        let a = _mm512_set1_ph(1.0);
20892        let b = _mm512_set1_ph(2.0);
20893        let c = _mm512_set1_ph(3.0);
20894        let r = _mm512_mask_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20895            a,
20896            0b01010101010101010101010101010101,
20897            b,
20898            c,
20899        );
20900        let e = _mm512_set_ph(
20901            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20902            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20903        );
20904        assert_eq_m512h(r, e);
20905    }
20906
20907    #[simd_test(enable = "avx512fp16")]
20908    unsafe fn test_mm512_mask3_fnmadd_round_ph() {
20909        let a = _mm512_set1_ph(1.0);
20910        let b = _mm512_set1_ph(2.0);
20911        let c = _mm512_set1_ph(3.0);
20912        let r = _mm512_mask3_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20913            a,
20914            b,
20915            c,
20916            0b01010101010101010101010101010101,
20917        );
20918        let e = _mm512_set_ph(
20919            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
20920            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20921        );
20922        assert_eq_m512h(r, e);
20923    }
20924
20925    #[simd_test(enable = "avx512fp16")]
20926    unsafe fn test_mm512_maskz_fnmadd_round_ph() {
20927        let a = _mm512_set1_ph(1.0);
20928        let b = _mm512_set1_ph(2.0);
20929        let c = _mm512_set1_ph(3.0);
20930        let r = _mm512_maskz_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20931            0b01010101010101010101010101010101,
20932            a,
20933            b,
20934            c,
20935        );
20936        let e = _mm512_set_ph(
20937            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
20938            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20939        );
20940        assert_eq_m512h(r, e);
20941    }
20942
20943    #[simd_test(enable = "avx512fp16")]
20944    unsafe fn test_mm_fnmadd_sh() {
20945        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20946        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20947        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20948        let r = _mm_fnmadd_sh(a, b, c);
20949        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20950        assert_eq_m128h(r, e);
20951    }
20952
20953    #[simd_test(enable = "avx512fp16")]
20954    unsafe fn test_mm_mask_fnmadd_sh() {
20955        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20956        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20957        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20958        let r = _mm_mask_fnmadd_sh(a, 0, b, c);
20959        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20960        assert_eq_m128h(r, e);
20961        let r = _mm_mask_fnmadd_sh(a, 1, b, c);
20962        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20963        assert_eq_m128h(r, e);
20964    }
20965
20966    #[simd_test(enable = "avx512fp16")]
20967    unsafe fn test_mm_mask3_fnmadd_sh() {
20968        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20969        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20970        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20971        let r = _mm_mask3_fnmadd_sh(a, b, c, 0);
20972        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20973        assert_eq_m128h(r, e);
20974        let r = _mm_mask3_fnmadd_sh(a, b, c, 1);
20975        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
20976        assert_eq_m128h(r, e);
20977    }
20978
20979    #[simd_test(enable = "avx512fp16")]
20980    unsafe fn test_mm_maskz_fnmadd_sh() {
20981        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20982        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20983        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20984        let r = _mm_maskz_fnmadd_sh(0, a, b, c);
20985        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20986        assert_eq_m128h(r, e);
20987        let r = _mm_maskz_fnmadd_sh(1, a, b, c);
20988        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20989        assert_eq_m128h(r, e);
20990    }
20991
20992    #[simd_test(enable = "avx512fp16")]
20993    unsafe fn test_mm_fnmadd_round_sh() {
20994        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20995        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20996        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20997        let r = _mm_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20998        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20999        assert_eq_m128h(r, e);
21000    }
21001
21002    #[simd_test(enable = "avx512fp16")]
21003    unsafe fn test_mm_mask_fnmadd_round_sh() {
21004        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21005        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21006        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21007        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21008            a, 0, b, c,
21009        );
21010        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21011        assert_eq_m128h(r, e);
21012        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21013            a, 1, b, c,
21014        );
21015        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21016        assert_eq_m128h(r, e);
21017    }
21018
21019    #[simd_test(enable = "avx512fp16")]
21020    unsafe fn test_mm_mask3_fnmadd_round_sh() {
21021        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21022        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21023        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21024        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21025            a, b, c, 0,
21026        );
21027        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21028        assert_eq_m128h(r, e);
21029        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21030            a, b, c, 1,
21031        );
21032        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
21033        assert_eq_m128h(r, e);
21034    }
21035
21036    #[simd_test(enable = "avx512fp16")]
21037    unsafe fn test_mm_maskz_fnmadd_round_sh() {
21038        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21039        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21040        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21041        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21042            0, a, b, c,
21043        );
21044        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21045        assert_eq_m128h(r, e);
21046        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21047            1, a, b, c,
21048        );
21049        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21050        assert_eq_m128h(r, e);
21051    }
21052
21053    #[simd_test(enable = "avx512fp16,avx512vl")]
21054    unsafe fn test_mm_fnmsub_ph() {
21055        let a = _mm_set1_ph(1.0);
21056        let b = _mm_set1_ph(2.0);
21057        let c = _mm_set1_ph(3.0);
21058        let r = _mm_fnmsub_ph(a, b, c);
21059        let e = _mm_set1_ph(-5.0);
21060        assert_eq_m128h(r, e);
21061    }
21062
21063    #[simd_test(enable = "avx512fp16,avx512vl")]
21064    unsafe fn test_mm_mask_fnmsub_ph() {
21065        let a = _mm_set1_ph(1.0);
21066        let b = _mm_set1_ph(2.0);
21067        let c = _mm_set1_ph(3.0);
21068        let r = _mm_mask_fnmsub_ph(a, 0b01010101, b, c);
21069        let e = _mm_set_ph(1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0);
21070        assert_eq_m128h(r, e);
21071    }
21072
21073    #[simd_test(enable = "avx512fp16,avx512vl")]
21074    unsafe fn test_mm_mask3_fnmsub_ph() {
21075        let a = _mm_set1_ph(1.0);
21076        let b = _mm_set1_ph(2.0);
21077        let c = _mm_set1_ph(3.0);
21078        let r = _mm_mask3_fnmsub_ph(a, b, c, 0b01010101);
21079        let e = _mm_set_ph(3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0);
21080        assert_eq_m128h(r, e);
21081    }
21082
21083    #[simd_test(enable = "avx512fp16,avx512vl")]
21084    unsafe fn test_mm_maskz_fnmsub_ph() {
21085        let a = _mm_set1_ph(1.0);
21086        let b = _mm_set1_ph(2.0);
21087        let c = _mm_set1_ph(3.0);
21088        let r = _mm_maskz_fnmsub_ph(0b01010101, a, b, c);
21089        let e = _mm_set_ph(0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0);
21090        assert_eq_m128h(r, e);
21091    }
21092
21093    #[simd_test(enable = "avx512fp16,avx512vl")]
21094    unsafe fn test_mm256_fnmsub_ph() {
21095        let a = _mm256_set1_ph(1.0);
21096        let b = _mm256_set1_ph(2.0);
21097        let c = _mm256_set1_ph(3.0);
21098        let r = _mm256_fnmsub_ph(a, b, c);
21099        let e = _mm256_set1_ph(-5.0);
21100        assert_eq_m256h(r, e);
21101    }
21102
21103    #[simd_test(enable = "avx512fp16,avx512vl")]
21104    unsafe fn test_mm256_mask_fnmsub_ph() {
21105        let a = _mm256_set1_ph(1.0);
21106        let b = _mm256_set1_ph(2.0);
21107        let c = _mm256_set1_ph(3.0);
21108        let r = _mm256_mask_fnmsub_ph(a, 0b0101010101010101, b, c);
21109        let e = _mm256_set_ph(
21110            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21111        );
21112        assert_eq_m256h(r, e);
21113    }
21114
21115    #[simd_test(enable = "avx512fp16,avx512vl")]
21116    unsafe fn test_mm256_mask3_fnmsub_ph() {
21117        let a = _mm256_set1_ph(1.0);
21118        let b = _mm256_set1_ph(2.0);
21119        let c = _mm256_set1_ph(3.0);
21120        let r = _mm256_mask3_fnmsub_ph(a, b, c, 0b0101010101010101);
21121        let e = _mm256_set_ph(
21122            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21123        );
21124        assert_eq_m256h(r, e);
21125    }
21126
21127    #[simd_test(enable = "avx512fp16,avx512vl")]
21128    unsafe fn test_mm256_maskz_fnmsub_ph() {
21129        let a = _mm256_set1_ph(1.0);
21130        let b = _mm256_set1_ph(2.0);
21131        let c = _mm256_set1_ph(3.0);
21132        let r = _mm256_maskz_fnmsub_ph(0b0101010101010101, a, b, c);
21133        let e = _mm256_set_ph(
21134            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21135        );
21136        assert_eq_m256h(r, e);
21137    }
21138
21139    #[simd_test(enable = "avx512fp16")]
21140    unsafe fn test_mm512_fnmsub_ph() {
21141        let a = _mm512_set1_ph(1.0);
21142        let b = _mm512_set1_ph(2.0);
21143        let c = _mm512_set1_ph(3.0);
21144        let r = _mm512_fnmsub_ph(a, b, c);
21145        let e = _mm512_set1_ph(-5.0);
21146        assert_eq_m512h(r, e);
21147    }
21148
21149    #[simd_test(enable = "avx512fp16")]
21150    unsafe fn test_mm512_mask_fnmsub_ph() {
21151        let a = _mm512_set1_ph(1.0);
21152        let b = _mm512_set1_ph(2.0);
21153        let c = _mm512_set1_ph(3.0);
21154        let r = _mm512_mask_fnmsub_ph(a, 0b01010101010101010101010101010101, b, c);
21155        let e = _mm512_set_ph(
21156            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21157            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21158        );
21159        assert_eq_m512h(r, e);
21160    }
21161
21162    #[simd_test(enable = "avx512fp16")]
21163    unsafe fn test_mm512_mask3_fnmsub_ph() {
21164        let a = _mm512_set1_ph(1.0);
21165        let b = _mm512_set1_ph(2.0);
21166        let c = _mm512_set1_ph(3.0);
21167        let r = _mm512_mask3_fnmsub_ph(a, b, c, 0b01010101010101010101010101010101);
21168        let e = _mm512_set_ph(
21169            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21170            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21171        );
21172        assert_eq_m512h(r, e);
21173    }
21174
21175    #[simd_test(enable = "avx512fp16")]
21176    unsafe fn test_mm512_maskz_fnmsub_ph() {
21177        let a = _mm512_set1_ph(1.0);
21178        let b = _mm512_set1_ph(2.0);
21179        let c = _mm512_set1_ph(3.0);
21180        let r = _mm512_maskz_fnmsub_ph(0b01010101010101010101010101010101, a, b, c);
21181        let e = _mm512_set_ph(
21182            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21183            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21184        );
21185        assert_eq_m512h(r, e);
21186    }
21187
21188    #[simd_test(enable = "avx512fp16")]
21189    unsafe fn test_mm512_fnmsub_round_ph() {
21190        let a = _mm512_set1_ph(1.0);
21191        let b = _mm512_set1_ph(2.0);
21192        let c = _mm512_set1_ph(3.0);
21193        let r =
21194            _mm512_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21195        let e = _mm512_set1_ph(-5.0);
21196        assert_eq_m512h(r, e);
21197    }
21198
21199    #[simd_test(enable = "avx512fp16")]
21200    unsafe fn test_mm512_mask_fnmsub_round_ph() {
21201        let a = _mm512_set1_ph(1.0);
21202        let b = _mm512_set1_ph(2.0);
21203        let c = _mm512_set1_ph(3.0);
21204        let r = _mm512_mask_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21205            a,
21206            0b01010101010101010101010101010101,
21207            b,
21208            c,
21209        );
21210        let e = _mm512_set_ph(
21211            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21212            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21213        );
21214        assert_eq_m512h(r, e);
21215    }
21216
21217    #[simd_test(enable = "avx512fp16")]
21218    unsafe fn test_mm512_mask3_fnmsub_round_ph() {
21219        let a = _mm512_set1_ph(1.0);
21220        let b = _mm512_set1_ph(2.0);
21221        let c = _mm512_set1_ph(3.0);
21222        let r = _mm512_mask3_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21223            a,
21224            b,
21225            c,
21226            0b01010101010101010101010101010101,
21227        );
21228        let e = _mm512_set_ph(
21229            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21230            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21231        );
21232        assert_eq_m512h(r, e);
21233    }
21234
21235    #[simd_test(enable = "avx512fp16")]
21236    unsafe fn test_mm512_maskz_fnmsub_round_ph() {
21237        let a = _mm512_set1_ph(1.0);
21238        let b = _mm512_set1_ph(2.0);
21239        let c = _mm512_set1_ph(3.0);
21240        let r = _mm512_maskz_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21241            0b01010101010101010101010101010101,
21242            a,
21243            b,
21244            c,
21245        );
21246        let e = _mm512_set_ph(
21247            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21248            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21249        );
21250        assert_eq_m512h(r, e);
21251    }
21252
21253    #[simd_test(enable = "avx512fp16")]
21254    unsafe fn test_mm_fnmsub_sh() {
21255        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21256        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21257        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21258        let r = _mm_fnmsub_sh(a, b, c);
21259        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21260        assert_eq_m128h(r, e);
21261    }
21262
21263    #[simd_test(enable = "avx512fp16")]
21264    unsafe fn test_mm_mask_fnmsub_sh() {
21265        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21266        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21267        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21268        let r = _mm_mask_fnmsub_sh(a, 0, b, c);
21269        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21270        assert_eq_m128h(r, e);
21271        let r = _mm_mask_fnmsub_sh(a, 1, b, c);
21272        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21273        assert_eq_m128h(r, e);
21274    }
21275
21276    #[simd_test(enable = "avx512fp16")]
21277    unsafe fn test_mm_mask3_fnmsub_sh() {
21278        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21279        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21280        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21281        let r = _mm_mask3_fnmsub_sh(a, b, c, 0);
21282        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21283        assert_eq_m128h(r, e);
21284        let r = _mm_mask3_fnmsub_sh(a, b, c, 1);
21285        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21286        assert_eq_m128h(r, e);
21287    }
21288
21289    #[simd_test(enable = "avx512fp16")]
21290    unsafe fn test_mm_maskz_fnmsub_sh() {
21291        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21292        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21293        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21294        let r = _mm_maskz_fnmsub_sh(0, a, b, c);
21295        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21296        assert_eq_m128h(r, e);
21297        let r = _mm_maskz_fnmsub_sh(1, a, b, c);
21298        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21299        assert_eq_m128h(r, e);
21300    }
21301
21302    #[simd_test(enable = "avx512fp16")]
21303    unsafe fn test_mm_fnmsub_round_sh() {
21304        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21305        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21306        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21307        let r = _mm_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21308        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21309        assert_eq_m128h(r, e);
21310    }
21311
21312    #[simd_test(enable = "avx512fp16")]
21313    unsafe fn test_mm_mask_fnmsub_round_sh() {
21314        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21315        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21316        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21317        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21318            a, 0, b, c,
21319        );
21320        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21321        assert_eq_m128h(r, e);
21322        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21323            a, 1, b, c,
21324        );
21325        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21326        assert_eq_m128h(r, e);
21327    }
21328
21329    #[simd_test(enable = "avx512fp16")]
21330    unsafe fn test_mm_mask3_fnmsub_round_sh() {
21331        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21332        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21333        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21334        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21335            a, b, c, 0,
21336        );
21337        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21338        assert_eq_m128h(r, e);
21339        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21340            a, b, c, 1,
21341        );
21342        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21343        assert_eq_m128h(r, e);
21344    }
21345
21346    #[simd_test(enable = "avx512fp16")]
21347    unsafe fn test_mm_maskz_fnmsub_round_sh() {
21348        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21349        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21350        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21351        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21352            0, a, b, c,
21353        );
21354        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21355        assert_eq_m128h(r, e);
21356        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21357            1, a, b, c,
21358        );
21359        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21360        assert_eq_m128h(r, e);
21361    }
21362
21363    #[simd_test(enable = "avx512fp16,avx512vl")]
21364    unsafe fn test_mm_fmaddsub_ph() {
21365        let a = _mm_set1_ph(1.0);
21366        let b = _mm_set1_ph(2.0);
21367        let c = _mm_set1_ph(3.0);
21368        let r = _mm_fmaddsub_ph(a, b, c);
21369        let e = _mm_set_ph(5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0);
21370        assert_eq_m128h(r, e);
21371    }
21372
21373    #[simd_test(enable = "avx512fp16,avx512vl")]
21374    unsafe fn test_mm_mask_fmaddsub_ph() {
21375        let a = _mm_set1_ph(1.0);
21376        let b = _mm_set1_ph(2.0);
21377        let c = _mm_set1_ph(3.0);
21378        let r = _mm_mask_fmaddsub_ph(a, 0b00110011, b, c);
21379        let e = _mm_set_ph(1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0);
21380        assert_eq_m128h(r, e);
21381    }
21382
21383    #[simd_test(enable = "avx512fp16,avx512vl")]
21384    unsafe fn test_mm_mask3_fmaddsub_ph() {
21385        let a = _mm_set1_ph(1.0);
21386        let b = _mm_set1_ph(2.0);
21387        let c = _mm_set1_ph(3.0);
21388        let r = _mm_mask3_fmaddsub_ph(a, b, c, 0b00110011);
21389        let e = _mm_set_ph(3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0);
21390        assert_eq_m128h(r, e);
21391    }
21392
21393    #[simd_test(enable = "avx512fp16,avx512vl")]
21394    unsafe fn test_mm_maskz_fmaddsub_ph() {
21395        let a = _mm_set1_ph(1.0);
21396        let b = _mm_set1_ph(2.0);
21397        let c = _mm_set1_ph(3.0);
21398        let r = _mm_maskz_fmaddsub_ph(0b00110011, a, b, c);
21399        let e = _mm_set_ph(0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0);
21400        assert_eq_m128h(r, e);
21401    }
21402
21403    #[simd_test(enable = "avx512fp16,avx512vl")]
21404    unsafe fn test_mm256_fmaddsub_ph() {
21405        let a = _mm256_set1_ph(1.0);
21406        let b = _mm256_set1_ph(2.0);
21407        let c = _mm256_set1_ph(3.0);
21408        let r = _mm256_fmaddsub_ph(a, b, c);
21409        let e = _mm256_set_ph(
21410            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21411        );
21412        assert_eq_m256h(r, e);
21413    }
21414
21415    #[simd_test(enable = "avx512fp16,avx512vl")]
21416    unsafe fn test_mm256_mask_fmaddsub_ph() {
21417        let a = _mm256_set1_ph(1.0);
21418        let b = _mm256_set1_ph(2.0);
21419        let c = _mm256_set1_ph(3.0);
21420        let r = _mm256_mask_fmaddsub_ph(a, 0b0011001100110011, b, c);
21421        let e = _mm256_set_ph(
21422            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21423        );
21424        assert_eq_m256h(r, e);
21425    }
21426
21427    #[simd_test(enable = "avx512fp16,avx512vl")]
21428    unsafe fn test_mm256_mask3_fmaddsub_ph() {
21429        let a = _mm256_set1_ph(1.0);
21430        let b = _mm256_set1_ph(2.0);
21431        let c = _mm256_set1_ph(3.0);
21432        let r = _mm256_mask3_fmaddsub_ph(a, b, c, 0b0011001100110011);
21433        let e = _mm256_set_ph(
21434            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21435        );
21436        assert_eq_m256h(r, e);
21437    }
21438
21439    #[simd_test(enable = "avx512fp16,avx512vl")]
21440    unsafe fn test_mm256_maskz_fmaddsub_ph() {
21441        let a = _mm256_set1_ph(1.0);
21442        let b = _mm256_set1_ph(2.0);
21443        let c = _mm256_set1_ph(3.0);
21444        let r = _mm256_maskz_fmaddsub_ph(0b0011001100110011, a, b, c);
21445        let e = _mm256_set_ph(
21446            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21447        );
21448        assert_eq_m256h(r, e);
21449    }
21450
21451    #[simd_test(enable = "avx512fp16")]
21452    unsafe fn test_mm512_fmaddsub_ph() {
21453        let a = _mm512_set1_ph(1.0);
21454        let b = _mm512_set1_ph(2.0);
21455        let c = _mm512_set1_ph(3.0);
21456        let r = _mm512_fmaddsub_ph(a, b, c);
21457        let e = _mm512_set_ph(
21458            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21459            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21460        );
21461        assert_eq_m512h(r, e);
21462    }
21463
21464    #[simd_test(enable = "avx512fp16")]
21465    unsafe fn test_mm512_mask_fmaddsub_ph() {
21466        let a = _mm512_set1_ph(1.0);
21467        let b = _mm512_set1_ph(2.0);
21468        let c = _mm512_set1_ph(3.0);
21469        let r = _mm512_mask_fmaddsub_ph(a, 0b00110011001100110011001100110011, b, c);
21470        let e = _mm512_set_ph(
21471            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21472            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21473        );
21474        assert_eq_m512h(r, e);
21475    }
21476
21477    #[simd_test(enable = "avx512fp16")]
21478    unsafe fn test_mm512_mask3_fmaddsub_ph() {
21479        let a = _mm512_set1_ph(1.0);
21480        let b = _mm512_set1_ph(2.0);
21481        let c = _mm512_set1_ph(3.0);
21482        let r = _mm512_mask3_fmaddsub_ph(a, b, c, 0b00110011001100110011001100110011);
21483        let e = _mm512_set_ph(
21484            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21485            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21486        );
21487        assert_eq_m512h(r, e);
21488    }
21489
21490    #[simd_test(enable = "avx512fp16")]
21491    unsafe fn test_mm512_maskz_fmaddsub_ph() {
21492        let a = _mm512_set1_ph(1.0);
21493        let b = _mm512_set1_ph(2.0);
21494        let c = _mm512_set1_ph(3.0);
21495        let r = _mm512_maskz_fmaddsub_ph(0b00110011001100110011001100110011, a, b, c);
21496        let e = _mm512_set_ph(
21497            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21498            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21499        );
21500        assert_eq_m512h(r, e);
21501    }
21502
21503    #[simd_test(enable = "avx512fp16")]
21504    unsafe fn test_mm512_fmaddsub_round_ph() {
21505        let a = _mm512_set1_ph(1.0);
21506        let b = _mm512_set1_ph(2.0);
21507        let c = _mm512_set1_ph(3.0);
21508        let r =
21509            _mm512_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21510        let e = _mm512_set_ph(
21511            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21512            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21513        );
21514        assert_eq_m512h(r, e);
21515    }
21516
21517    #[simd_test(enable = "avx512fp16")]
21518    unsafe fn test_mm512_mask_fmaddsub_round_ph() {
21519        let a = _mm512_set1_ph(1.0);
21520        let b = _mm512_set1_ph(2.0);
21521        let c = _mm512_set1_ph(3.0);
21522        let r = _mm512_mask_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21523            a,
21524            0b00110011001100110011001100110011,
21525            b,
21526            c,
21527        );
21528        let e = _mm512_set_ph(
21529            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21530            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21531        );
21532        assert_eq_m512h(r, e);
21533    }
21534
21535    #[simd_test(enable = "avx512fp16")]
21536    unsafe fn test_mm512_mask3_fmaddsub_round_ph() {
21537        let a = _mm512_set1_ph(1.0);
21538        let b = _mm512_set1_ph(2.0);
21539        let c = _mm512_set1_ph(3.0);
21540        let r = _mm512_mask3_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21541            a,
21542            b,
21543            c,
21544            0b00110011001100110011001100110011,
21545        );
21546        let e = _mm512_set_ph(
21547            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21548            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21549        );
21550        assert_eq_m512h(r, e);
21551    }
21552
21553    #[simd_test(enable = "avx512fp16")]
21554    unsafe fn test_mm512_maskz_fmaddsub_round_ph() {
21555        let a = _mm512_set1_ph(1.0);
21556        let b = _mm512_set1_ph(2.0);
21557        let c = _mm512_set1_ph(3.0);
21558        let r = _mm512_maskz_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21559            0b00110011001100110011001100110011,
21560            a,
21561            b,
21562            c,
21563        );
21564        let e = _mm512_set_ph(
21565            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21566            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21567        );
21568        assert_eq_m512h(r, e);
21569    }
21570
21571    #[simd_test(enable = "avx512fp16,avx512vl")]
21572    unsafe fn test_mm_fmsubadd_ph() {
21573        let a = _mm_set1_ph(1.0);
21574        let b = _mm_set1_ph(2.0);
21575        let c = _mm_set1_ph(3.0);
21576        let r = _mm_fmsubadd_ph(a, b, c);
21577        let e = _mm_set_ph(-1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0);
21578        assert_eq_m128h(r, e);
21579    }
21580
21581    #[simd_test(enable = "avx512fp16,avx512vl")]
21582    unsafe fn test_mm_mask_fmsubadd_ph() {
21583        let a = _mm_set1_ph(1.0);
21584        let b = _mm_set1_ph(2.0);
21585        let c = _mm_set1_ph(3.0);
21586        let r = _mm_mask_fmsubadd_ph(a, 0b00110011, b, c);
21587        let e = _mm_set_ph(1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0);
21588        assert_eq_m128h(r, e);
21589    }
21590
21591    #[simd_test(enable = "avx512fp16,avx512vl")]
21592    unsafe fn test_mm_mask3_fmsubadd_ph() {
21593        let a = _mm_set1_ph(1.0);
21594        let b = _mm_set1_ph(2.0);
21595        let c = _mm_set1_ph(3.0);
21596        let r = _mm_mask3_fmsubadd_ph(a, b, c, 0b00110011);
21597        let e = _mm_set_ph(3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0);
21598        assert_eq_m128h(r, e);
21599    }
21600
21601    #[simd_test(enable = "avx512fp16,avx512vl")]
21602    unsafe fn test_mm_maskz_fmsubadd_ph() {
21603        let a = _mm_set1_ph(1.0);
21604        let b = _mm_set1_ph(2.0);
21605        let c = _mm_set1_ph(3.0);
21606        let r = _mm_maskz_fmsubadd_ph(0b00110011, a, b, c);
21607        let e = _mm_set_ph(0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0);
21608        assert_eq_m128h(r, e);
21609    }
21610
21611    #[simd_test(enable = "avx512fp16,avx512vl")]
21612    unsafe fn test_mm256_fmsubadd_ph() {
21613        let a = _mm256_set1_ph(1.0);
21614        let b = _mm256_set1_ph(2.0);
21615        let c = _mm256_set1_ph(3.0);
21616        let r = _mm256_fmsubadd_ph(a, b, c);
21617        let e = _mm256_set_ph(
21618            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21619        );
21620        assert_eq_m256h(r, e);
21621    }
21622
21623    #[simd_test(enable = "avx512fp16,avx512vl")]
21624    unsafe fn test_mm256_mask_fmsubadd_ph() {
21625        let a = _mm256_set1_ph(1.0);
21626        let b = _mm256_set1_ph(2.0);
21627        let c = _mm256_set1_ph(3.0);
21628        let r = _mm256_mask_fmsubadd_ph(a, 0b0011001100110011, b, c);
21629        let e = _mm256_set_ph(
21630            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21631        );
21632        assert_eq_m256h(r, e);
21633    }
21634
21635    #[simd_test(enable = "avx512fp16,avx512vl")]
21636    unsafe fn test_mm256_mask3_fmsubadd_ph() {
21637        let a = _mm256_set1_ph(1.0);
21638        let b = _mm256_set1_ph(2.0);
21639        let c = _mm256_set1_ph(3.0);
21640        let r = _mm256_mask3_fmsubadd_ph(a, b, c, 0b0011001100110011);
21641        let e = _mm256_set_ph(
21642            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21643        );
21644        assert_eq_m256h(r, e);
21645    }
21646
21647    #[simd_test(enable = "avx512fp16,avx512vl")]
21648    unsafe fn test_mm256_maskz_fmsubadd_ph() {
21649        let a = _mm256_set1_ph(1.0);
21650        let b = _mm256_set1_ph(2.0);
21651        let c = _mm256_set1_ph(3.0);
21652        let r = _mm256_maskz_fmsubadd_ph(0b0011001100110011, a, b, c);
21653        let e = _mm256_set_ph(
21654            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21655        );
21656        assert_eq_m256h(r, e);
21657    }
21658
21659    #[simd_test(enable = "avx512fp16")]
21660    unsafe fn test_mm512_fmsubadd_ph() {
21661        let a = _mm512_set1_ph(1.0);
21662        let b = _mm512_set1_ph(2.0);
21663        let c = _mm512_set1_ph(3.0);
21664        let r = _mm512_fmsubadd_ph(a, b, c);
21665        let e = _mm512_set_ph(
21666            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21667            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21668        );
21669        assert_eq_m512h(r, e);
21670    }
21671
21672    #[simd_test(enable = "avx512fp16")]
21673    unsafe fn test_mm512_mask_fmsubadd_ph() {
21674        let a = _mm512_set1_ph(1.0);
21675        let b = _mm512_set1_ph(2.0);
21676        let c = _mm512_set1_ph(3.0);
21677        let r = _mm512_mask_fmsubadd_ph(a, 0b00110011001100110011001100110011, b, c);
21678        let e = _mm512_set_ph(
21679            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21680            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21681        );
21682        assert_eq_m512h(r, e);
21683    }
21684
21685    #[simd_test(enable = "avx512fp16")]
21686    unsafe fn test_mm512_mask3_fmsubadd_ph() {
21687        let a = _mm512_set1_ph(1.0);
21688        let b = _mm512_set1_ph(2.0);
21689        let c = _mm512_set1_ph(3.0);
21690        let r = _mm512_mask3_fmsubadd_ph(a, b, c, 0b00110011001100110011001100110011);
21691        let e = _mm512_set_ph(
21692            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21693            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21694        );
21695        assert_eq_m512h(r, e);
21696    }
21697
21698    #[simd_test(enable = "avx512fp16")]
21699    unsafe fn test_mm512_maskz_fmsubadd_ph() {
21700        let a = _mm512_set1_ph(1.0);
21701        let b = _mm512_set1_ph(2.0);
21702        let c = _mm512_set1_ph(3.0);
21703        let r = _mm512_maskz_fmsubadd_ph(0b00110011001100110011001100110011, a, b, c);
21704        let e = _mm512_set_ph(
21705            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21706            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21707        );
21708        assert_eq_m512h(r, e);
21709    }
21710
21711    #[simd_test(enable = "avx512fp16")]
21712    unsafe fn test_mm512_fmsubadd_round_ph() {
21713        let a = _mm512_set1_ph(1.0);
21714        let b = _mm512_set1_ph(2.0);
21715        let c = _mm512_set1_ph(3.0);
21716        let r =
21717            _mm512_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21718        let e = _mm512_set_ph(
21719            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21720            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21721        );
21722        assert_eq_m512h(r, e);
21723    }
21724
21725    #[simd_test(enable = "avx512fp16")]
21726    unsafe fn test_mm512_mask_fmsubadd_round_ph() {
21727        let a = _mm512_set1_ph(1.0);
21728        let b = _mm512_set1_ph(2.0);
21729        let c = _mm512_set1_ph(3.0);
21730        let r = _mm512_mask_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21731            a,
21732            0b00110011001100110011001100110011,
21733            b,
21734            c,
21735        );
21736        let e = _mm512_set_ph(
21737            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21738            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21739        );
21740        assert_eq_m512h(r, e);
21741    }
21742
21743    #[simd_test(enable = "avx512fp16")]
21744    unsafe fn test_mm512_mask3_fmsubadd_round_ph() {
21745        let a = _mm512_set1_ph(1.0);
21746        let b = _mm512_set1_ph(2.0);
21747        let c = _mm512_set1_ph(3.0);
21748        let r = _mm512_mask3_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21749            a,
21750            b,
21751            c,
21752            0b00110011001100110011001100110011,
21753        );
21754        let e = _mm512_set_ph(
21755            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21756            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21757        );
21758        assert_eq_m512h(r, e);
21759    }
21760
21761    #[simd_test(enable = "avx512fp16")]
21762    unsafe fn test_mm512_maskz_fmsubadd_round_ph() {
21763        let a = _mm512_set1_ph(1.0);
21764        let b = _mm512_set1_ph(2.0);
21765        let c = _mm512_set1_ph(3.0);
21766        let r = _mm512_maskz_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21767            0b00110011001100110011001100110011,
21768            a,
21769            b,
21770            c,
21771        );
21772        let e = _mm512_set_ph(
21773            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21774            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21775        );
21776        assert_eq_m512h(r, e);
21777    }
21778
21779    #[simd_test(enable = "avx512fp16,avx512vl")]
21780    unsafe fn test_mm_rcp_ph() {
21781        let a = _mm_set1_ph(2.0);
21782        let r = _mm_rcp_ph(a);
21783        let e = _mm_set1_ph(0.5);
21784        assert_eq_m128h(r, e);
21785    }
21786
21787    #[simd_test(enable = "avx512fp16,avx512vl")]
21788    unsafe fn test_mm_mask_rcp_ph() {
21789        let a = _mm_set1_ph(2.0);
21790        let src = _mm_set1_ph(1.0);
21791        let r = _mm_mask_rcp_ph(src, 0b01010101, a);
21792        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
21793        assert_eq_m128h(r, e);
21794    }
21795
21796    #[simd_test(enable = "avx512fp16,avx512vl")]
21797    unsafe fn test_mm_maskz_rcp_ph() {
21798        let a = _mm_set1_ph(2.0);
21799        let r = _mm_maskz_rcp_ph(0b01010101, a);
21800        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
21801        assert_eq_m128h(r, e);
21802    }
21803
21804    #[simd_test(enable = "avx512fp16,avx512vl")]
21805    unsafe fn test_mm256_rcp_ph() {
21806        let a = _mm256_set1_ph(2.0);
21807        let r = _mm256_rcp_ph(a);
21808        let e = _mm256_set1_ph(0.5);
21809        assert_eq_m256h(r, e);
21810    }
21811
21812    #[simd_test(enable = "avx512fp16,avx512vl")]
21813    unsafe fn test_mm256_mask_rcp_ph() {
21814        let a = _mm256_set1_ph(2.0);
21815        let src = _mm256_set1_ph(1.0);
21816        let r = _mm256_mask_rcp_ph(src, 0b0101010101010101, a);
21817        let e = _mm256_set_ph(
21818            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21819        );
21820        assert_eq_m256h(r, e);
21821    }
21822
21823    #[simd_test(enable = "avx512fp16,avx512vl")]
21824    unsafe fn test_mm256_maskz_rcp_ph() {
21825        let a = _mm256_set1_ph(2.0);
21826        let r = _mm256_maskz_rcp_ph(0b0101010101010101, a);
21827        let e = _mm256_set_ph(
21828            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21829        );
21830        assert_eq_m256h(r, e);
21831    }
21832
21833    #[simd_test(enable = "avx512fp16")]
21834    unsafe fn test_mm512_rcp_ph() {
21835        let a = _mm512_set1_ph(2.0);
21836        let r = _mm512_rcp_ph(a);
21837        let e = _mm512_set1_ph(0.5);
21838        assert_eq_m512h(r, e);
21839    }
21840
21841    #[simd_test(enable = "avx512fp16")]
21842    unsafe fn test_mm512_mask_rcp_ph() {
21843        let a = _mm512_set1_ph(2.0);
21844        let src = _mm512_set1_ph(1.0);
21845        let r = _mm512_mask_rcp_ph(src, 0b01010101010101010101010101010101, a);
21846        let e = _mm512_set_ph(
21847            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
21848            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21849        );
21850        assert_eq_m512h(r, e);
21851    }
21852
21853    #[simd_test(enable = "avx512fp16")]
21854    unsafe fn test_mm512_maskz_rcp_ph() {
21855        let a = _mm512_set1_ph(2.0);
21856        let r = _mm512_maskz_rcp_ph(0b01010101010101010101010101010101, a);
21857        let e = _mm512_set_ph(
21858            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
21859            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21860        );
21861        assert_eq_m512h(r, e);
21862    }
21863
21864    #[simd_test(enable = "avx512fp16")]
21865    unsafe fn test_mm_rcp_sh() {
21866        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21867        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21868        let r = _mm_rcp_sh(a, b);
21869        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21870        assert_eq_m128h(r, e);
21871    }
21872
21873    #[simd_test(enable = "avx512fp16")]
21874    unsafe fn test_mm_mask_rcp_sh() {
21875        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21876        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21877        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
21878        let r = _mm_mask_rcp_sh(src, 0, a, b);
21879        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21880        assert_eq_m128h(r, e);
21881        let r = _mm_mask_rcp_sh(src, 1, a, b);
21882        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21883        assert_eq_m128h(r, e);
21884    }
21885
21886    #[simd_test(enable = "avx512fp16")]
21887    unsafe fn test_mm_maskz_rcp_sh() {
21888        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21889        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21890        let r = _mm_maskz_rcp_sh(0, a, b);
21891        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21892        assert_eq_m128h(r, e);
21893        let r = _mm_maskz_rcp_sh(1, a, b);
21894        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21895        assert_eq_m128h(r, e);
21896    }
21897
21898    #[simd_test(enable = "avx512fp16,avx512vl")]
21899    unsafe fn test_mm_rsqrt_ph() {
21900        let a = _mm_set1_ph(4.0);
21901        let r = _mm_rsqrt_ph(a);
21902        let e = _mm_set1_ph(0.5);
21903        assert_eq_m128h(r, e);
21904    }
21905
21906    #[simd_test(enable = "avx512fp16,avx512vl")]
21907    unsafe fn test_mm_mask_rsqrt_ph() {
21908        let a = _mm_set1_ph(4.0);
21909        let src = _mm_set1_ph(1.0);
21910        let r = _mm_mask_rsqrt_ph(src, 0b01010101, a);
21911        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
21912        assert_eq_m128h(r, e);
21913    }
21914
21915    #[simd_test(enable = "avx512fp16,avx512vl")]
21916    unsafe fn test_mm_maskz_rsqrt_ph() {
21917        let a = _mm_set1_ph(4.0);
21918        let r = _mm_maskz_rsqrt_ph(0b01010101, a);
21919        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
21920        assert_eq_m128h(r, e);
21921    }
21922
21923    #[simd_test(enable = "avx512fp16,avx512vl")]
21924    unsafe fn test_mm256_rsqrt_ph() {
21925        let a = _mm256_set1_ph(4.0);
21926        let r = _mm256_rsqrt_ph(a);
21927        let e = _mm256_set1_ph(0.5);
21928        assert_eq_m256h(r, e);
21929    }
21930
21931    #[simd_test(enable = "avx512fp16,avx512vl")]
21932    unsafe fn test_mm256_mask_rsqrt_ph() {
21933        let a = _mm256_set1_ph(4.0);
21934        let src = _mm256_set1_ph(1.0);
21935        let r = _mm256_mask_rsqrt_ph(src, 0b0101010101010101, a);
21936        let e = _mm256_set_ph(
21937            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21938        );
21939        assert_eq_m256h(r, e);
21940    }
21941
21942    #[simd_test(enable = "avx512fp16,avx512vl")]
21943    unsafe fn test_mm256_maskz_rsqrt_ph() {
21944        let a = _mm256_set1_ph(4.0);
21945        let r = _mm256_maskz_rsqrt_ph(0b0101010101010101, a);
21946        let e = _mm256_set_ph(
21947            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21948        );
21949        assert_eq_m256h(r, e);
21950    }
21951
21952    #[simd_test(enable = "avx512fp16")]
21953    unsafe fn test_mm512_rsqrt_ph() {
21954        let a = _mm512_set1_ph(4.0);
21955        let r = _mm512_rsqrt_ph(a);
21956        let e = _mm512_set1_ph(0.5);
21957        assert_eq_m512h(r, e);
21958    }
21959
21960    #[simd_test(enable = "avx512fp16")]
21961    unsafe fn test_mm512_mask_rsqrt_ph() {
21962        let a = _mm512_set1_ph(4.0);
21963        let src = _mm512_set1_ph(1.0);
21964        let r = _mm512_mask_rsqrt_ph(src, 0b01010101010101010101010101010101, a);
21965        let e = _mm512_set_ph(
21966            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
21967            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21968        );
21969        assert_eq_m512h(r, e);
21970    }
21971
21972    #[simd_test(enable = "avx512fp16")]
21973    unsafe fn test_mm512_maskz_rsqrt_ph() {
21974        let a = _mm512_set1_ph(4.0);
21975        let r = _mm512_maskz_rsqrt_ph(0b01010101010101010101010101010101, a);
21976        let e = _mm512_set_ph(
21977            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
21978            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21979        );
21980        assert_eq_m512h(r, e);
21981    }
21982
21983    #[simd_test(enable = "avx512fp16")]
21984    unsafe fn test_mm_rsqrt_sh() {
21985        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21986        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21987        let r = _mm_rsqrt_sh(a, b);
21988        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21989        assert_eq_m128h(r, e);
21990    }
21991
21992    #[simd_test(enable = "avx512fp16")]
21993    unsafe fn test_mm_mask_rsqrt_sh() {
21994        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21995        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21996        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
21997        let r = _mm_mask_rsqrt_sh(src, 0, a, b);
21998        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21999        assert_eq_m128h(r, e);
22000        let r = _mm_mask_rsqrt_sh(src, 1, a, b);
22001        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22002        assert_eq_m128h(r, e);
22003    }
22004
22005    #[simd_test(enable = "avx512fp16")]
22006    unsafe fn test_mm_maskz_rsqrt_sh() {
22007        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22008        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22009        let r = _mm_maskz_rsqrt_sh(0, a, b);
22010        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22011        assert_eq_m128h(r, e);
22012        let r = _mm_maskz_rsqrt_sh(1, a, b);
22013        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22014        assert_eq_m128h(r, e);
22015    }
22016
22017    #[simd_test(enable = "avx512fp16,avx512vl")]
22018    unsafe fn test_mm_sqrt_ph() {
22019        let a = _mm_set1_ph(4.0);
22020        let r = _mm_sqrt_ph(a);
22021        let e = _mm_set1_ph(2.0);
22022        assert_eq_m128h(r, e);
22023    }
22024
22025    #[simd_test(enable = "avx512fp16,avx512vl")]
22026    unsafe fn test_mm_mask_sqrt_ph() {
22027        let a = _mm_set1_ph(4.0);
22028        let src = _mm_set1_ph(1.0);
22029        let r = _mm_mask_sqrt_ph(src, 0b01010101, a);
22030        let e = _mm_set_ph(1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0);
22031        assert_eq_m128h(r, e);
22032    }
22033
22034    #[simd_test(enable = "avx512fp16,avx512vl")]
22035    unsafe fn test_mm_maskz_sqrt_ph() {
22036        let a = _mm_set1_ph(4.0);
22037        let r = _mm_maskz_sqrt_ph(0b01010101, a);
22038        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22039        assert_eq_m128h(r, e);
22040    }
22041
22042    #[simd_test(enable = "avx512fp16,avx512vl")]
22043    unsafe fn test_mm256_sqrt_ph() {
22044        let a = _mm256_set1_ph(4.0);
22045        let r = _mm256_sqrt_ph(a);
22046        let e = _mm256_set1_ph(2.0);
22047        assert_eq_m256h(r, e);
22048    }
22049
22050    #[simd_test(enable = "avx512fp16,avx512vl")]
22051    unsafe fn test_mm256_mask_sqrt_ph() {
22052        let a = _mm256_set1_ph(4.0);
22053        let src = _mm256_set1_ph(1.0);
22054        let r = _mm256_mask_sqrt_ph(src, 0b0101010101010101, a);
22055        let e = _mm256_set_ph(
22056            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22057        );
22058        assert_eq_m256h(r, e);
22059    }
22060
22061    #[simd_test(enable = "avx512fp16,avx512vl")]
22062    unsafe fn test_mm256_maskz_sqrt_ph() {
22063        let a = _mm256_set1_ph(4.0);
22064        let r = _mm256_maskz_sqrt_ph(0b0101010101010101, a);
22065        let e = _mm256_set_ph(
22066            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22067        );
22068        assert_eq_m256h(r, e);
22069    }
22070
22071    #[simd_test(enable = "avx512fp16")]
22072    unsafe fn test_mm512_sqrt_ph() {
22073        let a = _mm512_set1_ph(4.0);
22074        let r = _mm512_sqrt_ph(a);
22075        let e = _mm512_set1_ph(2.0);
22076        assert_eq_m512h(r, e);
22077    }
22078
22079    #[simd_test(enable = "avx512fp16")]
22080    unsafe fn test_mm512_mask_sqrt_ph() {
22081        let a = _mm512_set1_ph(4.0);
22082        let src = _mm512_set1_ph(1.0);
22083        let r = _mm512_mask_sqrt_ph(src, 0b01010101010101010101010101010101, a);
22084        let e = _mm512_set_ph(
22085            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22086            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22087        );
22088        assert_eq_m512h(r, e);
22089    }
22090
22091    #[simd_test(enable = "avx512fp16")]
22092    unsafe fn test_mm512_maskz_sqrt_ph() {
22093        let a = _mm512_set1_ph(4.0);
22094        let r = _mm512_maskz_sqrt_ph(0b01010101010101010101010101010101, a);
22095        let e = _mm512_set_ph(
22096            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22097            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22098        );
22099        assert_eq_m512h(r, e);
22100    }
22101
22102    #[simd_test(enable = "avx512fp16")]
22103    unsafe fn test_mm512_sqrt_round_ph() {
22104        let a = _mm512_set1_ph(4.0);
22105        let r = _mm512_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
22106        let e = _mm512_set1_ph(2.0);
22107        assert_eq_m512h(r, e);
22108    }
22109
22110    #[simd_test(enable = "avx512fp16")]
22111    unsafe fn test_mm512_mask_sqrt_round_ph() {
22112        let a = _mm512_set1_ph(4.0);
22113        let src = _mm512_set1_ph(1.0);
22114        let r = _mm512_mask_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22115            src,
22116            0b01010101010101010101010101010101,
22117            a,
22118        );
22119        let e = _mm512_set_ph(
22120            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22121            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22122        );
22123        assert_eq_m512h(r, e);
22124    }
22125
22126    #[simd_test(enable = "avx512fp16")]
22127    unsafe fn test_mm512_maskz_sqrt_round_ph() {
22128        let a = _mm512_set1_ph(4.0);
22129        let r = _mm512_maskz_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22130            0b01010101010101010101010101010101,
22131            a,
22132        );
22133        let e = _mm512_set_ph(
22134            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22135            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22136        );
22137        assert_eq_m512h(r, e);
22138    }
22139
22140    #[simd_test(enable = "avx512fp16")]
22141    unsafe fn test_mm_sqrt_sh() {
22142        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22143        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22144        let r = _mm_sqrt_sh(a, b);
22145        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22146        assert_eq_m128h(r, e);
22147    }
22148
22149    #[simd_test(enable = "avx512fp16")]
22150    unsafe fn test_mm_mask_sqrt_sh() {
22151        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22152        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22153        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22154        let r = _mm_mask_sqrt_sh(src, 0, a, b);
22155        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22156        assert_eq_m128h(r, e);
22157        let r = _mm_mask_sqrt_sh(src, 1, a, b);
22158        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22159        assert_eq_m128h(r, e);
22160    }
22161
22162    #[simd_test(enable = "avx512fp16")]
22163    unsafe fn test_mm_maskz_sqrt_sh() {
22164        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22165        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22166        let r = _mm_maskz_sqrt_sh(0, a, b);
22167        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22168        assert_eq_m128h(r, e);
22169        let r = _mm_maskz_sqrt_sh(1, a, b);
22170        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22171        assert_eq_m128h(r, e);
22172    }
22173
22174    #[simd_test(enable = "avx512fp16")]
22175    unsafe fn test_mm_sqrt_round_sh() {
22176        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22177        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22178        let r = _mm_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22179        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22180        assert_eq_m128h(r, e);
22181    }
22182
22183    #[simd_test(enable = "avx512fp16")]
22184    unsafe fn test_mm_mask_sqrt_round_sh() {
22185        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22186        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22187        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22188        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22189            src, 0, a, b,
22190        );
22191        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22192        assert_eq_m128h(r, e);
22193        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22194            src, 1, a, b,
22195        );
22196        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22197        assert_eq_m128h(r, e);
22198    }
22199
22200    #[simd_test(enable = "avx512fp16")]
22201    unsafe fn test_mm_maskz_sqrt_round_sh() {
22202        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22203        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22204        let r =
22205            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22206        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22207        assert_eq_m128h(r, e);
22208        let r =
22209            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22210        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22211        assert_eq_m128h(r, e);
22212    }
22213
22214    #[simd_test(enable = "avx512fp16,avx512vl")]
22215    unsafe fn test_mm_max_ph() {
22216        let a = _mm_set1_ph(2.0);
22217        let b = _mm_set1_ph(1.0);
22218        let r = _mm_max_ph(a, b);
22219        let e = _mm_set1_ph(2.0);
22220        assert_eq_m128h(r, e);
22221    }
22222
22223    #[simd_test(enable = "avx512fp16,avx512vl")]
22224    unsafe fn test_mm_mask_max_ph() {
22225        let a = _mm_set1_ph(2.0);
22226        let b = _mm_set1_ph(1.0);
22227        let src = _mm_set1_ph(3.0);
22228        let r = _mm_mask_max_ph(src, 0b01010101, a, b);
22229        let e = _mm_set_ph(3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0);
22230        assert_eq_m128h(r, e);
22231    }
22232
22233    #[simd_test(enable = "avx512fp16,avx512vl")]
22234    unsafe fn test_mm_maskz_max_ph() {
22235        let a = _mm_set1_ph(2.0);
22236        let b = _mm_set1_ph(1.0);
22237        let r = _mm_maskz_max_ph(0b01010101, a, b);
22238        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22239        assert_eq_m128h(r, e);
22240    }
22241
22242    #[simd_test(enable = "avx512fp16,avx512vl")]
22243    unsafe fn test_mm256_max_ph() {
22244        let a = _mm256_set1_ph(2.0);
22245        let b = _mm256_set1_ph(1.0);
22246        let r = _mm256_max_ph(a, b);
22247        let e = _mm256_set1_ph(2.0);
22248        assert_eq_m256h(r, e);
22249    }
22250
22251    #[simd_test(enable = "avx512fp16,avx512vl")]
22252    unsafe fn test_mm256_mask_max_ph() {
22253        let a = _mm256_set1_ph(2.0);
22254        let b = _mm256_set1_ph(1.0);
22255        let src = _mm256_set1_ph(3.0);
22256        let r = _mm256_mask_max_ph(src, 0b0101010101010101, a, b);
22257        let e = _mm256_set_ph(
22258            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22259        );
22260        assert_eq_m256h(r, e);
22261    }
22262
22263    #[simd_test(enable = "avx512fp16,avx512vl")]
22264    unsafe fn test_mm256_maskz_max_ph() {
22265        let a = _mm256_set1_ph(2.0);
22266        let b = _mm256_set1_ph(1.0);
22267        let r = _mm256_maskz_max_ph(0b0101010101010101, a, b);
22268        let e = _mm256_set_ph(
22269            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22270        );
22271        assert_eq_m256h(r, e);
22272    }
22273
22274    #[simd_test(enable = "avx512fp16")]
22275    unsafe fn test_mm512_max_ph() {
22276        let a = _mm512_set1_ph(2.0);
22277        let b = _mm512_set1_ph(1.0);
22278        let r = _mm512_max_ph(a, b);
22279        let e = _mm512_set1_ph(2.0);
22280        assert_eq_m512h(r, e);
22281    }
22282
22283    #[simd_test(enable = "avx512fp16")]
22284    unsafe fn test_mm512_mask_max_ph() {
22285        let a = _mm512_set1_ph(2.0);
22286        let b = _mm512_set1_ph(1.0);
22287        let src = _mm512_set1_ph(3.0);
22288        let r = _mm512_mask_max_ph(src, 0b01010101010101010101010101010101, a, b);
22289        let e = _mm512_set_ph(
22290            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22291            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22292        );
22293        assert_eq_m512h(r, e);
22294    }
22295
22296    #[simd_test(enable = "avx512fp16")]
22297    unsafe fn test_mm512_maskz_max_ph() {
22298        let a = _mm512_set1_ph(2.0);
22299        let b = _mm512_set1_ph(1.0);
22300        let r = _mm512_maskz_max_ph(0b01010101010101010101010101010101, a, b);
22301        let e = _mm512_set_ph(
22302            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22303            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22304        );
22305        assert_eq_m512h(r, e);
22306    }
22307
22308    #[simd_test(enable = "avx512fp16")]
22309    unsafe fn test_mm512_max_round_ph() {
22310        let a = _mm512_set1_ph(2.0);
22311        let b = _mm512_set1_ph(1.0);
22312        let r = _mm512_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22313        let e = _mm512_set1_ph(2.0);
22314        assert_eq_m512h(r, e);
22315    }
22316
22317    #[simd_test(enable = "avx512fp16")]
22318    unsafe fn test_mm512_mask_max_round_ph() {
22319        let a = _mm512_set1_ph(2.0);
22320        let b = _mm512_set1_ph(1.0);
22321        let src = _mm512_set1_ph(3.0);
22322        let r = _mm512_mask_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22323            src,
22324            0b01010101010101010101010101010101,
22325            a,
22326            b,
22327        );
22328        let e = _mm512_set_ph(
22329            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22330            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22331        );
22332        assert_eq_m512h(r, e);
22333    }
22334
22335    #[simd_test(enable = "avx512fp16")]
22336    unsafe fn test_mm512_maskz_max_round_ph() {
22337        let a = _mm512_set1_ph(2.0);
22338        let b = _mm512_set1_ph(1.0);
22339        let r = _mm512_maskz_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22340            0b01010101010101010101010101010101,
22341            a,
22342            b,
22343        );
22344        let e = _mm512_set_ph(
22345            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22346            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22347        );
22348        assert_eq_m512h(r, e);
22349    }
22350
22351    #[simd_test(enable = "avx512fp16")]
22352    unsafe fn test_mm_max_sh() {
22353        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22354        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22355        let r = _mm_max_sh(a, b);
22356        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22357        assert_eq_m128h(r, e);
22358    }
22359
22360    #[simd_test(enable = "avx512fp16")]
22361    unsafe fn test_mm_mask_max_sh() {
22362        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22363        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22364        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22365        let r = _mm_mask_max_sh(src, 0, a, b);
22366        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22367        assert_eq_m128h(r, e);
22368        let r = _mm_mask_max_sh(src, 1, a, b);
22369        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22370        assert_eq_m128h(r, e);
22371    }
22372
22373    #[simd_test(enable = "avx512fp16")]
22374    unsafe fn test_mm_maskz_max_sh() {
22375        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22376        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22377        let r = _mm_maskz_max_sh(0, a, b);
22378        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22379        assert_eq_m128h(r, e);
22380        let r = _mm_maskz_max_sh(1, a, b);
22381        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22382        assert_eq_m128h(r, e);
22383    }
22384
22385    #[simd_test(enable = "avx512fp16")]
22386    unsafe fn test_mm_max_round_sh() {
22387        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22388        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22389        let r = _mm_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22390        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22391        assert_eq_m128h(r, e);
22392    }
22393
22394    #[simd_test(enable = "avx512fp16")]
22395    unsafe fn test_mm_mask_max_round_sh() {
22396        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22397        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22398        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22399        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22400            src, 0, a, b,
22401        );
22402        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22403        assert_eq_m128h(r, e);
22404        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22405            src, 1, a, b,
22406        );
22407        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22408        assert_eq_m128h(r, e);
22409    }
22410
22411    #[simd_test(enable = "avx512fp16")]
22412    unsafe fn test_mm_maskz_max_round_sh() {
22413        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22414        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22415        let r =
22416            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22417        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22418        assert_eq_m128h(r, e);
22419        let r =
22420            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22421        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22422        assert_eq_m128h(r, e);
22423    }
22424
22425    #[simd_test(enable = "avx512fp16,avx512vl")]
22426    unsafe fn test_mm_min_ph() {
22427        let a = _mm_set1_ph(2.0);
22428        let b = _mm_set1_ph(1.0);
22429        let r = _mm_min_ph(a, b);
22430        let e = _mm_set1_ph(1.0);
22431        assert_eq_m128h(r, e);
22432    }
22433
22434    #[simd_test(enable = "avx512fp16,avx512vl")]
22435    unsafe fn test_mm_mask_min_ph() {
22436        let a = _mm_set1_ph(2.0);
22437        let b = _mm_set1_ph(1.0);
22438        let src = _mm_set1_ph(3.0);
22439        let r = _mm_mask_min_ph(src, 0b01010101, a, b);
22440        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
22441        assert_eq_m128h(r, e);
22442    }
22443
22444    #[simd_test(enable = "avx512fp16,avx512vl")]
22445    unsafe fn test_mm_maskz_min_ph() {
22446        let a = _mm_set1_ph(2.0);
22447        let b = _mm_set1_ph(1.0);
22448        let r = _mm_maskz_min_ph(0b01010101, a, b);
22449        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22450        assert_eq_m128h(r, e);
22451    }
22452
22453    #[simd_test(enable = "avx512fp16,avx512vl")]
22454    unsafe fn test_mm256_min_ph() {
22455        let a = _mm256_set1_ph(2.0);
22456        let b = _mm256_set1_ph(1.0);
22457        let r = _mm256_min_ph(a, b);
22458        let e = _mm256_set1_ph(1.0);
22459        assert_eq_m256h(r, e);
22460    }
22461
22462    #[simd_test(enable = "avx512fp16,avx512vl")]
22463    unsafe fn test_mm256_mask_min_ph() {
22464        let a = _mm256_set1_ph(2.0);
22465        let b = _mm256_set1_ph(1.0);
22466        let src = _mm256_set1_ph(3.0);
22467        let r = _mm256_mask_min_ph(src, 0b0101010101010101, a, b);
22468        let e = _mm256_set_ph(
22469            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22470        );
22471        assert_eq_m256h(r, e);
22472    }
22473
22474    #[simd_test(enable = "avx512fp16,avx512vl")]
22475    unsafe fn test_mm256_maskz_min_ph() {
22476        let a = _mm256_set1_ph(2.0);
22477        let b = _mm256_set1_ph(1.0);
22478        let r = _mm256_maskz_min_ph(0b0101010101010101, a, b);
22479        let e = _mm256_set_ph(
22480            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22481        );
22482        assert_eq_m256h(r, e);
22483    }
22484
22485    #[simd_test(enable = "avx512fp16")]
22486    unsafe fn test_mm512_min_ph() {
22487        let a = _mm512_set1_ph(2.0);
22488        let b = _mm512_set1_ph(1.0);
22489        let r = _mm512_min_ph(a, b);
22490        let e = _mm512_set1_ph(1.0);
22491        assert_eq_m512h(r, e);
22492    }
22493
22494    #[simd_test(enable = "avx512fp16")]
22495    unsafe fn test_mm512_mask_min_ph() {
22496        let a = _mm512_set1_ph(2.0);
22497        let b = _mm512_set1_ph(1.0);
22498        let src = _mm512_set1_ph(3.0);
22499        let r = _mm512_mask_min_ph(src, 0b01010101010101010101010101010101, a, b);
22500        let e = _mm512_set_ph(
22501            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22502            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22503        );
22504        assert_eq_m512h(r, e);
22505    }
22506
22507    #[simd_test(enable = "avx512fp16")]
22508    unsafe fn test_mm512_maskz_min_ph() {
22509        let a = _mm512_set1_ph(2.0);
22510        let b = _mm512_set1_ph(1.0);
22511        let r = _mm512_maskz_min_ph(0b01010101010101010101010101010101, a, b);
22512        let e = _mm512_set_ph(
22513            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22514            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22515        );
22516        assert_eq_m512h(r, e);
22517    }
22518
22519    #[simd_test(enable = "avx512fp16")]
22520    unsafe fn test_mm512_min_round_ph() {
22521        let a = _mm512_set1_ph(2.0);
22522        let b = _mm512_set1_ph(1.0);
22523        let r = _mm512_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22524        let e = _mm512_set1_ph(1.0);
22525        assert_eq_m512h(r, e);
22526    }
22527
22528    #[simd_test(enable = "avx512fp16")]
22529    unsafe fn test_mm512_mask_min_round_ph() {
22530        let a = _mm512_set1_ph(2.0);
22531        let b = _mm512_set1_ph(1.0);
22532        let src = _mm512_set1_ph(3.0);
22533        let r = _mm512_mask_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22534            src,
22535            0b01010101010101010101010101010101,
22536            a,
22537            b,
22538        );
22539        let e = _mm512_set_ph(
22540            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22541            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22542        );
22543        assert_eq_m512h(r, e);
22544    }
22545
22546    #[simd_test(enable = "avx512fp16")]
22547    unsafe fn test_mm512_maskz_min_round_ph() {
22548        let a = _mm512_set1_ph(2.0);
22549        let b = _mm512_set1_ph(1.0);
22550        let r = _mm512_maskz_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22551            0b01010101010101010101010101010101,
22552            a,
22553            b,
22554        );
22555        let e = _mm512_set_ph(
22556            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22557            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22558        );
22559        assert_eq_m512h(r, e);
22560    }
22561
22562    #[simd_test(enable = "avx512fp16")]
22563    unsafe fn test_mm_min_sh() {
22564        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22565        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22566        let r = _mm_min_sh(a, b);
22567        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22568        assert_eq_m128h(r, e);
22569    }
22570
22571    #[simd_test(enable = "avx512fp16")]
22572    unsafe fn test_mm_mask_min_sh() {
22573        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22574        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22575        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22576        let r = _mm_mask_min_sh(src, 0, a, b);
22577        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22578        assert_eq_m128h(r, e);
22579        let r = _mm_mask_min_sh(src, 1, a, b);
22580        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22581        assert_eq_m128h(r, e);
22582    }
22583
22584    #[simd_test(enable = "avx512fp16")]
22585    unsafe fn test_mm_maskz_min_sh() {
22586        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22587        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22588        let r = _mm_maskz_min_sh(0, a, b);
22589        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22590        assert_eq_m128h(r, e);
22591        let r = _mm_maskz_min_sh(1, a, b);
22592        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22593        assert_eq_m128h(r, e);
22594    }
22595
22596    #[simd_test(enable = "avx512fp16")]
22597    unsafe fn test_mm_min_round_sh() {
22598        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22599        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22600        let r = _mm_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22601        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22602        assert_eq_m128h(r, e);
22603    }
22604
22605    #[simd_test(enable = "avx512fp16")]
22606    unsafe fn test_mm_mask_min_round_sh() {
22607        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22608        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22609        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22610        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22611            src, 0, a, b,
22612        );
22613        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22614        assert_eq_m128h(r, e);
22615        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22616            src, 1, a, b,
22617        );
22618        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22619        assert_eq_m128h(r, e);
22620    }
22621
22622    #[simd_test(enable = "avx512fp16")]
22623    unsafe fn test_mm_maskz_min_round_sh() {
22624        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22625        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22626        let r =
22627            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22628        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22629        assert_eq_m128h(r, e);
22630        let r =
22631            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22632        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22633        assert_eq_m128h(r, e);
22634    }
22635
22636    #[simd_test(enable = "avx512fp16,avx512vl")]
22637    unsafe fn test_mm_getexp_ph() {
22638        let a = _mm_set1_ph(3.0);
22639        let r = _mm_getexp_ph(a);
22640        let e = _mm_set1_ph(1.0);
22641        assert_eq_m128h(r, e);
22642    }
22643
22644    #[simd_test(enable = "avx512fp16,avx512vl")]
22645    unsafe fn test_mm_mask_getexp_ph() {
22646        let a = _mm_set1_ph(3.0);
22647        let src = _mm_set1_ph(4.0);
22648        let r = _mm_mask_getexp_ph(src, 0b01010101, a);
22649        let e = _mm_set_ph(4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0);
22650        assert_eq_m128h(r, e);
22651    }
22652
22653    #[simd_test(enable = "avx512fp16,avx512vl")]
22654    unsafe fn test_mm_maskz_getexp_ph() {
22655        let a = _mm_set1_ph(3.0);
22656        let r = _mm_maskz_getexp_ph(0b01010101, a);
22657        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22658        assert_eq_m128h(r, e);
22659    }
22660
22661    #[simd_test(enable = "avx512fp16,avx512vl")]
22662    unsafe fn test_mm256_getexp_ph() {
22663        let a = _mm256_set1_ph(3.0);
22664        let r = _mm256_getexp_ph(a);
22665        let e = _mm256_set1_ph(1.0);
22666        assert_eq_m256h(r, e);
22667    }
22668
22669    #[simd_test(enable = "avx512fp16,avx512vl")]
22670    unsafe fn test_mm256_mask_getexp_ph() {
22671        let a = _mm256_set1_ph(3.0);
22672        let src = _mm256_set1_ph(4.0);
22673        let r = _mm256_mask_getexp_ph(src, 0b0101010101010101, a);
22674        let e = _mm256_set_ph(
22675            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22676        );
22677        assert_eq_m256h(r, e);
22678    }
22679
22680    #[simd_test(enable = "avx512fp16,avx512vl")]
22681    unsafe fn test_mm256_maskz_getexp_ph() {
22682        let a = _mm256_set1_ph(3.0);
22683        let r = _mm256_maskz_getexp_ph(0b0101010101010101, a);
22684        let e = _mm256_set_ph(
22685            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22686        );
22687        assert_eq_m256h(r, e);
22688    }
22689
22690    #[simd_test(enable = "avx512fp16")]
22691    unsafe fn test_mm512_getexp_ph() {
22692        let a = _mm512_set1_ph(3.0);
22693        let r = _mm512_getexp_ph(a);
22694        let e = _mm512_set1_ph(1.0);
22695        assert_eq_m512h(r, e);
22696    }
22697
22698    #[simd_test(enable = "avx512fp16")]
22699    unsafe fn test_mm512_mask_getexp_ph() {
22700        let a = _mm512_set1_ph(3.0);
22701        let src = _mm512_set1_ph(4.0);
22702        let r = _mm512_mask_getexp_ph(src, 0b01010101010101010101010101010101, a);
22703        let e = _mm512_set_ph(
22704            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
22705            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22706        );
22707        assert_eq_m512h(r, e);
22708    }
22709
22710    #[simd_test(enable = "avx512fp16")]
22711    unsafe fn test_mm512_maskz_getexp_ph() {
22712        let a = _mm512_set1_ph(3.0);
22713        let r = _mm512_maskz_getexp_ph(0b01010101010101010101010101010101, a);
22714        let e = _mm512_set_ph(
22715            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22716            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22717        );
22718        assert_eq_m512h(r, e);
22719    }
22720
22721    #[simd_test(enable = "avx512fp16")]
22722    unsafe fn test_mm512_getexp_round_ph() {
22723        let a = _mm512_set1_ph(3.0);
22724        let r = _mm512_getexp_round_ph::<_MM_FROUND_NO_EXC>(a);
22725        let e = _mm512_set1_ph(1.0);
22726        assert_eq_m512h(r, e);
22727    }
22728
22729    #[simd_test(enable = "avx512fp16")]
22730    unsafe fn test_mm512_mask_getexp_round_ph() {
22731        let a = _mm512_set1_ph(3.0);
22732        let src = _mm512_set1_ph(4.0);
22733        let r = _mm512_mask_getexp_round_ph::<_MM_FROUND_NO_EXC>(
22734            src,
22735            0b01010101010101010101010101010101,
22736            a,
22737        );
22738        let e = _mm512_set_ph(
22739            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
22740            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22741        );
22742        assert_eq_m512h(r, e);
22743    }
22744
22745    #[simd_test(enable = "avx512fp16")]
22746    unsafe fn test_mm512_maskz_getexp_round_ph() {
22747        let a = _mm512_set1_ph(3.0);
22748        let r = _mm512_maskz_getexp_round_ph::<_MM_FROUND_NO_EXC>(
22749            0b01010101010101010101010101010101,
22750            a,
22751        );
22752        let e = _mm512_set_ph(
22753            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22754            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22755        );
22756        assert_eq_m512h(r, e);
22757    }
22758
22759    #[simd_test(enable = "avx512fp16")]
22760    unsafe fn test_mm_getexp_sh() {
22761        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22762        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22763        let r = _mm_getexp_sh(a, b);
22764        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22765        assert_eq_m128h(r, e);
22766    }
22767
22768    #[simd_test(enable = "avx512fp16")]
22769    unsafe fn test_mm_mask_getexp_sh() {
22770        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22771        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22772        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
22773        let r = _mm_mask_getexp_sh(src, 0, a, b);
22774        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22775        assert_eq_m128h(r, e);
22776        let r = _mm_mask_getexp_sh(src, 1, a, b);
22777        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22778        assert_eq_m128h(r, e);
22779    }
22780
22781    #[simd_test(enable = "avx512fp16")]
22782    unsafe fn test_mm_maskz_getexp_sh() {
22783        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22784        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22785        let r = _mm_maskz_getexp_sh(0, a, b);
22786        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22787        assert_eq_m128h(r, e);
22788        let r = _mm_maskz_getexp_sh(1, a, b);
22789        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22790        assert_eq_m128h(r, e);
22791    }
22792
22793    #[simd_test(enable = "avx512fp16")]
22794    unsafe fn test_mm_getexp_round_sh() {
22795        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22796        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22797        let r = _mm_getexp_round_sh::<_MM_FROUND_NO_EXC>(a, b);
22798        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22799        assert_eq_m128h(r, e);
22800    }
22801
22802    #[simd_test(enable = "avx512fp16")]
22803    unsafe fn test_mm_mask_getexp_round_sh() {
22804        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22805        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22806        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
22807        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 0, a, b);
22808        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22809        assert_eq_m128h(r, e);
22810        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 1, a, b);
22811        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22812        assert_eq_m128h(r, e);
22813    }
22814
22815    #[simd_test(enable = "avx512fp16")]
22816    unsafe fn test_mm_maskz_getexp_round_sh() {
22817        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22818        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22819        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(0, a, b);
22820        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22821        assert_eq_m128h(r, e);
22822        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(1, a, b);
22823        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22824        assert_eq_m128h(r, e);
22825    }
22826
22827    #[simd_test(enable = "avx512fp16,avx512vl")]
22828    unsafe fn test_mm_getmant_ph() {
22829        let a = _mm_set1_ph(10.0);
22830        let r = _mm_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22831        let e = _mm_set1_ph(1.25);
22832        assert_eq_m128h(r, e);
22833    }
22834
22835    #[simd_test(enable = "avx512fp16,avx512vl")]
22836    unsafe fn test_mm_mask_getmant_ph() {
22837        let a = _mm_set1_ph(10.0);
22838        let src = _mm_set1_ph(20.0);
22839        let r = _mm_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0b01010101, a);
22840        let e = _mm_set_ph(20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25);
22841        assert_eq_m128h(r, e);
22842    }
22843
22844    #[simd_test(enable = "avx512fp16,avx512vl")]
22845    unsafe fn test_mm_maskz_getmant_ph() {
22846        let a = _mm_set1_ph(10.0);
22847        let r = _mm_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0b01010101, a);
22848        let e = _mm_set_ph(0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25);
22849        assert_eq_m128h(r, e);
22850    }
22851
22852    #[simd_test(enable = "avx512fp16,avx512vl")]
22853    unsafe fn test_mm256_getmant_ph() {
22854        let a = _mm256_set1_ph(10.0);
22855        let r = _mm256_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22856        let e = _mm256_set1_ph(1.25);
22857        assert_eq_m256h(r, e);
22858    }
22859
22860    #[simd_test(enable = "avx512fp16,avx512vl")]
22861    unsafe fn test_mm256_mask_getmant_ph() {
22862        let a = _mm256_set1_ph(10.0);
22863        let src = _mm256_set1_ph(20.0);
22864        let r = _mm256_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22865            src,
22866            0b0101010101010101,
22867            a,
22868        );
22869        let e = _mm256_set_ph(
22870            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22871            20.0, 1.25,
22872        );
22873        assert_eq_m256h(r, e);
22874    }
22875
22876    #[simd_test(enable = "avx512fp16,avx512vl")]
22877    unsafe fn test_mm256_maskz_getmant_ph() {
22878        let a = _mm256_set1_ph(10.0);
22879        let r = _mm256_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22880            0b0101010101010101,
22881            a,
22882        );
22883        let e = _mm256_set_ph(
22884            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22885        );
22886        assert_eq_m256h(r, e);
22887    }
22888
22889    #[simd_test(enable = "avx512fp16")]
22890    unsafe fn test_mm512_getmant_ph() {
22891        let a = _mm512_set1_ph(10.0);
22892        let r = _mm512_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22893        let e = _mm512_set1_ph(1.25);
22894        assert_eq_m512h(r, e);
22895    }
22896
22897    #[simd_test(enable = "avx512fp16")]
22898    unsafe fn test_mm512_mask_getmant_ph() {
22899        let a = _mm512_set1_ph(10.0);
22900        let src = _mm512_set1_ph(20.0);
22901        let r = _mm512_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22902            src,
22903            0b01010101010101010101010101010101,
22904            a,
22905        );
22906        let e = _mm512_set_ph(
22907            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22908            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22909            20.0, 1.25, 20.0, 1.25,
22910        );
22911        assert_eq_m512h(r, e);
22912    }
22913
22914    #[simd_test(enable = "avx512fp16")]
22915    unsafe fn test_mm512_maskz_getmant_ph() {
22916        let a = _mm512_set1_ph(10.0);
22917        let r = _mm512_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22918            0b01010101010101010101010101010101,
22919            a,
22920        );
22921        let e = _mm512_set_ph(
22922            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22923            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22924        );
22925        assert_eq_m512h(r, e);
22926    }
22927
22928    #[simd_test(enable = "avx512fp16")]
22929    unsafe fn test_mm512_getmant_round_ph() {
22930        let a = _mm512_set1_ph(10.0);
22931        let r =
22932            _mm512_getmant_round_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
22933                a,
22934            );
22935        let e = _mm512_set1_ph(1.25);
22936        assert_eq_m512h(r, e);
22937    }
22938
22939    #[simd_test(enable = "avx512fp16")]
22940    unsafe fn test_mm512_mask_getmant_round_ph() {
22941        let a = _mm512_set1_ph(10.0);
22942        let src = _mm512_set1_ph(20.0);
22943        let r = _mm512_mask_getmant_round_ph::<
22944            _MM_MANT_NORM_P75_1P5,
22945            _MM_MANT_SIGN_NAN,
22946            _MM_FROUND_NO_EXC,
22947        >(src, 0b01010101010101010101010101010101, a);
22948        let e = _mm512_set_ph(
22949            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22950            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22951            20.0, 1.25, 20.0, 1.25,
22952        );
22953        assert_eq_m512h(r, e);
22954    }
22955
22956    #[simd_test(enable = "avx512fp16")]
22957    unsafe fn test_mm512_maskz_getmant_round_ph() {
22958        let a = _mm512_set1_ph(10.0);
22959        let r = _mm512_maskz_getmant_round_ph::<
22960            _MM_MANT_NORM_P75_1P5,
22961            _MM_MANT_SIGN_NAN,
22962            _MM_FROUND_NO_EXC,
22963        >(0b01010101010101010101010101010101, a);
22964        let e = _mm512_set_ph(
22965            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22966            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22967        );
22968        assert_eq_m512h(r, e);
22969    }
22970
22971    #[simd_test(enable = "avx512fp16")]
22972    unsafe fn test_mm_getmant_sh() {
22973        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22974        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22975        let r = _mm_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a, b);
22976        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
22977        assert_eq_m128h(r, e);
22978    }
22979
22980    #[simd_test(enable = "avx512fp16")]
22981    unsafe fn test_mm_mask_getmant_sh() {
22982        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22983        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22984        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
22985        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0, a, b);
22986        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
22987        assert_eq_m128h(r, e);
22988        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 1, a, b);
22989        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
22990        assert_eq_m128h(r, e);
22991    }
22992
22993    #[simd_test(enable = "avx512fp16")]
22994    unsafe fn test_mm_maskz_getmant_sh() {
22995        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22996        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22997        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0, a, b);
22998        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22999        assert_eq_m128h(r, e);
23000        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(1, a, b);
23001        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23002        assert_eq_m128h(r, e);
23003    }
23004
23005    #[simd_test(enable = "avx512fp16")]
23006    unsafe fn test_mm_getmant_round_sh() {
23007        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23008        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23009        let r = _mm_getmant_round_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
23010            a, b,
23011        );
23012        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23013        assert_eq_m128h(r, e);
23014    }
23015
23016    #[simd_test(enable = "avx512fp16")]
23017    unsafe fn test_mm_mask_getmant_round_sh() {
23018        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23019        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23020        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
23021        let r = _mm_mask_getmant_round_sh::<
23022            _MM_MANT_NORM_P75_1P5,
23023            _MM_MANT_SIGN_NAN,
23024            _MM_FROUND_NO_EXC,
23025        >(src, 0, a, b);
23026        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
23027        assert_eq_m128h(r, e);
23028        let r = _mm_mask_getmant_round_sh::<
23029            _MM_MANT_NORM_P75_1P5,
23030            _MM_MANT_SIGN_NAN,
23031            _MM_FROUND_NO_EXC,
23032        >(src, 1, a, b);
23033        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23034        assert_eq_m128h(r, e);
23035    }
23036
23037    #[simd_test(enable = "avx512fp16")]
23038    unsafe fn test_mm_maskz_getmant_round_sh() {
23039        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23040        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23041        let r = _mm_maskz_getmant_round_sh::<
23042            _MM_MANT_NORM_P75_1P5,
23043            _MM_MANT_SIGN_NAN,
23044            _MM_FROUND_NO_EXC,
23045        >(0, a, b);
23046        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23047        assert_eq_m128h(r, e);
23048        let r = _mm_maskz_getmant_round_sh::<
23049            _MM_MANT_NORM_P75_1P5,
23050            _MM_MANT_SIGN_NAN,
23051            _MM_FROUND_NO_EXC,
23052        >(1, a, b);
23053        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23054        assert_eq_m128h(r, e);
23055    }
23056
23057    #[simd_test(enable = "avx512fp16,avx512vl")]
23058    unsafe fn test_mm_roundscale_ph() {
23059        let a = _mm_set1_ph(1.1);
23060        let r = _mm_roundscale_ph::<0>(a);
23061        let e = _mm_set1_ph(1.0);
23062        assert_eq_m128h(r, e);
23063    }
23064
23065    #[simd_test(enable = "avx512fp16,avx512vl")]
23066    unsafe fn test_mm_mask_roundscale_ph() {
23067        let a = _mm_set1_ph(1.1);
23068        let src = _mm_set1_ph(2.0);
23069        let r = _mm_mask_roundscale_ph::<0>(src, 0b01010101, a);
23070        let e = _mm_set_ph(2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0);
23071        assert_eq_m128h(r, e);
23072    }
23073
23074    #[simd_test(enable = "avx512fp16,avx512vl")]
23075    unsafe fn test_mm_maskz_roundscale_ph() {
23076        let a = _mm_set1_ph(1.1);
23077        let r = _mm_maskz_roundscale_ph::<0>(0b01010101, a);
23078        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
23079        assert_eq_m128h(r, e);
23080    }
23081
23082    #[simd_test(enable = "avx512fp16,avx512vl")]
23083    unsafe fn test_mm256_roundscale_ph() {
23084        let a = _mm256_set1_ph(1.1);
23085        let r = _mm256_roundscale_ph::<0>(a);
23086        let e = _mm256_set1_ph(1.0);
23087        assert_eq_m256h(r, e);
23088    }
23089
23090    #[simd_test(enable = "avx512fp16,avx512vl")]
23091    unsafe fn test_mm256_mask_roundscale_ph() {
23092        let a = _mm256_set1_ph(1.1);
23093        let src = _mm256_set1_ph(2.0);
23094        let r = _mm256_mask_roundscale_ph::<0>(src, 0b0101010101010101, a);
23095        let e = _mm256_set_ph(
23096            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23097        );
23098        assert_eq_m256h(r, e);
23099    }
23100
23101    #[simd_test(enable = "avx512fp16,avx512vl")]
23102    unsafe fn test_mm256_maskz_roundscale_ph() {
23103        let a = _mm256_set1_ph(1.1);
23104        let r = _mm256_maskz_roundscale_ph::<0>(0b0101010101010101, a);
23105        let e = _mm256_set_ph(
23106            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23107        );
23108        assert_eq_m256h(r, e);
23109    }
23110
23111    #[simd_test(enable = "avx512fp16")]
23112    unsafe fn test_mm512_roundscale_ph() {
23113        let a = _mm512_set1_ph(1.1);
23114        let r = _mm512_roundscale_ph::<0>(a);
23115        let e = _mm512_set1_ph(1.0);
23116        assert_eq_m512h(r, e);
23117    }
23118
23119    #[simd_test(enable = "avx512fp16")]
23120    unsafe fn test_mm512_mask_roundscale_ph() {
23121        let a = _mm512_set1_ph(1.1);
23122        let src = _mm512_set1_ph(2.0);
23123        let r = _mm512_mask_roundscale_ph::<0>(src, 0b01010101010101010101010101010101, a);
23124        let e = _mm512_set_ph(
23125            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23126            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23127        );
23128        assert_eq_m512h(r, e);
23129    }
23130
23131    #[simd_test(enable = "avx512fp16")]
23132    unsafe fn test_mm512_maskz_roundscale_ph() {
23133        let a = _mm512_set1_ph(1.1);
23134        let r = _mm512_maskz_roundscale_ph::<0>(0b01010101010101010101010101010101, a);
23135        let e = _mm512_set_ph(
23136            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23137            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23138        );
23139        assert_eq_m512h(r, e);
23140    }
23141
23142    #[simd_test(enable = "avx512fp16")]
23143    unsafe fn test_mm512_roundscale_round_ph() {
23144        let a = _mm512_set1_ph(1.1);
23145        let r = _mm512_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(a);
23146        let e = _mm512_set1_ph(1.0);
23147        assert_eq_m512h(r, e);
23148    }
23149
23150    #[simd_test(enable = "avx512fp16")]
23151    unsafe fn test_mm512_mask_roundscale_round_ph() {
23152        let a = _mm512_set1_ph(1.1);
23153        let src = _mm512_set1_ph(2.0);
23154        let r = _mm512_mask_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23155            src,
23156            0b01010101010101010101010101010101,
23157            a,
23158        );
23159        let e = _mm512_set_ph(
23160            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23161            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23162        );
23163        assert_eq_m512h(r, e);
23164    }
23165
23166    #[simd_test(enable = "avx512fp16")]
23167    unsafe fn test_mm512_maskz_roundscale_round_ph() {
23168        let a = _mm512_set1_ph(1.1);
23169        let r = _mm512_maskz_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23170            0b01010101010101010101010101010101,
23171            a,
23172        );
23173        let e = _mm512_set_ph(
23174            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23175            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23176        );
23177        assert_eq_m512h(r, e);
23178    }
23179
23180    #[simd_test(enable = "avx512fp16")]
23181    unsafe fn test_mm_roundscale_sh() {
23182        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23183        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23184        let r = _mm_roundscale_sh::<0>(a, b);
23185        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23186        assert_eq_m128h(r, e);
23187    }
23188
23189    #[simd_test(enable = "avx512fp16")]
23190    unsafe fn test_mm_mask_roundscale_sh() {
23191        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23192        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23193        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23194        let r = _mm_mask_roundscale_sh::<0>(src, 0, a, b);
23195        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23196        assert_eq_m128h(r, e);
23197        let r = _mm_mask_roundscale_sh::<0>(src, 1, a, b);
23198        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23199        assert_eq_m128h(r, e);
23200    }
23201
23202    #[simd_test(enable = "avx512fp16")]
23203    unsafe fn test_mm_maskz_roundscale_sh() {
23204        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23205        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23206        let r = _mm_maskz_roundscale_sh::<0>(0, a, b);
23207        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23208        assert_eq_m128h(r, e);
23209        let r = _mm_maskz_roundscale_sh::<0>(1, a, b);
23210        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23211        assert_eq_m128h(r, e);
23212    }
23213
23214    #[simd_test(enable = "avx512fp16")]
23215    unsafe fn test_mm_roundscale_round_sh() {
23216        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23217        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23218        let r = _mm_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(a, b);
23219        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23220        assert_eq_m128h(r, e);
23221    }
23222
23223    #[simd_test(enable = "avx512fp16")]
23224    unsafe fn test_mm_mask_roundscale_round_sh() {
23225        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23226        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23227        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23228        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
23229        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23230        assert_eq_m128h(r, e);
23231        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
23232        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23233        assert_eq_m128h(r, e);
23234    }
23235
23236    #[simd_test(enable = "avx512fp16")]
23237    unsafe fn test_mm_maskz_roundscale_round_sh() {
23238        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23239        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23240        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(0, a, b);
23241        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23242        assert_eq_m128h(r, e);
23243        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(1, a, b);
23244        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23245        assert_eq_m128h(r, e);
23246    }
23247
23248    #[simd_test(enable = "avx512fp16,avx512vl")]
23249    unsafe fn test_mm_scalef_ph() {
23250        let a = _mm_set1_ph(1.);
23251        let b = _mm_set1_ph(3.);
23252        let r = _mm_scalef_ph(a, b);
23253        let e = _mm_set1_ph(8.0);
23254        assert_eq_m128h(r, e);
23255    }
23256
23257    #[simd_test(enable = "avx512fp16,avx512vl")]
23258    unsafe fn test_mm_mask_scalef_ph() {
23259        let a = _mm_set1_ph(1.);
23260        let b = _mm_set1_ph(3.);
23261        let src = _mm_set1_ph(2.);
23262        let r = _mm_mask_scalef_ph(src, 0b01010101, a, b);
23263        let e = _mm_set_ph(2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0);
23264        assert_eq_m128h(r, e);
23265    }
23266
23267    #[simd_test(enable = "avx512fp16,avx512vl")]
23268    unsafe fn test_mm_maskz_scalef_ph() {
23269        let a = _mm_set1_ph(1.);
23270        let b = _mm_set1_ph(3.);
23271        let r = _mm_maskz_scalef_ph(0b01010101, a, b);
23272        let e = _mm_set_ph(0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0);
23273        assert_eq_m128h(r, e);
23274    }
23275
23276    #[simd_test(enable = "avx512fp16,avx512vl")]
23277    unsafe fn test_mm256_scalef_ph() {
23278        let a = _mm256_set1_ph(1.);
23279        let b = _mm256_set1_ph(3.);
23280        let r = _mm256_scalef_ph(a, b);
23281        let e = _mm256_set1_ph(8.0);
23282        assert_eq_m256h(r, e);
23283    }
23284
23285    #[simd_test(enable = "avx512fp16,avx512vl")]
23286    unsafe fn test_mm256_mask_scalef_ph() {
23287        let a = _mm256_set1_ph(1.);
23288        let b = _mm256_set1_ph(3.);
23289        let src = _mm256_set1_ph(2.);
23290        let r = _mm256_mask_scalef_ph(src, 0b0101010101010101, a, b);
23291        let e = _mm256_set_ph(
23292            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23293        );
23294        assert_eq_m256h(r, e);
23295    }
23296
23297    #[simd_test(enable = "avx512fp16,avx512vl")]
23298    unsafe fn test_mm256_maskz_scalef_ph() {
23299        let a = _mm256_set1_ph(1.);
23300        let b = _mm256_set1_ph(3.);
23301        let r = _mm256_maskz_scalef_ph(0b0101010101010101, a, b);
23302        let e = _mm256_set_ph(
23303            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23304        );
23305        assert_eq_m256h(r, e);
23306    }
23307
23308    #[simd_test(enable = "avx512fp16")]
23309    unsafe fn test_mm512_scalef_ph() {
23310        let a = _mm512_set1_ph(1.);
23311        let b = _mm512_set1_ph(3.);
23312        let r = _mm512_scalef_ph(a, b);
23313        let e = _mm512_set1_ph(8.0);
23314        assert_eq_m512h(r, e);
23315    }
23316
23317    #[simd_test(enable = "avx512fp16")]
23318    unsafe fn test_mm512_mask_scalef_ph() {
23319        let a = _mm512_set1_ph(1.);
23320        let b = _mm512_set1_ph(3.);
23321        let src = _mm512_set1_ph(2.);
23322        let r = _mm512_mask_scalef_ph(src, 0b01010101010101010101010101010101, a, b);
23323        let e = _mm512_set_ph(
23324            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23325            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23326        );
23327        assert_eq_m512h(r, e);
23328    }
23329
23330    #[simd_test(enable = "avx512fp16")]
23331    unsafe fn test_mm512_maskz_scalef_ph() {
23332        let a = _mm512_set1_ph(1.);
23333        let b = _mm512_set1_ph(3.);
23334        let r = _mm512_maskz_scalef_ph(0b01010101010101010101010101010101, a, b);
23335        let e = _mm512_set_ph(
23336            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23337            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23338        );
23339        assert_eq_m512h(r, e);
23340    }
23341
23342    #[simd_test(enable = "avx512fp16")]
23343    unsafe fn test_mm512_scalef_round_ph() {
23344        let a = _mm512_set1_ph(1.);
23345        let b = _mm512_set1_ph(3.);
23346        let r = _mm512_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23347        let e = _mm512_set1_ph(8.0);
23348        assert_eq_m512h(r, e);
23349    }
23350
23351    #[simd_test(enable = "avx512fp16")]
23352    unsafe fn test_mm512_mask_scalef_round_ph() {
23353        let a = _mm512_set1_ph(1.);
23354        let b = _mm512_set1_ph(3.);
23355        let src = _mm512_set1_ph(2.);
23356        let r = _mm512_mask_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23357            src,
23358            0b01010101010101010101010101010101,
23359            a,
23360            b,
23361        );
23362        let e = _mm512_set_ph(
23363            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23364            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23365        );
23366        assert_eq_m512h(r, e);
23367    }
23368
23369    #[simd_test(enable = "avx512fp16")]
23370    unsafe fn test_mm512_maskz_scalef_round_ph() {
23371        let a = _mm512_set1_ph(1.);
23372        let b = _mm512_set1_ph(3.);
23373        let r = _mm512_maskz_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23374            0b01010101010101010101010101010101,
23375            a,
23376            b,
23377        );
23378        let e = _mm512_set_ph(
23379            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23380            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23381        );
23382        assert_eq_m512h(r, e);
23383    }
23384
23385    #[simd_test(enable = "avx512fp16")]
23386    unsafe fn test_mm_scalef_sh() {
23387        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23388        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23389        let r = _mm_scalef_sh(a, b);
23390        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23391        assert_eq_m128h(r, e);
23392    }
23393
23394    #[simd_test(enable = "avx512fp16")]
23395    unsafe fn test_mm_mask_scalef_sh() {
23396        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23397        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23398        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23399        let r = _mm_mask_scalef_sh(src, 0, a, b);
23400        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23401        assert_eq_m128h(r, e);
23402        let r = _mm_mask_scalef_sh(src, 1, a, b);
23403        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23404        assert_eq_m128h(r, e);
23405    }
23406
23407    #[simd_test(enable = "avx512fp16")]
23408    unsafe fn test_mm_maskz_scalef_sh() {
23409        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23410        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23411        let r = _mm_maskz_scalef_sh(0, a, b);
23412        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23413        assert_eq_m128h(r, e);
23414        let r = _mm_maskz_scalef_sh(1, a, b);
23415        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23416        assert_eq_m128h(r, e);
23417    }
23418
23419    #[simd_test(enable = "avx512fp16")]
23420    unsafe fn test_mm_scalef_round_sh() {
23421        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23422        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23423        let r = _mm_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23424        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23425        assert_eq_m128h(r, e);
23426    }
23427
23428    #[simd_test(enable = "avx512fp16")]
23429    unsafe fn test_mm_mask_scalef_round_sh() {
23430        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23431        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23432        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23433        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23434            src, 0, a, b,
23435        );
23436        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23437        assert_eq_m128h(r, e);
23438        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23439            src, 1, a, b,
23440        );
23441        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23442        assert_eq_m128h(r, e);
23443    }
23444
23445    #[simd_test(enable = "avx512fp16")]
23446    unsafe fn test_mm_maskz_scalef_round_sh() {
23447        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23448        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23449        let r =
23450            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
23451        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23452        assert_eq_m128h(r, e);
23453        let r =
23454            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
23455        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23456        assert_eq_m128h(r, e);
23457    }
23458
23459    #[simd_test(enable = "avx512fp16,avx512vl")]
23460    unsafe fn test_mm_reduce_ph() {
23461        let a = _mm_set1_ph(1.25);
23462        let r = _mm_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23463        let e = _mm_set1_ph(0.25);
23464        assert_eq_m128h(r, e);
23465    }
23466
23467    #[simd_test(enable = "avx512fp16,avx512vl")]
23468    unsafe fn test_mm_mask_reduce_ph() {
23469        let a = _mm_set1_ph(1.25);
23470        let src = _mm_set1_ph(2.0);
23471        let r = _mm_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01010101, a);
23472        let e = _mm_set_ph(2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25);
23473        assert_eq_m128h(r, e);
23474    }
23475
23476    #[simd_test(enable = "avx512fp16,avx512vl")]
23477    unsafe fn test_mm_maskz_reduce_ph() {
23478        let a = _mm_set1_ph(1.25);
23479        let r = _mm_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01010101, a);
23480        let e = _mm_set_ph(0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25);
23481        assert_eq_m128h(r, e);
23482    }
23483
23484    #[simd_test(enable = "avx512fp16,avx512vl")]
23485    unsafe fn test_mm256_reduce_ph() {
23486        let a = _mm256_set1_ph(1.25);
23487        let r = _mm256_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23488        let e = _mm256_set1_ph(0.25);
23489        assert_eq_m256h(r, e);
23490    }
23491
23492    #[simd_test(enable = "avx512fp16,avx512vl")]
23493    unsafe fn test_mm256_mask_reduce_ph() {
23494        let a = _mm256_set1_ph(1.25);
23495        let src = _mm256_set1_ph(2.0);
23496        let r = _mm256_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0101010101010101, a);
23497        let e = _mm256_set_ph(
23498            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23499        );
23500        assert_eq_m256h(r, e);
23501    }
23502
23503    #[simd_test(enable = "avx512fp16,avx512vl")]
23504    unsafe fn test_mm256_maskz_reduce_ph() {
23505        let a = _mm256_set1_ph(1.25);
23506        let r = _mm256_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0101010101010101, a);
23507        let e = _mm256_set_ph(
23508            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23509        );
23510        assert_eq_m256h(r, e);
23511    }
23512
23513    #[simd_test(enable = "avx512fp16")]
23514    unsafe fn test_mm512_reduce_ph() {
23515        let a = _mm512_set1_ph(1.25);
23516        let r = _mm512_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23517        let e = _mm512_set1_ph(0.25);
23518        assert_eq_m512h(r, e);
23519    }
23520
23521    #[simd_test(enable = "avx512fp16")]
23522    unsafe fn test_mm512_mask_reduce_ph() {
23523        let a = _mm512_set1_ph(1.25);
23524        let src = _mm512_set1_ph(2.0);
23525        let r = _mm512_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23526            src,
23527            0b01010101010101010101010101010101,
23528            a,
23529        );
23530        let e = _mm512_set_ph(
23531            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23532            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23533        );
23534        assert_eq_m512h(r, e);
23535    }
23536
23537    #[simd_test(enable = "avx512fp16")]
23538    unsafe fn test_mm512_maskz_reduce_ph() {
23539        let a = _mm512_set1_ph(1.25);
23540        let r = _mm512_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23541            0b01010101010101010101010101010101,
23542            a,
23543        );
23544        let e = _mm512_set_ph(
23545            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23546            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23547        );
23548        assert_eq_m512h(r, e);
23549    }
23550
23551    #[simd_test(enable = "avx512fp16")]
23552    unsafe fn test_mm512_reduce_round_ph() {
23553        let a = _mm512_set1_ph(1.25);
23554        let r = _mm512_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
23555        let e = _mm512_set1_ph(0.25);
23556        assert_eq_m512h(r, e);
23557    }
23558
23559    #[simd_test(enable = "avx512fp16")]
23560    unsafe fn test_mm512_mask_reduce_round_ph() {
23561        let a = _mm512_set1_ph(1.25);
23562        let src = _mm512_set1_ph(2.0);
23563        let r = _mm512_mask_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23564            src,
23565            0b01010101010101010101010101010101,
23566            a,
23567        );
23568        let e = _mm512_set_ph(
23569            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23570            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23571        );
23572        assert_eq_m512h(r, e);
23573    }
23574
23575    #[simd_test(enable = "avx512fp16")]
23576    unsafe fn test_mm512_maskz_reduce_round_ph() {
23577        let a = _mm512_set1_ph(1.25);
23578        let r = _mm512_maskz_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23579            0b01010101010101010101010101010101,
23580            a,
23581        );
23582        let e = _mm512_set_ph(
23583            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23584            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23585        );
23586        assert_eq_m512h(r, e);
23587    }
23588
23589    #[simd_test(enable = "avx512fp16")]
23590    unsafe fn test_mm_reduce_sh() {
23591        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23592        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23593        let r = _mm_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
23594        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23595        assert_eq_m128h(r, e);
23596    }
23597
23598    #[simd_test(enable = "avx512fp16")]
23599    unsafe fn test_mm_mask_reduce_sh() {
23600        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23601        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23602        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23603        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0, a, b);
23604        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23605        assert_eq_m128h(r, e);
23606        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 1, a, b);
23607        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23608        assert_eq_m128h(r, e);
23609    }
23610
23611    #[simd_test(enable = "avx512fp16")]
23612    unsafe fn test_mm_maskz_reduce_sh() {
23613        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23614        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23615        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(0, a, b);
23616        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23617        assert_eq_m128h(r, e);
23618        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(1, a, b);
23619        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23620        assert_eq_m128h(r, e);
23621    }
23622
23623    #[simd_test(enable = "avx512fp16")]
23624    unsafe fn test_mm_reduce_round_sh() {
23625        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23626        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23627        let r = _mm_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
23628        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23629        assert_eq_m128h(r, e);
23630    }
23631
23632    #[simd_test(enable = "avx512fp16")]
23633    unsafe fn test_mm_mask_reduce_round_sh() {
23634        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23635        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23636        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23637        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23638            src, 0, a, b,
23639        );
23640        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23641        assert_eq_m128h(r, e);
23642        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23643            src, 1, a, b,
23644        );
23645        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23646        assert_eq_m128h(r, e);
23647    }
23648
23649    #[simd_test(enable = "avx512fp16")]
23650    unsafe fn test_mm_maskz_reduce_round_sh() {
23651        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23652        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23653        let r =
23654            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0, a, b);
23655        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23656        assert_eq_m128h(r, e);
23657        let r =
23658            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(1, a, b);
23659        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23660        assert_eq_m128h(r, e);
23661    }
23662
23663    #[simd_test(enable = "avx512fp16,avx512vl")]
23664    unsafe fn test_mm_reduce_add_ph() {
23665        let a = _mm_set1_ph(2.0);
23666        let r = _mm_reduce_add_ph(a);
23667        assert_eq!(r, 16.0);
23668    }
23669
23670    #[simd_test(enable = "avx512fp16,avx512vl")]
23671    unsafe fn test_mm256_reduce_add_ph() {
23672        let a = _mm256_set1_ph(2.0);
23673        let r = _mm256_reduce_add_ph(a);
23674        assert_eq!(r, 32.0);
23675    }
23676
23677    #[simd_test(enable = "avx512fp16")]
23678    unsafe fn test_mm512_reduce_add_ph() {
23679        let a = _mm512_set1_ph(2.0);
23680        let r = _mm512_reduce_add_ph(a);
23681        assert_eq!(r, 64.0);
23682    }
23683
23684    #[simd_test(enable = "avx512fp16,avx512vl")]
23685    unsafe fn test_mm_reduce_mul_ph() {
23686        let a = _mm_set1_ph(2.0);
23687        let r = _mm_reduce_mul_ph(a);
23688        assert_eq!(r, 256.0);
23689    }
23690
23691    #[simd_test(enable = "avx512fp16,avx512vl")]
23692    unsafe fn test_mm256_reduce_mul_ph() {
23693        let a = _mm256_set1_ph(2.0);
23694        let r = _mm256_reduce_mul_ph(a);
23695        assert_eq!(r, 65536.0);
23696    }
23697
23698    #[simd_test(enable = "avx512fp16")]
23699    unsafe fn test_mm512_reduce_mul_ph() {
23700        let a = _mm512_set1_ph(2.0);
23701        let r = _mm512_reduce_mul_ph(a);
23702        assert_eq!(r, 16777216.0);
23703    }
23704
23705    #[simd_test(enable = "avx512fp16,avx512vl")]
23706    unsafe fn test_mm_reduce_max_ph() {
23707        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23708        let r = _mm_reduce_max_ph(a);
23709        assert_eq!(r, 8.0);
23710    }
23711
23712    #[simd_test(enable = "avx512fp16,avx512vl")]
23713    unsafe fn test_mm256_reduce_max_ph() {
23714        let a = _mm256_set_ph(
23715            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23716        );
23717        let r = _mm256_reduce_max_ph(a);
23718        assert_eq!(r, 16.0);
23719    }
23720
23721    #[simd_test(enable = "avx512fp16")]
23722    unsafe fn test_mm512_reduce_max_ph() {
23723        let a = _mm512_set_ph(
23724            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23725            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23726            31.0, 32.0,
23727        );
23728        let r = _mm512_reduce_max_ph(a);
23729        assert_eq!(r, 32.0);
23730    }
23731
23732    #[simd_test(enable = "avx512fp16,avx512vl")]
23733    unsafe fn test_mm_reduce_min_ph() {
23734        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23735        let r = _mm_reduce_min_ph(a);
23736        assert_eq!(r, 1.0);
23737    }
23738
23739    #[simd_test(enable = "avx512fp16,avx512vl")]
23740    unsafe fn test_mm256_reduce_min_ph() {
23741        let a = _mm256_set_ph(
23742            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23743        );
23744        let r = _mm256_reduce_min_ph(a);
23745        assert_eq!(r, 1.0);
23746    }
23747
23748    #[simd_test(enable = "avx512fp16")]
23749    unsafe fn test_mm512_reduce_min_ph() {
23750        let a = _mm512_set_ph(
23751            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23752            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23753            31.0, 32.0,
23754        );
23755        let r = _mm512_reduce_min_ph(a);
23756        assert_eq!(r, 1.0);
23757    }
23758
23759    #[simd_test(enable = "avx512fp16,avx512vl")]
23760    unsafe fn test_mm_fpclass_ph_mask() {
23761        let a = _mm_set_ph(
23762            1.,
23763            f16::INFINITY,
23764            f16::NEG_INFINITY,
23765            0.0,
23766            -0.0,
23767            -2.0,
23768            f16::NAN,
23769            5.9e-8, // Denormal
23770        );
23771        let r = _mm_fpclass_ph_mask::<0x18>(a); // infinities
23772        assert_eq!(r, 0b01100000);
23773    }
23774
23775    #[simd_test(enable = "avx512fp16,avx512vl")]
23776    unsafe fn test_mm_mask_fpclass_ph_mask() {
23777        let a = _mm_set_ph(
23778            1.,
23779            f16::INFINITY,
23780            f16::NEG_INFINITY,
23781            0.0,
23782            -0.0,
23783            -2.0,
23784            f16::NAN,
23785            5.9e-8, // Denormal
23786        );
23787        let r = _mm_mask_fpclass_ph_mask::<0x18>(0b01010101, a);
23788        assert_eq!(r, 0b01000000);
23789    }
23790
23791    #[simd_test(enable = "avx512fp16,avx512vl")]
23792    unsafe fn test_mm256_fpclass_ph_mask() {
23793        let a = _mm256_set_ph(
23794            1.,
23795            f16::INFINITY,
23796            f16::NEG_INFINITY,
23797            0.0,
23798            -0.0,
23799            -2.0,
23800            f16::NAN,
23801            5.9e-8, // Denormal
23802            1.,
23803            f16::INFINITY,
23804            f16::NEG_INFINITY,
23805            0.0,
23806            -0.0,
23807            -2.0,
23808            f16::NAN,
23809            5.9e-8, // Denormal
23810        );
23811        let r = _mm256_fpclass_ph_mask::<0x18>(a); // infinities
23812        assert_eq!(r, 0b0110000001100000);
23813    }
23814
23815    #[simd_test(enable = "avx512fp16,avx512vl")]
23816    unsafe fn test_mm256_mask_fpclass_ph_mask() {
23817        let a = _mm256_set_ph(
23818            1.,
23819            f16::INFINITY,
23820            f16::NEG_INFINITY,
23821            0.0,
23822            -0.0,
23823            -2.0,
23824            f16::NAN,
23825            5.9e-8, // Denormal
23826            1.,
23827            f16::INFINITY,
23828            f16::NEG_INFINITY,
23829            0.0,
23830            -0.0,
23831            -2.0,
23832            f16::NAN,
23833            5.9e-8, // Denormal
23834        );
23835        let r = _mm256_mask_fpclass_ph_mask::<0x18>(0b0101010101010101, a);
23836        assert_eq!(r, 0b0100000001000000);
23837    }
23838
23839    #[simd_test(enable = "avx512fp16")]
23840    unsafe fn test_mm512_fpclass_ph_mask() {
23841        let a = _mm512_set_ph(
23842            1.,
23843            f16::INFINITY,
23844            f16::NEG_INFINITY,
23845            0.0,
23846            -0.0,
23847            -2.0,
23848            f16::NAN,
23849            5.9e-8, // Denormal
23850            1.,
23851            f16::INFINITY,
23852            f16::NEG_INFINITY,
23853            0.0,
23854            -0.0,
23855            -2.0,
23856            f16::NAN,
23857            5.9e-8, // Denormal
23858            1.,
23859            f16::INFINITY,
23860            f16::NEG_INFINITY,
23861            0.0,
23862            -0.0,
23863            -2.0,
23864            f16::NAN,
23865            5.9e-8, // Denormal
23866            1.,
23867            f16::INFINITY,
23868            f16::NEG_INFINITY,
23869            0.0,
23870            -0.0,
23871            -2.0,
23872            f16::NAN,
23873            5.9e-8, // Denormal
23874        );
23875        let r = _mm512_fpclass_ph_mask::<0x18>(a); // infinities
23876        assert_eq!(r, 0b01100000011000000110000001100000);
23877    }
23878
23879    #[simd_test(enable = "avx512fp16")]
23880    unsafe fn test_mm512_mask_fpclass_ph_mask() {
23881        let a = _mm512_set_ph(
23882            1.,
23883            f16::INFINITY,
23884            f16::NEG_INFINITY,
23885            0.0,
23886            -0.0,
23887            -2.0,
23888            f16::NAN,
23889            5.9e-8, // Denormal
23890            1.,
23891            f16::INFINITY,
23892            f16::NEG_INFINITY,
23893            0.0,
23894            -0.0,
23895            -2.0,
23896            f16::NAN,
23897            5.9e-8, // Denormal
23898            1.,
23899            f16::INFINITY,
23900            f16::NEG_INFINITY,
23901            0.0,
23902            -0.0,
23903            -2.0,
23904            f16::NAN,
23905            5.9e-8, // Denormal
23906            1.,
23907            f16::INFINITY,
23908            f16::NEG_INFINITY,
23909            0.0,
23910            -0.0,
23911            -2.0,
23912            f16::NAN,
23913            5.9e-8, // Denormal
23914        );
23915        let r = _mm512_mask_fpclass_ph_mask::<0x18>(0b01010101010101010101010101010101, a);
23916        assert_eq!(r, 0b01000000010000000100000001000000);
23917    }
23918
23919    #[simd_test(enable = "avx512fp16")]
23920    unsafe fn test_mm_fpclass_sh_mask() {
23921        let a = _mm_set_sh(f16::INFINITY);
23922        let r = _mm_fpclass_sh_mask::<0x18>(a);
23923        assert_eq!(r, 1);
23924    }
23925
23926    #[simd_test(enable = "avx512fp16")]
23927    unsafe fn test_mm_mask_fpclass_sh_mask() {
23928        let a = _mm_set_sh(f16::INFINITY);
23929        let r = _mm_mask_fpclass_sh_mask::<0x18>(0, a);
23930        assert_eq!(r, 0);
23931        let r = _mm_mask_fpclass_sh_mask::<0x18>(1, a);
23932        assert_eq!(r, 1);
23933    }
23934
23935    #[simd_test(enable = "avx512fp16,avx512vl")]
23936    unsafe fn test_mm_mask_blend_ph() {
23937        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23938        let b = _mm_set_ph(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0);
23939        let r = _mm_mask_blend_ph(0b01010101, a, b);
23940        let e = _mm_set_ph(1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0);
23941        assert_eq_m128h(r, e);
23942    }
23943
23944    #[simd_test(enable = "avx512fp16,avx512vl")]
23945    unsafe fn test_mm256_mask_blend_ph() {
23946        let a = _mm256_set_ph(
23947            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23948        );
23949        let b = _mm256_set_ph(
23950            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
23951            -14.0, -15.0, -16.0,
23952        );
23953        let r = _mm256_mask_blend_ph(0b0101010101010101, a, b);
23954        let e = _mm256_set_ph(
23955            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
23956            -16.0,
23957        );
23958        assert_eq_m256h(r, e);
23959    }
23960
23961    #[simd_test(enable = "avx512fp16")]
23962    unsafe fn test_mm512_mask_blend_ph() {
23963        let a = _mm512_set_ph(
23964            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23965            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23966            31.0, 32.0,
23967        );
23968        let b = _mm512_set_ph(
23969            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
23970            -14.0, -15.0, -16.0, -17.0, -18.0, -19.0, -20.0, -21.0, -22.0, -23.0, -24.0, -25.0,
23971            -26.0, -27.0, -28.0, -29.0, -30.0, -31.0, -32.0,
23972        );
23973        let r = _mm512_mask_blend_ph(0b01010101010101010101010101010101, a, b);
23974        let e = _mm512_set_ph(
23975            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
23976            -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0, 27.0, -28.0,
23977            29.0, -30.0, 31.0, -32.0,
23978        );
23979        assert_eq_m512h(r, e);
23980    }
23981
23982    #[simd_test(enable = "avx512fp16,avx512vl")]
23983    unsafe fn test_mm_permutex2var_ph() {
23984        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23985        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
23986        let idx = _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14);
23987        let r = _mm_permutex2var_ph(a, idx, b);
23988        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
23989        assert_eq_m128h(r, e);
23990    }
23991
23992    #[simd_test(enable = "avx512fp16,avx512vl")]
23993    unsafe fn test_mm256_permutex2var_ph() {
23994        let a = _mm256_setr_ph(
23995            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23996        );
23997        let b = _mm256_setr_ph(
23998            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23999            31.0, 32.0,
24000        );
24001        let idx = _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
24002        let r = _mm256_permutex2var_ph(a, idx, b);
24003        let e = _mm256_setr_ph(
24004            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24005            31.0,
24006        );
24007        assert_eq_m256h(r, e);
24008    }
24009
24010    #[simd_test(enable = "avx512fp16")]
24011    unsafe fn test_mm512_permutex2var_ph() {
24012        let a = _mm512_setr_ph(
24013            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24014            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24015            31.0, 32.0,
24016        );
24017        let b = _mm512_setr_ph(
24018            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
24019            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
24020            61.0, 62.0, 63.0, 64.0,
24021        );
24022        let idx = _mm512_set_epi16(
24023            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20,
24024            18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
24025        );
24026        let r = _mm512_permutex2var_ph(a, idx, b);
24027        let e = _mm512_setr_ph(
24028            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24029            31.0, 33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0, 49.0, 51.0, 53.0, 55.0, 57.0,
24030            59.0, 61.0, 63.0,
24031        );
24032        assert_eq_m512h(r, e);
24033    }
24034
24035    #[simd_test(enable = "avx512fp16,avx512vl")]
24036    unsafe fn test_mm_permutexvar_ph() {
24037        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24038        let idx = _mm_set_epi16(0, 2, 4, 6, 1, 3, 5, 7);
24039        let r = _mm_permutexvar_ph(idx, a);
24040        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 2.0, 4.0, 6.0, 8.0);
24041        assert_eq_m128h(r, e);
24042    }
24043
24044    #[simd_test(enable = "avx512fp16,avx512vl")]
24045    unsafe fn test_mm256_permutexvar_ph() {
24046        let a = _mm256_set_ph(
24047            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24048        );
24049        let idx = _mm256_set_epi16(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
24050        let r = _mm256_permutexvar_ph(idx, a);
24051        let e = _mm256_setr_ph(
24052            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0,
24053        );
24054        assert_eq_m256h(r, e);
24055    }
24056
24057    #[simd_test(enable = "avx512fp16")]
24058    unsafe fn test_mm512_permutexvar_ph() {
24059        let a = _mm512_set_ph(
24060            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24061            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24062            31.0, 32.0,
24063        );
24064        let idx = _mm512_set_epi16(
24065            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15,
24066            17, 19, 21, 23, 25, 27, 29, 31,
24067        );
24068        let r = _mm512_permutexvar_ph(idx, a);
24069        let e = _mm512_setr_ph(
24070            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24071            31.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0,
24072            30.0, 32.0,
24073        );
24074        assert_eq_m512h(r, e);
24075    }
24076
24077    #[simd_test(enable = "avx512fp16,avx512vl")]
24078    unsafe fn test_mm_cvtepi16_ph() {
24079        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24080        let r = _mm_cvtepi16_ph(a);
24081        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24082        assert_eq_m128h(r, e);
24083    }
24084
24085    #[simd_test(enable = "avx512fp16,avx512vl")]
24086    unsafe fn test_mm_mask_cvtepi16_ph() {
24087        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24088        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24089        let r = _mm_mask_cvtepi16_ph(src, 0b01010101, a);
24090        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24091        assert_eq_m128h(r, e);
24092    }
24093
24094    #[simd_test(enable = "avx512fp16,avx512vl")]
24095    unsafe fn test_mm_maskz_cvtepi16_ph() {
24096        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24097        let r = _mm_maskz_cvtepi16_ph(0b01010101, a);
24098        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24099        assert_eq_m128h(r, e);
24100    }
24101
24102    #[simd_test(enable = "avx512fp16,avx512vl")]
24103    unsafe fn test_mm256_cvtepi16_ph() {
24104        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24105        let r = _mm256_cvtepi16_ph(a);
24106        let e = _mm256_set_ph(
24107            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24108        );
24109        assert_eq_m256h(r, e);
24110    }
24111
24112    #[simd_test(enable = "avx512fp16,avx512vl")]
24113    unsafe fn test_mm256_mask_cvtepi16_ph() {
24114        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24115        let src = _mm256_set_ph(
24116            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24117        );
24118        let r = _mm256_mask_cvtepi16_ph(src, 0b0101010101010101, a);
24119        let e = _mm256_set_ph(
24120            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24121        );
24122        assert_eq_m256h(r, e);
24123    }
24124
24125    #[simd_test(enable = "avx512fp16,avx512vl")]
24126    unsafe fn test_mm256_maskz_cvtepi16_ph() {
24127        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24128        let r = _mm256_maskz_cvtepi16_ph(0b0101010101010101, a);
24129        let e = _mm256_set_ph(
24130            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24131        );
24132        assert_eq_m256h(r, e);
24133    }
24134
24135    #[simd_test(enable = "avx512fp16")]
24136    unsafe fn test_mm512_cvtepi16_ph() {
24137        let a = _mm512_set_epi16(
24138            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24139            25, 26, 27, 28, 29, 30, 31, 32,
24140        );
24141        let r = _mm512_cvtepi16_ph(a);
24142        let e = _mm512_set_ph(
24143            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24144            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24145            31.0, 32.0,
24146        );
24147        assert_eq_m512h(r, e);
24148    }
24149
24150    #[simd_test(enable = "avx512fp16")]
24151    unsafe fn test_mm512_mask_cvtepi16_ph() {
24152        let a = _mm512_set_epi16(
24153            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24154            25, 26, 27, 28, 29, 30, 31, 32,
24155        );
24156        let src = _mm512_set_ph(
24157            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24158            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24159        );
24160        let r = _mm512_mask_cvtepi16_ph(src, 0b01010101010101010101010101010101, a);
24161        let e = _mm512_set_ph(
24162            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24163            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24164        );
24165        assert_eq_m512h(r, e);
24166    }
24167
24168    #[simd_test(enable = "avx512fp16")]
24169    unsafe fn test_mm512_maskz_cvtepi16_ph() {
24170        let a = _mm512_set_epi16(
24171            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24172            25, 26, 27, 28, 29, 30, 31, 32,
24173        );
24174        let r = _mm512_maskz_cvtepi16_ph(0b01010101010101010101010101010101, a);
24175        let e = _mm512_set_ph(
24176            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24177            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24178        );
24179        assert_eq_m512h(r, e);
24180    }
24181
24182    #[simd_test(enable = "avx512fp16")]
24183    unsafe fn test_mm512_cvt_roundepi16_ph() {
24184        let a = _mm512_set_epi16(
24185            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24186            25, 26, 27, 28, 29, 30, 31, 32,
24187        );
24188        let r = _mm512_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24189        let e = _mm512_set_ph(
24190            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24191            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24192            31.0, 32.0,
24193        );
24194        assert_eq_m512h(r, e);
24195    }
24196
24197    #[simd_test(enable = "avx512fp16")]
24198    unsafe fn test_mm512_mask_cvt_roundepi16_ph() {
24199        let a = _mm512_set_epi16(
24200            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24201            25, 26, 27, 28, 29, 30, 31, 32,
24202        );
24203        let src = _mm512_set_ph(
24204            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24205            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24206        );
24207        let r = _mm512_mask_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24208            src,
24209            0b01010101010101010101010101010101,
24210            a,
24211        );
24212        let e = _mm512_set_ph(
24213            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24214            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24215        );
24216        assert_eq_m512h(r, e);
24217    }
24218
24219    #[simd_test(enable = "avx512fp16")]
24220    unsafe fn test_mm512_maskz_cvt_roundepi16_ph() {
24221        let a = _mm512_set_epi16(
24222            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24223            25, 26, 27, 28, 29, 30, 31, 32,
24224        );
24225        let r = _mm512_maskz_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24226            0b01010101010101010101010101010101,
24227            a,
24228        );
24229        let e = _mm512_set_ph(
24230            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24231            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24232        );
24233        assert_eq_m512h(r, e);
24234    }
24235
24236    #[simd_test(enable = "avx512fp16,avx512vl")]
24237    unsafe fn test_mm_cvtepu16_ph() {
24238        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24239        let r = _mm_cvtepu16_ph(a);
24240        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24241        assert_eq_m128h(r, e);
24242    }
24243
24244    #[simd_test(enable = "avx512fp16,avx512vl")]
24245    unsafe fn test_mm_mask_cvtepu16_ph() {
24246        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24247        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24248        let r = _mm_mask_cvtepu16_ph(src, 0b01010101, a);
24249        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24250        assert_eq_m128h(r, e);
24251    }
24252
24253    #[simd_test(enable = "avx512fp16,avx512vl")]
24254    unsafe fn test_mm_maskz_cvtepu16_ph() {
24255        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24256        let r = _mm_maskz_cvtepu16_ph(0b01010101, a);
24257        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24258        assert_eq_m128h(r, e);
24259    }
24260
24261    #[simd_test(enable = "avx512fp16,avx512vl")]
24262    unsafe fn test_mm256_cvtepu16_ph() {
24263        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24264        let r = _mm256_cvtepu16_ph(a);
24265        let e = _mm256_set_ph(
24266            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24267        );
24268        assert_eq_m256h(r, e);
24269    }
24270
24271    #[simd_test(enable = "avx512fp16,avx512vl")]
24272    unsafe fn test_mm256_mask_cvtepu16_ph() {
24273        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24274        let src = _mm256_set_ph(
24275            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24276        );
24277        let r = _mm256_mask_cvtepu16_ph(src, 0b0101010101010101, a);
24278        let e = _mm256_set_ph(
24279            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24280        );
24281        assert_eq_m256h(r, e);
24282    }
24283
24284    #[simd_test(enable = "avx512fp16,avx512vl")]
24285    unsafe fn test_mm256_maskz_cvtepu16_ph() {
24286        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24287        let r = _mm256_maskz_cvtepu16_ph(0b0101010101010101, a);
24288        let e = _mm256_set_ph(
24289            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24290        );
24291        assert_eq_m256h(r, e);
24292    }
24293
24294    #[simd_test(enable = "avx512fp16")]
24295    unsafe fn test_mm512_cvtepu16_ph() {
24296        let a = _mm512_set_epi16(
24297            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24298            25, 26, 27, 28, 29, 30, 31, 32,
24299        );
24300        let r = _mm512_cvtepu16_ph(a);
24301        let e = _mm512_set_ph(
24302            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24303            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24304            31.0, 32.0,
24305        );
24306        assert_eq_m512h(r, e);
24307    }
24308
24309    #[simd_test(enable = "avx512fp16")]
24310    unsafe fn test_mm512_mask_cvtepu16_ph() {
24311        let a = _mm512_set_epi16(
24312            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24313            25, 26, 27, 28, 29, 30, 31, 32,
24314        );
24315        let src = _mm512_set_ph(
24316            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24317            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24318        );
24319        let r = _mm512_mask_cvtepu16_ph(src, 0b01010101010101010101010101010101, a);
24320        let e = _mm512_set_ph(
24321            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24322            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24323        );
24324        assert_eq_m512h(r, e);
24325    }
24326
24327    #[simd_test(enable = "avx512fp16")]
24328    unsafe fn test_mm512_maskz_cvtepu16_ph() {
24329        let a = _mm512_set_epi16(
24330            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24331            25, 26, 27, 28, 29, 30, 31, 32,
24332        );
24333        let r = _mm512_maskz_cvtepu16_ph(0b01010101010101010101010101010101, a);
24334        let e = _mm512_set_ph(
24335            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24336            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24337        );
24338        assert_eq_m512h(r, e);
24339    }
24340
24341    #[simd_test(enable = "avx512fp16")]
24342    unsafe fn test_mm512_cvt_roundepu16_ph() {
24343        let a = _mm512_set_epi16(
24344            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24345            25, 26, 27, 28, 29, 30, 31, 32,
24346        );
24347        let r = _mm512_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24348        let e = _mm512_set_ph(
24349            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24350            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24351            31.0, 32.0,
24352        );
24353        assert_eq_m512h(r, e);
24354    }
24355
24356    #[simd_test(enable = "avx512fp16")]
24357    unsafe fn test_mm512_mask_cvt_roundepu16_ph() {
24358        let a = _mm512_set_epi16(
24359            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24360            25, 26, 27, 28, 29, 30, 31, 32,
24361        );
24362        let src = _mm512_set_ph(
24363            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24364            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24365        );
24366        let r = _mm512_mask_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24367            src,
24368            0b01010101010101010101010101010101,
24369            a,
24370        );
24371        let e = _mm512_set_ph(
24372            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24373            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24374        );
24375        assert_eq_m512h(r, e);
24376    }
24377
24378    #[simd_test(enable = "avx512fp16")]
24379    unsafe fn test_mm512_maskz_cvt_roundepu16_ph() {
24380        let a = _mm512_set_epi16(
24381            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24382            25, 26, 27, 28, 29, 30, 31, 32,
24383        );
24384        let r = _mm512_maskz_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24385            0b01010101010101010101010101010101,
24386            a,
24387        );
24388        let e = _mm512_set_ph(
24389            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24390            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24391        );
24392        assert_eq_m512h(r, e);
24393    }
24394
24395    #[simd_test(enable = "avx512fp16,avx512vl")]
24396    unsafe fn test_mm_cvtepi32_ph() {
24397        let a = _mm_set_epi32(1, 2, 3, 4);
24398        let r = _mm_cvtepi32_ph(a);
24399        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24400        assert_eq_m128h(r, e);
24401    }
24402
24403    #[simd_test(enable = "avx512fp16,avx512vl")]
24404    unsafe fn test_mm_mask_cvtepi32_ph() {
24405        let a = _mm_set_epi32(1, 2, 3, 4);
24406        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24407        let r = _mm_mask_cvtepi32_ph(src, 0b0101, a);
24408        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24409        assert_eq_m128h(r, e);
24410    }
24411
24412    #[simd_test(enable = "avx512fp16,avx512vl")]
24413    unsafe fn test_mm_maskz_cvtepi32_ph() {
24414        let a = _mm_set_epi32(1, 2, 3, 4);
24415        let r = _mm_maskz_cvtepi32_ph(0b0101, a);
24416        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24417        assert_eq_m128h(r, e);
24418    }
24419
24420    #[simd_test(enable = "avx512fp16,avx512vl")]
24421    unsafe fn test_mm256_cvtepi32_ph() {
24422        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24423        let r = _mm256_cvtepi32_ph(a);
24424        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24425        assert_eq_m128h(r, e);
24426    }
24427
24428    #[simd_test(enable = "avx512fp16,avx512vl")]
24429    unsafe fn test_mm256_mask_cvtepi32_ph() {
24430        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24431        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24432        let r = _mm256_mask_cvtepi32_ph(src, 0b01010101, a);
24433        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24434        assert_eq_m128h(r, e);
24435    }
24436
24437    #[simd_test(enable = "avx512fp16,avx512vl")]
24438    unsafe fn test_mm256_maskz_cvtepi32_ph() {
24439        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24440        let r = _mm256_maskz_cvtepi32_ph(0b01010101, a);
24441        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24442        assert_eq_m128h(r, e);
24443    }
24444
24445    #[simd_test(enable = "avx512fp16")]
24446    unsafe fn test_mm512_cvtepi32_ph() {
24447        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24448        let r = _mm512_cvtepi32_ph(a);
24449        let e = _mm256_set_ph(
24450            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24451        );
24452        assert_eq_m256h(r, e);
24453    }
24454
24455    #[simd_test(enable = "avx512fp16")]
24456    unsafe fn test_mm512_mask_cvtepi32_ph() {
24457        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24458        let src = _mm256_set_ph(
24459            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24460        );
24461        let r = _mm512_mask_cvtepi32_ph(src, 0b0101010101010101, a);
24462        let e = _mm256_set_ph(
24463            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24464        );
24465        assert_eq_m256h(r, e);
24466    }
24467
24468    #[simd_test(enable = "avx512fp16")]
24469    unsafe fn test_mm512_maskz_cvtepi32_ph() {
24470        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24471        let r = _mm512_maskz_cvtepi32_ph(0b0101010101010101, a);
24472        let e = _mm256_set_ph(
24473            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24474        );
24475        assert_eq_m256h(r, e);
24476    }
24477
24478    #[simd_test(enable = "avx512fp16")]
24479    unsafe fn test_mm512_cvt_roundepi32_ph() {
24480        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24481        let r = _mm512_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24482        let e = _mm256_set_ph(
24483            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24484        );
24485        assert_eq_m256h(r, e);
24486    }
24487
24488    #[simd_test(enable = "avx512fp16")]
24489    unsafe fn test_mm512_mask_cvt_roundepi32_ph() {
24490        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24491        let src = _mm256_set_ph(
24492            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24493        );
24494        let r = _mm512_mask_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24495            src,
24496            0b0101010101010101,
24497            a,
24498        );
24499        let e = _mm256_set_ph(
24500            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24501        );
24502        assert_eq_m256h(r, e);
24503    }
24504
24505    #[simd_test(enable = "avx512fp16")]
24506    unsafe fn test_mm512_maskz_cvt_roundepi32_ph() {
24507        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24508        let r = _mm512_maskz_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24509            0b0101010101010101,
24510            a,
24511        );
24512        let e = _mm256_set_ph(
24513            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24514        );
24515        assert_eq_m256h(r, e);
24516    }
24517
24518    #[simd_test(enable = "avx512fp16")]
24519    unsafe fn test_mm_cvti32_sh() {
24520        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24521        let r = _mm_cvti32_sh(a, 10);
24522        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24523        assert_eq_m128h(r, e);
24524    }
24525
24526    #[simd_test(enable = "avx512fp16")]
24527    unsafe fn test_mm_cvt_roundi32_sh() {
24528        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24529        let r = _mm_cvt_roundi32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24530        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24531        assert_eq_m128h(r, e);
24532    }
24533
24534    #[simd_test(enable = "avx512fp16,avx512vl")]
24535    unsafe fn test_mm_cvtepu32_ph() {
24536        let a = _mm_set_epi32(1, 2, 3, 4);
24537        let r = _mm_cvtepu32_ph(a);
24538        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24539        assert_eq_m128h(r, e);
24540    }
24541
24542    #[simd_test(enable = "avx512fp16,avx512vl")]
24543    unsafe fn test_mm_mask_cvtepu32_ph() {
24544        let a = _mm_set_epi32(1, 2, 3, 4);
24545        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24546        let r = _mm_mask_cvtepu32_ph(src, 0b0101, a);
24547        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24548        assert_eq_m128h(r, e);
24549    }
24550
24551    #[simd_test(enable = "avx512fp16,avx512vl")]
24552    unsafe fn test_mm_maskz_cvtepu32_ph() {
24553        let a = _mm_set_epi32(1, 2, 3, 4);
24554        let r = _mm_maskz_cvtepu32_ph(0b0101, a);
24555        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24556        assert_eq_m128h(r, e);
24557    }
24558
24559    #[simd_test(enable = "avx512fp16,avx512vl")]
24560    unsafe fn test_mm256_cvtepu32_ph() {
24561        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24562        let r = _mm256_cvtepu32_ph(a);
24563        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24564        assert_eq_m128h(r, e);
24565    }
24566
24567    #[simd_test(enable = "avx512fp16,avx512vl")]
24568    unsafe fn test_mm256_mask_cvtepu32_ph() {
24569        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24570        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24571        let r = _mm256_mask_cvtepu32_ph(src, 0b01010101, a);
24572        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24573        assert_eq_m128h(r, e);
24574    }
24575
24576    #[simd_test(enable = "avx512fp16,avx512vl")]
24577    unsafe fn test_mm256_maskz_cvtepu32_ph() {
24578        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24579        let r = _mm256_maskz_cvtepu32_ph(0b01010101, a);
24580        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24581        assert_eq_m128h(r, e);
24582    }
24583
24584    #[simd_test(enable = "avx512fp16")]
24585    unsafe fn test_mm512_cvtepu32_ph() {
24586        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24587        let r = _mm512_cvtepu32_ph(a);
24588        let e = _mm256_set_ph(
24589            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24590        );
24591        assert_eq_m256h(r, e);
24592    }
24593
24594    #[simd_test(enable = "avx512fp16")]
24595    unsafe fn test_mm512_mask_cvtepu32_ph() {
24596        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24597        let src = _mm256_set_ph(
24598            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24599        );
24600        let r = _mm512_mask_cvtepu32_ph(src, 0b0101010101010101, a);
24601        let e = _mm256_set_ph(
24602            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
24603        );
24604        assert_eq_m256h(r, e);
24605    }
24606
24607    #[simd_test(enable = "avx512fp16")]
24608    unsafe fn test_mm512_maskz_cvtepu32_ph() {
24609        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24610        let r = _mm512_maskz_cvtepu32_ph(0b0101010101010101, a);
24611        let e = _mm256_set_ph(
24612            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24613        );
24614        assert_eq_m256h(r, e);
24615    }
24616
24617    #[simd_test(enable = "avx512fp16")]
24618    unsafe fn test_mm512_cvt_roundepu32_ph() {
24619        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24620        let r = _mm512_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24621        let e = _mm256_set_ph(
24622            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24623        );
24624        assert_eq_m256h(r, e);
24625    }
24626
24627    #[simd_test(enable = "avx512fp16")]
24628    unsafe fn test_mm512_mask_cvt_roundepu32_ph() {
24629        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24630        let src = _mm256_set_ph(
24631            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24632        );
24633        let r = _mm512_mask_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24634            src,
24635            0b0101010101010101,
24636            a,
24637        );
24638        let e = _mm256_set_ph(
24639            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
24640            16.0,
24641        );
24642        assert_eq_m256h(r, e);
24643    }
24644
24645    #[simd_test(enable = "avx512fp16")]
24646    unsafe fn test_mm512_maskz_cvt_roundepu32_ph() {
24647        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24648        let r = _mm512_maskz_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24649            0b0101010101010101,
24650            a,
24651        );
24652        let e = _mm256_set_ph(
24653            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24654        );
24655        assert_eq_m256h(r, e);
24656    }
24657
24658    #[simd_test(enable = "avx512fp16")]
24659    unsafe fn test_mm_cvtu32_sh() {
24660        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24661        let r = _mm_cvtu32_sh(a, 10);
24662        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24663        assert_eq_m128h(r, e);
24664    }
24665
24666    #[simd_test(enable = "avx512fp16")]
24667    unsafe fn test_mm_cvt_roundu32_sh() {
24668        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24669        let r = _mm_cvt_roundu32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24670        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24671        assert_eq_m128h(r, e);
24672    }
24673
24674    #[simd_test(enable = "avx512fp16,avx512vl")]
24675    unsafe fn test_mm_cvtepi64_ph() {
24676        let a = _mm_set_epi64x(1, 2);
24677        let r = _mm_cvtepi64_ph(a);
24678        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24679        assert_eq_m128h(r, e);
24680    }
24681
24682    #[simd_test(enable = "avx512fp16,avx512vl")]
24683    unsafe fn test_mm_mask_cvtepi64_ph() {
24684        let a = _mm_set_epi64x(1, 2);
24685        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24686        let r = _mm_mask_cvtepi64_ph(src, 0b01, a);
24687        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24688        assert_eq_m128h(r, e);
24689    }
24690
24691    #[simd_test(enable = "avx512fp16,avx512vl")]
24692    unsafe fn test_mm_maskz_cvtepi64_ph() {
24693        let a = _mm_set_epi64x(1, 2);
24694        let r = _mm_maskz_cvtepi64_ph(0b01, a);
24695        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.);
24696        assert_eq_m128h(r, e);
24697    }
24698
24699    #[simd_test(enable = "avx512fp16,avx512vl")]
24700    unsafe fn test_mm256_cvtepi64_ph() {
24701        let a = _mm256_set_epi64x(1, 2, 3, 4);
24702        let r = _mm256_cvtepi64_ph(a);
24703        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24704        assert_eq_m128h(r, e);
24705    }
24706
24707    #[simd_test(enable = "avx512fp16,avx512vl")]
24708    unsafe fn test_mm256_mask_cvtepi64_ph() {
24709        let a = _mm256_set_epi64x(1, 2, 3, 4);
24710        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24711        let r = _mm256_mask_cvtepi64_ph(src, 0b0101, a);
24712        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
24713        assert_eq_m128h(r, e);
24714    }
24715
24716    #[simd_test(enable = "avx512fp16,avx512vl")]
24717    unsafe fn test_mm256_maskz_cvtepi64_ph() {
24718        let a = _mm256_set_epi64x(1, 2, 3, 4);
24719        let r = _mm256_maskz_cvtepi64_ph(0b0101, a);
24720        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24721        assert_eq_m128h(r, e);
24722    }
24723
24724    #[simd_test(enable = "avx512fp16")]
24725    unsafe fn test_mm512_cvtepi64_ph() {
24726        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24727        let r = _mm512_cvtepi64_ph(a);
24728        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24729        assert_eq_m128h(r, e);
24730    }
24731
24732    #[simd_test(enable = "avx512fp16")]
24733    unsafe fn test_mm512_mask_cvtepi64_ph() {
24734        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24735        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24736        let r = _mm512_mask_cvtepi64_ph(src, 0b01010101, a);
24737        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24738        assert_eq_m128h(r, e);
24739    }
24740
24741    #[simd_test(enable = "avx512fp16")]
24742    unsafe fn test_mm512_maskz_cvtepi64_ph() {
24743        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24744        let r = _mm512_maskz_cvtepi64_ph(0b01010101, a);
24745        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24746        assert_eq_m128h(r, e);
24747    }
24748
24749    #[simd_test(enable = "avx512fp16")]
24750    unsafe fn test_mm512_cvt_roundepi64_ph() {
24751        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24752        let r = _mm512_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24753        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24754        assert_eq_m128h(r, e);
24755    }
24756
24757    #[simd_test(enable = "avx512fp16")]
24758    unsafe fn test_mm512_mask_cvt_roundepi64_ph() {
24759        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24760        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24761        let r = _mm512_mask_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24762            src, 0b01010101, a,
24763        );
24764        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24765        assert_eq_m128h(r, e);
24766    }
24767
24768    #[simd_test(enable = "avx512fp16")]
24769    unsafe fn test_mm512_maskz_cvt_roundepi64_ph() {
24770        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24771        let r = _mm512_maskz_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24772            0b01010101, a,
24773        );
24774        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24775        assert_eq_m128h(r, e);
24776    }
24777
24778    #[simd_test(enable = "avx512fp16,avx512vl")]
24779    unsafe fn test_mm_cvtepu64_ph() {
24780        let a = _mm_set_epi64x(1, 2);
24781        let r = _mm_cvtepu64_ph(a);
24782        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24783        assert_eq_m128h(r, e);
24784    }
24785
24786    #[simd_test(enable = "avx512fp16,avx512vl")]
24787    unsafe fn test_mm_mask_cvtepu64_ph() {
24788        let a = _mm_set_epi64x(1, 2);
24789        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24790        let r = _mm_mask_cvtepu64_ph(src, 0b01, a);
24791        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24792        assert_eq_m128h(r, e);
24793    }
24794
24795    #[simd_test(enable = "avx512fp16,avx512vl")]
24796    unsafe fn test_mm_maskz_cvtepu64_ph() {
24797        let a = _mm_set_epi64x(1, 2);
24798        let r = _mm_maskz_cvtepu64_ph(0b01, a);
24799        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
24800        assert_eq_m128h(r, e);
24801    }
24802
24803    #[simd_test(enable = "avx512fp16,avx512vl")]
24804    unsafe fn test_mm256_cvtepu64_ph() {
24805        let a = _mm256_set_epi64x(1, 2, 3, 4);
24806        let r = _mm256_cvtepu64_ph(a);
24807        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24808        assert_eq_m128h(r, e);
24809    }
24810
24811    #[simd_test(enable = "avx512fp16,avx512vl")]
24812    unsafe fn test_mm256_mask_cvtepu64_ph() {
24813        let a = _mm256_set_epi64x(1, 2, 3, 4);
24814        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24815        let r = _mm256_mask_cvtepu64_ph(src, 0b0101, a);
24816        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
24817        assert_eq_m128h(r, e);
24818    }
24819
24820    #[simd_test(enable = "avx512fp16,avx512vl")]
24821    unsafe fn test_mm256_maskz_cvtepu64_ph() {
24822        let a = _mm256_set_epi64x(1, 2, 3, 4);
24823        let r = _mm256_maskz_cvtepu64_ph(0b0101, a);
24824        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24825        assert_eq_m128h(r, e);
24826    }
24827
24828    #[simd_test(enable = "avx512fp16")]
24829    unsafe fn test_mm512_cvtepu64_ph() {
24830        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24831        let r = _mm512_cvtepu64_ph(a);
24832        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24833        assert_eq_m128h(r, e);
24834    }
24835
24836    #[simd_test(enable = "avx512fp16")]
24837    unsafe fn test_mm512_mask_cvtepu64_ph() {
24838        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24839        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24840        let r = _mm512_mask_cvtepu64_ph(src, 0b01010101, a);
24841        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24842        assert_eq_m128h(r, e);
24843    }
24844
24845    #[simd_test(enable = "avx512fp16")]
24846    unsafe fn test_mm512_maskz_cvtepu64_ph() {
24847        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24848        let r = _mm512_maskz_cvtepu64_ph(0b01010101, a);
24849        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24850        assert_eq_m128h(r, e);
24851    }
24852
24853    #[simd_test(enable = "avx512fp16")]
24854    unsafe fn test_mm512_cvt_roundepu64_ph() {
24855        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24856        let r = _mm512_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24857        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24858        assert_eq_m128h(r, e);
24859    }
24860
24861    #[simd_test(enable = "avx512fp16")]
24862    unsafe fn test_mm512_mask_cvt_roundepu64_ph() {
24863        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24864        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24865        let r = _mm512_mask_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24866            src, 0b01010101, a,
24867        );
24868        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24869        assert_eq_m128h(r, e);
24870    }
24871
24872    #[simd_test(enable = "avx512fp16")]
24873    unsafe fn test_mm512_maskz_cvt_roundepu64_ph() {
24874        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24875        let r = _mm512_maskz_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24876            0b01010101, a,
24877        );
24878        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24879        assert_eq_m128h(r, e);
24880    }
24881
24882    #[simd_test(enable = "avx512fp16,avx512vl")]
24883    unsafe fn test_mm_cvtxps_ph() {
24884        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24885        let r = _mm_cvtxps_ph(a);
24886        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24887        assert_eq_m128h(r, e);
24888    }
24889
24890    #[simd_test(enable = "avx512fp16,avx512vl")]
24891    unsafe fn test_mm_mask_cvtxps_ph() {
24892        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24893        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24894        let r = _mm_mask_cvtxps_ph(src, 0b0101, a);
24895        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16., 4.0);
24896        assert_eq_m128h(r, e);
24897    }
24898
24899    #[simd_test(enable = "avx512fp16,avx512vl")]
24900    unsafe fn test_mm_maskz_cvtxps_ph() {
24901        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24902        let r = _mm_maskz_cvtxps_ph(0b0101, a);
24903        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24904        assert_eq_m128h(r, e);
24905    }
24906
24907    #[simd_test(enable = "avx512fp16,avx512vl")]
24908    unsafe fn test_mm256_cvtxps_ph() {
24909        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24910        let r = _mm256_cvtxps_ph(a);
24911        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24912        assert_eq_m128h(r, e);
24913    }
24914
24915    #[simd_test(enable = "avx512fp16,avx512vl")]
24916    unsafe fn test_mm256_mask_cvtxps_ph() {
24917        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24918        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24919        let r = _mm256_mask_cvtxps_ph(src, 0b01010101, a);
24920        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24921        assert_eq_m128h(r, e);
24922    }
24923
24924    #[simd_test(enable = "avx512fp16,avx512vl")]
24925    unsafe fn test_mm256_maskz_cvtxps_ph() {
24926        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24927        let r = _mm256_maskz_cvtxps_ph(0b01010101, a);
24928        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24929        assert_eq_m128h(r, e);
24930    }
24931
24932    #[simd_test(enable = "avx512fp16")]
24933    unsafe fn test_mm512_cvtxps_ph() {
24934        let a = _mm512_set_ps(
24935            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24936        );
24937        let r = _mm512_cvtxps_ph(a);
24938        let e = _mm256_set_ph(
24939            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24940        );
24941        assert_eq_m256h(r, e);
24942    }
24943
24944    #[simd_test(enable = "avx512fp16")]
24945    unsafe fn test_mm512_mask_cvtxps_ph() {
24946        let a = _mm512_set_ps(
24947            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24948        );
24949        let src = _mm256_set_ph(
24950            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24951        );
24952        let r = _mm512_mask_cvtxps_ph(src, 0b0101010101010101, a);
24953        let e = _mm256_set_ph(
24954            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
24955        );
24956        assert_eq_m256h(r, e);
24957    }
24958
24959    #[simd_test(enable = "avx512fp16")]
24960    unsafe fn test_mm512_maskz_cvtxps_ph() {
24961        let a = _mm512_set_ps(
24962            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24963        );
24964        let r = _mm512_maskz_cvtxps_ph(0b0101010101010101, a);
24965        let e = _mm256_set_ph(
24966            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24967        );
24968        assert_eq_m256h(r, e);
24969    }
24970
24971    #[simd_test(enable = "avx512fp16")]
24972    unsafe fn test_mm512_cvtx_roundps_ph() {
24973        let a = _mm512_set_ps(
24974            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24975        );
24976        let r = _mm512_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24977        let e = _mm256_set_ph(
24978            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24979        );
24980        assert_eq_m256h(r, e);
24981    }
24982
24983    #[simd_test(enable = "avx512fp16")]
24984    unsafe fn test_mm512_mask_cvtx_roundps_ph() {
24985        let a = _mm512_set_ps(
24986            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24987        );
24988        let src = _mm256_set_ph(
24989            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24990        );
24991        let r = _mm512_mask_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24992            src,
24993            0b0101010101010101,
24994            a,
24995        );
24996        let e = _mm256_set_ph(
24997            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
24998            16.0,
24999        );
25000        assert_eq_m256h(r, e);
25001    }
25002
25003    #[simd_test(enable = "avx512fp16")]
25004    unsafe fn test_mm512_maskz_cvtx_roundps_ph() {
25005        let a = _mm512_set_ps(
25006            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25007        );
25008        let r = _mm512_maskz_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25009            0b0101010101010101,
25010            a,
25011        );
25012        let e = _mm256_set_ph(
25013            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
25014        );
25015        assert_eq_m256h(r, e);
25016    }
25017
25018    #[simd_test(enable = "avx512fp16")]
25019    unsafe fn test_mm_cvtss_sh() {
25020        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25021        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25022        let r = _mm_cvtss_sh(a, b);
25023        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25024        assert_eq_m128h(r, e);
25025    }
25026
25027    #[simd_test(enable = "avx512fp16")]
25028    unsafe fn test_mm_mask_cvtss_sh() {
25029        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25030        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25031        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25032        let r = _mm_mask_cvtss_sh(src, 0, a, b);
25033        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25034        assert_eq_m128h(r, e);
25035        let r = _mm_mask_cvtss_sh(src, 1, a, b);
25036        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25037        assert_eq_m128h(r, e);
25038    }
25039
25040    #[simd_test(enable = "avx512fp16")]
25041    unsafe fn test_mm_maskz_cvtss_sh() {
25042        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25043        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25044        let r = _mm_maskz_cvtss_sh(0, a, b);
25045        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25046        assert_eq_m128h(r, e);
25047        let r = _mm_maskz_cvtss_sh(1, a, b);
25048        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25049        assert_eq_m128h(r, e);
25050    }
25051
25052    #[simd_test(enable = "avx512fp16")]
25053    unsafe fn test_mm_cvt_roundss_sh() {
25054        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25055        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25056        let r = _mm_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25057        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25058        assert_eq_m128h(r, e);
25059    }
25060
25061    #[simd_test(enable = "avx512fp16")]
25062    unsafe fn test_mm_mask_cvt_roundss_sh() {
25063        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25064        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25065        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25066        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25067            src, 0, a, b,
25068        );
25069        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25070        assert_eq_m128h(r, e);
25071        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25072            src, 1, a, b,
25073        );
25074        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25075        assert_eq_m128h(r, e);
25076    }
25077
25078    #[simd_test(enable = "avx512fp16")]
25079    unsafe fn test_mm_maskz_cvt_roundss_sh() {
25080        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25081        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25082        let r =
25083            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25084        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25085        assert_eq_m128h(r, e);
25086        let r =
25087            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25088        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25089        assert_eq_m128h(r, e);
25090    }
25091
25092    #[simd_test(enable = "avx512fp16,avx512vl")]
25093    unsafe fn test_mm_cvtpd_ph() {
25094        let a = _mm_set_pd(1.0, 2.0);
25095        let r = _mm_cvtpd_ph(a);
25096        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
25097        assert_eq_m128h(r, e);
25098    }
25099
25100    #[simd_test(enable = "avx512fp16,avx512vl")]
25101    unsafe fn test_mm_mask_cvtpd_ph() {
25102        let a = _mm_set_pd(1.0, 2.0);
25103        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25104        let r = _mm_mask_cvtpd_ph(src, 0b01, a);
25105        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
25106        assert_eq_m128h(r, e);
25107    }
25108
25109    #[simd_test(enable = "avx512fp16,avx512vl")]
25110    unsafe fn test_mm_maskz_cvtpd_ph() {
25111        let a = _mm_set_pd(1.0, 2.0);
25112        let r = _mm_maskz_cvtpd_ph(0b01, a);
25113        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
25114        assert_eq_m128h(r, e);
25115    }
25116
25117    #[simd_test(enable = "avx512fp16,avx512vl")]
25118    unsafe fn test_mm256_cvtpd_ph() {
25119        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25120        let r = _mm256_cvtpd_ph(a);
25121        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25122        assert_eq_m128h(r, e);
25123    }
25124
25125    #[simd_test(enable = "avx512fp16,avx512vl")]
25126    unsafe fn test_mm256_mask_cvtpd_ph() {
25127        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25128        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25129        let r = _mm256_mask_cvtpd_ph(src, 0b0101, a);
25130        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
25131        assert_eq_m128h(r, e);
25132    }
25133
25134    #[simd_test(enable = "avx512fp16,avx512vl")]
25135    unsafe fn test_mm256_maskz_cvtpd_ph() {
25136        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25137        let r = _mm256_maskz_cvtpd_ph(0b0101, a);
25138        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25139        assert_eq_m128h(r, e);
25140    }
25141
25142    #[simd_test(enable = "avx512fp16")]
25143    unsafe fn test_mm512_cvtpd_ph() {
25144        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25145        let r = _mm512_cvtpd_ph(a);
25146        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25147        assert_eq_m128h(r, e);
25148    }
25149
25150    #[simd_test(enable = "avx512fp16")]
25151    unsafe fn test_mm512_mask_cvtpd_ph() {
25152        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25153        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25154        let r = _mm512_mask_cvtpd_ph(src, 0b01010101, a);
25155        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25156        assert_eq_m128h(r, e);
25157    }
25158
25159    #[simd_test(enable = "avx512fp16")]
25160    unsafe fn test_mm512_maskz_cvtpd_ph() {
25161        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25162        let r = _mm512_maskz_cvtpd_ph(0b01010101, a);
25163        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25164        assert_eq_m128h(r, e);
25165    }
25166
25167    #[simd_test(enable = "avx512fp16")]
25168    unsafe fn test_mm512_cvt_roundpd_ph() {
25169        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25170        let r = _mm512_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25171        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25172        assert_eq_m128h(r, e);
25173    }
25174
25175    #[simd_test(enable = "avx512fp16")]
25176    unsafe fn test_mm512_mask_cvt_roundpd_ph() {
25177        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25178        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25179        let r = _mm512_mask_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25180            src, 0b01010101, a,
25181        );
25182        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25183        assert_eq_m128h(r, e);
25184    }
25185
25186    #[simd_test(enable = "avx512fp16")]
25187    unsafe fn test_mm512_maskz_cvt_roundpd_ph() {
25188        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25189        let r = _mm512_maskz_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25190            0b01010101, a,
25191        );
25192        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25193        assert_eq_m128h(r, e);
25194    }
25195
25196    #[simd_test(enable = "avx512fp16")]
25197    unsafe fn test_mm_cvtsd_sh() {
25198        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25199        let b = _mm_setr_pd(1.0, 2.0);
25200        let r = _mm_cvtsd_sh(a, b);
25201        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25202        assert_eq_m128h(r, e);
25203    }
25204
25205    #[simd_test(enable = "avx512fp16")]
25206    unsafe fn test_mm_mask_cvtsd_sh() {
25207        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25208        let b = _mm_setr_pd(1.0, 2.0);
25209        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25210        let r = _mm_mask_cvtsd_sh(src, 0, a, b);
25211        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25212        assert_eq_m128h(r, e);
25213        let r = _mm_mask_cvtsd_sh(src, 1, a, b);
25214        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25215        assert_eq_m128h(r, e);
25216    }
25217
25218    #[simd_test(enable = "avx512fp16")]
25219    unsafe fn test_mm_maskz_cvtsd_sh() {
25220        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25221        let b = _mm_setr_pd(1.0, 2.0);
25222        let r = _mm_maskz_cvtsd_sh(0, a, b);
25223        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25224        assert_eq_m128h(r, e);
25225        let r = _mm_maskz_cvtsd_sh(1, a, b);
25226        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25227        assert_eq_m128h(r, e);
25228    }
25229
25230    #[simd_test(enable = "avx512fp16")]
25231    unsafe fn test_mm_cvt_roundsd_sh() {
25232        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25233        let b = _mm_setr_pd(1.0, 2.0);
25234        let r = _mm_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25235        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25236        assert_eq_m128h(r, e);
25237    }
25238
25239    #[simd_test(enable = "avx512fp16")]
25240    unsafe fn test_mm_mask_cvt_roundsd_sh() {
25241        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25242        let b = _mm_setr_pd(1.0, 2.0);
25243        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25244        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25245            src, 0, a, b,
25246        );
25247        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25248        assert_eq_m128h(r, e);
25249        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25250            src, 1, a, b,
25251        );
25252        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25253        assert_eq_m128h(r, e);
25254    }
25255
25256    #[simd_test(enable = "avx512fp16")]
25257    unsafe fn test_mm_maskz_cvt_roundsd_sh() {
25258        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25259        let b = _mm_setr_pd(1.0, 2.0);
25260        let r =
25261            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25262        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25263        assert_eq_m128h(r, e);
25264        let r =
25265            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25266        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25267        assert_eq_m128h(r, e);
25268    }
25269
25270    #[simd_test(enable = "avx512fp16,avx512vl")]
25271    unsafe fn test_mm_cvtph_epi16() {
25272        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25273        let r = _mm_cvttph_epi16(a);
25274        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25275        assert_eq_m128i(r, e);
25276    }
25277
25278    #[simd_test(enable = "avx512fp16,avx512vl")]
25279    unsafe fn test_mm_mask_cvtph_epi16() {
25280        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25281        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25282        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25283        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25284        assert_eq_m128i(r, e);
25285    }
25286
25287    #[simd_test(enable = "avx512fp16,avx512vl")]
25288    unsafe fn test_mm_maskz_cvtph_epi16() {
25289        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25290        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25291        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25292        assert_eq_m128i(r, e);
25293    }
25294
25295    #[simd_test(enable = "avx512fp16,avx512vl")]
25296    unsafe fn test_mm256_cvtph_epi16() {
25297        let a = _mm256_set_ph(
25298            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25299        );
25300        let r = _mm256_cvttph_epi16(a);
25301        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25302        assert_eq_m256i(r, e);
25303    }
25304
25305    #[simd_test(enable = "avx512fp16,avx512vl")]
25306    unsafe fn test_mm256_mask_cvtph_epi16() {
25307        let a = _mm256_set_ph(
25308            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25309        );
25310        let src = _mm256_set_epi16(
25311            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25312        );
25313        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25314        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25315        assert_eq_m256i(r, e);
25316    }
25317
25318    #[simd_test(enable = "avx512fp16,avx512vl")]
25319    unsafe fn test_mm256_maskz_cvtph_epi16() {
25320        let a = _mm256_set_ph(
25321            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25322        );
25323        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25324        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25325        assert_eq_m256i(r, e);
25326    }
25327
25328    #[simd_test(enable = "avx512fp16")]
25329    unsafe fn test_mm512_cvtph_epi16() {
25330        let a = _mm512_set_ph(
25331            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25332            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25333            31.0, 32.0,
25334        );
25335        let r = _mm512_cvttph_epi16(a);
25336        let e = _mm512_set_epi16(
25337            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25338            25, 26, 27, 28, 29, 30, 31, 32,
25339        );
25340        assert_eq_m512i(r, e);
25341    }
25342
25343    #[simd_test(enable = "avx512fp16")]
25344    unsafe fn test_mm512_mask_cvtph_epi16() {
25345        let a = _mm512_set_ph(
25346            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25347            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25348            31.0, 32.0,
25349        );
25350        let src = _mm512_set_epi16(
25351            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25352            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25353        );
25354        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25355        let e = _mm512_set_epi16(
25356            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25357            24, 34, 26, 36, 28, 38, 30, 40, 32,
25358        );
25359        assert_eq_m512i(r, e);
25360    }
25361
25362    #[simd_test(enable = "avx512fp16")]
25363    unsafe fn test_mm512_maskz_cvtph_epi16() {
25364        let a = _mm512_set_ph(
25365            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25366            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25367            31.0, 32.0,
25368        );
25369        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25370        let e = _mm512_set_epi16(
25371            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25372            0, 28, 0, 30, 0, 32,
25373        );
25374        assert_eq_m512i(r, e);
25375    }
25376
25377    #[simd_test(enable = "avx512fp16")]
25378    unsafe fn test_mm512_cvt_roundph_epi16() {
25379        let a = _mm512_set_ph(
25380            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25381            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25382            31.0, 32.0,
25383        );
25384        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25385        let e = _mm512_set_epi16(
25386            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25387            25, 26, 27, 28, 29, 30, 31, 32,
25388        );
25389        assert_eq_m512i(r, e);
25390    }
25391
25392    #[simd_test(enable = "avx512fp16")]
25393    unsafe fn test_mm512_mask_cvt_roundph_epi16() {
25394        let a = _mm512_set_ph(
25395            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25396            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25397            31.0, 32.0,
25398        );
25399        let src = _mm512_set_epi16(
25400            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25401            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25402        );
25403        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25404            src,
25405            0b01010101010101010101010101010101,
25406            a,
25407        );
25408        let e = _mm512_set_epi16(
25409            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25410            24, 34, 26, 36, 28, 38, 30, 40, 32,
25411        );
25412        assert_eq_m512i(r, e);
25413    }
25414
25415    #[simd_test(enable = "avx512fp16")]
25416    unsafe fn test_mm512_maskz_cvt_roundph_epi16() {
25417        let a = _mm512_set_ph(
25418            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25419            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25420            31.0, 32.0,
25421        );
25422        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25423            0b01010101010101010101010101010101,
25424            a,
25425        );
25426        let e = _mm512_set_epi16(
25427            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25428            0, 28, 0, 30, 0, 32,
25429        );
25430        assert_eq_m512i(r, e);
25431    }
25432
25433    #[simd_test(enable = "avx512fp16,avx512vl")]
25434    unsafe fn test_mm_cvtph_epu16() {
25435        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25436        let r = _mm_cvttph_epu16(a);
25437        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25438        assert_eq_m128i(r, e);
25439    }
25440
25441    #[simd_test(enable = "avx512fp16,avx512vl")]
25442    unsafe fn test_mm_mask_cvtph_epu16() {
25443        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25444        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25445        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25446        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25447        assert_eq_m128i(r, e);
25448    }
25449
25450    #[simd_test(enable = "avx512fp16,avx512vl")]
25451    unsafe fn test_mm_maskz_cvtph_epu16() {
25452        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25453        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25454        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25455        assert_eq_m128i(r, e);
25456    }
25457
25458    #[simd_test(enable = "avx512fp16,avx512vl")]
25459    unsafe fn test_mm256_cvtph_epu16() {
25460        let a = _mm256_set_ph(
25461            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25462        );
25463        let r = _mm256_cvttph_epu16(a);
25464        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25465        assert_eq_m256i(r, e);
25466    }
25467
25468    #[simd_test(enable = "avx512fp16,avx512vl")]
25469    unsafe fn test_mm256_mask_cvtph_epu16() {
25470        let a = _mm256_set_ph(
25471            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25472        );
25473        let src = _mm256_set_epi16(
25474            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25475        );
25476        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25477        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25478        assert_eq_m256i(r, e);
25479    }
25480
25481    #[simd_test(enable = "avx512fp16,avx512vl")]
25482    unsafe fn test_mm256_maskz_cvtph_epu16() {
25483        let a = _mm256_set_ph(
25484            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25485        );
25486        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25487        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25488        assert_eq_m256i(r, e);
25489    }
25490
25491    #[simd_test(enable = "avx512fp16")]
25492    unsafe fn test_mm512_cvtph_epu16() {
25493        let a = _mm512_set_ph(
25494            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25495            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25496            31.0, 32.0,
25497        );
25498        let r = _mm512_cvttph_epu16(a);
25499        let e = _mm512_set_epi16(
25500            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25501            25, 26, 27, 28, 29, 30, 31, 32,
25502        );
25503        assert_eq_m512i(r, e);
25504    }
25505
25506    #[simd_test(enable = "avx512fp16")]
25507    unsafe fn test_mm512_mask_cvtph_epu16() {
25508        let a = _mm512_set_ph(
25509            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25510            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25511            31.0, 32.0,
25512        );
25513        let src = _mm512_set_epi16(
25514            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25515            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25516        );
25517        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25518        let e = _mm512_set_epi16(
25519            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25520            24, 34, 26, 36, 28, 38, 30, 40, 32,
25521        );
25522        assert_eq_m512i(r, e);
25523    }
25524
25525    #[simd_test(enable = "avx512fp16")]
25526    unsafe fn test_mm512_maskz_cvtph_epu16() {
25527        let a = _mm512_set_ph(
25528            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25529            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25530            31.0, 32.0,
25531        );
25532        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25533        let e = _mm512_set_epi16(
25534            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25535            0, 28, 0, 30, 0, 32,
25536        );
25537        assert_eq_m512i(r, e);
25538    }
25539
25540    #[simd_test(enable = "avx512fp16")]
25541    unsafe fn test_mm512_cvt_roundph_epu16() {
25542        let a = _mm512_set_ph(
25543            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25544            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25545            31.0, 32.0,
25546        );
25547        let r = _mm512_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25548        let e = _mm512_set_epi16(
25549            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25550            25, 26, 27, 28, 29, 30, 31, 32,
25551        );
25552        assert_eq_m512i(r, e);
25553    }
25554
25555    #[simd_test(enable = "avx512fp16")]
25556    unsafe fn test_mm512_mask_cvt_roundph_epu16() {
25557        let a = _mm512_set_ph(
25558            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25559            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25560            31.0, 32.0,
25561        );
25562        let src = _mm512_set_epi16(
25563            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25564            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25565        );
25566        let r = _mm512_mask_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25567            src,
25568            0b01010101010101010101010101010101,
25569            a,
25570        );
25571        let e = _mm512_set_epi16(
25572            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25573            24, 34, 26, 36, 28, 38, 30, 40, 32,
25574        );
25575        assert_eq_m512i(r, e);
25576    }
25577
25578    #[simd_test(enable = "avx512fp16")]
25579    unsafe fn test_mm512_maskz_cvt_roundph_epu16() {
25580        let a = _mm512_set_ph(
25581            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25582            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25583            31.0, 32.0,
25584        );
25585        let r = _mm512_maskz_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25586            0b01010101010101010101010101010101,
25587            a,
25588        );
25589        let e = _mm512_set_epi16(
25590            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25591            0, 28, 0, 30, 0, 32,
25592        );
25593        assert_eq_m512i(r, e);
25594    }
25595
25596    #[simd_test(enable = "avx512fp16,avx512vl")]
25597    unsafe fn test_mm_cvttph_epi16() {
25598        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25599        let r = _mm_cvttph_epi16(a);
25600        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25601        assert_eq_m128i(r, e);
25602    }
25603
25604    #[simd_test(enable = "avx512fp16,avx512vl")]
25605    unsafe fn test_mm_mask_cvttph_epi16() {
25606        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25607        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25608        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25609        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25610        assert_eq_m128i(r, e);
25611    }
25612
25613    #[simd_test(enable = "avx512fp16,avx512vl")]
25614    unsafe fn test_mm_maskz_cvttph_epi16() {
25615        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25616        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25617        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25618        assert_eq_m128i(r, e);
25619    }
25620
25621    #[simd_test(enable = "avx512fp16,avx512vl")]
25622    unsafe fn test_mm256_cvttph_epi16() {
25623        let a = _mm256_set_ph(
25624            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25625        );
25626        let r = _mm256_cvttph_epi16(a);
25627        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25628        assert_eq_m256i(r, e);
25629    }
25630
25631    #[simd_test(enable = "avx512fp16,avx512vl")]
25632    unsafe fn test_mm256_mask_cvttph_epi16() {
25633        let a = _mm256_set_ph(
25634            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25635        );
25636        let src = _mm256_set_epi16(
25637            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25638        );
25639        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25640        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25641        assert_eq_m256i(r, e);
25642    }
25643
25644    #[simd_test(enable = "avx512fp16,avx512vl")]
25645    unsafe fn test_mm256_maskz_cvttph_epi16() {
25646        let a = _mm256_set_ph(
25647            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25648        );
25649        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25650        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25651        assert_eq_m256i(r, e);
25652    }
25653
25654    #[simd_test(enable = "avx512fp16")]
25655    unsafe fn test_mm512_cvttph_epi16() {
25656        let a = _mm512_set_ph(
25657            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25658            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25659            31.0, 32.0,
25660        );
25661        let r = _mm512_cvttph_epi16(a);
25662        let e = _mm512_set_epi16(
25663            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25664            25, 26, 27, 28, 29, 30, 31, 32,
25665        );
25666        assert_eq_m512i(r, e);
25667    }
25668
25669    #[simd_test(enable = "avx512fp16")]
25670    unsafe fn test_mm512_mask_cvttph_epi16() {
25671        let a = _mm512_set_ph(
25672            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25673            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25674            31.0, 32.0,
25675        );
25676        let src = _mm512_set_epi16(
25677            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25678            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25679        );
25680        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25681        let e = _mm512_set_epi16(
25682            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25683            24, 34, 26, 36, 28, 38, 30, 40, 32,
25684        );
25685        assert_eq_m512i(r, e);
25686    }
25687
25688    #[simd_test(enable = "avx512fp16")]
25689    unsafe fn test_mm512_maskz_cvttph_epi16() {
25690        let a = _mm512_set_ph(
25691            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25692            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25693            31.0, 32.0,
25694        );
25695        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25696        let e = _mm512_set_epi16(
25697            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25698            0, 28, 0, 30, 0, 32,
25699        );
25700        assert_eq_m512i(r, e);
25701    }
25702
25703    #[simd_test(enable = "avx512fp16")]
25704    unsafe fn test_mm512_cvtt_roundph_epi16() {
25705        let a = _mm512_set_ph(
25706            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25707            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25708            31.0, 32.0,
25709        );
25710        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25711        let e = _mm512_set_epi16(
25712            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25713            25, 26, 27, 28, 29, 30, 31, 32,
25714        );
25715        assert_eq_m512i(r, e);
25716    }
25717
25718    #[simd_test(enable = "avx512fp16")]
25719    unsafe fn test_mm512_mask_cvtt_roundph_epi16() {
25720        let a = _mm512_set_ph(
25721            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25722            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25723            31.0, 32.0,
25724        );
25725        let src = _mm512_set_epi16(
25726            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25727            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25728        );
25729        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25730            src,
25731            0b01010101010101010101010101010101,
25732            a,
25733        );
25734        let e = _mm512_set_epi16(
25735            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25736            24, 34, 26, 36, 28, 38, 30, 40, 32,
25737        );
25738        assert_eq_m512i(r, e);
25739    }
25740
25741    #[simd_test(enable = "avx512fp16")]
25742    unsafe fn test_mm512_maskz_cvtt_roundph_epi16() {
25743        let a = _mm512_set_ph(
25744            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25745            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25746            31.0, 32.0,
25747        );
25748        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25749            0b01010101010101010101010101010101,
25750            a,
25751        );
25752        let e = _mm512_set_epi16(
25753            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25754            0, 28, 0, 30, 0, 32,
25755        );
25756        assert_eq_m512i(r, e);
25757    }
25758
25759    #[simd_test(enable = "avx512fp16,avx512vl")]
25760    unsafe fn test_mm_cvttph_epu16() {
25761        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25762        let r = _mm_cvttph_epu16(a);
25763        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25764        assert_eq_m128i(r, e);
25765    }
25766
25767    #[simd_test(enable = "avx512fp16,avx512vl")]
25768    unsafe fn test_mm_mask_cvttph_epu16() {
25769        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25770        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25771        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25772        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25773        assert_eq_m128i(r, e);
25774    }
25775
25776    #[simd_test(enable = "avx512fp16,avx512vl")]
25777    unsafe fn test_mm_maskz_cvttph_epu16() {
25778        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25779        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25780        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25781        assert_eq_m128i(r, e);
25782    }
25783
25784    #[simd_test(enable = "avx512fp16,avx512vl")]
25785    unsafe fn test_mm256_cvttph_epu16() {
25786        let a = _mm256_set_ph(
25787            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25788        );
25789        let r = _mm256_cvttph_epu16(a);
25790        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25791        assert_eq_m256i(r, e);
25792    }
25793
25794    #[simd_test(enable = "avx512fp16,avx512vl")]
25795    unsafe fn test_mm256_mask_cvttph_epu16() {
25796        let a = _mm256_set_ph(
25797            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25798        );
25799        let src = _mm256_set_epi16(
25800            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25801        );
25802        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25803        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25804        assert_eq_m256i(r, e);
25805    }
25806
25807    #[simd_test(enable = "avx512fp16,avx512vl")]
25808    unsafe fn test_mm256_maskz_cvttph_epu16() {
25809        let a = _mm256_set_ph(
25810            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25811        );
25812        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25813        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25814        assert_eq_m256i(r, e);
25815    }
25816
25817    #[simd_test(enable = "avx512fp16")]
25818    unsafe fn test_mm512_cvttph_epu16() {
25819        let a = _mm512_set_ph(
25820            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25821            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25822            31.0, 32.0,
25823        );
25824        let r = _mm512_cvttph_epu16(a);
25825        let e = _mm512_set_epi16(
25826            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25827            25, 26, 27, 28, 29, 30, 31, 32,
25828        );
25829        assert_eq_m512i(r, e);
25830    }
25831
25832    #[simd_test(enable = "avx512fp16")]
25833    unsafe fn test_mm512_mask_cvttph_epu16() {
25834        let a = _mm512_set_ph(
25835            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25836            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25837            31.0, 32.0,
25838        );
25839        let src = _mm512_set_epi16(
25840            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25841            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25842        );
25843        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25844        let e = _mm512_set_epi16(
25845            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25846            24, 34, 26, 36, 28, 38, 30, 40, 32,
25847        );
25848        assert_eq_m512i(r, e);
25849    }
25850
25851    #[simd_test(enable = "avx512fp16")]
25852    unsafe fn test_mm512_maskz_cvttph_epu16() {
25853        let a = _mm512_set_ph(
25854            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25855            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25856            31.0, 32.0,
25857        );
25858        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25859        let e = _mm512_set_epi16(
25860            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25861            0, 28, 0, 30, 0, 32,
25862        );
25863        assert_eq_m512i(r, e);
25864    }
25865
25866    #[simd_test(enable = "avx512fp16")]
25867    unsafe fn test_mm512_cvtt_roundph_epu16() {
25868        let a = _mm512_set_ph(
25869            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25870            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25871            31.0, 32.0,
25872        );
25873        let r = _mm512_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(a);
25874        let e = _mm512_set_epi16(
25875            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25876            25, 26, 27, 28, 29, 30, 31, 32,
25877        );
25878        assert_eq_m512i(r, e);
25879    }
25880
25881    #[simd_test(enable = "avx512fp16")]
25882    unsafe fn test_mm512_mask_cvtt_roundph_epu16() {
25883        let a = _mm512_set_ph(
25884            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25885            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25886            31.0, 32.0,
25887        );
25888        let src = _mm512_set_epi16(
25889            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25890            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25891        );
25892        let r = _mm512_mask_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
25893            src,
25894            0b01010101010101010101010101010101,
25895            a,
25896        );
25897        let e = _mm512_set_epi16(
25898            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25899            24, 34, 26, 36, 28, 38, 30, 40, 32,
25900        );
25901        assert_eq_m512i(r, e);
25902    }
25903
25904    #[simd_test(enable = "avx512fp16")]
25905    unsafe fn test_mm512_maskz_cvtt_roundph_epu16() {
25906        let a = _mm512_set_ph(
25907            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25908            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25909            31.0, 32.0,
25910        );
25911        let r = _mm512_maskz_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
25912            0b01010101010101010101010101010101,
25913            a,
25914        );
25915        let e = _mm512_set_epi16(
25916            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25917            0, 28, 0, 30, 0, 32,
25918        );
25919        assert_eq_m512i(r, e);
25920    }
25921
25922    #[simd_test(enable = "avx512fp16,avx512vl")]
25923    unsafe fn test_mm_cvtph_epi32() {
25924        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25925        let r = _mm_cvtph_epi32(a);
25926        let e = _mm_set_epi32(1, 2, 3, 4);
25927        assert_eq_m128i(r, e);
25928    }
25929
25930    #[simd_test(enable = "avx512fp16,avx512vl")]
25931    unsafe fn test_mm_mask_cvtph_epi32() {
25932        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25933        let src = _mm_set_epi32(10, 11, 12, 13);
25934        let r = _mm_mask_cvtph_epi32(src, 0b0101, a);
25935        let e = _mm_set_epi32(10, 2, 12, 4);
25936        assert_eq_m128i(r, e);
25937    }
25938
25939    #[simd_test(enable = "avx512fp16,avx512vl")]
25940    unsafe fn test_mm_maskz_cvtph_epi32() {
25941        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25942        let r = _mm_maskz_cvtph_epi32(0b0101, a);
25943        let e = _mm_set_epi32(0, 2, 0, 4);
25944        assert_eq_m128i(r, e);
25945    }
25946
25947    #[simd_test(enable = "avx512fp16,avx512vl")]
25948    unsafe fn test_mm256_cvtph_epi32() {
25949        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25950        let r = _mm256_cvtph_epi32(a);
25951        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
25952        assert_eq_m256i(r, e);
25953    }
25954
25955    #[simd_test(enable = "avx512fp16,avx512vl")]
25956    unsafe fn test_mm256_mask_cvtph_epi32() {
25957        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25958        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
25959        let r = _mm256_mask_cvtph_epi32(src, 0b01010101, a);
25960        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
25961        assert_eq_m256i(r, e);
25962    }
25963
25964    #[simd_test(enable = "avx512fp16,avx512vl")]
25965    unsafe fn test_mm256_maskz_cvtph_epi32() {
25966        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25967        let r = _mm256_maskz_cvtph_epi32(0b01010101, a);
25968        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
25969        assert_eq_m256i(r, e);
25970    }
25971
25972    #[simd_test(enable = "avx512fp16")]
25973    unsafe fn test_mm512_cvtph_epi32() {
25974        let a = _mm256_set_ph(
25975            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25976        );
25977        let r = _mm512_cvtph_epi32(a);
25978        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25979        assert_eq_m512i(r, e);
25980    }
25981
25982    #[simd_test(enable = "avx512fp16")]
25983    unsafe fn test_mm512_mask_cvtph_epi32() {
25984        let a = _mm256_set_ph(
25985            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25986        );
25987        let src = _mm512_set_epi32(
25988            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25989        );
25990        let r = _mm512_mask_cvtph_epi32(src, 0b0101010101010101, a);
25991        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25992        assert_eq_m512i(r, e);
25993    }
25994
25995    #[simd_test(enable = "avx512fp16")]
25996    unsafe fn test_mm512_maskz_cvtph_epi32() {
25997        let a = _mm256_set_ph(
25998            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25999        );
26000        let r = _mm512_maskz_cvtph_epi32(0b0101010101010101, a);
26001        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26002        assert_eq_m512i(r, e);
26003    }
26004
26005    #[simd_test(enable = "avx512fp16")]
26006    unsafe fn test_mm512_cvt_roundph_epi32() {
26007        let a = _mm256_set_ph(
26008            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26009        );
26010        let r = _mm512_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26011        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26012        assert_eq_m512i(r, e);
26013    }
26014
26015    #[simd_test(enable = "avx512fp16")]
26016    unsafe fn test_mm512_mask_cvt_roundph_epi32() {
26017        let a = _mm256_set_ph(
26018            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26019        );
26020        let src = _mm512_set_epi32(
26021            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26022        );
26023        let r = _mm512_mask_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26024            src,
26025            0b0101010101010101,
26026            a,
26027        );
26028        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26029        assert_eq_m512i(r, e);
26030    }
26031
26032    #[simd_test(enable = "avx512fp16")]
26033    unsafe fn test_mm512_maskz_cvt_roundph_epi32() {
26034        let a = _mm256_set_ph(
26035            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26036        );
26037        let r = _mm512_maskz_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26038            0b0101010101010101,
26039            a,
26040        );
26041        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26042        assert_eq_m512i(r, e);
26043    }
26044
26045    #[simd_test(enable = "avx512fp16")]
26046    unsafe fn test_mm_cvtsh_i32() {
26047        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26048        let r = _mm_cvtsh_i32(a);
26049        assert_eq!(r, 1);
26050    }
26051
26052    #[simd_test(enable = "avx512fp16")]
26053    unsafe fn test_mm_cvt_roundsh_i32() {
26054        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26055        let r = _mm_cvt_roundsh_i32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26056        assert_eq!(r, 1);
26057    }
26058
26059    #[simd_test(enable = "avx512fp16,avx512vl")]
26060    unsafe fn test_mm_cvtph_epu32() {
26061        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26062        let r = _mm_cvtph_epu32(a);
26063        let e = _mm_set_epi32(1, 2, 3, 4);
26064        assert_eq_m128i(r, e);
26065    }
26066
26067    #[simd_test(enable = "avx512fp16,avx512vl")]
26068    unsafe fn test_mm_mask_cvtph_epu32() {
26069        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26070        let src = _mm_set_epi32(10, 11, 12, 13);
26071        let r = _mm_mask_cvtph_epu32(src, 0b0101, a);
26072        let e = _mm_set_epi32(10, 2, 12, 4);
26073        assert_eq_m128i(r, e);
26074    }
26075
26076    #[simd_test(enable = "avx512fp16,avx512vl")]
26077    unsafe fn test_mm_maskz_cvtph_epu32() {
26078        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26079        let r = _mm_maskz_cvtph_epu32(0b0101, a);
26080        let e = _mm_set_epi32(0, 2, 0, 4);
26081        assert_eq_m128i(r, e);
26082    }
26083
26084    #[simd_test(enable = "avx512fp16,avx512vl")]
26085    unsafe fn test_mm256_cvtph_epu32() {
26086        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26087        let r = _mm256_cvtph_epu32(a);
26088        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26089        assert_eq_m256i(r, e);
26090    }
26091
26092    #[simd_test(enable = "avx512fp16,avx512vl")]
26093    unsafe fn test_mm256_mask_cvtph_epu32() {
26094        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26095        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26096        let r = _mm256_mask_cvtph_epu32(src, 0b01010101, a);
26097        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26098        assert_eq_m256i(r, e);
26099    }
26100
26101    #[simd_test(enable = "avx512fp16,avx512vl")]
26102    unsafe fn test_mm256_maskz_cvtph_epu32() {
26103        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26104        let r = _mm256_maskz_cvtph_epu32(0b01010101, a);
26105        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26106        assert_eq_m256i(r, e);
26107    }
26108
26109    #[simd_test(enable = "avx512fp16")]
26110    unsafe fn test_mm512_cvtph_epu32() {
26111        let a = _mm256_set_ph(
26112            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26113        );
26114        let r = _mm512_cvtph_epu32(a);
26115        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26116        assert_eq_m512i(r, e);
26117    }
26118
26119    #[simd_test(enable = "avx512fp16")]
26120    unsafe fn test_mm512_mask_cvtph_epu32() {
26121        let a = _mm256_set_ph(
26122            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26123        );
26124        let src = _mm512_set_epi32(
26125            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26126        );
26127        let r = _mm512_mask_cvtph_epu32(src, 0b0101010101010101, a);
26128        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26129        assert_eq_m512i(r, e);
26130    }
26131
26132    #[simd_test(enable = "avx512fp16")]
26133    unsafe fn test_mm512_maskz_cvtph_epu32() {
26134        let a = _mm256_set_ph(
26135            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26136        );
26137        let r = _mm512_maskz_cvtph_epu32(0b0101010101010101, a);
26138        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26139        assert_eq_m512i(r, e);
26140    }
26141
26142    #[simd_test(enable = "avx512fp16")]
26143    unsafe fn test_mm512_cvt_roundph_epu32() {
26144        let a = _mm256_set_ph(
26145            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26146        );
26147        let r = _mm512_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26148        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26149        assert_eq_m512i(r, e);
26150    }
26151
26152    #[simd_test(enable = "avx512fp16")]
26153    unsafe fn test_mm512_mask_cvt_roundph_epu32() {
26154        let a = _mm256_set_ph(
26155            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26156        );
26157        let src = _mm512_set_epi32(
26158            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26159        );
26160        let r = _mm512_mask_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26161            src,
26162            0b0101010101010101,
26163            a,
26164        );
26165        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26166        assert_eq_m512i(r, e);
26167    }
26168
26169    #[simd_test(enable = "avx512fp16")]
26170    unsafe fn test_mm512_maskz_cvt_roundph_epu32() {
26171        let a = _mm256_set_ph(
26172            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26173        );
26174        let r = _mm512_maskz_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26175            0b0101010101010101,
26176            a,
26177        );
26178        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26179        assert_eq_m512i(r, e);
26180    }
26181
26182    #[simd_test(enable = "avx512fp16")]
26183    unsafe fn test_mm_cvtsh_u32() {
26184        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26185        let r = _mm_cvtsh_u32(a);
26186        assert_eq!(r, 1);
26187    }
26188
26189    #[simd_test(enable = "avx512fp16")]
26190    unsafe fn test_mm_cvt_roundsh_u32() {
26191        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26192        let r = _mm_cvt_roundsh_u32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26193        assert_eq!(r, 1);
26194    }
26195
26196    #[simd_test(enable = "avx512fp16,avx512vl")]
26197    unsafe fn test_mm_cvttph_epi32() {
26198        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26199        let r = _mm_cvttph_epi32(a);
26200        let e = _mm_set_epi32(1, 2, 3, 4);
26201        assert_eq_m128i(r, e);
26202    }
26203
26204    #[simd_test(enable = "avx512fp16,avx512vl")]
26205    unsafe fn test_mm_mask_cvttph_epi32() {
26206        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26207        let src = _mm_set_epi32(10, 11, 12, 13);
26208        let r = _mm_mask_cvttph_epi32(src, 0b0101, a);
26209        let e = _mm_set_epi32(10, 2, 12, 4);
26210        assert_eq_m128i(r, e);
26211    }
26212
26213    #[simd_test(enable = "avx512fp16,avx512vl")]
26214    unsafe fn test_mm_maskz_cvttph_epi32() {
26215        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26216        let r = _mm_maskz_cvttph_epi32(0b0101, a);
26217        let e = _mm_set_epi32(0, 2, 0, 4);
26218        assert_eq_m128i(r, e);
26219    }
26220
26221    #[simd_test(enable = "avx512fp16,avx512vl")]
26222    unsafe fn test_mm256_cvttph_epi32() {
26223        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26224        let r = _mm256_cvttph_epi32(a);
26225        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26226        assert_eq_m256i(r, e);
26227    }
26228
26229    #[simd_test(enable = "avx512fp16,avx512vl")]
26230    unsafe fn test_mm256_mask_cvttph_epi32() {
26231        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26232        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26233        let r = _mm256_mask_cvttph_epi32(src, 0b01010101, a);
26234        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26235        assert_eq_m256i(r, e);
26236    }
26237
26238    #[simd_test(enable = "avx512fp16,avx512vl")]
26239    unsafe fn test_mm256_maskz_cvttph_epi32() {
26240        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26241        let r = _mm256_maskz_cvttph_epi32(0b01010101, a);
26242        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26243        assert_eq_m256i(r, e);
26244    }
26245
26246    #[simd_test(enable = "avx512fp16")]
26247    unsafe fn test_mm512_cvttph_epi32() {
26248        let a = _mm256_set_ph(
26249            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26250        );
26251        let r = _mm512_cvttph_epi32(a);
26252        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26253        assert_eq_m512i(r, e);
26254    }
26255
26256    #[simd_test(enable = "avx512fp16")]
26257    unsafe fn test_mm512_mask_cvttph_epi32() {
26258        let a = _mm256_set_ph(
26259            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26260        );
26261        let src = _mm512_set_epi32(
26262            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26263        );
26264        let r = _mm512_mask_cvttph_epi32(src, 0b0101010101010101, a);
26265        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26266        assert_eq_m512i(r, e);
26267    }
26268
26269    #[simd_test(enable = "avx512fp16")]
26270    unsafe fn test_mm512_maskz_cvttph_epi32() {
26271        let a = _mm256_set_ph(
26272            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26273        );
26274        let r = _mm512_maskz_cvttph_epi32(0b0101010101010101, a);
26275        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26276        assert_eq_m512i(r, e);
26277    }
26278
26279    #[simd_test(enable = "avx512fp16")]
26280    unsafe fn test_mm512_cvtt_roundph_epi32() {
26281        let a = _mm256_set_ph(
26282            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26283        );
26284        let r = _mm512_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(a);
26285        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26286        assert_eq_m512i(r, e);
26287    }
26288
26289    #[simd_test(enable = "avx512fp16")]
26290    unsafe fn test_mm512_mask_cvtt_roundph_epi32() {
26291        let a = _mm256_set_ph(
26292            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26293        );
26294        let src = _mm512_set_epi32(
26295            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26296        );
26297        let r = _mm512_mask_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26298        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26299        assert_eq_m512i(r, e);
26300    }
26301
26302    #[simd_test(enable = "avx512fp16")]
26303    unsafe fn test_mm512_maskz_cvtt_roundph_epi32() {
26304        let a = _mm256_set_ph(
26305            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26306        );
26307        let r = _mm512_maskz_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26308        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26309        assert_eq_m512i(r, e);
26310    }
26311
26312    #[simd_test(enable = "avx512fp16")]
26313    unsafe fn test_mm_cvttsh_i32() {
26314        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26315        let r = _mm_cvttsh_i32(a);
26316        assert_eq!(r, 1);
26317    }
26318
26319    #[simd_test(enable = "avx512fp16")]
26320    unsafe fn test_mm_cvtt_roundsh_i32() {
26321        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26322        let r = _mm_cvtt_roundsh_i32::<_MM_FROUND_NO_EXC>(a);
26323        assert_eq!(r, 1);
26324    }
26325
26326    #[simd_test(enable = "avx512fp16,avx512vl")]
26327    unsafe fn test_mm_cvttph_epu32() {
26328        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26329        let r = _mm_cvttph_epu32(a);
26330        let e = _mm_set_epi32(1, 2, 3, 4);
26331        assert_eq_m128i(r, e);
26332    }
26333
26334    #[simd_test(enable = "avx512fp16,avx512vl")]
26335    unsafe fn test_mm_mask_cvttph_epu32() {
26336        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26337        let src = _mm_set_epi32(10, 11, 12, 13);
26338        let r = _mm_mask_cvttph_epu32(src, 0b0101, a);
26339        let e = _mm_set_epi32(10, 2, 12, 4);
26340        assert_eq_m128i(r, e);
26341    }
26342
26343    #[simd_test(enable = "avx512fp16,avx512vl")]
26344    unsafe fn test_mm_maskz_cvttph_epu32() {
26345        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26346        let r = _mm_maskz_cvttph_epu32(0b0101, a);
26347        let e = _mm_set_epi32(0, 2, 0, 4);
26348        assert_eq_m128i(r, e);
26349    }
26350
26351    #[simd_test(enable = "avx512fp16,avx512vl")]
26352    unsafe fn test_mm256_cvttph_epu32() {
26353        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26354        let r = _mm256_cvttph_epu32(a);
26355        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26356        assert_eq_m256i(r, e);
26357    }
26358
26359    #[simd_test(enable = "avx512fp16,avx512vl")]
26360    unsafe fn test_mm256_mask_cvttph_epu32() {
26361        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26362        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26363        let r = _mm256_mask_cvttph_epu32(src, 0b01010101, a);
26364        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26365        assert_eq_m256i(r, e);
26366    }
26367
26368    #[simd_test(enable = "avx512fp16,avx512vl")]
26369    unsafe fn test_mm256_maskz_cvttph_epu32() {
26370        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26371        let r = _mm256_maskz_cvttph_epu32(0b01010101, a);
26372        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26373        assert_eq_m256i(r, e);
26374    }
26375
26376    #[simd_test(enable = "avx512fp16")]
26377    unsafe fn test_mm512_cvttph_epu32() {
26378        let a = _mm256_set_ph(
26379            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26380        );
26381        let r = _mm512_cvttph_epu32(a);
26382        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26383        assert_eq_m512i(r, e);
26384    }
26385
26386    #[simd_test(enable = "avx512fp16")]
26387    unsafe fn test_mm512_mask_cvttph_epu32() {
26388        let a = _mm256_set_ph(
26389            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26390        );
26391        let src = _mm512_set_epi32(
26392            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26393        );
26394        let r = _mm512_mask_cvttph_epu32(src, 0b0101010101010101, a);
26395        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26396        assert_eq_m512i(r, e);
26397    }
26398
26399    #[simd_test(enable = "avx512fp16")]
26400    unsafe fn test_mm512_maskz_cvttph_epu32() {
26401        let a = _mm256_set_ph(
26402            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26403        );
26404        let r = _mm512_maskz_cvttph_epu32(0b0101010101010101, a);
26405        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26406        assert_eq_m512i(r, e);
26407    }
26408
26409    #[simd_test(enable = "avx512fp16")]
26410    unsafe fn test_mm512_cvtt_roundph_epu32() {
26411        let a = _mm256_set_ph(
26412            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26413        );
26414        let r = _mm512_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(a);
26415        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26416        assert_eq_m512i(r, e);
26417    }
26418
26419    #[simd_test(enable = "avx512fp16")]
26420    unsafe fn test_mm512_mask_cvtt_roundph_epu32() {
26421        let a = _mm256_set_ph(
26422            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26423        );
26424        let src = _mm512_set_epi32(
26425            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26426        );
26427        let r = _mm512_mask_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26428        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26429        assert_eq_m512i(r, e);
26430    }
26431
26432    #[simd_test(enable = "avx512fp16")]
26433    unsafe fn test_mm512_maskz_cvtt_roundph_epu32() {
26434        let a = _mm256_set_ph(
26435            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26436        );
26437        let r = _mm512_maskz_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26438        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26439        assert_eq_m512i(r, e);
26440    }
26441
26442    #[simd_test(enable = "avx512fp16")]
26443    unsafe fn test_mm_cvttsh_u32() {
26444        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26445        let r = _mm_cvttsh_u32(a);
26446        assert_eq!(r, 1);
26447    }
26448
26449    #[simd_test(enable = "avx512fp16")]
26450    unsafe fn test_mm_cvtt_roundsh_u32() {
26451        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26452        let r = _mm_cvtt_roundsh_u32::<_MM_FROUND_NO_EXC>(a);
26453        assert_eq!(r, 1);
26454    }
26455
26456    #[simd_test(enable = "avx512fp16,avx512vl")]
26457    unsafe fn test_mm_cvtph_epi64() {
26458        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26459        let r = _mm_cvtph_epi64(a);
26460        let e = _mm_set_epi64x(1, 2);
26461        assert_eq_m128i(r, e);
26462    }
26463
26464    #[simd_test(enable = "avx512fp16,avx512vl")]
26465    unsafe fn test_mm_mask_cvtph_epi64() {
26466        let src = _mm_set_epi64x(3, 4);
26467        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26468        let r = _mm_mask_cvtph_epi64(src, 0b01, a);
26469        let e = _mm_set_epi64x(3, 2);
26470        assert_eq_m128i(r, e);
26471    }
26472
26473    #[simd_test(enable = "avx512fp16,avx512vl")]
26474    unsafe fn test_mm_maskz_cvtph_epi64() {
26475        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26476        let r = _mm_maskz_cvtph_epi64(0b01, a);
26477        let e = _mm_set_epi64x(0, 2);
26478        assert_eq_m128i(r, e);
26479    }
26480
26481    #[simd_test(enable = "avx512fp16,avx512vl")]
26482    unsafe fn test_mm256_cvtph_epi64() {
26483        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26484        let r = _mm256_cvtph_epi64(a);
26485        let e = _mm256_set_epi64x(1, 2, 3, 4);
26486        assert_eq_m256i(r, e);
26487    }
26488
26489    #[simd_test(enable = "avx512fp16,avx512vl")]
26490    unsafe fn test_mm256_mask_cvtph_epi64() {
26491        let src = _mm256_set_epi64x(5, 6, 7, 8);
26492        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26493        let r = _mm256_mask_cvtph_epi64(src, 0b0101, a);
26494        let e = _mm256_set_epi64x(5, 2, 7, 4);
26495        assert_eq_m256i(r, e);
26496    }
26497
26498    #[simd_test(enable = "avx512fp16,avx512vl")]
26499    unsafe fn test_mm256_maskz_cvtph_epi64() {
26500        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26501        let r = _mm256_maskz_cvtph_epi64(0b0101, a);
26502        let e = _mm256_set_epi64x(0, 2, 0, 4);
26503        assert_eq_m256i(r, e);
26504    }
26505
26506    #[simd_test(enable = "avx512fp16")]
26507    unsafe fn test_mm512_cvtph_epi64() {
26508        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26509        let r = _mm512_cvtph_epi64(a);
26510        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26511        assert_eq_m512i(r, e);
26512    }
26513
26514    #[simd_test(enable = "avx512fp16")]
26515    unsafe fn test_mm512_mask_cvtph_epi64() {
26516        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26517        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26518        let r = _mm512_mask_cvtph_epi64(src, 0b01010101, a);
26519        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26520        assert_eq_m512i(r, e);
26521    }
26522
26523    #[simd_test(enable = "avx512fp16")]
26524    unsafe fn test_mm512_maskz_cvtph_epi64() {
26525        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26526        let r = _mm512_maskz_cvtph_epi64(0b01010101, a);
26527        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26528        assert_eq_m512i(r, e);
26529    }
26530
26531    #[simd_test(enable = "avx512fp16")]
26532    unsafe fn test_mm512_cvt_roundph_epi64() {
26533        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26534        let r = _mm512_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26535        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26536        assert_eq_m512i(r, e);
26537    }
26538
26539    #[simd_test(enable = "avx512fp16")]
26540    unsafe fn test_mm512_mask_cvt_roundph_epi64() {
26541        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26542        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26543        let r = _mm512_mask_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26544            src, 0b01010101, a,
26545        );
26546        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26547        assert_eq_m512i(r, e);
26548    }
26549
26550    #[simd_test(enable = "avx512fp16")]
26551    unsafe fn test_mm512_maskz_cvt_roundph_epi64() {
26552        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26553        let r = _mm512_maskz_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26554            0b01010101, a,
26555        );
26556        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26557        assert_eq_m512i(r, e);
26558    }
26559
26560    #[simd_test(enable = "avx512fp16,avx512vl")]
26561    unsafe fn test_mm_cvtph_epu64() {
26562        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26563        let r = _mm_cvtph_epu64(a);
26564        let e = _mm_set_epi64x(1, 2);
26565        assert_eq_m128i(r, e);
26566    }
26567
26568    #[simd_test(enable = "avx512fp16,avx512vl")]
26569    unsafe fn test_mm_mask_cvtph_epu64() {
26570        let src = _mm_set_epi64x(3, 4);
26571        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26572        let r = _mm_mask_cvtph_epu64(src, 0b01, a);
26573        let e = _mm_set_epi64x(3, 2);
26574        assert_eq_m128i(r, e);
26575    }
26576
26577    #[simd_test(enable = "avx512fp16,avx512vl")]
26578    unsafe fn test_mm_maskz_cvtph_epu64() {
26579        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26580        let r = _mm_maskz_cvtph_epu64(0b01, a);
26581        let e = _mm_set_epi64x(0, 2);
26582        assert_eq_m128i(r, e);
26583    }
26584
26585    #[simd_test(enable = "avx512fp16,avx512vl")]
26586    unsafe fn test_mm256_cvtph_epu64() {
26587        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26588        let r = _mm256_cvtph_epu64(a);
26589        let e = _mm256_set_epi64x(1, 2, 3, 4);
26590        assert_eq_m256i(r, e);
26591    }
26592
26593    #[simd_test(enable = "avx512fp16,avx512vl")]
26594    unsafe fn test_mm256_mask_cvtph_epu64() {
26595        let src = _mm256_set_epi64x(5, 6, 7, 8);
26596        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26597        let r = _mm256_mask_cvtph_epu64(src, 0b0101, a);
26598        let e = _mm256_set_epi64x(5, 2, 7, 4);
26599        assert_eq_m256i(r, e);
26600    }
26601
26602    #[simd_test(enable = "avx512fp16,avx512vl")]
26603    unsafe fn test_mm256_maskz_cvtph_epu64() {
26604        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26605        let r = _mm256_maskz_cvtph_epu64(0b0101, a);
26606        let e = _mm256_set_epi64x(0, 2, 0, 4);
26607        assert_eq_m256i(r, e);
26608    }
26609
26610    #[simd_test(enable = "avx512fp16")]
26611    unsafe fn test_mm512_cvtph_epu64() {
26612        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26613        let r = _mm512_cvtph_epu64(a);
26614        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26615        assert_eq_m512i(r, e);
26616    }
26617
26618    #[simd_test(enable = "avx512fp16")]
26619    unsafe fn test_mm512_mask_cvtph_epu64() {
26620        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26621        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26622        let r = _mm512_mask_cvtph_epu64(src, 0b01010101, a);
26623        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26624        assert_eq_m512i(r, e);
26625    }
26626
26627    #[simd_test(enable = "avx512fp16")]
26628    unsafe fn test_mm512_maskz_cvtph_epu64() {
26629        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26630        let r = _mm512_maskz_cvtph_epu64(0b01010101, a);
26631        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26632        assert_eq_m512i(r, e);
26633    }
26634
26635    #[simd_test(enable = "avx512fp16")]
26636    unsafe fn test_mm512_cvt_roundph_epu64() {
26637        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26638        let r = _mm512_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26639        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26640        assert_eq_m512i(r, e);
26641    }
26642
26643    #[simd_test(enable = "avx512fp16")]
26644    unsafe fn test_mm512_mask_cvt_roundph_epu64() {
26645        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26646        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26647        let r = _mm512_mask_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26648            src, 0b01010101, a,
26649        );
26650        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26651        assert_eq_m512i(r, e);
26652    }
26653
26654    #[simd_test(enable = "avx512fp16")]
26655    unsafe fn test_mm512_maskz_cvt_roundph_epu64() {
26656        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26657        let r = _mm512_maskz_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26658            0b01010101, a,
26659        );
26660        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26661        assert_eq_m512i(r, e);
26662    }
26663
26664    #[simd_test(enable = "avx512fp16,avx512vl")]
26665    unsafe fn test_mm_cvttph_epi64() {
26666        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26667        let r = _mm_cvttph_epi64(a);
26668        let e = _mm_set_epi64x(1, 2);
26669        assert_eq_m128i(r, e);
26670    }
26671
26672    #[simd_test(enable = "avx512fp16,avx512vl")]
26673    unsafe fn test_mm_mask_cvttph_epi64() {
26674        let src = _mm_set_epi64x(3, 4);
26675        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26676        let r = _mm_mask_cvttph_epi64(src, 0b01, a);
26677        let e = _mm_set_epi64x(3, 2);
26678        assert_eq_m128i(r, e);
26679    }
26680
26681    #[simd_test(enable = "avx512fp16,avx512vl")]
26682    unsafe fn test_mm_maskz_cvttph_epi64() {
26683        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26684        let r = _mm_maskz_cvttph_epi64(0b01, a);
26685        let e = _mm_set_epi64x(0, 2);
26686        assert_eq_m128i(r, e);
26687    }
26688
26689    #[simd_test(enable = "avx512fp16,avx512vl")]
26690    unsafe fn test_mm256_cvttph_epi64() {
26691        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26692        let r = _mm256_cvttph_epi64(a);
26693        let e = _mm256_set_epi64x(1, 2, 3, 4);
26694        assert_eq_m256i(r, e);
26695    }
26696
26697    #[simd_test(enable = "avx512fp16,avx512vl")]
26698    unsafe fn test_mm256_mask_cvttph_epi64() {
26699        let src = _mm256_set_epi64x(5, 6, 7, 8);
26700        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26701        let r = _mm256_mask_cvttph_epi64(src, 0b0101, a);
26702        let e = _mm256_set_epi64x(5, 2, 7, 4);
26703        assert_eq_m256i(r, e);
26704    }
26705
26706    #[simd_test(enable = "avx512fp16,avx512vl")]
26707    unsafe fn test_mm256_maskz_cvttph_epi64() {
26708        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26709        let r = _mm256_maskz_cvttph_epi64(0b0101, a);
26710        let e = _mm256_set_epi64x(0, 2, 0, 4);
26711        assert_eq_m256i(r, e);
26712    }
26713
26714    #[simd_test(enable = "avx512fp16")]
26715    unsafe fn test_mm512_cvttph_epi64() {
26716        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26717        let r = _mm512_cvttph_epi64(a);
26718        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26719        assert_eq_m512i(r, e);
26720    }
26721
26722    #[simd_test(enable = "avx512fp16")]
26723    unsafe fn test_mm512_mask_cvttph_epi64() {
26724        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26725        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26726        let r = _mm512_mask_cvttph_epi64(src, 0b01010101, a);
26727        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26728        assert_eq_m512i(r, e);
26729    }
26730
26731    #[simd_test(enable = "avx512fp16")]
26732    unsafe fn test_mm512_maskz_cvttph_epi64() {
26733        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26734        let r = _mm512_maskz_cvttph_epi64(0b01010101, a);
26735        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26736        assert_eq_m512i(r, e);
26737    }
26738
26739    #[simd_test(enable = "avx512fp16")]
26740    unsafe fn test_mm512_cvtt_roundph_epi64() {
26741        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26742        let r = _mm512_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(a);
26743        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26744        assert_eq_m512i(r, e);
26745    }
26746
26747    #[simd_test(enable = "avx512fp16")]
26748    unsafe fn test_mm512_mask_cvtt_roundph_epi64() {
26749        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26750        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26751        let r = _mm512_mask_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
26752        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26753        assert_eq_m512i(r, e);
26754    }
26755
26756    #[simd_test(enable = "avx512fp16")]
26757    unsafe fn test_mm512_maskz_cvtt_roundph_epi64() {
26758        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26759        let r = _mm512_maskz_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(0b01010101, a);
26760        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26761        assert_eq_m512i(r, e);
26762    }
26763
26764    #[simd_test(enable = "avx512fp16,avx512vl")]
26765    unsafe fn test_mm_cvttph_epu64() {
26766        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26767        let r = _mm_cvttph_epu64(a);
26768        let e = _mm_set_epi64x(1, 2);
26769        assert_eq_m128i(r, e);
26770    }
26771
26772    #[simd_test(enable = "avx512fp16,avx512vl")]
26773    unsafe fn test_mm_mask_cvttph_epu64() {
26774        let src = _mm_set_epi64x(3, 4);
26775        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26776        let r = _mm_mask_cvttph_epu64(src, 0b01, a);
26777        let e = _mm_set_epi64x(3, 2);
26778        assert_eq_m128i(r, e);
26779    }
26780
26781    #[simd_test(enable = "avx512fp16,avx512vl")]
26782    unsafe fn test_mm_maskz_cvttph_epu64() {
26783        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26784        let r = _mm_maskz_cvttph_epu64(0b01, a);
26785        let e = _mm_set_epi64x(0, 2);
26786        assert_eq_m128i(r, e);
26787    }
26788
26789    #[simd_test(enable = "avx512fp16,avx512vl")]
26790    unsafe fn test_mm256_cvttph_epu64() {
26791        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26792        let r = _mm256_cvttph_epu64(a);
26793        let e = _mm256_set_epi64x(1, 2, 3, 4);
26794        assert_eq_m256i(r, e);
26795    }
26796
26797    #[simd_test(enable = "avx512fp16,avx512vl")]
26798    unsafe fn test_mm256_mask_cvttph_epu64() {
26799        let src = _mm256_set_epi64x(5, 6, 7, 8);
26800        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26801        let r = _mm256_mask_cvttph_epu64(src, 0b0101, a);
26802        let e = _mm256_set_epi64x(5, 2, 7, 4);
26803        assert_eq_m256i(r, e);
26804    }
26805
26806    #[simd_test(enable = "avx512fp16,avx512vl")]
26807    unsafe fn test_mm256_maskz_cvttph_epu64() {
26808        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26809        let r = _mm256_maskz_cvttph_epu64(0b0101, a);
26810        let e = _mm256_set_epi64x(0, 2, 0, 4);
26811        assert_eq_m256i(r, e);
26812    }
26813
26814    #[simd_test(enable = "avx512fp16")]
26815    unsafe fn test_mm512_cvttph_epu64() {
26816        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26817        let r = _mm512_cvttph_epu64(a);
26818        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26819        assert_eq_m512i(r, e);
26820    }
26821
26822    #[simd_test(enable = "avx512fp16")]
26823    unsafe fn test_mm512_mask_cvttph_epu64() {
26824        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26825        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26826        let r = _mm512_mask_cvttph_epu64(src, 0b01010101, a);
26827        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26828        assert_eq_m512i(r, e);
26829    }
26830
26831    #[simd_test(enable = "avx512fp16")]
26832    unsafe fn test_mm512_maskz_cvttph_epu64() {
26833        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26834        let r = _mm512_maskz_cvttph_epu64(0b01010101, a);
26835        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26836        assert_eq_m512i(r, e);
26837    }
26838
26839    #[simd_test(enable = "avx512fp16")]
26840    unsafe fn test_mm512_cvtt_roundph_epu64() {
26841        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26842        let r = _mm512_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(a);
26843        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26844        assert_eq_m512i(r, e);
26845    }
26846
26847    #[simd_test(enable = "avx512fp16")]
26848    unsafe fn test_mm512_mask_cvtt_roundph_epu64() {
26849        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26850        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26851        let r = _mm512_mask_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
26852        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26853        assert_eq_m512i(r, e);
26854    }
26855
26856    #[simd_test(enable = "avx512fp16")]
26857    unsafe fn test_mm512_maskz_cvtt_roundph_epu64() {
26858        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26859        let r = _mm512_maskz_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(0b01010101, a);
26860        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26861        assert_eq_m512i(r, e);
26862    }
26863
26864    #[simd_test(enable = "avx512fp16,avx512vl")]
26865    unsafe fn test_mm_cvtxph_ps() {
26866        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26867        let r = _mm_cvtxph_ps(a);
26868        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
26869        assert_eq_m128(r, e);
26870    }
26871
26872    #[simd_test(enable = "avx512fp16,avx512vl")]
26873    unsafe fn test_mm_mask_cvtxph_ps() {
26874        let src = _mm_set_ps(10.0, 11.0, 12.0, 13.0);
26875        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26876        let r = _mm_mask_cvtxph_ps(src, 0b0101, a);
26877        let e = _mm_set_ps(10.0, 2.0, 12.0, 4.0);
26878        assert_eq_m128(r, e);
26879    }
26880
26881    #[simd_test(enable = "avx512fp16,avx512vl")]
26882    unsafe fn test_mm_maskz_cvtxph_ps() {
26883        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26884        let r = _mm_maskz_cvtxph_ps(0b0101, a);
26885        let e = _mm_set_ps(0.0, 2.0, 0.0, 4.0);
26886        assert_eq_m128(r, e);
26887    }
26888
26889    #[simd_test(enable = "avx512fp16,avx512vl")]
26890    unsafe fn test_mm256_cvtxph_ps() {
26891        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26892        let r = _mm256_cvtxph_ps(a);
26893        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26894        assert_eq_m256(r, e);
26895    }
26896
26897    #[simd_test(enable = "avx512fp16,avx512vl")]
26898    unsafe fn test_mm256_mask_cvtxph_ps() {
26899        let src = _mm256_set_ps(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
26900        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26901        let r = _mm256_mask_cvtxph_ps(src, 0b01010101, a);
26902        let e = _mm256_set_ps(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
26903        assert_eq_m256(r, e);
26904    }
26905
26906    #[simd_test(enable = "avx512fp16,avx512vl")]
26907    unsafe fn test_mm256_maskz_cvtxph_ps() {
26908        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26909        let r = _mm256_maskz_cvtxph_ps(0b01010101, a);
26910        let e = _mm256_set_ps(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
26911        assert_eq_m256(r, e);
26912    }
26913
26914    #[simd_test(enable = "avx512fp16")]
26915    unsafe fn test_mm512_cvtxph_ps() {
26916        let a = _mm256_set_ph(
26917            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26918        );
26919        let r = _mm512_cvtxph_ps(a);
26920        let e = _mm512_set_ps(
26921            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26922        );
26923        assert_eq_m512(r, e);
26924    }
26925
26926    #[simd_test(enable = "avx512fp16")]
26927    unsafe fn test_mm512_mask_cvtxph_ps() {
26928        let src = _mm512_set_ps(
26929            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
26930            24.0, 25.0,
26931        );
26932        let a = _mm256_set_ph(
26933            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26934        );
26935        let r = _mm512_mask_cvtxph_ps(src, 0b0101010101010101, a);
26936        let e = _mm512_set_ps(
26937            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
26938            16.0,
26939        );
26940        assert_eq_m512(r, e);
26941    }
26942
26943    #[simd_test(enable = "avx512fp16")]
26944    unsafe fn test_mm512_maskz_cvtxph_ps() {
26945        let a = _mm256_set_ph(
26946            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26947        );
26948        let r = _mm512_maskz_cvtxph_ps(0b0101010101010101, a);
26949        let e = _mm512_set_ps(
26950            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
26951        );
26952        assert_eq_m512(r, e);
26953    }
26954
26955    #[simd_test(enable = "avx512fp16")]
26956    unsafe fn test_mm512_cvtx_roundph_ps() {
26957        let a = _mm256_set_ph(
26958            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26959        );
26960        let r = _mm512_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(a);
26961        let e = _mm512_set_ps(
26962            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26963        );
26964        assert_eq_m512(r, e);
26965    }
26966
26967    #[simd_test(enable = "avx512fp16")]
26968    unsafe fn test_mm512_mask_cvtx_roundph_ps() {
26969        let src = _mm512_set_ps(
26970            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
26971            24.0, 25.0,
26972        );
26973        let a = _mm256_set_ph(
26974            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26975        );
26976        let r = _mm512_mask_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26977        let e = _mm512_set_ps(
26978            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
26979            16.0,
26980        );
26981        assert_eq_m512(r, e);
26982    }
26983
26984    #[simd_test(enable = "avx512fp16")]
26985    unsafe fn test_mm512_maskz_cvtx_roundph_ps() {
26986        let a = _mm256_set_ph(
26987            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26988        );
26989        let r = _mm512_maskz_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26990        let e = _mm512_set_ps(
26991            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
26992        );
26993        assert_eq_m512(r, e);
26994    }
26995
26996    #[simd_test(enable = "avx512fp16")]
26997    unsafe fn test_mm_cvtsh_ss() {
26998        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
26999        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27000        let r = _mm_cvtsh_ss(a, b);
27001        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27002        assert_eq_m128(r, e);
27003    }
27004
27005    #[simd_test(enable = "avx512fp16")]
27006    unsafe fn test_mm_mask_cvtsh_ss() {
27007        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27008        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27009        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27010        let r = _mm_mask_cvtsh_ss(src, 0, a, b);
27011        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27012        assert_eq_m128(r, e);
27013        let r = _mm_mask_cvtsh_ss(src, 1, a, b);
27014        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27015        assert_eq_m128(r, e);
27016    }
27017
27018    #[simd_test(enable = "avx512fp16")]
27019    unsafe fn test_mm_maskz_cvtsh_ss() {
27020        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27021        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27022        let r = _mm_maskz_cvtsh_ss(0, a, b);
27023        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27024        assert_eq_m128(r, e);
27025        let r = _mm_maskz_cvtsh_ss(1, a, b);
27026        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27027        assert_eq_m128(r, e);
27028    }
27029
27030    #[simd_test(enable = "avx512fp16")]
27031    unsafe fn test_mm_cvt_roundsh_ss() {
27032        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27033        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27034        let r = _mm_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(a, b);
27035        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27036        assert_eq_m128(r, e);
27037    }
27038
27039    #[simd_test(enable = "avx512fp16")]
27040    unsafe fn test_mm_mask_cvt_roundsh_ss() {
27041        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27042        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27043        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27044        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27045        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27046        assert_eq_m128(r, e);
27047        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27048        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27049        assert_eq_m128(r, e);
27050    }
27051
27052    #[simd_test(enable = "avx512fp16")]
27053    unsafe fn test_mm_maskz_cvt_roundsh_ss() {
27054        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27055        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27056        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(0, a, b);
27057        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27058        assert_eq_m128(r, e);
27059        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(1, a, b);
27060        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27061        assert_eq_m128(r, e);
27062    }
27063
27064    #[simd_test(enable = "avx512fp16,avx512vl")]
27065    unsafe fn test_mm_cvtph_pd() {
27066        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27067        let r = _mm_cvtph_pd(a);
27068        let e = _mm_set_pd(1.0, 2.0);
27069        assert_eq_m128d(r, e);
27070    }
27071
27072    #[simd_test(enable = "avx512fp16,avx512vl")]
27073    unsafe fn test_mm_mask_cvtph_pd() {
27074        let src = _mm_set_pd(10.0, 11.0);
27075        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27076        let r = _mm_mask_cvtph_pd(src, 0b01, a);
27077        let e = _mm_set_pd(10.0, 2.0);
27078        assert_eq_m128d(r, e);
27079    }
27080
27081    #[simd_test(enable = "avx512fp16,avx512vl")]
27082    unsafe fn test_mm_maskz_cvtph_pd() {
27083        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27084        let r = _mm_maskz_cvtph_pd(0b01, a);
27085        let e = _mm_set_pd(0.0, 2.0);
27086        assert_eq_m128d(r, e);
27087    }
27088
27089    #[simd_test(enable = "avx512fp16,avx512vl")]
27090    unsafe fn test_mm256_cvtph_pd() {
27091        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27092        let r = _mm256_cvtph_pd(a);
27093        let e = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
27094        assert_eq_m256d(r, e);
27095    }
27096
27097    #[simd_test(enable = "avx512fp16,avx512vl")]
27098    unsafe fn test_mm256_mask_cvtph_pd() {
27099        let src = _mm256_set_pd(10.0, 11.0, 12.0, 13.0);
27100        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27101        let r = _mm256_mask_cvtph_pd(src, 0b0101, a);
27102        let e = _mm256_set_pd(10.0, 2.0, 12.0, 4.0);
27103        assert_eq_m256d(r, e);
27104    }
27105
27106    #[simd_test(enable = "avx512fp16,avx512vl")]
27107    unsafe fn test_mm256_maskz_cvtph_pd() {
27108        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27109        let r = _mm256_maskz_cvtph_pd(0b0101, a);
27110        let e = _mm256_set_pd(0.0, 2.0, 0.0, 4.0);
27111        assert_eq_m256d(r, e);
27112    }
27113
27114    #[simd_test(enable = "avx512fp16")]
27115    unsafe fn test_mm512_cvtph_pd() {
27116        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27117        let r = _mm512_cvtph_pd(a);
27118        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27119        assert_eq_m512d(r, e);
27120    }
27121
27122    #[simd_test(enable = "avx512fp16")]
27123    unsafe fn test_mm512_mask_cvtph_pd() {
27124        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27125        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27126        let r = _mm512_mask_cvtph_pd(src, 0b01010101, a);
27127        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27128        assert_eq_m512d(r, e);
27129    }
27130
27131    #[simd_test(enable = "avx512fp16")]
27132    unsafe fn test_mm512_maskz_cvtph_pd() {
27133        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27134        let r = _mm512_maskz_cvtph_pd(0b01010101, a);
27135        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27136        assert_eq_m512d(r, e);
27137    }
27138
27139    #[simd_test(enable = "avx512fp16")]
27140    unsafe fn test_mm512_cvt_roundph_pd() {
27141        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27142        let r = _mm512_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(a);
27143        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27144        assert_eq_m512d(r, e);
27145    }
27146
27147    #[simd_test(enable = "avx512fp16")]
27148    unsafe fn test_mm512_mask_cvt_roundph_pd() {
27149        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27150        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27151        let r = _mm512_mask_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
27152        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27153        assert_eq_m512d(r, e);
27154    }
27155
27156    #[simd_test(enable = "avx512fp16")]
27157    unsafe fn test_mm512_maskz_cvt_roundph_pd() {
27158        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27159        let r = _mm512_maskz_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(0b01010101, a);
27160        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27161        assert_eq_m512d(r, e);
27162    }
27163
27164    #[simd_test(enable = "avx512fp16")]
27165    unsafe fn test_mm_cvtsh_sd() {
27166        let a = _mm_setr_pd(2.0, 20.0);
27167        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27168        let r = _mm_cvtsh_sd(a, b);
27169        let e = _mm_setr_pd(1.0, 20.0);
27170        assert_eq_m128d(r, e);
27171    }
27172
27173    #[simd_test(enable = "avx512fp16")]
27174    unsafe fn test_mm_mask_cvtsh_sd() {
27175        let src = _mm_setr_pd(3.0, 11.0);
27176        let a = _mm_setr_pd(2.0, 20.0);
27177        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27178        let r = _mm_mask_cvtsh_sd(src, 0, a, b);
27179        let e = _mm_setr_pd(3.0, 20.0);
27180        assert_eq_m128d(r, e);
27181        let r = _mm_mask_cvtsh_sd(src, 1, a, b);
27182        let e = _mm_setr_pd(1.0, 20.0);
27183        assert_eq_m128d(r, e);
27184    }
27185
27186    #[simd_test(enable = "avx512fp16")]
27187    unsafe fn test_mm_maskz_cvtsh_sd() {
27188        let a = _mm_setr_pd(2.0, 20.0);
27189        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27190        let r = _mm_maskz_cvtsh_sd(0, a, b);
27191        let e = _mm_setr_pd(0.0, 20.0);
27192        assert_eq_m128d(r, e);
27193        let r = _mm_maskz_cvtsh_sd(1, a, b);
27194        let e = _mm_setr_pd(1.0, 20.0);
27195        assert_eq_m128d(r, e);
27196    }
27197
27198    #[simd_test(enable = "avx512fp16")]
27199    unsafe fn test_mm_cvt_roundsh_sd() {
27200        let a = _mm_setr_pd(2.0, 20.0);
27201        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27202        let r = _mm_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(a, b);
27203        let e = _mm_setr_pd(1.0, 20.0);
27204        assert_eq_m128d(r, e);
27205    }
27206
27207    #[simd_test(enable = "avx512fp16")]
27208    unsafe fn test_mm_mask_cvt_roundsh_sd() {
27209        let src = _mm_setr_pd(3.0, 11.0);
27210        let a = _mm_setr_pd(2.0, 20.0);
27211        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27212        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27213        let e = _mm_setr_pd(3.0, 20.0);
27214        assert_eq_m128d(r, e);
27215        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27216        let e = _mm_setr_pd(1.0, 20.0);
27217        assert_eq_m128d(r, e);
27218    }
27219
27220    #[simd_test(enable = "avx512fp16")]
27221    unsafe fn test_mm_maskz_cvt_roundsh_sd() {
27222        let a = _mm_setr_pd(2.0, 20.0);
27223        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27224        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(0, a, b);
27225        let e = _mm_setr_pd(0.0, 20.0);
27226        assert_eq_m128d(r, e);
27227        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(1, a, b);
27228        let e = _mm_setr_pd(1.0, 20.0);
27229        assert_eq_m128d(r, e);
27230    }
27231
27232    #[simd_test(enable = "avx512fp16")]
27233    unsafe fn test_mm_cvtsh_h() {
27234        let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
27235        let r = _mm_cvtsh_h(a);
27236        assert_eq!(r, 1.0);
27237    }
27238
27239    #[simd_test(enable = "avx512fp16")]
27240    unsafe fn test_mm256_cvtsh_h() {
27241        let a = _mm256_setr_ph(
27242            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27243        );
27244        let r = _mm256_cvtsh_h(a);
27245        assert_eq!(r, 1.0);
27246    }
27247
27248    #[simd_test(enable = "avx512fp16")]
27249    unsafe fn test_mm512_cvtsh_h() {
27250        let a = _mm512_setr_ph(
27251            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27252            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
27253            31.0, 32.0,
27254        );
27255        let r = _mm512_cvtsh_h(a);
27256        assert_eq!(r, 1.0);
27257    }
27258
27259    #[simd_test(enable = "avx512fp16")]
27260    unsafe fn test_mm_cvtsi128_si16() {
27261        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
27262        let r = _mm_cvtsi128_si16(a);
27263        assert_eq!(r, 1);
27264    }
27265
27266    #[simd_test(enable = "avx512fp16")]
27267    unsafe fn test_mm_cvtsi16_si128() {
27268        let a = 1;
27269        let r = _mm_cvtsi16_si128(a);
27270        let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
27271        assert_eq_m128i(r, e);
27272    }
27273}