core/stdarch/crates/core_arch/src/x86/
avxneconvert.rs

1use crate::arch::asm;
2use crate::core_arch::x86::*;
3
4#[cfg(test)]
5use stdarch_test::assert_instr;
6
7/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
8/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit)
9/// floating-point elements, and store the results in dst.
10///
11/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
12#[inline]
13#[target_feature(enable = "avxneconvert")]
14#[cfg_attr(
15    all(test, any(target_os = "linux", target_env = "msvc")),
16    assert_instr(vbcstnebf162ps)
17)]
18#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
19pub unsafe fn _mm_bcstnebf16_ps(a: *const bf16) -> __m128 {
20    bcstnebf162ps_128(a)
21}
22
23/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
24/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point
25/// elements, and store the results in dst.
26///
27/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
28#[inline]
29#[target_feature(enable = "avxneconvert")]
30#[cfg_attr(
31    all(test, any(target_os = "linux", target_env = "msvc")),
32    assert_instr(vbcstnebf162ps)
33)]
34#[unstable(feature = "stdarch_x86_avx512_bf16", issue = "127356")]
35pub unsafe fn _mm256_bcstnebf16_ps(a: *const bf16) -> __m256 {
36    bcstnebf162ps_256(a)
37}
38
39/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
40/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
41/// (32-bit) floating-point elements, and store the results in dst.
42///
43/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps)
44#[inline]
45#[target_feature(enable = "avxneconvert")]
46#[cfg_attr(
47    all(test, any(target_os = "linux", target_env = "msvc")),
48    assert_instr(vbcstnesh2ps)
49)]
50#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
51pub unsafe fn _mm_bcstnesh_ps(a: *const f16) -> __m128 {
52    bcstnesh2ps_128(a)
53}
54
55/// Convert scalar half-precision (16-bit) floating-point element stored at memory locations starting
56/// at location a to a single-precision (32-bit) floating-point, broadcast it to packed single-precision
57/// (32-bit) floating-point elements, and store the results in dst.
58///
59/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps)
60#[inline]
61#[target_feature(enable = "avxneconvert")]
62#[cfg_attr(
63    all(test, any(target_os = "linux", target_env = "msvc")),
64    assert_instr(vbcstnesh2ps)
65)]
66#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
67pub unsafe fn _mm256_bcstnesh_ps(a: *const f16) -> __m256 {
68    bcstnesh2ps_256(a)
69}
70
71/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
72/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
73///
74/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
75#[inline]
76#[target_feature(enable = "avxneconvert")]
77#[cfg_attr(
78    all(test, any(target_os = "linux", target_env = "msvc")),
79    assert_instr(vcvtneebf162ps)
80)]
81#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
82pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 {
83    transmute(cvtneebf162ps_128(a))
84}
85
86/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
87/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
88///
89/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
90#[inline]
91#[target_feature(enable = "avxneconvert")]
92#[cfg_attr(
93    all(test, any(target_os = "linux", target_env = "msvc")),
94    assert_instr(vcvtneebf162ps)
95)]
96#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
97pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 {
98    transmute(cvtneebf162ps_256(a))
99}
100
101/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
102/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
103///
104/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps)
105#[inline]
106#[target_feature(enable = "avxneconvert")]
107#[cfg_attr(
108    all(test, any(target_os = "linux", target_env = "msvc")),
109    assert_instr(vcvtneeph2ps)
110)]
111#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
112pub unsafe fn _mm_cvtneeph_ps(a: *const __m128h) -> __m128 {
113    transmute(cvtneeph2ps_128(a))
114}
115
116/// Convert packed half-precision (16-bit) floating-point even-indexed elements stored at memory locations starting at
117/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
118///
119/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps)
120#[inline]
121#[target_feature(enable = "avxneconvert")]
122#[cfg_attr(
123    all(test, any(target_os = "linux", target_env = "msvc")),
124    assert_instr(vcvtneeph2ps)
125)]
126#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
127pub unsafe fn _mm256_cvtneeph_ps(a: *const __m256h) -> __m256 {
128    transmute(cvtneeph2ps_256(a))
129}
130
131/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
132/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
133///
134/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
135#[inline]
136#[target_feature(enable = "avxneconvert")]
137#[cfg_attr(
138    all(test, any(target_os = "linux", target_env = "msvc")),
139    assert_instr(vcvtneobf162ps)
140)]
141#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
142pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 {
143    transmute(cvtneobf162ps_128(a))
144}
145
146/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
147/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
148///
149/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
150#[inline]
151#[target_feature(enable = "avxneconvert")]
152#[cfg_attr(
153    all(test, any(target_os = "linux", target_env = "msvc")),
154    assert_instr(vcvtneobf162ps)
155)]
156#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
157pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 {
158    transmute(cvtneobf162ps_256(a))
159}
160
161/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
162/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
163///
164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps)
165#[inline]
166#[target_feature(enable = "avxneconvert")]
167#[cfg_attr(
168    all(test, any(target_os = "linux", target_env = "msvc")),
169    assert_instr(vcvtneoph2ps)
170)]
171#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
172pub unsafe fn _mm_cvtneoph_ps(a: *const __m128h) -> __m128 {
173    transmute(cvtneoph2ps_128(a))
174}
175
176/// Convert packed half-precision (16-bit) floating-point odd-indexed elements stored at memory locations starting at
177/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
178///
179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps)
180#[inline]
181#[target_feature(enable = "avxneconvert")]
182#[cfg_attr(
183    all(test, any(target_os = "linux", target_env = "msvc")),
184    assert_instr(vcvtneoph2ps)
185)]
186#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
187pub unsafe fn _mm256_cvtneoph_ps(a: *const __m256h) -> __m256 {
188    transmute(cvtneoph2ps_256(a))
189}
190
191/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
192/// elements, and store the results in dst.
193///
194/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh)
195#[inline]
196#[target_feature(enable = "avxneconvert")]
197#[cfg_attr(
198    all(test, any(target_os = "linux", target_env = "msvc")),
199    assert_instr(vcvtneps2bf16)
200)]
201#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
202pub fn _mm_cvtneps_avx_pbh(a: __m128) -> __m128bh {
203    unsafe {
204        let mut dst: __m128bh;
205        asm!(
206            "{{vex}}vcvtneps2bf16 {dst},{src}",
207            dst = lateout(xmm_reg) dst,
208            src = in(xmm_reg) a,
209            options(pure, nomem, nostack, preserves_flags)
210        );
211        dst
212    }
213}
214
215/// Convert packed single precision (32-bit) floating-point elements in a to packed BF16 (16-bit) floating-point
216/// elements, and store the results in dst.
217///
218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh)
219#[inline]
220#[target_feature(enable = "avxneconvert")]
221#[cfg_attr(
222    all(test, any(target_os = "linux", target_env = "msvc")),
223    assert_instr(vcvtneps2bf16)
224)]
225#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
226pub fn _mm256_cvtneps_avx_pbh(a: __m256) -> __m128bh {
227    unsafe {
228        let mut dst: __m128bh;
229        asm!(
230            "{{vex}}vcvtneps2bf16 {dst},{src}",
231            dst = lateout(xmm_reg) dst,
232            src = in(ymm_reg) a,
233            options(pure, nomem, nostack, preserves_flags)
234        );
235        dst
236    }
237}
238
239#[allow(improper_ctypes)]
240unsafe extern "C" {
241    #[link_name = "llvm.x86.vbcstnebf162ps128"]
242    fn bcstnebf162ps_128(a: *const bf16) -> __m128;
243    #[link_name = "llvm.x86.vbcstnebf162ps256"]
244    fn bcstnebf162ps_256(a: *const bf16) -> __m256;
245    #[link_name = "llvm.x86.vbcstnesh2ps128"]
246    fn bcstnesh2ps_128(a: *const f16) -> __m128;
247    #[link_name = "llvm.x86.vbcstnesh2ps256"]
248    fn bcstnesh2ps_256(a: *const f16) -> __m256;
249
250    #[link_name = "llvm.x86.vcvtneebf162ps128"]
251    fn cvtneebf162ps_128(a: *const __m128bh) -> __m128;
252    #[link_name = "llvm.x86.vcvtneebf162ps256"]
253    fn cvtneebf162ps_256(a: *const __m256bh) -> __m256;
254    #[link_name = "llvm.x86.vcvtneeph2ps128"]
255    fn cvtneeph2ps_128(a: *const __m128h) -> __m128;
256    #[link_name = "llvm.x86.vcvtneeph2ps256"]
257    fn cvtneeph2ps_256(a: *const __m256h) -> __m256;
258
259    #[link_name = "llvm.x86.vcvtneobf162ps128"]
260    fn cvtneobf162ps_128(a: *const __m128bh) -> __m128;
261    #[link_name = "llvm.x86.vcvtneobf162ps256"]
262    fn cvtneobf162ps_256(a: *const __m256bh) -> __m256;
263    #[link_name = "llvm.x86.vcvtneoph2ps128"]
264    fn cvtneoph2ps_128(a: *const __m128h) -> __m128;
265    #[link_name = "llvm.x86.vcvtneoph2ps256"]
266    fn cvtneoph2ps_256(a: *const __m256h) -> __m256;
267}
268
269#[cfg(test)]
270mod tests {
271    use crate::core_arch::simd::{u16x4, u16x8};
272    use crate::core_arch::x86::*;
273    use crate::mem::transmute_copy;
274    use std::ptr::addr_of;
275    use stdarch_test::simd_test;
276
277    const BF16_ONE: u16 = 0b0_01111111_0000000;
278    const BF16_TWO: u16 = 0b0_10000000_0000000;
279    const BF16_THREE: u16 = 0b0_10000000_1000000;
280    const BF16_FOUR: u16 = 0b0_10000001_0000000;
281    const BF16_FIVE: u16 = 0b0_10000001_0100000;
282    const BF16_SIX: u16 = 0b0_10000001_1000000;
283    const BF16_SEVEN: u16 = 0b0_10000001_1100000;
284    const BF16_EIGHT: u16 = 0b0_10000010_0000000;
285
286    #[simd_test(enable = "avxneconvert")]
287    unsafe fn test_mm_bcstnebf16_ps() {
288        let a = bf16::from_bits(BF16_ONE);
289        let r = _mm_bcstnebf16_ps(addr_of!(a));
290        let e = _mm_set_ps(1., 1., 1., 1.);
291        assert_eq_m128(r, e);
292    }
293
294    #[simd_test(enable = "avxneconvert")]
295    unsafe fn test_mm256_bcstnebf16_ps() {
296        let a = bf16::from_bits(BF16_ONE);
297        let r = _mm256_bcstnebf16_ps(addr_of!(a));
298        let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
299        assert_eq_m256(r, e);
300    }
301
302    #[simd_test(enable = "avxneconvert")]
303    unsafe fn test_mm_bcstnesh_ps() {
304        let a = 1.0_f16;
305        let r = _mm_bcstnesh_ps(addr_of!(a));
306        let e = _mm_set_ps(1., 1., 1., 1.);
307        assert_eq_m128(r, e);
308    }
309
310    #[simd_test(enable = "avxneconvert")]
311    unsafe fn test_mm256_bcstnesh_ps() {
312        let a = 1.0_f16;
313        let r = _mm256_bcstnesh_ps(addr_of!(a));
314        let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
315        assert_eq_m256(r, e);
316    }
317
318    #[simd_test(enable = "avxneconvert")]
319    unsafe fn test_mm_cvtneebf16_ps() {
320        let a = __m128bh([
321            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
322        ]);
323        let r = _mm_cvtneebf16_ps(addr_of!(a));
324        let e = _mm_setr_ps(1., 3., 5., 7.);
325        assert_eq_m128(r, e);
326    }
327
328    #[simd_test(enable = "avxneconvert")]
329    unsafe fn test_mm256_cvtneebf16_ps() {
330        let a = __m256bh([
331            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
332            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
333        ]);
334        let r = _mm256_cvtneebf16_ps(addr_of!(a));
335        let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.);
336        assert_eq_m256(r, e);
337    }
338
339    #[simd_test(enable = "avxneconvert")]
340    unsafe fn test_mm_cvtneeph_ps() {
341        let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
342        let r = _mm_cvtneeph_ps(addr_of!(a));
343        let e = _mm_setr_ps(1., 3., 5., 7.);
344        assert_eq_m128(r, e);
345    }
346
347    #[simd_test(enable = "avxneconvert")]
348    unsafe fn test_mm256_cvtneeph_ps() {
349        let a = __m256h([
350            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
351        ]);
352        let r = _mm256_cvtneeph_ps(addr_of!(a));
353        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
354        assert_eq_m256(r, e);
355    }
356
357    #[simd_test(enable = "avxneconvert")]
358    unsafe fn test_mm_cvtneobf16_ps() {
359        let a = __m128bh([
360            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
361        ]);
362        let r = _mm_cvtneobf16_ps(addr_of!(a));
363        let e = _mm_setr_ps(2., 4., 6., 8.);
364        assert_eq_m128(r, e);
365    }
366
367    #[simd_test(enable = "avxneconvert")]
368    unsafe fn test_mm256_cvtneobf16_ps() {
369        let a = __m256bh([
370            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
371            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
372        ]);
373        let r = _mm256_cvtneobf16_ps(addr_of!(a));
374        let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.);
375        assert_eq_m256(r, e);
376    }
377
378    #[simd_test(enable = "avxneconvert")]
379    unsafe fn test_mm_cvtneoph_ps() {
380        let a = __m128h([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
381        let r = _mm_cvtneoph_ps(addr_of!(a));
382        let e = _mm_setr_ps(2., 4., 6., 8.);
383        assert_eq_m128(r, e);
384    }
385
386    #[simd_test(enable = "avxneconvert")]
387    unsafe fn test_mm256_cvtneoph_ps() {
388        let a = __m256h([
389            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
390        ]);
391        let r = _mm256_cvtneoph_ps(addr_of!(a));
392        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
393        assert_eq_m256(r, e);
394    }
395
396    #[simd_test(enable = "avxneconvert")]
397    unsafe fn test_mm_cvtneps_avx_pbh() {
398        let a = _mm_setr_ps(1., 2., 3., 4.);
399        let r: u16x4 = transmute_copy(&_mm_cvtneps_avx_pbh(a));
400        let e = u16x4::new(BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR);
401        assert_eq!(r, e);
402    }
403
404    #[simd_test(enable = "avxneconvert")]
405    unsafe fn test_mm256_cvtneps_avx_pbh() {
406        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
407        let r: u16x8 = transmute(_mm256_cvtneps_avx_pbh(a));
408        let e = u16x8::new(
409            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
410        );
411        assert_eq!(r, e);
412    }
413}