From WikiChip
Editing x86/avx512 bf16

Warning: You are not logged in. Your IP address will be publicly visible if you make any edits. If you log in or create an account, your edits will be attributed to your username, along with other benefits.

The edit can be undone. Please check the comparison below to verify that this is what you want to do, and then save the changes below to finish undoing the edit.

This page supports semantic in-text annotations (e.g. "[[Is specified as::World Heritage Site]]") to build structured and queryable content provided by Semantic MediaWiki. For a comprehensive description on how to use annotations or the #ask parser function, please have a look at the getting started, in-text annotation, or inline queries help pages.

Latest revision Your text
Line 6: Line 6:
  
 
* <code>VCVTNE2PS2BF16</code> - Convert two SIMD registers with packed single-precision floating point values to [[bfloat16]] packed in a single register.
 
* <code>VCVTNE2PS2BF16</code> - Convert two SIMD registers with packed single-precision floating point values to [[bfloat16]] packed in a single register.
* <code>VCVTNEPS2BF16</code> - Convert one SIMD register with packed single-precision floating-point values to [[bfloat16]] packed in a single register.
+
* <code>VCVTNEPS2BF16</code> - Convert one SIMD register with packed single-precision floating-point values to [[bfloat16]] packed in a single register.  
 
* <code>VDPBF16PS</code> - Performs a SIMD dot-product on [[bfloat16]] pairs and accumulates the results into a packaged single-precision register.
 
* <code>VDPBF16PS</code> - Performs a SIMD dot-product on [[bfloat16]] pairs and accumulates the results into a packaged single-precision register.
  
Line 13: Line 13:
  
 
== Detection ==
 
== Detection ==
Support for these instructions is indicated by the AVX512_BF16 feature flag. 128- and 256-bit vectors are supported if the AVX512VL flag is set as well.
 
 
 
{| class="wikitable"
 
{| class="wikitable"
! colspan="2" | {{x86|CPUID}} !! rowspan="2" | Instruction Set
+
! colspan="2" | {{x86|CPUID}} !! rowspan="2" | Instruction Set  
 
|-
 
|-
 
! Input !! Output
 
! Input !! Output
 
|-
 
|-
| EAX=07H, ECX=0 || EBX[bit 31] || AVX512VL
+
| rowspan="14" | EAX=07H, ECX=1 || EAX[bit 05] || AVX512_BF16
|-
 
| EAX=07H, ECX=1 || EAX[bit 05] || AVX512_BF16
 
 
|}
 
|}
  
 
== Microarchitecture support ==
 
== Microarchitecture support ==
<!-- Wrong/incomplete? Visit https://en.wikichip.org/wiki/Template:avx512_support_matrix -->
+
{| class="wikitable"
{{avx512 support matrix|em=VL+BF16}}
+
|-
 +
! Instructions !! Introduction
 +
|-
 +
| AVX512_BF16 || {{intel|Cooper Lake|l=arch}} (server)<br>{{intel|Sapphire Rapids|l=arch}}
 +
|}
  
 
== Intrinsic functions ==
 
== Intrinsic functions ==
<source lang=c>
+
<source lang=asm>
// VCVTNE2PS2BF16
+
# vcvtne2ps2bf16
__m128bh _mm_cvtne2ps_pbh (__m128 a, __m128 b);
+
__m128bh _mm_cvtne2ps_pbh (__m128 a, __m128 b)
__m128bh _mm_mask_cvtne2ps_pbh (__m128bh src, __mmask8 k, __m128 a, __m128 b);
+
__m128bh _mm_mask_cvtne2ps_pbh (__m128bh src, __mmask8 k, __m128 a, __m128 b)
__m128bh _mm_maskz_cvtne2ps_pbh (__mmask8 k, __m128 a, __m128 b);
+
__m128bh _mm_maskz_cvtne2ps_pbh (__mmask8 k, __m128 a, __m128 b)
__m256bh _mm256_cvtne2ps_pbh (__m256 a, __m256 b);
+
__m256bh _mm256_cvtne2ps_pbh (__m256 a, __m256 b)
__m256bh _mm256_mask_cvtne2ps_pbh (__m256bh src, __mmask16 k, __m256 a, __m256 b);
+
__m256bh _mm256_mask_cvtne2ps_pbh (__m256bh src, __mmask16 k, __m256 a, __m256 b)
__m256bh _mm256_maskz_cvtne2ps_pbh (__mmask16 k, __m256 a, __m256 b);
+
__m256bh _mm256_maskz_cvtne2ps_pbh (__mmask16 k, __m256 a, __m256 b)
__m512bh _mm512_cvtne2ps_pbh (__m512 a, __m512 b);
+
__m512bh _mm512_cvtne2ps_pbh (__m512 a, __m512 b)
__m512bh _mm512_mask_cvtne2ps_pbh (__m512bh src, __mmask32 k, __m512 a, __m512 b);
+
__m512bh _mm512_mask_cvtne2ps_pbh (__m512bh src, __mmask32 k, __m512 a, __m512 b)
__m512bh _mm512_maskz_cvtne2ps_pbh (__mmask32 k, __m512 a, __m512 b);
+
__m512bh _mm512_maskz_cvtne2ps_pbh (__mmask32 k, __m512 a, __m512 b)
// VCVTNEPS2BF16
+
# vcvtneps2bf16
__m128bh _mm_cvtneps_pbh (__m128 a);
+
__m128bh _mm_cvtneps_pbh (__m128 a)
__m128bh _mm_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m128 a);
+
__m128bh _mm_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m128 a)
__m128bh _mm_maskz_cvtneps_pbh (__mmask8 k, __m128 a);
+
__m128bh _mm_maskz_cvtneps_pbh (__mmask8 k, __m128 a)
__m128bh _mm256_cvtneps_pbh (__m256 a);
+
__m128bh _mm256_cvtneps_pbh (__m256 a)
__m128bh _mm256_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m256 a);
+
__m128bh _mm256_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m256 a)
__m128bh _mm256_maskz_cvtneps_pbh (__mmask8 k, __m256 a);
+
__m128bh _mm256_maskz_cvtneps_pbh (__mmask8 k, __m256 a)
__m256bh _mm512_cvtneps_pbh (__m512 a);
+
__m256bh _mm512_cvtneps_pbh (__m512 a)
__m256bh _mm512_mask_cvtneps_pbh (__m256bh src, __mmask16 k, __m512 a);
+
__m256bh _mm512_mask_cvtneps_pbh (__m256bh src, __mmask16 k, __m512 a)
__m256bh _mm512_maskz_cvtneps_pbh (__mmask16 k, __m512 a);
+
__m256bh _mm512_maskz_cvtneps_pbh (__mmask16 k, __m512 a)
// VDPBF16PS
+
# vdpbf16ps
__m128 _mm_dpbf16_ps (__m128 src, __m128bh a, __m128bh b);
+
__m128 _mm_dpbf16_ps (__m128 src, __m128bh a, __m128bh b)
__m128 _mm_mask_dpbf16_ps (__m128 src, __mmask8 k, __m128bh a, __m128bh b);
+
__m128 _mm_mask_dpbf16_ps (__m128 src, __mmask8 k, __m128bh a, __m128bh b)
__m128 _mm_maskz_dpbf16_ps (__mmask8 k, __m128 src, __m128bh a, __m128bh b);
+
__m128 _mm_maskz_dpbf16_ps (__mmask8 k, __m128 src, __m128bh a, __m128bh b)
__m256 _mm256_dpbf16_ps (__m256 src, __m256bh a, __m256bh b);
+
__m256 _mm256_dpbf16_ps (__m256 src, __m256bh a, __m256bh b)
__m256 _mm256_mask_dpbf16_ps (__m256 src, __mmask8 k, __m256bh a, __m256bh b);
+
__m256 _mm256_mask_dpbf16_ps (__m256 src, __mmask8 k, __m256bh a, __m256bh b)
__m256 _mm256_maskz_dpbf16_ps (__mmask8 k, __m256 src, __m256bh a, __m256bh b);
+
__m256 _mm256_maskz_dpbf16_ps (__mmask8 k, __m256 src, __m256bh a, __m256bh b)
__m512 _mm512_dpbf16_ps (__m512 src, __m512bh a, __m512bh b);
+
__m512 _mm512_dpbf16_ps (__m512 src, __m512bh a, __m512bh b)
__m512 _mm512_mask_dpbf16_ps (__m512 src, __mmask16 k, __m512bh a, __m512bh b);
+
__m512 _mm512_mask_dpbf16_ps (__m512 src, __mmask16 k, __m512bh a, __m512bh b)
__m512 _mm512_maskz_dpbf16_ps (__mmask16 k, __m512 src, __m512bh a, __m512bh b);
+
__m512 _mm512_maskz_dpbf16_ps (__mmask16 k, __m512 src, __m512bh a, __m512bh b)
 
</source>
 
</source>
  
Line 68: Line 68:
  
 
== Bibliography ==
 
== Bibliography ==
* ''Intel Architecture Instruction Set Extensions and Future Features Programming Reference'', Revision 36. (Ref #319433-036)
+
* ''Intel Architecture Instruction Set Extensions and Future Features Programming Reference'', Revision 36. (Ref #319433-039)
  
 
[[Category:x86_extensions]]
 
[[Category:x86_extensions]]

Please note that all contributions to WikiChip may be edited, altered, or removed by other contributors. If you do not want your writing to be edited mercilessly, then do not submit it here.
You are also promising us that you wrote this yourself, or copied it from a public domain or similar free resource (see WikiChip:Copyrights for details). Do not submit copyrighted work without permission!

Cancel | Editing help (opens in new window)