From WikiChip
Difference between revisions of "x86/avx512 bf16"
< x86

(Replaced support matrix.)
 
Line 6: Line 6:
  
 
* <code>VCVTNE2PS2BF16</code> - Convert two SIMD registers with packed single-precision floating point values to [[bfloat16]] packed in a single register.
 
* <code>VCVTNE2PS2BF16</code> - Convert two SIMD registers with packed single-precision floating point values to [[bfloat16]] packed in a single register.
* <code>VCVTNEPS2BF16</code> - Convert one SIMD register with packed single-precision floating-point values to [[bfloat16]] packed in a single register.  
+
* <code>VCVTNEPS2BF16</code> - Convert one SIMD register with packed single-precision floating-point values to [[bfloat16]] packed in a single register.
 
* <code>VDPBF16PS</code> - Performs a SIMD dot-product on [[bfloat16]] pairs and accumulates the results into a packaged single-precision register.
 
* <code>VDPBF16PS</code> - Performs a SIMD dot-product on [[bfloat16]] pairs and accumulates the results into a packaged single-precision register.
  
Line 13: Line 13:
  
 
== Detection ==
 
== Detection ==
 +
Support for these instructions is indicated by the AVX512_BF16 feature flag. 128- and 256-bit vectors are supported if the AVX512VL flag is set as well.
 +
 
{| class="wikitable"
 
{| class="wikitable"
! colspan="2" | {{x86|CPUID}} !! rowspan="2" | Instruction Set  
+
! colspan="2" | {{x86|CPUID}} !! rowspan="2" | Instruction Set
 
|-
 
|-
 
! Input !! Output
 
! Input !! Output
 
|-
 
|-
| rowspan="14" | EAX=07H, ECX=1 || EAX[bit 05] || AVX512_BF16
+
| EAX=07H, ECX=0 || EBX[bit 31] || AVX512VL
 +
|-
 +
| EAX=07H, ECX=1 || EAX[bit 05] || AVX512_BF16
 
|}
 
|}
  
 
== Microarchitecture support ==
 
== Microarchitecture support ==
{| class="wikitable"
+
<!-- Wrong/incomplete? Visit https://en.wikichip.org/wiki/Template:avx512_support_matrix -->
|-
+
{{avx512 support matrix|em=VL+BF16}}
! Instructions !! Introduction
 
|-
 
| AVX512_BF16 || {{intel|Cooper Lake|l=arch}} (server)<br>{{intel|Sapphire Rapids|l=arch}}
 
|}
 
  
 
== Intrinsic functions ==
 
== Intrinsic functions ==
<source lang=asm>
+
<source lang=c>
# vcvtne2ps2bf16
+
// VCVTNE2PS2BF16
__m128bh _mm_cvtne2ps_pbh (__m128 a, __m128 b)
+
__m128bh _mm_cvtne2ps_pbh (__m128 a, __m128 b);
__m128bh _mm_mask_cvtne2ps_pbh (__m128bh src, __mmask8 k, __m128 a, __m128 b)
+
__m128bh _mm_mask_cvtne2ps_pbh (__m128bh src, __mmask8 k, __m128 a, __m128 b);
__m128bh _mm_maskz_cvtne2ps_pbh (__mmask8 k, __m128 a, __m128 b)
+
__m128bh _mm_maskz_cvtne2ps_pbh (__mmask8 k, __m128 a, __m128 b);
__m256bh _mm256_cvtne2ps_pbh (__m256 a, __m256 b)
+
__m256bh _mm256_cvtne2ps_pbh (__m256 a, __m256 b);
__m256bh _mm256_mask_cvtne2ps_pbh (__m256bh src, __mmask16 k, __m256 a, __m256 b)
+
__m256bh _mm256_mask_cvtne2ps_pbh (__m256bh src, __mmask16 k, __m256 a, __m256 b);
__m256bh _mm256_maskz_cvtne2ps_pbh (__mmask16 k, __m256 a, __m256 b)
+
__m256bh _mm256_maskz_cvtne2ps_pbh (__mmask16 k, __m256 a, __m256 b);
__m512bh _mm512_cvtne2ps_pbh (__m512 a, __m512 b)
+
__m512bh _mm512_cvtne2ps_pbh (__m512 a, __m512 b);
__m512bh _mm512_mask_cvtne2ps_pbh (__m512bh src, __mmask32 k, __m512 a, __m512 b)
+
__m512bh _mm512_mask_cvtne2ps_pbh (__m512bh src, __mmask32 k, __m512 a, __m512 b);
__m512bh _mm512_maskz_cvtne2ps_pbh (__mmask32 k, __m512 a, __m512 b)
+
__m512bh _mm512_maskz_cvtne2ps_pbh (__mmask32 k, __m512 a, __m512 b);
# vcvtneps2bf16
+
// VCVTNEPS2BF16
__m128bh _mm_cvtneps_pbh (__m128 a)
+
__m128bh _mm_cvtneps_pbh (__m128 a);
__m128bh _mm_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m128 a)
+
__m128bh _mm_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m128 a);
__m128bh _mm_maskz_cvtneps_pbh (__mmask8 k, __m128 a)
+
__m128bh _mm_maskz_cvtneps_pbh (__mmask8 k, __m128 a);
__m128bh _mm256_cvtneps_pbh (__m256 a)
+
__m128bh _mm256_cvtneps_pbh (__m256 a);
__m128bh _mm256_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m256 a)
+
__m128bh _mm256_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m256 a);
__m128bh _mm256_maskz_cvtneps_pbh (__mmask8 k, __m256 a)
+
__m128bh _mm256_maskz_cvtneps_pbh (__mmask8 k, __m256 a);
__m256bh _mm512_cvtneps_pbh (__m512 a)
+
__m256bh _mm512_cvtneps_pbh (__m512 a);
__m256bh _mm512_mask_cvtneps_pbh (__m256bh src, __mmask16 k, __m512 a)
+
__m256bh _mm512_mask_cvtneps_pbh (__m256bh src, __mmask16 k, __m512 a);
__m256bh _mm512_maskz_cvtneps_pbh (__mmask16 k, __m512 a)
+
__m256bh _mm512_maskz_cvtneps_pbh (__mmask16 k, __m512 a);
# vdpbf16ps
+
// VDPBF16PS
__m128 _mm_dpbf16_ps (__m128 src, __m128bh a, __m128bh b)
+
__m128 _mm_dpbf16_ps (__m128 src, __m128bh a, __m128bh b);
__m128 _mm_mask_dpbf16_ps (__m128 src, __mmask8 k, __m128bh a, __m128bh b)
+
__m128 _mm_mask_dpbf16_ps (__m128 src, __mmask8 k, __m128bh a, __m128bh b);
__m128 _mm_maskz_dpbf16_ps (__mmask8 k, __m128 src, __m128bh a, __m128bh b)
+
__m128 _mm_maskz_dpbf16_ps (__mmask8 k, __m128 src, __m128bh a, __m128bh b);
__m256 _mm256_dpbf16_ps (__m256 src, __m256bh a, __m256bh b)
+
__m256 _mm256_dpbf16_ps (__m256 src, __m256bh a, __m256bh b);
__m256 _mm256_mask_dpbf16_ps (__m256 src, __mmask8 k, __m256bh a, __m256bh b)
+
__m256 _mm256_mask_dpbf16_ps (__m256 src, __mmask8 k, __m256bh a, __m256bh b);
__m256 _mm256_maskz_dpbf16_ps (__mmask8 k, __m256 src, __m256bh a, __m256bh b)
+
__m256 _mm256_maskz_dpbf16_ps (__mmask8 k, __m256 src, __m256bh a, __m256bh b);
__m512 _mm512_dpbf16_ps (__m512 src, __m512bh a, __m512bh b)
+
__m512 _mm512_dpbf16_ps (__m512 src, __m512bh a, __m512bh b);
__m512 _mm512_mask_dpbf16_ps (__m512 src, __mmask16 k, __m512bh a, __m512bh b)
+
__m512 _mm512_mask_dpbf16_ps (__m512 src, __mmask16 k, __m512bh a, __m512bh b);
__m512 _mm512_maskz_dpbf16_ps (__mmask16 k, __m512 src, __m512bh a, __m512bh b)
+
__m512 _mm512_maskz_dpbf16_ps (__mmask16 k, __m512 src, __m512bh a, __m512bh b);
 
</source>
 
</source>
  

Latest revision as of 15:50, 15 March 2023

AVX-512 BFloat16 Instructions (AVX512_BF16) is an x86 extension, part of AVX-512, designed to accelerate neural network-based algorithms by performing dot-product on bfloat16.

Overview[edit]

The AVX512 BF16 x86 extension extends AVX-512 Foundation by introducing three new instructions for converting and operating on bfloat16.

  • VCVTNE2PS2BF16 - Convert two SIMD registers with packed single-precision floating point values to bfloat16 packed in a single register.
  • VCVTNEPS2BF16 - Convert one SIMD register with packed single-precision floating-point values to bfloat16 packed in a single register.
  • VDPBF16PS - Performs a SIMD dot-product on bfloat16 pairs and accumulates the results into a packaged single-precision register.

Motivation[edit]

See bfloat16 § Motivation.

Detection[edit]

Support for these instructions is indicated by the AVX512_BF16 feature flag. 128- and 256-bit vectors are supported if the AVX512VL flag is set as well.

CPUID Instruction Set
Input Output
EAX=07H, ECX=0 EBX[bit 31] AVX512VL
EAX=07H, ECX=1 EAX[bit 05] AVX512_BF16

Microarchitecture support[edit]

Designer Microarchitecture Year Support Level
F CD ER PF BW DQ VL FP16 IFMA VBMI VBMI2 BITALG VPOPCNTDQ VP2INTERSECT 4VNNIW 4FMAPS VNNI BF16
Intel Knights Landing 2016
Knights Mill 2017
Skylake (server) 2017
Cannon Lake 2018
Cascade Lake 2019
Cooper Lake 2020
Tiger Lake 2020
Rocket Lake 2021
Alder Lake 2021
Ice Lake (server) 2021
Sapphire Rapids 2023
AMD Zen 4 2022
Centaur CHA

Intrinsic functions[edit]

// VCVTNE2PS2BF16
__m128bh _mm_cvtne2ps_pbh (__m128 a, __m128 b);
__m128bh _mm_mask_cvtne2ps_pbh (__m128bh src, __mmask8 k, __m128 a, __m128 b);
__m128bh _mm_maskz_cvtne2ps_pbh (__mmask8 k, __m128 a, __m128 b);
__m256bh _mm256_cvtne2ps_pbh (__m256 a, __m256 b);
__m256bh _mm256_mask_cvtne2ps_pbh (__m256bh src, __mmask16 k, __m256 a, __m256 b);
__m256bh _mm256_maskz_cvtne2ps_pbh (__mmask16 k, __m256 a, __m256 b);
__m512bh _mm512_cvtne2ps_pbh (__m512 a, __m512 b);
__m512bh _mm512_mask_cvtne2ps_pbh (__m512bh src, __mmask32 k, __m512 a, __m512 b);
__m512bh _mm512_maskz_cvtne2ps_pbh (__mmask32 k, __m512 a, __m512 b);
// VCVTNEPS2BF16
__m128bh _mm_cvtneps_pbh (__m128 a);
__m128bh _mm_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m128 a);
__m128bh _mm_maskz_cvtneps_pbh (__mmask8 k, __m128 a);
__m128bh _mm256_cvtneps_pbh (__m256 a);
__m128bh _mm256_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m256 a);
__m128bh _mm256_maskz_cvtneps_pbh (__mmask8 k, __m256 a);
__m256bh _mm512_cvtneps_pbh (__m512 a);
__m256bh _mm512_mask_cvtneps_pbh (__m256bh src, __mmask16 k, __m512 a);
__m256bh _mm512_maskz_cvtneps_pbh (__mmask16 k, __m512 a);
// VDPBF16PS
__m128 _mm_dpbf16_ps (__m128 src, __m128bh a, __m128bh b);
__m128 _mm_mask_dpbf16_ps (__m128 src, __mmask8 k, __m128bh a, __m128bh b);
__m128 _mm_maskz_dpbf16_ps (__mmask8 k, __m128 src, __m128bh a, __m128bh b);
__m256 _mm256_dpbf16_ps (__m256 src, __m256bh a, __m256bh b);
__m256 _mm256_mask_dpbf16_ps (__m256 src, __mmask8 k, __m256bh a, __m256bh b);
__m256 _mm256_maskz_dpbf16_ps (__mmask8 k, __m256 src, __m256bh a, __m256bh b);
__m512 _mm512_dpbf16_ps (__m512 src, __m512bh a, __m512bh b);
__m512 _mm512_mask_dpbf16_ps (__m512 src, __mmask16 k, __m512bh a, __m512bh b);
__m512 _mm512_maskz_dpbf16_ps (__mmask16 k, __m512 src, __m512bh a, __m512bh b);

See also[edit]

Bibliography[edit]

  • Intel Architecture Instruction Set Extensions and Future Features Programming Reference, Revision 36. (Ref #319433-036)