From WikiChip
Difference between revisions of "x86/avx512 bf16"
(→Detection) |
(Replaced support matrix.) |
||
(2 intermediate revisions by one other user not shown) | |||
Line 6: | Line 6: | ||
* <code>VCVTNE2PS2BF16</code> - Convert two SIMD registers with packed single-precision floating point values to [[bfloat16]] packed in a single register. | * <code>VCVTNE2PS2BF16</code> - Convert two SIMD registers with packed single-precision floating point values to [[bfloat16]] packed in a single register. | ||
− | * <code>VCVTNEPS2BF16</code> - Convert one SIMD register with packed single-precision floating-point values to [[bfloat16]] packed in a single register. | + | * <code>VCVTNEPS2BF16</code> - Convert one SIMD register with packed single-precision floating-point values to [[bfloat16]] packed in a single register. |
* <code>VDPBF16PS</code> - Performs a SIMD dot-product on [[bfloat16]] pairs and accumulates the results into a packaged single-precision register. | * <code>VDPBF16PS</code> - Performs a SIMD dot-product on [[bfloat16]] pairs and accumulates the results into a packaged single-precision register. | ||
Line 13: | Line 13: | ||
== Detection == | == Detection == | ||
+ | Support for these instructions is indicated by the AVX512_BF16 feature flag. 128- and 256-bit vectors are supported if the AVX512VL flag is set as well. | ||
+ | |||
{| class="wikitable" | {| class="wikitable" | ||
− | ! colspan="2" | {{x86|CPUID}} !! rowspan="2" | Instruction Set | + | ! colspan="2" | {{x86|CPUID}} !! rowspan="2" | Instruction Set |
|- | |- | ||
! Input !! Output | ! Input !! Output | ||
|- | |- | ||
− | | | + | | EAX=07H, ECX=0 || EBX[bit 31] || AVX512VL |
+ | |- | ||
+ | | EAX=07H, ECX=1 || EAX[bit 05] || AVX512_BF16 | ||
|} | |} | ||
== Microarchitecture support == | == Microarchitecture support == | ||
− | + | <!-- Wrong/incomplete? Visit https://en.wikichip.org/wiki/Template:avx512_support_matrix --> | |
− | + | {{avx512 support matrix|em=VL+BF16}} | |
− | |||
− | |||
− | |||
− | |||
== Intrinsic functions == | == Intrinsic functions == | ||
− | <source lang= | + | <source lang=c> |
− | + | // VCVTNE2PS2BF16 | |
− | __m128bh _mm_cvtne2ps_pbh (__m128 a, __m128 b) | + | __m128bh _mm_cvtne2ps_pbh (__m128 a, __m128 b); |
− | __m128bh _mm_mask_cvtne2ps_pbh (__m128bh src, __mmask8 k, __m128 a, __m128 b) | + | __m128bh _mm_mask_cvtne2ps_pbh (__m128bh src, __mmask8 k, __m128 a, __m128 b); |
− | __m128bh _mm_maskz_cvtne2ps_pbh (__mmask8 k, __m128 a, __m128 b) | + | __m128bh _mm_maskz_cvtne2ps_pbh (__mmask8 k, __m128 a, __m128 b); |
− | __m256bh _mm256_cvtne2ps_pbh (__m256 a, __m256 b) | + | __m256bh _mm256_cvtne2ps_pbh (__m256 a, __m256 b); |
− | __m256bh _mm256_mask_cvtne2ps_pbh (__m256bh src, __mmask16 k, __m256 a, __m256 b) | + | __m256bh _mm256_mask_cvtne2ps_pbh (__m256bh src, __mmask16 k, __m256 a, __m256 b); |
− | __m256bh _mm256_maskz_cvtne2ps_pbh (__mmask16 k, __m256 a, __m256 b) | + | __m256bh _mm256_maskz_cvtne2ps_pbh (__mmask16 k, __m256 a, __m256 b); |
− | __m512bh _mm512_cvtne2ps_pbh (__m512 a, __m512 b) | + | __m512bh _mm512_cvtne2ps_pbh (__m512 a, __m512 b); |
− | __m512bh _mm512_mask_cvtne2ps_pbh (__m512bh src, __mmask32 k, __m512 a, __m512 b) | + | __m512bh _mm512_mask_cvtne2ps_pbh (__m512bh src, __mmask32 k, __m512 a, __m512 b); |
− | __m512bh _mm512_maskz_cvtne2ps_pbh (__mmask32 k, __m512 a, __m512 b) | + | __m512bh _mm512_maskz_cvtne2ps_pbh (__mmask32 k, __m512 a, __m512 b); |
− | + | // VCVTNEPS2BF16 | |
− | __m128bh _mm_cvtneps_pbh (__m128 a) | + | __m128bh _mm_cvtneps_pbh (__m128 a); |
− | __m128bh _mm_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m128 a) | + | __m128bh _mm_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m128 a); |
− | __m128bh _mm_maskz_cvtneps_pbh (__mmask8 k, __m128 a) | + | __m128bh _mm_maskz_cvtneps_pbh (__mmask8 k, __m128 a); |
− | __m128bh _mm256_cvtneps_pbh (__m256 a) | + | __m128bh _mm256_cvtneps_pbh (__m256 a); |
− | __m128bh _mm256_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m256 a) | + | __m128bh _mm256_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m256 a); |
− | __m128bh _mm256_maskz_cvtneps_pbh (__mmask8 k, __m256 a) | + | __m128bh _mm256_maskz_cvtneps_pbh (__mmask8 k, __m256 a); |
− | __m256bh _mm512_cvtneps_pbh (__m512 a) | + | __m256bh _mm512_cvtneps_pbh (__m512 a); |
− | __m256bh _mm512_mask_cvtneps_pbh (__m256bh src, __mmask16 k, __m512 a) | + | __m256bh _mm512_mask_cvtneps_pbh (__m256bh src, __mmask16 k, __m512 a); |
− | __m256bh _mm512_maskz_cvtneps_pbh (__mmask16 k, __m512 a) | + | __m256bh _mm512_maskz_cvtneps_pbh (__mmask16 k, __m512 a); |
− | + | // VDPBF16PS | |
− | __m128 _mm_dpbf16_ps (__m128 src, __m128bh a, __m128bh b) | + | __m128 _mm_dpbf16_ps (__m128 src, __m128bh a, __m128bh b); |
− | __m128 _mm_mask_dpbf16_ps (__m128 src, __mmask8 k, __m128bh a, __m128bh b) | + | __m128 _mm_mask_dpbf16_ps (__m128 src, __mmask8 k, __m128bh a, __m128bh b); |
− | __m128 _mm_maskz_dpbf16_ps (__mmask8 k, __m128 src, __m128bh a, __m128bh b) | + | __m128 _mm_maskz_dpbf16_ps (__mmask8 k, __m128 src, __m128bh a, __m128bh b); |
− | __m256 _mm256_dpbf16_ps (__m256 src, __m256bh a, __m256bh b) | + | __m256 _mm256_dpbf16_ps (__m256 src, __m256bh a, __m256bh b); |
− | __m256 _mm256_mask_dpbf16_ps (__m256 src, __mmask8 k, __m256bh a, __m256bh b) | + | __m256 _mm256_mask_dpbf16_ps (__m256 src, __mmask8 k, __m256bh a, __m256bh b); |
− | __m256 _mm256_maskz_dpbf16_ps (__mmask8 k, __m256 src, __m256bh a, __m256bh b) | + | __m256 _mm256_maskz_dpbf16_ps (__mmask8 k, __m256 src, __m256bh a, __m256bh b); |
− | __m512 _mm512_dpbf16_ps (__m512 src, __m512bh a, __m512bh b) | + | __m512 _mm512_dpbf16_ps (__m512 src, __m512bh a, __m512bh b); |
− | __m512 _mm512_mask_dpbf16_ps (__m512 src, __mmask16 k, __m512bh a, __m512bh b) | + | __m512 _mm512_mask_dpbf16_ps (__m512 src, __mmask16 k, __m512bh a, __m512bh b); |
− | __m512 _mm512_maskz_dpbf16_ps (__mmask16 k, __m512 src, __m512bh a, __m512bh b) | + | __m512 _mm512_maskz_dpbf16_ps (__mmask16 k, __m512 src, __m512bh a, __m512bh b); |
</source> | </source> | ||
+ | |||
+ | == See also == | ||
+ | * [[DL Boost]] | ||
+ | * [[AVX512_VNNI]] | ||
== Bibliography == | == Bibliography == | ||
− | * ''Intel Architecture Instruction Set Extensions and Future Features Programming Reference'', Revision 36. (Ref #319433- | + | * ''Intel Architecture Instruction Set Extensions and Future Features Programming Reference'', Revision 36. (Ref #319433-036) |
[[Category:x86_extensions]] | [[Category:x86_extensions]] |
Latest revision as of 15:50, 15 March 2023
x86
Instruction Set Architecture
Instruction Set Architecture
General
Variants
Topics
- Instructions
- Addressing Modes
- Registers
- Model-Specific Register
- Assembly
- Interrupts
- Micro-Ops
- Timer
- Calling Convention
- Microarchitectures
- CPUID
CPUIDs
Modes
Extensions(all)
AVX-512 BFloat16 Instructions (AVX512_BF16) is an x86 extension, part of AVX-512, designed to accelerate neural network-based algorithms by performing dot-product on bfloat16.
Contents
Overview[edit]
The AVX512 BF16 x86 extension extends AVX-512 Foundation by introducing three new instructions for converting and operating on bfloat16.
-
VCVTNE2PS2BF16
- Convert two SIMD registers with packed single-precision floating point values to bfloat16 packed in a single register. -
VCVTNEPS2BF16
- Convert one SIMD register with packed single-precision floating-point values to bfloat16 packed in a single register. -
VDPBF16PS
- Performs a SIMD dot-product on bfloat16 pairs and accumulates the results into a packaged single-precision register.
Motivation[edit]
Detection[edit]
Support for these instructions is indicated by the AVX512_BF16 feature flag. 128- and 256-bit vectors are supported if the AVX512VL flag is set as well.
CPUID | Instruction Set | |
---|---|---|
Input | Output | |
EAX=07H, ECX=0 | EBX[bit 31] | AVX512VL |
EAX=07H, ECX=1 | EAX[bit 05] | AVX512_BF16 |
Microarchitecture support[edit]
Designer | Microarchitecture | Year | Support Level | ||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
F | CD | ER | PF | BW | DQ | VL | FP16 | IFMA | VBMI | VBMI2 | BITALG | VPOPCNTDQ | VP2INTERSECT | 4VNNIW | 4FMAPS | VNNI | BF16 | ||||
Intel | Knights Landing | 2016 | ✔ | ✔ | ✔ | ✔ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | |
Knights Mill | 2017 | ✔ | ✔ | ✔ | ✔ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✔ | ✘ | ✔ | ✔ | ✘ | ✘ | ||
Skylake (server) | 2017 | ✔ | ✔ | ✘ | ✘ | ✔ | ✔ | ✔ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ||
Cannon Lake | 2018 | ✔ | ✔ | ✘ | ✘ | ✔ | ✔ | ✔ | ✘ | ✔ | ✔ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ||
Cascade Lake | 2019 | ✔ | ✔ | ✘ | ✘ | ✔ | ✔ | ✔ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✔ | ✘ | ||
Cooper Lake | 2020 | ✔ | ✔ | ✘ | ✘ | ✔ | ✔ | ✔ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✔ | ✔ | ||
Tiger Lake | 2020 | ✔ | ✔ | ✘ | ✘ | ✔ | ✔ | ✔ | ✘ | ✔ | ✔ | ✔ | ✔ | ✔ | ✔ | ✘ | ✘ | ✔ | ✘ | ||
Rocket Lake | 2021 | ✔ | ✔ | ✘ | ✘ | ✔ | ✔ | ✔ | ✘ | ✔ | ✔ | ✔ | ✔ | ✔ | ✘ | ✘ | ✘ | ✔ | ✘ | ||
Alder Lake | 2021 | ✔ | ✔ | ✘ | ✘ | ✔ | ✔ | ✔ | ✔ | ✔ | ✔ | ✔ | ✔ | ✔ | ✔ | ✘ | ✘ | ✔ | ✔ | ||
Ice Lake (server) | 2021 | ✔ | ✔ | ✘ | ✘ | ✔ | ✔ | ✔ | ✘ | ✔ | ✔ | ✔ | ✔ | ✔ | ✘ | ✘ | ✘ | ✔ | ✘ | ||
Sapphire Rapids | 2023 | ✔ | ✔ | ✘ | ✘ | ✔ | ✔ | ✔ | ✔ | ✔ | ✔ | ✔ | ✔ | ✔ | ✘ | ✘ | ✘ | ✔ | ✔ | ||
AMD | Zen 4 | 2022 | ✔ | ✔ | ✘ | ✘ | ✔ | ✔ | ✔ | ✘ | ✔ | ✔ | ✔ | ✔ | ✔ | ✘ | ✘ | ✘ | ✔ | ✔ | |
Centaur | CHA | ✔ | ✔ | ✘ | ✘ | ✔ | ✔ | ✔ | ✘ | ✔ | ✔ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ | ✘ |
Intrinsic functions[edit]
// VCVTNE2PS2BF16
__m128bh _mm_cvtne2ps_pbh (__m128 a, __m128 b);
__m128bh _mm_mask_cvtne2ps_pbh (__m128bh src, __mmask8 k, __m128 a, __m128 b);
__m128bh _mm_maskz_cvtne2ps_pbh (__mmask8 k, __m128 a, __m128 b);
__m256bh _mm256_cvtne2ps_pbh (__m256 a, __m256 b);
__m256bh _mm256_mask_cvtne2ps_pbh (__m256bh src, __mmask16 k, __m256 a, __m256 b);
__m256bh _mm256_maskz_cvtne2ps_pbh (__mmask16 k, __m256 a, __m256 b);
__m512bh _mm512_cvtne2ps_pbh (__m512 a, __m512 b);
__m512bh _mm512_mask_cvtne2ps_pbh (__m512bh src, __mmask32 k, __m512 a, __m512 b);
__m512bh _mm512_maskz_cvtne2ps_pbh (__mmask32 k, __m512 a, __m512 b);
// VCVTNEPS2BF16
__m128bh _mm_cvtneps_pbh (__m128 a);
__m128bh _mm_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m128 a);
__m128bh _mm_maskz_cvtneps_pbh (__mmask8 k, __m128 a);
__m128bh _mm256_cvtneps_pbh (__m256 a);
__m128bh _mm256_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m256 a);
__m128bh _mm256_maskz_cvtneps_pbh (__mmask8 k, __m256 a);
__m256bh _mm512_cvtneps_pbh (__m512 a);
__m256bh _mm512_mask_cvtneps_pbh (__m256bh src, __mmask16 k, __m512 a);
__m256bh _mm512_maskz_cvtneps_pbh (__mmask16 k, __m512 a);
// VDPBF16PS
__m128 _mm_dpbf16_ps (__m128 src, __m128bh a, __m128bh b);
__m128 _mm_mask_dpbf16_ps (__m128 src, __mmask8 k, __m128bh a, __m128bh b);
__m128 _mm_maskz_dpbf16_ps (__mmask8 k, __m128 src, __m128bh a, __m128bh b);
__m256 _mm256_dpbf16_ps (__m256 src, __m256bh a, __m256bh b);
__m256 _mm256_mask_dpbf16_ps (__m256 src, __mmask8 k, __m256bh a, __m256bh b);
__m256 _mm256_maskz_dpbf16_ps (__mmask8 k, __m256 src, __m256bh a, __m256bh b);
__m512 _mm512_dpbf16_ps (__m512 src, __m512bh a, __m512bh b);
__m512 _mm512_mask_dpbf16_ps (__m512 src, __mmask16 k, __m512bh a, __m512bh b);
__m512 _mm512_maskz_dpbf16_ps (__mmask16 k, __m512 src, __m512bh a, __m512bh b);
See also[edit]
Bibliography[edit]
- Intel Architecture Instruction Set Extensions and Future Features Programming Reference, Revision 36. (Ref #319433-036)