From WikiChip
Difference between revisions of "x86/avx512 bf16"
(→Detection) |
|||
Line 62: | Line 62: | ||
__m512 _mm512_maskz_dpbf16_ps (__mmask16 k, __m512 src, __m512bh a, __m512bh b) | __m512 _mm512_maskz_dpbf16_ps (__mmask16 k, __m512 src, __m512bh a, __m512bh b) | ||
</source> | </source> | ||
+ | |||
+ | == See also == | ||
+ | * [[DL Boost]] | ||
+ | * [[AVX512_VNNI]] | ||
== Bibliography == | == Bibliography == |
Revision as of 21:56, 24 June 2020
x86
Instruction Set Architecture
Instruction Set Architecture
General
Variants
Topics
- Instructions
- Addressing Modes
- Registers
- Model-Specific Register
- Assembly
- Interrupts
- Micro-Ops
- Timer
- Calling Convention
- Microarchitectures
- CPUID
CPUIDs
Modes
Extensions(all)
AVX-512 BFloat16 Instructions (AVX512_BF16) is an x86 extension, part of AVX-512, designed to accelerate neural network-based algorithms by performing dot-product on bfloat16.
Contents
Overview
The AVX512 BF16 x86 extension extends AVX-512 Foundation by introducing three new instructions for converting and operating on bfloat16.
-
VCVTNE2PS2BF16
- Convert two SIMD registers with packed single-precision floating point values to bfloat16 packed in a single register. -
VCVTNEPS2BF16
- Convert one SIMD register with packed single-precision floating-point values to bfloat16 packed in a single register. -
VDPBF16PS
- Performs a SIMD dot-product on bfloat16 pairs and accumulates the results into a packaged single-precision register.
Motivation
Detection
CPUID | Instruction Set | |
---|---|---|
Input | Output | |
EAX=07H, ECX=1 | EAX[bit 05] | AVX512_BF16 |
Microarchitecture support
Instructions | Introduction |
---|---|
AVX512_BF16 | Cooper Lake (server) Sapphire Rapids |
Intrinsic functions
# vcvtne2ps2bf16
__m128bh _mm_cvtne2ps_pbh (__m128 a, __m128 b)
__m128bh _mm_mask_cvtne2ps_pbh (__m128bh src, __mmask8 k, __m128 a, __m128 b)
__m128bh _mm_maskz_cvtne2ps_pbh (__mmask8 k, __m128 a, __m128 b)
__m256bh _mm256_cvtne2ps_pbh (__m256 a, __m256 b)
__m256bh _mm256_mask_cvtne2ps_pbh (__m256bh src, __mmask16 k, __m256 a, __m256 b)
__m256bh _mm256_maskz_cvtne2ps_pbh (__mmask16 k, __m256 a, __m256 b)
__m512bh _mm512_cvtne2ps_pbh (__m512 a, __m512 b)
__m512bh _mm512_mask_cvtne2ps_pbh (__m512bh src, __mmask32 k, __m512 a, __m512 b)
__m512bh _mm512_maskz_cvtne2ps_pbh (__mmask32 k, __m512 a, __m512 b)
# vcvtneps2bf16
__m128bh _mm_cvtneps_pbh (__m128 a)
__m128bh _mm_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m128 a)
__m128bh _mm_maskz_cvtneps_pbh (__mmask8 k, __m128 a)
__m128bh _mm256_cvtneps_pbh (__m256 a)
__m128bh _mm256_mask_cvtneps_pbh (__m128bh src, __mmask8 k, __m256 a)
__m128bh _mm256_maskz_cvtneps_pbh (__mmask8 k, __m256 a)
__m256bh _mm512_cvtneps_pbh (__m512 a)
__m256bh _mm512_mask_cvtneps_pbh (__m256bh src, __mmask16 k, __m512 a)
__m256bh _mm512_maskz_cvtneps_pbh (__mmask16 k, __m512 a)
# vdpbf16ps
__m128 _mm_dpbf16_ps (__m128 src, __m128bh a, __m128bh b)
__m128 _mm_mask_dpbf16_ps (__m128 src, __mmask8 k, __m128bh a, __m128bh b)
__m128 _mm_maskz_dpbf16_ps (__mmask8 k, __m128 src, __m128bh a, __m128bh b)
__m256 _mm256_dpbf16_ps (__m256 src, __m256bh a, __m256bh b)
__m256 _mm256_mask_dpbf16_ps (__m256 src, __mmask8 k, __m256bh a, __m256bh b)
__m256 _mm256_maskz_dpbf16_ps (__mmask8 k, __m256 src, __m256bh a, __m256bh b)
__m512 _mm512_dpbf16_ps (__m512 src, __m512bh a, __m512bh b)
__m512 _mm512_mask_dpbf16_ps (__m512 src, __mmask16 k, __m512bh a, __m512bh b)
__m512 _mm512_maskz_dpbf16_ps (__mmask16 k, __m512 src, __m512bh a, __m512bh b)
See also
Bibliography
- Intel Architecture Instruction Set Extensions and Future Features Programming Reference, Revision 36. (Ref #319433-039)