(→Added instructions) |
|||
Line 236: | Line 236: | ||
* [[22 nm process]] | * [[22 nm process]] | ||
== Added instructions == | == Added instructions == | ||
+ | '''{{x86|AVX2}}''' - Integer data types were extended to 256-bit SIMD. | ||
+ | |||
+ | {{collist | ||
+ | | count = 4 | ||
+ | | width = 650px | ||
+ | | | ||
+ | * {{x86|VBROADCASTI128}} | ||
+ | * {{x86|VBROADCASTSD}} | ||
+ | * {{x86|VBROADCASTSS}} | ||
+ | * {{x86|VEXTRACTI128}} | ||
+ | * {{x86|VGATHERDPD}} | ||
+ | * {{x86|VGATHERDPS}} | ||
+ | * {{x86|VGATHERQPD}} | ||
+ | * {{x86|VGATHERQPS}} | ||
+ | * {{x86|VINSERTI128}} | ||
+ | * {{x86|VMOVNTDQA}} | ||
+ | * {{x86|VMPSADBW}} | ||
+ | * {{x86|VPABSB}} | ||
+ | * {{x86|VPABSD}} | ||
+ | * {{x86|VPABSW}} | ||
+ | * {{x86|VPACKSSDW}} | ||
+ | * {{x86|VPACKSSWB}} | ||
+ | * {{x86|VPACKUSDW}} | ||
+ | * {{x86|VPACKUSWB}} | ||
+ | * {{x86|VPADDB}} | ||
+ | * {{x86|VPADDD}} | ||
+ | * {{x86|VPADDQ}} | ||
+ | * {{x86|VPADDSB}} | ||
+ | * {{x86|VPADDSW}} | ||
+ | * {{x86|VPADDUSB}} | ||
+ | * {{x86|VPADDUSW}} | ||
+ | * {{x86|VPADDW}} | ||
+ | * {{x86|VPALIGNR}} | ||
+ | * {{x86|VPAND}} | ||
+ | * {{x86|VPANDN}} | ||
+ | * {{x86|VPAVGB}} | ||
+ | * {{x86|VPAVGW}} | ||
+ | * {{x86|VPBLENDD}} | ||
+ | * {{x86|VPBLENDVB}} | ||
+ | * {{x86|VPBLENDW}} | ||
+ | * {{x86|VPBROADCASTB}} | ||
+ | * {{x86|VPBROADCASTD}} | ||
+ | * {{x86|VPBROADCASTQ}} | ||
+ | * {{x86|VPBROADCASTW}} | ||
+ | * {{x86|VPCMPEQB}} | ||
+ | * {{x86|VPCMPEQD}} | ||
+ | * {{x86|VPCMPEQQ}} | ||
+ | * {{x86|VPCMPEQW}} | ||
+ | * {{x86|VPCMPGTB}} | ||
+ | * {{x86|VPCMPGTD}} | ||
+ | * {{x86|VPCMPGTQ}} | ||
+ | * {{x86|VPCMPGTW}} | ||
+ | * {{x86|VPERM2I128}} | ||
+ | * {{x86|VPERMD}} | ||
+ | * {{x86|VPERMPD}} | ||
+ | * {{x86|VPERMPS}} | ||
+ | * {{x86|VPERMQ}} | ||
+ | * {{x86|VPGATHERDD}} | ||
+ | * {{x86|VPGATHERDQ}} | ||
+ | * {{x86|VPGATHERQD}} | ||
+ | * {{x86|VPGATHERQQ}} | ||
+ | * {{x86|VPHADDD}} | ||
+ | * {{x86|VPHADDSW}} | ||
+ | * {{x86|VPHADDW}} | ||
+ | * {{x86|VPHSUBD}} | ||
+ | * {{x86|VPHSUBSW}} | ||
+ | * {{x86|VPHSUBW}} | ||
+ | * {{x86|VPMADDUBSW}} | ||
+ | * {{x86|VPMADDWD}} | ||
+ | * {{x86|VPMASKMOVD}} | ||
+ | * {{x86|VPMASKMOVQ}} | ||
+ | * {{x86|VPMAXSB}} | ||
+ | * {{x86|VPMAXSD}} | ||
+ | * {{x86|VPMAXSW}} | ||
+ | * {{x86|VPMAXUB}} | ||
+ | * {{x86|VPMAXUD}} | ||
+ | * {{x86|VPMAXUW}} | ||
+ | * {{x86|VPMINSB}} | ||
+ | * {{x86|VPMINSD}} | ||
+ | * {{x86|VPMINSW}} | ||
+ | * {{x86|VPMINUB}} | ||
+ | * {{x86|VPMINUD}} | ||
+ | * {{x86|VPMINUW}} | ||
+ | * {{x86|VPMOVMSKB}} | ||
+ | * {{x86|VPMOVSXBD}} | ||
+ | * {{x86|VPMOVSXBQ}} | ||
+ | * {{x86|VPMOVSXBW}} | ||
+ | * {{x86|VPMOVSXDQ}} | ||
+ | * {{x86|VPMOVSXWD}} | ||
+ | * {{x86|VPMOVSXWQ}} | ||
+ | * {{x86|VPMOVZXBD}} | ||
+ | * {{x86|VPMOVZXBQ}} | ||
+ | * {{x86|VPMOVZXBW}} | ||
+ | * {{x86|VPMOVZXDQ}} | ||
+ | * {{x86|VPMOVZXWD}} | ||
+ | * {{x86|VPMOVZXWQ}} | ||
+ | * {{x86|VPMULDQ}} | ||
+ | * {{x86|VPMULHRSW}} | ||
+ | * {{x86|VPMULHUW}} | ||
+ | * {{x86|VPMULHW}} | ||
+ | * {{x86|VPMULLD}} | ||
+ | * {{x86|VPMULLW}} | ||
+ | * {{x86|VPMULUDQ}} | ||
+ | * {{x86|VPOR}} | ||
+ | * {{x86|VPSADBW}} | ||
+ | * {{x86|VPSHUFB}} | ||
+ | * {{x86|VPSHUFD}} | ||
+ | * {{x86|VPSHUFHW}} | ||
+ | * {{x86|VPSHUFLW}} | ||
+ | * {{x86|VPSIGNB}} | ||
+ | * {{x86|VPSIGND}} | ||
+ | * {{x86|VPSIGNW}} | ||
+ | * {{x86|VPSLLD}} | ||
+ | * {{x86|VPSLLDQ}} | ||
+ | * {{x86|VPSLLQ}} | ||
+ | * {{x86|VPSLLVD}} | ||
+ | * {{x86|VPSLLVQ}} | ||
+ | * {{x86|VPSLLW}} | ||
+ | * {{x86|VPSRAD}} | ||
+ | * {{x86|VPSRAVD}} | ||
+ | * {{x86|VPSRAW}} | ||
+ | * {{x86|VPSRLD}} | ||
+ | * {{x86|VPSRLDQ}} | ||
+ | * {{x86|VPSRLQ}} | ||
+ | * {{x86|VPSRLVD}} | ||
+ | * {{x86|VPSRLVQ}} | ||
+ | * {{x86|VPSRLW}} | ||
+ | * {{x86|VPSUBB}} | ||
+ | * {{x86|VPSUBD}} | ||
+ | * {{x86|VPSUBQ}} | ||
+ | * {{x86|VPSUBSB}} | ||
+ | * {{x86|VPSUBSW}} | ||
+ | * {{x86|VPSUBUSB}} | ||
+ | * {{x86|VPSUBUSW}} | ||
+ | * {{x86|VPSUBW}} | ||
+ | * {{x86|VPUNPCKHBW}} | ||
+ | * {{x86|VPUNPCKHDQ}} | ||
+ | * {{x86|VPUNPCKHQDQ}} | ||
+ | * {{x86|VPUNPCKHWD}} | ||
+ | * {{x86|VPUNPCKLBW}} | ||
+ | * {{x86|VPUNPCKLDQ}} | ||
+ | * {{x86|VPUNPCKLQDQ}} | ||
+ | * {{x86|VPUNPCKLWD}} | ||
+ | * {{x86|VPXOR}} | ||
+ | }} | ||
+ | |||
+ | '''{{x86|BMI1}}''' / '''{{x86|BMI2}}''' - Bit Manipulation Instructions Sets | ||
+ | |||
+ | {{collist | ||
+ | | count = 2 | ||
+ | | width = 150px | ||
+ | | | ||
+ | * {{x86|ANDN}} | ||
+ | * {{x86|BEXTR}} | ||
+ | * {{x86|BLSI}} | ||
+ | * {{x86|BLSMSK}} | ||
+ | * {{x86|BLSR}} | ||
+ | * {{x86|BZHI}} | ||
+ | * {{x86|LZCNT}} | ||
+ | * {{x86|MULX}} | ||
+ | * {{x86|PDEP}} | ||
+ | * {{x86|PEXT}} | ||
+ | * {{x86|POPCNT}} | ||
+ | * {{x86|RORX}} | ||
+ | * {{x86|SARX}} | ||
+ | * {{x86|SHLX}} | ||
+ | * {{x86|SHRX}} | ||
+ | * {{x86|TZCNT}} | ||
+ | }} | ||
+ | |||
+ | '''{{x86|FMA}}''' - Fused Multiply-Add instructions | ||
+ | |||
+ | {{collist | ||
+ | | count = 4 | ||
+ | | width = 650px | ||
+ | | | ||
+ | * {{x86|VFMADD123PD}} | ||
+ | * {{x86|VFMADD123PS}} | ||
+ | * {{x86|VFMADD123SD}} | ||
+ | * {{x86|VFMADD123SS}} | ||
+ | * {{x86|VFMADD132PD}} | ||
+ | * {{x86|VFMADD132PS}} | ||
+ | * {{x86|VFMADD132SD}} | ||
+ | * {{x86|VFMADD132SS}} | ||
+ | * {{x86|VFMADD213PD}} | ||
+ | * {{x86|VFMADD213PS}} | ||
+ | * {{x86|VFMADD213SD}} | ||
+ | * {{x86|VFMADD213SS}} | ||
+ | * {{x86|VFMADD231PD}} | ||
+ | * {{x86|VFMADD231PS}} | ||
+ | * {{x86|VFMADD231SD}} | ||
+ | * {{x86|VFMADD231SS}} | ||
+ | * {{x86|VFMADD312PD}} | ||
+ | * {{x86|VFMADD312PS}} | ||
+ | * {{x86|VFMADD312SD}} | ||
+ | * {{x86|VFMADD312SS}} | ||
+ | * {{x86|VFMADD321PD}} | ||
+ | * {{x86|VFMADD321PS}} | ||
+ | * {{x86|VFMADD321SD}} | ||
+ | * {{x86|VFMADD321SS}} | ||
+ | * {{x86|VFMADDSUB123PD}} | ||
+ | * {{x86|VFMADDSUB123PS}} | ||
+ | * {{x86|VFMADDSUB132PD}} | ||
+ | * {{x86|VFMADDSUB132PS}} | ||
+ | * {{x86|VFMADDSUB213PD}} | ||
+ | * {{x86|VFMADDSUB213PS}} | ||
+ | * {{x86|VFMADDSUB231PD}} | ||
+ | * {{x86|VFMADDSUB231PS}} | ||
+ | * {{x86|VFMADDSUB312PD}} | ||
+ | * {{x86|VFMADDSUB312PS}} | ||
+ | * {{x86|VFMADDSUB321PD}} | ||
+ | * {{x86|VFMADDSUB321PS}} | ||
+ | * {{x86|VFMSUB123PD}} | ||
+ | * {{x86|VFMSUB123PS}} | ||
+ | * {{x86|VFMSUB123SD}} | ||
+ | * {{x86|VFMSUB123SS}} | ||
+ | * {{x86|VFMSUB132PD}} | ||
+ | * {{x86|VFMSUB132PS}} | ||
+ | * {{x86|VFMSUB132SD}} | ||
+ | * {{x86|VFMSUB132SS}} | ||
+ | * {{x86|VFMSUB213PD}} | ||
+ | * {{x86|VFMSUB213PS}} | ||
+ | * {{x86|VFMSUB213SD}} | ||
+ | * {{x86|VFMSUB213SS}} | ||
+ | * {{x86|VFMSUB231PD}} | ||
+ | * {{x86|VFMSUB231PS}} | ||
+ | * {{x86|VFMSUB231SD}} | ||
+ | * {{x86|VFMSUB231SS}} | ||
+ | * {{x86|VFMSUB312PD}} | ||
+ | * {{x86|VFMSUB312PS}} | ||
+ | * {{x86|VFMSUB312SD}} | ||
+ | * {{x86|VFMSUB312SS}} | ||
+ | * {{x86|VFMSUB321PD}} | ||
+ | * {{x86|VFMSUB321PS}} | ||
+ | * {{x86|VFMSUB321SD}} | ||
+ | * {{x86|VFMSUB321SS}} | ||
+ | * {{x86|VFMSUBADD123PD}} | ||
+ | * {{x86|VFMSUBADD123PS}} | ||
+ | * {{x86|VFMSUBADD132PD}} | ||
+ | * {{x86|VFMSUBADD132PS}} | ||
+ | * {{x86|VFMSUBADD213PD}} | ||
+ | * {{x86|VFMSUBADD213PS}} | ||
+ | * {{x86|VFMSUBADD231PD}} | ||
+ | * {{x86|VFMSUBADD231PS}} | ||
+ | * {{x86|VFMSUBADD312PD}} | ||
+ | * {{x86|VFMSUBADD312PS}} | ||
+ | * {{x86|VFMSUBADD321PD}} | ||
+ | * {{x86|VFMSUBADD321PS}} | ||
+ | * {{x86|VFNMADD123PD}} | ||
+ | * {{x86|VFNMADD123PS}} | ||
+ | * {{x86|VFNMADD123SD}} | ||
+ | * {{x86|VFNMADD123SS}} | ||
+ | * {{x86|VFNMADD132PD}} | ||
+ | * {{x86|VFNMADD132PS}} | ||
+ | * {{x86|VFNMADD132SD}} | ||
+ | * {{x86|VFNMADD132SS}} | ||
+ | * {{x86|VFNMADD213PD}} | ||
+ | * {{x86|VFNMADD213PS}} | ||
+ | * {{x86|VFNMADD213SD}} | ||
+ | * {{x86|VFNMADD213SS}} | ||
+ | * {{x86|VFNMADD231PD}} | ||
+ | * {{x86|VFNMADD231PS}} | ||
+ | * {{x86|VFNMADD231SD}} | ||
+ | * {{x86|VFNMADD231SS}} | ||
+ | * {{x86|VFNMADD312PD}} | ||
+ | * {{x86|VFNMADD312PS}} | ||
+ | * {{x86|VFNMADD312SD}} | ||
+ | * {{x86|VFNMADD312SS}} | ||
+ | * {{x86|VFNMADD321PD}} | ||
+ | * {{x86|VFNMADD321PS}} | ||
+ | * {{x86|VFNMADD321SD}} | ||
+ | * {{x86|VFNMADD321SS}} | ||
+ | * {{x86|VFNMSUB123PD}} | ||
+ | * {{x86|VFNMSUB123PS}} | ||
+ | * {{x86|VFNMSUB123SD}} | ||
+ | * {{x86|VFNMSUB123SS}} | ||
+ | * {{x86|VFNMSUB132PD}} | ||
+ | * {{x86|VFNMSUB132PS}} | ||
+ | * {{x86|VFNMSUB132SD}} | ||
+ | * {{x86|VFNMSUB132SS}} | ||
+ | * {{x86|VFNMSUB213PD}} | ||
+ | * {{x86|VFNMSUB213PS}} | ||
+ | * {{x86|VFNMSUB213SD}} | ||
+ | * {{x86|VFNMSUB213SS}} | ||
+ | * {{x86|VFNMSUB231PD}} | ||
+ | * {{x86|VFNMSUB231PS}} | ||
+ | * {{x86|VFNMSUB231SD}} | ||
+ | * {{x86|VFNMSUB231SS}} | ||
+ | * {{x86|VFNMSUB312PD}} | ||
+ | * {{x86|VFNMSUB312PS}} | ||
+ | * {{x86|VFNMSUB312SD}} | ||
+ | * {{x86|VFNMSUB312SS}} | ||
+ | * {{x86|VFNMSUB321PD}} | ||
+ | * {{x86|VFNMSUB321PS}} | ||
+ | * {{x86|VFNMSUB321SD}} | ||
+ | * {{x86|VFNMSUB321SS}} | ||
+ | }} | ||
+ | |||
+ | '''{{x86|MOVBE}}''' - Move Big-Endian instruction | ||
+ | |||
+ | {{collist | ||
+ | | count = 1 | ||
+ | | width = 650px | ||
+ | | | ||
+ | * {{x86|MOVBE}} | ||
+ | }} | ||
+ | |||
+ | '''{{x86|TSX}}''' - Transactional Synchronization Extensions | ||
+ | |||
+ | {{collist | ||
+ | | count = 1 | ||
+ | | width = 150px | ||
+ | | | ||
+ | * {{x86|XABORT}} | ||
+ | * {{x86|XBEGIN}} | ||
+ | * {{x86|XEND}} | ||
+ | * {{x86|XTEST}} | ||
+ | }} | ||
+ | |||
== Cores == | == Cores == | ||
== All Haswell Chips == | == All Haswell Chips == |
Revision as of 16:12, 14 April 2016
Edit Values | |
Haswell µarch | |
General Info |
Haswell (HSW) is Intel's microarchitecture based on the 22 nm process for mobile, desktops, and servers. Haswell, which was introduced in 2013, became the successor to Ivy Bridge. Haswell is named after Haswell, Colorado (Originally Molalla after Molalla, Oregon, it was later renamed due to the difficult pronunciation).
Contents
Codenames
Core | Abbrev | Target |
---|---|---|
Haswell DT | HSW-DT | Desktops |
Haswell MB | HSW-MB | Mobile/Laptops |
Haswell H | HSW-H | All-in-ones |
Haswell ULT | HSW-ULT | UltraBooks (MCPs) |
Haswell ULX | HSW-ULX | Tablets/UltraBooks (SoCs) |
Haswell EP | HSW-EP | Xeon chips |
Haswell EX | HSW-EX | Xeon chips, QP |
Haswell E | HSW-E | High-End Desktops (HEDT) |
Architecture
While sharing a lot of similarities with its predecessor Ivy Bridge, Haswell introduces many new enhancements and features. Haswell is the first desktop-line of x86s by Intel tailored for a system on chip architecture. This is a significant move that will continue to be developed over the next couple of microarchitectures. Overall Haswell shares the same basic flow as Sandy Bridge and Ivy but expends on them considerably in the execution engine with wider execution units and additional scheduler ports.
- Platform Controller Hub (PCH)
- Support for DDR4 (server/enthusiast segments)
- Integrated voltage regulator (IVR)
- New C6 & C7 sleep states
- Cache
- L1D$ has double the bandwidth
- Load: 64B/cycle (up from 32B/cycle)
- Store: 32B/cycle (up from 16B/cycle)
- L2$ bandwidth to L1 is doubled
- 64B/cycle (up from 32B/cycle)
- STLB been made to support 2MB pages
- Table has been doubled to 1,024 entries 8-Way (up from 512, 4-way)
- L1D$ has double the bandwidth
- Reorder Buffer (ROB) was increased to 192 entries (up from 168)
- Scheduler has been widened, (see #Front-end)
- Increased to 60 entries (up from 54)
- Integer register file up 8 entries to 168
- FP register file up 24 entries to 168
- 2 additional execution ports (see #Execution_Units)
- New memory model for Transactional Synchronization Extensions
GPU changes
- Direct3D 11.1
- OpenGL 4.3
- OpenCL 1.2
- Four versions of GPU options codenamed GT1, GT2, GT3 and GT3 (with GT3e having a dedicated eDRAM L4$)
New instructions
- Main article: See #Added_instructions for the complete list
Haswell introduced a number of new instructions:
-
AVX2
- Advanced Vector Extensions 2; an extension that extends most integer instructions to 256 bits vectors.- Vector Gather supprt
- Any-to-Any permutes
- Vector-Vector Shifts
-
BMI1
- Bit Manipulation Instructions Sets 1 -
BMI2
- Bit Manipulation Instructions Sets 2 -
MOVBE
- Move Big-Endian instruction -
FMA
- Floating Point Multiply Accumulate -
TSX
- Transactional Synchronization Extensions
Block Diagram
Due to the success of the front end in Ivy Bridge, very few changes were done in Haswell.
Memory Hierarchy
The memory hierarchy in Haswell had a number of changes from its predecessor. The cache bandwidth for both load and store have been doubled (64B/cycle for load and 32B/cycle for store; up from 32/16 respectively). Significant enhancements have been done to support the new gather instructions and transactional memory. With haswell new port 7 which adds an address generation for stores, up to two loads and one store are possible each cycle.
- Cache
- L1I Cache:
- 32 KB 8-way set associative
- 64 B line size
- shared by the two threads, per core
- 32 KB 8-way set associative
- L1D Cache:
- 32 KB 8-way set associative
- 64 B line size
- shared by the two threads, per core
- 4 cycles for fastest load-to-use
- 64 Bytes/cycle load bandwidth
- 32 Bytes/cycle store bandwidth
- 32 KB 8-way set associative
- L2 Cache:
- unified, 256 KB 8-way set associative
- 11 cycles for fastest load-to-use
- 64B/cycle bandwidth to L1$
- L3 Cache:
- 1.5 MB
- Per core
- L4 Cache:
- 128 MB
- Per package
- Only on the Iris Pro GPUs
- TLBs:
- ITLB
- 4KB page translations:
- 128 entries; 4-way set associative
- fixed partition; divided between the two threads
- 2MB/4MB page translations:
- 8 entries; fully associative
- Duplicated for each thread
- 4KB page translations:
- DTLB
- 4KB page translations:
- 64 entries; 4-way set associative
- fixed partition; divided between the two threads
- 2MB/4MB page translations:
- 32 entries; 4-way set associative
- 1G page translations:
- 4 entries; 4-way set associative
- 4KB page translations:
- STLB
- 4KB+2M page translations:
- 1024 entries; 8-way set associative
- shared
- 4KB+2M page translations:
- ITLB
- L1I Cache:
Pipeline
Haswell, like its predecessor Ivy Bridge, also has a dual-threaded and out-of-order pipeline.
Front-end
The front-end is the complicated part of the microarchitecture has it deals with variable length x86 instructions ranging from 1 to 15 bytes. The main goal here is to fetch and decode correctly the next set of instructions. The caches have not changed in Haswell from Ivy Bridge, with the L1i$ still 32KB , 8-way set associative shared dynamically by the two threads. Instruction cache instruction fetching remains 16B/cycle. TLB is also still 128-entries, 4-way for 4KB pages and 8-entries, fully associative for 2MB page mode. The fetched instructions are then moved on to an instruction queue which has 40 entries, 20 for each thread. Haswell continued to improve the branch misses although the exact details have not been made public.
Haswell has the same µOps cache as Ivy Bridge - 1,536 entries organized in 32 sets of 8 cache lines with 6 µOps each. Hits can yield up to 4-µOps/cycle. The cache supports microcoded instructions (being pointers to ROM entries). Cache is shared by the two threads.
Following the instruction queue, instructions are coded via the complex 4-way decoder. The decoder has 3 simple decoders and 1 complex decoder. In total, they are capable of emitting 3 single fused µOps and an additional 1-4 fused µOps. The unit handles both micro and macro fusions. Macro-fusion as a result of compatible adjacent µOps may be merged into a single µOp. Push and pops as well as call and return are also handled at this stage. 4 instructions, but with the aid of the macro-fusion, up to 5 instructions can be decoded each cycle.
Execution engine
Continuing with the decoder is the register renaming stage. This is crucial for out-of-order execution. In this stage the architectural x86 registers get mapped into one of the many physical registers. The integer physical register file (PRF) has been enlarged by 8 addition registers for a total 168. Likewise the FP PRF was extended by 24 registers bringing it too to 168 registers. The larger increase in the FP PRF is likely to accommodate the new AVX2 extension. The ROB in Haswell has been increased to 192 entries (from 168 in Ivy) where each entry corresponds to a single µOp. The ROD is fixed split between the two threads. Additional scheduler resources get allocated as well - this includes stores, loads, and branch buffer entries. Note that due to how dependencies are handled, there may be more or less µOps than what was fed in. For the most part, the renamer is unified and deals with both integers and vectors. Resources, however, are partitioned between the two threads. Finally, as a last step, the µOps are matched with a port depending on their intended execution purpose. Up to 4 fused µOps may be renamed and handled per thread per cycle. Both the load and store in-flight units were increased to 72 and 42 entries respectively.
Haswell continues to use a unified scheduler for all µOps which holds 60 entries. µOps at this stage sit idle until they are cleared to be executed via their assigned dispatch port. µOps may be held due to resource unavailability.
Following a successful execution, µOps retire at a rate of up to 4 fused µOps/cycle. Retirement is once again in-order and frees up any reserved resource (ROB entries, PRFs entries, and various other buffers).
Execution Units
Some of the biggest architectural changes were done in the area of the execution units. Haswell widened the scheduler by two ports - one new integer dispatch port and a new memory port bringing the total to 8 µOps/cycle. The various ports have also been rebalanced. The new port 6 adds another Integer ALU designs to improve integer workloads freeing up Port 0 and 1 for vector works. It also adds a second branch unit to low the congestion Port 0. The second port that was added, Port 7 adds a new AGU. This is largely due to the improvements for AVX2 that roughly doubled its throughput. Port 0 had its ALU/Mul/shifter extended to 256-bits; same is true for the vector ALU on port 1 and the ALU/shuffle on port 5. Additionally a 256-bit FMA unit were added to both port 0 and port 1. The change makes it possible for FMAs and FMULs to issue on both ports. In theory, Haswell can peak at over double the performance of Sandy Bridge, with 16 double / 32 single precision FLOP/cycle + Integer ALU option + Vector operation.
The scheduler dispatches up to 8 ready µOps/cycle in FIFO order through the dispatch ports. µOps involving computational operations are sent to ports 0, 1, 5, and 6 to the appropriate unit. Likewise ports 2, 3, 4 and 7 are used for load/store and address calculations.
Die
Dual-core Haswell die:
- 1,400,000,000 transistors
- 177 mm2
- 4 CPU cores
- 1 GPU core
- 2x10xEU (80 ALUs)
- 22 nm process
Added instructions
AVX2 - Integer data types were extended to 256-bit SIMD.
- VBROADCASTI128
- VBROADCASTSD
- VBROADCASTSS
- VEXTRACTI128
- VGATHERDPD
- VGATHERDPS
- VGATHERQPD
- VGATHERQPS
- VINSERTI128
- VMOVNTDQA
- VMPSADBW
- VPABSB
- VPABSD
- VPABSW
- VPACKSSDW
- VPACKSSWB
- VPACKUSDW
- VPACKUSWB
- VPADDB
- VPADDD
- VPADDQ
- VPADDSB
- VPADDSW
- VPADDUSB
- VPADDUSW
- VPADDW
- VPALIGNR
- VPAND
- VPANDN
- VPAVGB
- VPAVGW
- VPBLENDD
- VPBLENDVB
- VPBLENDW
- VPBROADCASTB
- VPBROADCASTD
- VPBROADCASTQ
- VPBROADCASTW
- VPCMPEQB
- VPCMPEQD
- VPCMPEQQ
- VPCMPEQW
- VPCMPGTB
- VPCMPGTD
- VPCMPGTQ
- VPCMPGTW
- VPERM2I128
- VPERMD
- VPERMPD
- VPERMPS
- VPERMQ
- VPGATHERDD
- VPGATHERDQ
- VPGATHERQD
- VPGATHERQQ
- VPHADDD
- VPHADDSW
- VPHADDW
- VPHSUBD
- VPHSUBSW
- VPHSUBW
- VPMADDUBSW
- VPMADDWD
- VPMASKMOVD
- VPMASKMOVQ
- VPMAXSB
- VPMAXSD
- VPMAXSW
- VPMAXUB
- VPMAXUD
- VPMAXUW
- VPMINSB
- VPMINSD
- VPMINSW
- VPMINUB
- VPMINUD
- VPMINUW
- VPMOVMSKB
- VPMOVSXBD
- VPMOVSXBQ
- VPMOVSXBW
- VPMOVSXDQ
- VPMOVSXWD
- VPMOVSXWQ
- VPMOVZXBD
- VPMOVZXBQ
- VPMOVZXBW
- VPMOVZXDQ
- VPMOVZXWD
- VPMOVZXWQ
- VPMULDQ
- VPMULHRSW
- VPMULHUW
- VPMULHW
- VPMULLD
- VPMULLW
- VPMULUDQ
- VPOR
- VPSADBW
- VPSHUFB
- VPSHUFD
- VPSHUFHW
- VPSHUFLW
- VPSIGNB
- VPSIGND
- VPSIGNW
- VPSLLD
- VPSLLDQ
- VPSLLQ
- VPSLLVD
- VPSLLVQ
- VPSLLW
- VPSRAD
- VPSRAVD
- VPSRAW
- VPSRLD
- VPSRLDQ
- VPSRLQ
- VPSRLVD
- VPSRLVQ
- VPSRLW
- VPSUBB
- VPSUBD
- VPSUBQ
- VPSUBSB
- VPSUBSW
- VPSUBUSB
- VPSUBUSW
- VPSUBW
- VPUNPCKHBW
- VPUNPCKHDQ
- VPUNPCKHQDQ
- VPUNPCKHWD
- VPUNPCKLBW
- VPUNPCKLDQ
- VPUNPCKLQDQ
- VPUNPCKLWD
- VPXOR
BMI1 / BMI2 - Bit Manipulation Instructions Sets
FMA - Fused Multiply-Add instructions
- VFMADD123PD
- VFMADD123PS
- VFMADD123SD
- VFMADD123SS
- VFMADD132PD
- VFMADD132PS
- VFMADD132SD
- VFMADD132SS
- VFMADD213PD
- VFMADD213PS
- VFMADD213SD
- VFMADD213SS
- VFMADD231PD
- VFMADD231PS
- VFMADD231SD
- VFMADD231SS
- VFMADD312PD
- VFMADD312PS
- VFMADD312SD
- VFMADD312SS
- VFMADD321PD
- VFMADD321PS
- VFMADD321SD
- VFMADD321SS
- VFMADDSUB123PD
- VFMADDSUB123PS
- VFMADDSUB132PD
- VFMADDSUB132PS
- VFMADDSUB213PD
- VFMADDSUB213PS
- VFMADDSUB231PD
- VFMADDSUB231PS
- VFMADDSUB312PD
- VFMADDSUB312PS
- VFMADDSUB321PD
- VFMADDSUB321PS
- VFMSUB123PD
- VFMSUB123PS
- VFMSUB123SD
- VFMSUB123SS
- VFMSUB132PD
- VFMSUB132PS
- VFMSUB132SD
- VFMSUB132SS
- VFMSUB213PD
- VFMSUB213PS
- VFMSUB213SD
- VFMSUB213SS
- VFMSUB231PD
- VFMSUB231PS
- VFMSUB231SD
- VFMSUB231SS
- VFMSUB312PD
- VFMSUB312PS
- VFMSUB312SD
- VFMSUB312SS
- VFMSUB321PD
- VFMSUB321PS
- VFMSUB321SD
- VFMSUB321SS
- VFMSUBADD123PD
- VFMSUBADD123PS
- VFMSUBADD132PD
- VFMSUBADD132PS
- VFMSUBADD213PD
- VFMSUBADD213PS
- VFMSUBADD231PD
- VFMSUBADD231PS
- VFMSUBADD312PD
- VFMSUBADD312PS
- VFMSUBADD321PD
- VFMSUBADD321PS
- VFNMADD123PD
- VFNMADD123PS
- VFNMADD123SD
- VFNMADD123SS
- VFNMADD132PD
- VFNMADD132PS
- VFNMADD132SD
- VFNMADD132SS
- VFNMADD213PD
- VFNMADD213PS
- VFNMADD213SD
- VFNMADD213SS
- VFNMADD231PD
- VFNMADD231PS
- VFNMADD231SD
- VFNMADD231SS
- VFNMADD312PD
- VFNMADD312PS
- VFNMADD312SD
- VFNMADD312SS
- VFNMADD321PD
- VFNMADD321PS
- VFNMADD321SD
- VFNMADD321SS
- VFNMSUB123PD
- VFNMSUB123PS
- VFNMSUB123SD
- VFNMSUB123SS
- VFNMSUB132PD
- VFNMSUB132PS
- VFNMSUB132SD
- VFNMSUB132SS
- VFNMSUB213PD
- VFNMSUB213PS
- VFNMSUB213SD
- VFNMSUB213SS
- VFNMSUB231PD
- VFNMSUB231PS
- VFNMSUB231SD
- VFNMSUB231SS
- VFNMSUB312PD
- VFNMSUB312PS
- VFNMSUB312SD
- VFNMSUB312SS
- VFNMSUB321PD
- VFNMSUB321PS
- VFNMSUB321SD
- VFNMSUB321SS
MOVBE - Move Big-Endian instruction
TSX - Transactional Synchronization Extensions
Cores
All Haswell Chips
Haswell Chips | |||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|
Main processor | IGP | ||||||||||
Model | µarch | Platform | Core | Launched | SDP | TDP | Freq | Max Mem | Name | Freq | Max Freq |
i5-4570R | Haswell | 4 June 2013 | 65 W 65,000 mW 0.0872 hp 0.065 kW | 2,700 MHz 2.7 GHz 2,700,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 1,150 MHz 1.15 GHz 1,150,000 KHz | |||
i5-4670R | Haswell | 4 June 2013 | 65 W 65,000 mW 0.0872 hp 0.065 kW | 3,000 MHz 3 GHz 3,000,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 1,300 MHz 1.3 GHz 1,300,000 KHz | |||
i7-4750HQ | Haswell | Shark Bay | 2 June 2013 | 47 W 47,000 mW 0.063 hp 0.047 kW | 2,000 MHz 2 GHz 2,000,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 1,200 MHz 1.2 GHz 1,200,000 KHz | ||
i7-4760HQ | Haswell | Shark Bay | 14 April 2014 | 47 W 47,000 mW 0.063 hp 0.047 kW | 2,100 MHz 2.1 GHz 2,100,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 1,200 MHz 1.2 GHz 1,200,000 KHz | ||
i7-4770HQ | Haswell | Shark Bay | 20 July 2014 | 47 W 47,000 mW 0.063 hp 0.047 kW | 2,200 MHz 2.2 GHz 2,200,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 1,200 MHz 1.2 GHz 1,200,000 KHz | ||
i7-4770R | Haswell | Shark Bay | 4 June 2013 | 47 W 47,000 mW 0.063 hp 0.047 kW | 3,200 MHz 3.2 GHz 3,200,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 1,300 MHz 1.3 GHz 1,300,000 KHz | ||
i7-4850EQ | Haswell | Shark Bay | 20 February 2014 | 47 W 47,000 mW 0.063 hp 0.047 kW | 1,600 MHz 1.6 GHz 1,600,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 650 MHz 0.65 GHz 650,000 KHz | ||
i7-4850HQ | Haswell | Shark Bay | 4 June 2013 | 47 W 47,000 mW 0.063 hp 0.047 kW | 2,300 MHz 2.3 GHz 2,300,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 1,200 MHz 1.2 GHz 1,200,000 KHz | ||
i7-4860EQ | Haswell | Shark Bay | 20 February 2014 | 47 W 47,000 mW 0.063 hp 0.047 kW | 1,800 MHz 1.8 GHz 1,800,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 750 MHz 0.75 GHz 750,000 KHz | ||
i7-4860HQ | Haswell | Shark Bay | 19 January 2014 | 47 W 47,000 mW 0.063 hp 0.047 kW | 2,300 MHz 2.3 GHz 2,300,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 1,200 MHz 1.2 GHz 1,200,000 KHz | ||
i7-4870HQ | Haswell | Shark Bay | 20 July 2014 | 47 W 47,000 mW 0.063 hp 0.047 kW | 2,500 MHz 2.5 GHz 2,500,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 1,200 MHz 1.2 GHz 1,200,000 KHz | ||
i7-4930MX | Haswell | Shark Bay | Haswell | 2 June 2013 | 57 W 57,000 mW 0.0764 hp 0.057 kW | 3,000 MHz 3 GHz 3,000,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel HD Graphics 4600 | 400 MHz 0.4 GHz 400,000 KHz | 1,350 MHz 1.35 GHz 1,350,000 KHz | |
i7-4940MX | Haswell | Shark Bay | Haswell | 19 January 2014 | 57 W 57,000 mW 0.0764 hp 0.057 kW | 3,100 MHz 3.1 GHz 3,100,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel HD Graphics 4600 | 400 MHz 0.4 GHz 400,000 KHz | 1,350 MHz 1.35 GHz 1,350,000 KHz | |
i7-4950HQ | Haswell | Shark Bay | 4 June 2013 | 47 W 47,000 mW 0.063 hp 0.047 kW | 2,400 MHz 2.4 GHz 2,400,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 1,300 MHz 1.3 GHz 1,300,000 KHz | ||
i7-4960HQ | Haswell | Shark Bay | 1 September 2013 | 47 W 47,000 mW 0.063 hp 0.047 kW | 2,600 MHz 2.6 GHz 2,600,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 1,300 MHz 1.3 GHz 1,300,000 KHz | ||
i7-4980HQ | Haswell | Shark Bay | 1 September 2014 | 47 W 47,000 mW 0.063 hp 0.047 kW | 2,800 MHz 2.8 GHz 2,800,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 1,300 MHz 1.3 GHz 1,300,000 KHz | ||
i7-5960X | Haswell | X99 | Haswell E | 29 August 2014 | 140 W 140,000 mW 0.188 hp 0.14 kW | 3,000 MHz 3 GHz 3,000,000 kHz | 65,536 MiB 67,108,864 KiB 68,719,476,736 B 64 GiB 0.0625 TiB | ||||
E3-1284L v3 | Haswell | 1 October 2014 | 47 W 47,000 mW 0.063 hp 0.047 kW | 1,800 MHz 1.8 GHz 1,800,000 kHz | 32,768 MiB 33,554,432 KiB 34,359,738,368 B 32 GiB 0.0313 TiB | Intel Iris Pro Graphics 5200 | 200 MHz 0.2 GHz 200,000 KHz | 750 MHz 0.75 GHz 750,000 KHz | |||
E5-2670 v3 | Haswell | Grantley EP 2S | Haswell EP | 8 September 2014 | 120 W 120,000 mW 0.161 hp 0.12 kW | 2,300 MHz 2.3 GHz 2,300,000 kHz | 804,864 MiB 824,180,736 KiB 843,961,073,664 B 786 GiB 0.768 TiB |