ArcSim - HiPEAC`10 Poster

Transcription

ArcSim - HiPEAC`10 Poster
& Collaborative Systems
Institute for Computing
Systems Architecture
Centre for Intelligent Systems
& their Applications
Processor Automated Synthesis by iTerative Analysis Project
Laboratory for Foundations
of Computer Science
http://groups.inf.ed.ac.uk/pasta/
Institute of Perception,
Action and Behaviour
Ultra-Fast Functional Instruction Set Simulation
Institute for Adaptive
& Neural Computation
e-Science Institute
JIT Dynamic Binary Translation
HOTSPOT Detection
Large Translation Units
PC address
Basic Block Mode
SCC Mode
A
BB
A
SCC
A
B
BB
B
SCC
B
Simulation time is partitioned into epochs
Epoch is interval between two translations
Block
Translated
Yes
Call Translated Block
Function
BB
C
D
Frequently executed blocks are HOTSPOTS
No
HOTSPOTS are translated into native code
New Block
Yes
BB
C
BB
E
F
BB
F
G
BB
G
PAGE Mode
CFG Mode
CFG
C
D
E
Informatics Life-Sciences Institute
PAGE CFG
Flexible Faster Than Silicon
Instruction Set Simulator
D
E
F
LINEAR
Add Block to Worklist
G
No
JIT Dynamic Binary Translation
Interpretive Block
Simulation
JIT DBT Engine
1
Increment Block
Simulation Counter
Compute instruction side effects
QNN'Q>304<40.A0'S9@0'T'U7>8.9>-C'
QNN'VQW'XYW'S9@0'T'U7>8.9>-C'
+"""'
Map ARCompact code onto native code
Determine operand dependencies
N<00@'O<./1;0@'PNQR'
ZSQRN['
Full Virtual Simulation Platform
Faster than Silicon*
*""'
#""'
Instruction Selection
1
,""'
Yes
Emit native target instructions
No
Emit performance model updates
1
Dependency Analysis
End of Epoch
2
)"&$%*'
)""'
Emit Target Instruction
Translate Blocks in
Worklist
Large Translation Units
%""'
Simulation Modes
&""'
Instruction Set Extension API
(""'
2
Emit Performance Model
Update
!"#$%&'
!""'
Verified Cycle-Accurate Simulation
Larger blocks result in improved locality
More scope for JIT compiler optimisations
Features and APIs
Update Translated
Block Map
High-Speed JIT DBT Simulation
+""'
Compile and Load
("$)%'
* ENCORE
'
@M
<0
I'
89
40
/
-4
F'
-A
04
-I
0'
Fast Cycle-Approximate Simulation
8M<
0I
9;
<6
'
<F
GC
9H
'
<>
34
8?
"+
<7
'
H
/
9@
"+
4I
'
:8
/
J"
+'
4I
:?
<I
"+
'
4I
:J
1K
"+
'
49
330
49
"+
73
'
0C
99
F7
<'
4;
<0
0@
"+
'
3:
C9
9F
"+
'
30
E3
"+
'
L;
<4
F"
+'
A1
30
4:
""
'
23
""
'
1@
83
4>
"+
'
114
D3
"+
'
/
-3
41E
"+
'
-!
./
0"
+'
-1
23
4"
+'
-1
54
6"
+'
-1
12
3"
+'
-7
38
94
""
'
:;0
6<
"+
'
:0
=10
4"
+
:1
3/ '
><
"+
8'
8?
0:
"+
'
8>4
@4
"+
89
'
>A
0>
""
'
@1
3?
04
"+
'
B
13C"
"'
"'
Powerful plugin API for implementing instruction set extensions. ISE plugins are
as powerful as baseline instructions and integrate into cycle-accurate and highspeed JIT DBT simulation modes.
Memory Mapped IO API
Simple yet very powerful API for implementing user defined memory mapped IO
devices such as sound, screen, ethernet...
Co-Simulation for HW Verification
@ 350MHz using 90nm
System Call Emulation
Emulation implementations for Linux system calls as well as support for
redirection of stdin, stdout, and stderr are provided and enable the standalone
simulation of binaries without an operating system.
Detailed Profiling Simulation Mode
Ultra-Fast Cycle Accurate Instruction Set Simulation
FETCH
DECODE
MEMORY
EXECUTE
Next Fetch
PC
PC
....
0x00000848:
[0x00000848] ext
[0x0000084c] xor
[0x00000850] and
r2,r9
r3,r12,r2
r3,r3,0xf
[0x00000854] asl
r3,r3,0x3
[0x00000858] and
r2,r2,0x7
[0x0000085c] or
r3,r3,r2
[0x00000860] asl
r4,r3,0x8
[0x00000864] brcc.d
[0x00000868] or
....
r10,r13,0x2c
r4,r4,r3
extern CpuState cpu;
// global processor state
1
void BLK_0x00000848(void) {
cpu.r[2] = (uint16_t)(cpu.r[9]);
pipeline(0,cpu.avail[9],&(cpu.avail[2]),0x00000848,1,0);
2
cpu.r[3] = cpu.r[12] ^ cpu.r[2];
pipeline(cpu.avail[12],cpu.avail[2],&(cpu.avail[3]),0x0000084c,1,0);
cpu.r[3] = cpu.r[3] & (uint32_t)15;
pipeline(cpu.avail[3],0,&(cpu.avail[3]),0x00000850,1,0);
cpu.r[3] = cpu.r[3] << ((sint8_t)3 & 0x1f);
pipeline(cpu.avail[3],0,&(cpu.avail[3]),0x00000854,1,0);
cpu.r[2] = cpu.r[2] & (uint32_t)7;
pipeline(cpu.avail[2],0,&(cpu.avail[2]),0x00000858,1,0);
cpu.r[3] = cpu.r[3] | cpu.r[2];
pipeline(cpu.avail[3],cpu.avail[2],&(cpu.avail[3]),0x0000085c,1,0); 3
cpu.r[4] = cpu.r[3] << ((sint8_t)8 & 0x1f);
pipeline(cpu.avail[3],0,&(cpu.avail[4]),0x00000860,1,0);
// compare and branch instruction with delay slot
pipeline(cpu.avail[10],cpu.avail[13],&(ignore),0x00000864,1,0);
if (cpu.r[10] >= cpu.r[13]) {
cpu.pl[FE] = cpu.pl[ME] - 1; // branch penalty
4
fetch(0x0000086c);
// speculative fetch due to branch pred.
cpu.auxr[BTA] = 0x00000890; // set BTA register
cpu.D = 1;
// set delay slot bit
} else {
5
cpu.pc = 0x0000086c;
}
cpu.r[4] = cpu.r[4] | cpu.r[3];// delay slot instruction
pipeline(cpu.avail[4],cpu.avail[3],&(cpu.avail[4]),0x00000868,1,0);
if (cpu.D) {
// branch was taken
cpu.D = 0;
// clear delay slot bit
cpu.pc = cpu.auxr[BTA];
// set PC
}
cpu.cycles = cpu.pl[WB];
// set total cycle count at end of block
return;
}
Data Structures
ZOL
Logic
Instruction Cache
Tags
// pipeline stages
typedef enum {
FE,
// fetch
DE,
// decode
EX,
// execute
ME,
// memory
WB,
// write back
STAGES // 5 stages
} Stage;
// processor state
typedef struct {
uint32_t pc;
6
// general purpose registers
uint32_t r[REGS];
// auxiliary registers
uint32_t auxr[AUXREGS];
// status flags (H...halt bit)
char
L,Z,N,C,V,U,D,H;
// per stage cycle count
7
uint64_t pl[STAGES];
// per register cycle count
uint64_t avail[REGS];
// total cycle count
uint64_t cycles;
// used when insn. does not
// produce result
uint64_t ignore;
} CpuState;
Interactive debugging interface supporting single step tracing of instructions,
debugging of processor state, setting of breakpoints...
PC
Inst
Data
Q
PC
Inst
Decode
Logic
r0
Align
Logic
B
PC
Register
File
Hit &
Select
Logic
WRITEBACK
BRcc/BBIT Target
Logic
Bcc/Jcc Target
Logic
JIT Translated Block with Performance Model
Debugging and Tracing
ENCORE 5-Stage Pipeline Hardware Model
JIT Performance Model Translation
Block of ARCompactâ„¢ Instructions
Profiling simulation mode is orthogonal to the above modes and provides
detailed statistics about various simulation aspects such as dynamic instruction
frequencies, per-instruction latency distributions, detailed cache-statistics,
branch predictor statistics, detailed register usage statistics...
ALU
Bypass
Logic
Limm
PC
ABS
MIN
MAX
SELECT
Select
Result
Bypass
Logic
Data
Cache
Input
Select
&
Control
Logic
Exception
&
Replay
Logic
Data Cache
Tags
Data
Instruction Set Simulator
Hit &
Select
Logic
void pipeline(uint64_t opd1, uint64_t opd2, uint64_t* dst1, uint64_t* dst2, uint32_t faddr, uint32_t xc, uint32_t mc) {
cpu.pl[FE] += fetch(faddr);
// FETCH
- account for instruction fetch latency
if (cpu.pl[FE] < cpu.pl[DE]) cpu.pl[FE] = cpu.pl[DE]; // INVARIANT - resolves structural hazard
cpu.pl[DE] = max3((cpu.pl[FE] + 1), opd1, opd2);
// DECODE
- determine operand availability time
if (cpu.pl[DE] < cpu.pl[EX]) cpu.pl[DE] = cpu.pl[EX]; //
cpu.pl[EX] = *dst1 = cpu.pl[DE] + xc;
// EXECUTE
- account for execution latency and destination
if (cpu.pl[EX] < cpu.pl[ME]) cpu.pl[EX] = cpu.pl[ME]; //
availability time
cpu.pl[ME] = *dst2 = cpu.pl[EX] + mc;
// MEMORY
- account for memory latency and destination
if (cpu.pl[ME] < cpu.pl[WB]) cpu.pl[ME] = cpu.pl[WB]; //
availability time
cpu.pl[WB] = cpu.pl[ME] + 1;
// WRITEBACK
}
M;//?&N;-.0:/?&OPQR&
[TSPM\&&
Load
Align
Screen
DATA MEMORY PIPELINE
ENCORE 5-Stage Pipeline JIT Generated Software Model
SMM&S=2/3;3/-@/&T8?/&U&VI7B/&R7763,2/&
Virtual Devices
1
2
3
4
5
Novel Software
Pipeline Model
Structural
Dependencies
2
Operand
Dependency Timing
3
Side Effect
Availability Timing
4
5
UART
SMM&WSX&YZX&T8?/&U&VI7B/&R7763,2/&
(+&
(*&
Faster than FPGA*
Sound
++&
+*&
)+&
)*&
High-Speed Performance Modeling
Key idea is to retain a high-speed instruction-by-instruction execution model and reconstruct microarchitectural
''#()&
'+&
!"#$%&
'*&
!+&
pipeline state at instruction commit boundaries. In order to increase simulation speed we introduce an innovative
pipeline software model and perform JIT dynamic binary translation. JIT generated code is augmented with
!*&
%+&
$#""&
%*&
+&
8:
;5
&
;E
FB
8G
&
;=
237
>*
;6
%&
G
.
8?
*%
3H
&
97
.
I*
%&
3H
9>
;H
*%
&
3H
9I
0J
*%
&
38
2,
2/
38
*%
62
&
/B
88
E6
;&
3:
;/
/?
*%
&
29
B8
8E
*%
&
2/
D2
*%
&
K:
;3
E*
%&
@0
2/
39
**
&
7L;
/H
&
?L
;/
H&
78
3/
.
,3
E&
,@
/3
,H
/&
/*
%&
,0
12
3*
%&
,0
43
5*
%&
,0
01
2*
%&
,6
27
83
**
&
9,
:/
5;
*%
&
9/
<0/
3*
%
90
2. &
=;
*%
&
7,
7>
/9
*%
&
7,
=3
?3
*%
78
&
=@
/=
**
&
?0
2>
/3
*%
&
A
02,
B*
*&
12
**
&
0?
72
3=
*%
&
003
C2
*%
&
.
,2
30D
*%
&
*&
,!
-.
highly optimised JIT generated code for performance model updates.
* Speed-Opt.
FPGA @ 50MHz
Instruction Set
Extensions

Similar documents