cmp %r0, %r1 This says to compare the values of the registers
r0 and r1, and set the control status register
(CSR) to reflect the outcome. The CSR
will store whether (r0==r1), (r0 < r1) or (r0 > r1).
b l1 This says go (branch) directly to label l1. This sets
the pc to l1 rather than (pc+4). Note that you
can't "return" from a branch like you can from a
"jsr" statement.
beq l1 This says that if the CSR denotes that the two compared
values are equal, go (set the pc) to label l1.
If the two compared values are not equal, the
next statement (pc+4) is executed.
ble l1 These should be obvious (<=, <, >=, >, !=).
blt l1
bge l1
bgt l1
bne l1
if (cond) {
S1
} else {
S2
}
S3
|
set up conditional branch on the negation of the conditional to l1 S1 b l2 l1: S2 l2: S3 |
For example, here is a program that does a simple if-then-else:
cond1.c
int a(int i, int j)
{
int k;
if (i < j) {
k = i;
} else {
k = j;
}
return k;
}
int main()
{
return a(3, 4);
}
|
cond1.jasa: push #4 / Allocate k ld [fp+12] -> %r0 / Compare i & j ld [fp+16] -> %r1 / Branch on negation of less-than cmp %r0, %r1 bge l1 ld [fp+12] -> %r0 / k = i st %r0 -> [fp] b l2 l1: ld [fp+16] -> %r0 / k = j st %r0 -> [fp] l2: ld [fp] -> %r0 / return k ret main: mov #4 -> %r0 st %r0 -> [sp]-- mov #3 -> %r0 st %r0 -> [sp]-- jsr a ret |
Trace through it with jassem.tcl to see that r0 will be three when main() returns.
while (cond) {
S1
}
S2
|
l1: set up conditional branch on the negation of the conditional to l2 S1 b l1 l2: S2 |
for (S1; cond; S2) {
S3
}
S4
|
S1 b l2 l1: S2 l2: set up conditional branch on the negation of the conditional to l3 S3 b l1 l3: S4 |
cond2.c
int a(int k)
{
int i, j;
j = 0;
for (i = 1; i <= k; i++) j += i;
return j;
}
int main()
{
int i;
i = a(4);
}
|
cond2.jasa: push #8 / Allocate i and j on the stack st %g0 -> [fp-4] / Set j to zero st %g1 -> [fp] / Initialize the for loop (S1) b l2 l1: ld [fp] -> %r0 / Do i++ (S2) add %r0, %g1 -> %r0 st %r0 -> [fp] l2: ld [fp] -> %r0 / Perform the test, and ld [fp+12] -> %r1 / branch on the negation cmp %r0, %r1 bgt l3 ld [fp-4] -> %r0 / Do j += i (S3) ld [fp] -> %r1 add %r0, %r1 -> %r0 st %r0 -> [fp-4] b l1 l3: ld [fp-4] -> %r0 / return j (S4) ret main: push #4 mov #4 -> %r0 st %r0 -> [sp]-- jsr a pop #4 st %r0 -> [fp] ret |
By now, recursive procedures shouldn't seem mysterious. Below, I have a recursive implementation of factorial:
fact4.c
int fact(int i)
{
if (i == 0) return 1;
return fact(i-1)*i;
}
int main()
{
int i;
i = fact(4);
}
|
fact4.jas
fact:
ld [fp+12] -> %r0 / do the if statement
cmp %r0, %g0
bne l1
mov %g1 -> %r0
ret
l1:
ld [fp+12] -> %r0 / push i-1 on the stack
add %r0, %gm1 -> %r0
st %r0 -> [sp]--
jsr fact / recursive call to fact
pop #4 / pop the argument off the stack
ld [fp+12] -> %r1 / multiply fact(i-1)*i
mul %r0, %r1 -> %r0
ret
main:
push #4
mov #4 -> %r0
st %r0 -> [sp]--
jsr fact
pop #4
st %r0 -> [fp]
ret
|
Each recursive call pushes a new stack frame. You can use Use jassem.tcl on the program fact4.c, compiled into fact4.jas, to trace through fact(4).
Additionally, I have two drawings that you can use for studying: fact-unlabeled.png (and fact-unlabeled.pdf) show a snapshot of the stack at a certain point in the execution of fact4.c. One potential test or homework question would be to label every byte on the stack and say where we are in the program. The answer is in fact-labeled.png (and fact-labeled.pdf).
void bsort(int *a, int size)
{
int i, j, tmp;
for (i = size-1; i > 0; i--) {
for (j = 0; j < i; j++) {
if (a[j] > a[j+1]) {
tmp = a[j];
a[j] = a[j+1];
a[j+1] = tmp;
}
}
}
}
main()
{
int array[4];
array[0] = 6;
array[1] = 1;
array[2] = 4;
array[3] = 2;
bsort(array, 4);
}
There are a lot of array operations here, so the assembly
code is lengthy. It is in bsort.jas,
and below:
bsort:
push #12 / i=fp-8, j=fp-4, tmp=fp
st %r2 -> [sp]-- / Spill r2
/ For loop #1: labels f11, f12, f13
ld [fp+16] -> %r0 / i = size-1
add %r0, %gm1 -> %r0
st %r0 -> [fp-8]
b f12
f11:
ld [fp-8] -> %r0 / i--
add %r0, %gm1 -> %r0
st %r0 -> [fp-8]
f12:
ld [fp-8] -> %r0 / i > 0
cmp %r0, %g0
ble f13
/ For loop #2: labels f21, f22, f23
st %g0 -> [fp-4] / j = 0
b f22
f21:
ld [fp-4] -> %r0 / j++
add %r0, %g1 -> %r0
st %r0 -> [fp-4]
f22:
ld [fp-4] -> %r0
ld [fp-8] -> %r1
cmp %r0, %r1
bge f23
/ If (a[j] > a[j+1])
ld [fp-4] -> %r0 / First put a[j] into register r0
mov #4 -> %r1
mul %r0, %r1 -> %r0
ld [fp+12] -> %r1
add %r0, %r1 -> %r0
ld [r0] -> %r0
ld [fp-4] -> %r1 / Now put a[j+1] into register r1
add %r1, %g1 -> %r1 / without touching r0
mov #4 -> %r2
mul %r1, %r2 -> %r1
ld [fp+12] -> %r2
add %r1, %r2 -> %r1
ld [r1] -> %r1
cmp %r0, %r1
ble i1
ld [fp-4] -> %r0 / tmp = a[j]
mov #4 -> %r1
mul %r0, %r1 -> %r0
ld [fp+12] -> %r1
add %r0, %r1 -> %r0
ld [r0] -> %r0
st %r0 -> [fp]
ld [fp-4] -> %r0 / a[j] = a[j+1]
add %r0, %g1 -> %r0 / Load a[j+1] into r0
mov #4 -> %r1
mul %r0, %r1 -> %r0
ld [fp+12] -> %r1
add %r0, %r1 -> %r0
ld [r0] -> %r0
ld [fp-4] -> %r1 / Load &(a[j]) into r1
mov #4 -> %r2
mul %r1, %r2 -> %r1
ld [fp+12] -> %r2
add %r1, %r2 -> %r1
st %r0 -> [r1] / Store r0 into a[j]
ld [fp] -> %r0 / a[j+1] = tmp
ld [fp-4] -> %r1
add %r1, %g1 -> %r1
mov #4 -> %r2
mul %r1, %r2 -> %r1
ld [fp+12] -> %r2
add %r1, %r2 -> %r1
st %r0 -> [r1]
i1: / End of if statement
b f21 / End of for loop #2
f23:
b f11 / End of for loop #1
f13:
ld ++[sp] -> %r2
ret
main:
push #16
mov #-1 -> %r2 / This is just to show spilling
mov #6 -> %r0
st %r0 -> [fp-12]
mov #1 -> %r0
st %r0 -> [fp-8]
mov #4 -> %r0
st %r0 -> [fp-4]
mov #2 -> %r0
st %r0 -> [fp]
mov #4 -> %r0
st %r0 -> [sp]--
mov #12 -> %r0
sub %fp, %r0 -> %r0
st %r0 -> [sp]--
jsr bsort
pop #8
ret
The execution of this with jassem is a bit cumbersome -- it goes blazingly
fast on my linux box, but not on my windows box -- this is not the
most efficient tcl/tk code in the world. Oh well. As always, make
sure you understand both the translation to assembly code, and
the workings of the assembler. Yes, this code is grossly inefficient
and can be made world's faster with the judicious use of some registers.
I will only go over this in class if there is time. If not, only read this if you are interested. Since you are not supposed to optimize code, you are not supposed to implement things like this on homeworks or exams.
Suppose you write a procedure that is recursive, and has the feature that the recursive call is returned. For example, suppose you decided to write that recursive factorial function as in fact5.c:
int fact(int i, int val_so_far)
{
if (i == 0) return val_so_far;
return fact(i-1, val_so_far * i);
}
int main()
{
int i;
i = fact(5, 1);
}
|
Since the recursive call is simply returned, there is a tricky compiler optimization called "tail recursion." What it does is the following:
fact: ld [fp+12] -> %r0 / This is the base case. cmp %r0, %g0 / If i == 0, return val_so_far bne l1 ld [fp+16] -> %r0 ret l1: ld [fp+16] -> %r0 / Replace val_so_far ld [fp+12] -> %r1 / with (val_so_far * i) mul %r0, %r1 -> %r0 st %r0 -> [fp+16] ld [fp+12] -> %r0 / Replace i with i-1 add %r0, %gm1 -> %r0 st %r0 -> [fp+12] b fact / Branch back instead of doing recursion. main: push #4 st %g1 -> [sp]-- mov #5 -> %r0 st %r0 -> [sp]-- jsr fact pop #8 st %r0 -> [fp] ret |
That's a neat optimization, isn't it? It's one that functional programmers love, because it allows them to write their tricky, modification-free functional programs and claim that they can run as fast as normal programs.
I will only go over this in class if there is time. If not, only read this if you are interested.
In all assembler assignments in class, in homeworks and on tests, assume that there is no delay slot. This is just for your own knowledge.
Reading assembler from a random machine can be difficult, but usually you can figure out how its assembler maps into the one defined in this class. One point of confusion which arose when our department had Sun Sparc processors is the delay slot. There is a technique for speeding up processors called "pipelining" which means that the CPU doesn't finish executing the current instruction before it starts executing the next instruction. Usually, this does not involve much confusion. However, on jsr and ret and b instructions, there is a problem: These instructions change the pc, which means that the next instruction should not be executed. But on a pipelined processor, by the time the instruction is done, the next instruction has already been partially executed.
The solution on Sparcs is that the instruction after the jsr, ret and b is executed, and then control goes to the changed value of the pc. This instruction -- the one after the jsr, ret or b -- is known as the delay slot. Note that the semantics of jsr must change too -- it must push pc+8 onto the stack so that when ret is called, it returns to the instruction after the delay instruction.
It is up to the compiler-writers to ensure that this slot is used correctly. For example, without compiler optimization, most compilers simply insert a noop after the jsr, ret or b. For example:
a(int i)
{
return b(i+1)+1;
}
compiles to:
a: ld [fp+12] -> %r0 / Push i+1 onto the stack add %r0, %g1 -> %r0 st %r0 -> [sp]-- jsr b / Call procedure b noop / Delay slot pop #4 add %r0, %g1 -> %r0 / Put b(i+1)+1 into r0 ret / return noop / delay slotAn optimized compiler, however, will use the delay slot, which makes code harder to read, since you have to remember that the instruction after the jsr, ret or b gets executed. Moreover, subroutines return to the instruction after the instruction after the jsr call. Here's an example of the above procedure compiled in such a way that the delay slots following the jsr and ret statements are used.
a:
ld [fp+12] -> %r0 / Push i+1 onto the stack
add %r0, %g1 -> %r0
jsr b / Call procedure b
st %r0 -> [sp]--
pop #4
ret / return
add %r0, %g1 -> %r0 / Put b(i+1)+1 into r0 -- this gets executed
before the return actually occurs.