Pardon my English
In this article I give the example of assembler optimization of the C code.
Code is HashTable. The algorithm is:
- Read items(words) from dict
- For each item compute hash
- Add item in the hash table
- If item already exist in the hash table, bound it with the last added item in this cell
The 4 point can be visualized with this picture
I used Callgrind profile with KCacheGrind to analyse the slowest functions in my code. In the Kcashgrind sorted functions by self
, to display only current function's time. After that I rewrote this function in clear asm
The profile output before optimization:
As we can see, the majority of the time takes ListValid()
function. It is bad! Because, Valid functions won't be in production. Let's delete these functions and run profile. Now our output:
Now the majority of time take memory control functions. But, I can't optimzie malloc and free functions. Let's multiply the HashTable code FROM:
RunTest(Hash_First, list_of_lines, number_lines, name_of_result_files[0]);
RunTest(Hash_Sum, list_of_lines, number_lines, name_of_result_files[1]);
RunTest(Hash_Len, list_of_lines, number_lines, name_of_result_files[2]);
RunTest(Hash_Rotate, list_of_lines, number_lines, name_of_result_files[3]);
RunTest(Hash_GNU, list_of_lines, number_lines, name_of_result_files[4]);
RunTest(Hash_Rot13, list_of_lines, number_lines, name_of_result_files[5]);
RunTest(Hash_FAQ6, list_of_lines, number_lines, name_of_result_files[6]);
RunTest(Hash_Ly, list_of_lines, number_lines, name_of_result_files[7]);
RunTest(Hash_Rs, list_of_lines, number_lines, name_of_result_files[8]);
TO:
for (int i = 0; i < 100; ++i) {
RunTest(Hash_First, list_of_lines, number_lines, name_of_result_files[0]);
RunTest(Hash_Sum, list_of_lines, number_lines, name_of_result_files[1]);
RunTest(Hash_Len, list_of_lines, number_lines, name_of_result_files[2]);
RunTest(Hash_Rotate, list_of_lines, number_lines, name_of_result_files[3]);
RunTest(Hash_GNU, list_of_lines, number_lines, name_of_result_files[4]);
RunTest(Hash_Rot13, list_of_lines, number_lines, name_of_result_files[5]);
RunTest(Hash_FAQ6, list_of_lines, number_lines, name_of_result_files[6]);
RunTest(Hash_Ly, list_of_lines, number_lines, name_of_result_files[7]);
RunTest(Hash_Rs, list_of_lines, number_lines, name_of_result_files[8]);
}
And from calloc memory:
ListNode* new_node = (ListNode*)calloc(1, sizeof(ListNode));
To fixed memory:
ListNode memory[3330927] = {}; // 3330927 - constant == number of ListPush calls
// was calculated by static variable
...
static int number_calls = 0;
ListNode* new_node = memory + number_calls;
number_calls++;
I inserted these lines in the specific my_calloc function. Don't forget delete all free's! So, the final attempt for profile:
YES!!!! ListPush and HashTablePush!
ListPush pretty easy:
//////////////////////////////// LIST //////////////////////////////
struct ListNode {
Elem_t data;
ListNode* next;
};
struct List {
ListNode* first_elem;
size_t number_nodes;
ListNode* last_elem;
};
////////////////////////////////////////////////////////////////////
int ListPush(List* list, Elem_t value) {
static int number_calls = 0;
ListNode* new_node = memory + number_calls++;
//init node
new_node->data = value;
new_node->next = NULL;
if (list->number_nodes == 0) {
list->first_elem = list->last_elem = new_node;
}
else {
list->last_elem->next = new_node;
list->last_elem = new_node;
}
list->number_nodes += 1;
return 0;
}
Let's see, assembler version of that function with g++ -s list_pointers.cpp
Don't try to understand all this stuff, just be scared of this code. Pretty ugly, yeah?
_Z8ListPushP4ListPc:
.LFB17:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movl _ZZ8ListPushP4ListPcE12number_calls(%rip), %eax
leal 1(%rax), %edx
movl %edx, _ZZ8ListPushP4ListPcE12number_calls(%rip)
cltq
salq $4, %rax
movq %rax, %rdx
leaq memory(%rip), %rax
addq %rdx, %rax
movq %rax, -8(%rbp)
movq -8(%rbp), %rax
movq -32(%rbp), %rdx
movq %rdx, (%rax)
movq -8(%rbp), %rax
movq $0, 8(%rax)
movq -24(%rbp), %rax
movq 8(%rax), %rax
testq %rax, %rax
jne .L11
movq -24(%rbp), %rax
movq -8(%rbp), %rdx
movq %rdx, 16(%rax)
movq -24(%rbp), %rax
movq 16(%rax), %rdx
movq -24(%rbp), %rax
movq %rdx, (%rax)
jmp .L12
.L11:
movq -24(%rbp), %rax
movq 16(%rax), %rax
movq -8(%rbp), %rdx
movq %rdx, 8(%rax)
movq -24(%rbp), %rax
movq -8(%rbp), %rdx
movq %rdx, 16(%rax)
.L12:
movq -24(%rbp), %rax
movq 8(%rax), %rax
leaq 1(%rax), %rdx
movq -24(%rbp), %rax
movq %rdx, 8(%rax)
movl $0, %eax
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
Extremely slow, because of the stack manipulation, useful operations...
But, with g++ -s list_pointers.cpp -O2
I got better code, but still with stack and some light brakers.
So, what you should do to rewrite code in assembler. My solution:
- Make assembler listing for desired file wtih
g++ -s list_pointers.cpp -O0
- Just copy desired function in new file, list_push.s in my example. It was assembled with mangling. It is okey, because we will write
extern
, which, by default, meansextern "C++"
, which means C++ names. - Don't forget to write
.globl <function_name>
in assembler file - In function declaration in the .cpp file, where desired function lived, wrtie:
extern ListPush(...);
. And delete function definition - Compile:
g++ list_pointers.cpp list_push.s -O2
- Now, if all works, with http://godbolt.org optimize you .s file
After a hard coding I wrote this function in clear assembler. It has documentation, so you can read it carefully
.globl _Z8ListPushP4ListPc
.type _Z8ListPushP4ListPc, @function
_Z8ListPushP4ListPc:
# new_node = memory + number_calls++;
movq _ZZ8ListPushP4ListPcE12number_calls(%rip), %rax
leal 1(%rax), %edx
movl %edx, _ZZ8ListPushP4ListPcE12number_calls(%rip)
salq $4, %rax
leaq memory(%rip), %rdx
addq %rax, %rdx
# new_node->data = value;
# new_node->next = NULL;
movq %rsi, (%rdx)
movq $0, 8(%rdx)
# if (list->number_nodes == 0) {
movq 8(%rdi), %rax
testq %rax, %rax
jne .L11
# list->number_nodes += 1;
addq $1, %rax
movq %rax, 8(%rdi)
# list->first_elem = list->last_elem = new_node;
movq %rdx, 16(%rdi)
movq %rdx, (%rdi)
xorl %eax, %eax
ret
.L11:
# list->number_nodes += 1;
addq $1, %rax
movq %rax, 8(%rdi)
# list->last_elem->next = new_node;
# list->last_elem = new_node;
movq 16(%rdi), %rax
movq %rdx, 8(%rax)
movq %rdx, 16(%rdi)
xorl %eax, %eax
ret
- Deleted all stack manipulation
- Deleted some strange expressions:
movq -8(%rbp), %rax
- Less jmp's
- And some small one's:
movl $0, %eax -> xorl %eax, %eax
Fabulous. I also rewrote the HashTablePush function Let't check the speed of this function!
As you can see, ListPush and HashTablePush reduced their time. Comparing with old version of the program So we get