First of all

Pardon my English

Abstract

In this article I give the example of assembler optimization of the C code.

Code is HashTable. The algorithm is:

Read items(words) from dict
For each item compute hash
Add item in the hash table
If item already exist in the hash table, bound it with the last added item in this cell

The 4 point can be visualized with this picture

I used Callgrind profile with KCacheGrind to analyse the slowest functions in my code. In the Kcashgrind sorted functions by self, to display only current function's time. After that I rewrote this function in clear asm

Analyzing

The profile output before optimization:

As we can see, the majority of the time takes ListValid() function. It is bad! Because, Valid functions won't be in production. Let's delete these functions and run profile. Now our output:

Now the majority of time take memory control functions. But, I can't optimzie malloc and free functions. Let's multiply the HashTable code FROM:

  RunTest(Hash_First,  list_of_lines, number_lines, name_of_result_files[0]);
  RunTest(Hash_Sum,    list_of_lines, number_lines, name_of_result_files[1]);
  RunTest(Hash_Len,    list_of_lines, number_lines, name_of_result_files[2]);
  RunTest(Hash_Rotate, list_of_lines, number_lines, name_of_result_files[3]);
  RunTest(Hash_GNU,    list_of_lines, number_lines, name_of_result_files[4]);
  RunTest(Hash_Rot13,  list_of_lines, number_lines, name_of_result_files[5]);
  RunTest(Hash_FAQ6,   list_of_lines, number_lines, name_of_result_files[6]);
  RunTest(Hash_Ly,     list_of_lines, number_lines, name_of_result_files[7]);
  RunTest(Hash_Rs,     list_of_lines, number_lines, name_of_result_files[8]);

TO:

for (int i = 0; i < 100; ++i) {
  RunTest(Hash_First,  list_of_lines, number_lines, name_of_result_files[0]);
  RunTest(Hash_Sum,    list_of_lines, number_lines, name_of_result_files[1]);
  RunTest(Hash_Len,    list_of_lines, number_lines, name_of_result_files[2]);
  RunTest(Hash_Rotate, list_of_lines, number_lines, name_of_result_files[3]);
  RunTest(Hash_GNU,    list_of_lines, number_lines, name_of_result_files[4]);
  RunTest(Hash_Rot13,  list_of_lines, number_lines, name_of_result_files[5]);
  RunTest(Hash_FAQ6,   list_of_lines, number_lines, name_of_result_files[6]);
  RunTest(Hash_Ly,     list_of_lines, number_lines, name_of_result_files[7]);
  RunTest(Hash_Rs,     list_of_lines, number_lines, name_of_result_files[8]);
}

And from calloc memory:

ListNode* new_node = (ListNode*)calloc(1, sizeof(ListNode));

To fixed memory:

ListNode memory[3330927] = {}; // 3330927 - constant == number of ListPush calls
			       //           was calculated by static variable
...

static int number_calls = 0;

ListNode* new_node = memory + number_calls;
number_calls++;

I inserted these lines in the specific my_calloc function. Don't forget delete all free's! So, the final attempt for profile:

YES!!!! ListPush and HashTablePush!

ListPush pretty easy:

//////////////////////////////// LIST //////////////////////////////
struct ListNode {
  
  Elem_t data;
  ListNode* next;
};

struct List {
  
  ListNode* first_elem;
  size_t number_nodes;
  ListNode* last_elem;
};
////////////////////////////////////////////////////////////////////

int ListPush(List* list, Elem_t value) {
  static int number_calls = 0;
 
  ListNode* new_node = memory + number_calls++;
 
  //init node
  new_node->data = value;
  new_node->next = NULL;
  
  if (list->number_nodes == 0) {
    list->first_elem = list->last_elem = new_node;
  }
  else {
    list->last_elem->next = new_node;
    list->last_elem = new_node;
  }
  list->number_nodes += 1;
  
  return 0;
}

Let's see, assembler version of that function with g++ -s list_pointers.cpp

Don't try to understand all this stuff, just be scared of this code. Pretty ugly, yeah?

_Z8ListPushP4ListPc:
.LFB17:
	.cfi_startproc
	pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset 6, -16
	movq	%rsp, %rbp
	.cfi_def_cfa_register 6
	movq	%rdi, -24(%rbp)
	movq	%rsi, -32(%rbp)
	movl	_ZZ8ListPushP4ListPcE12number_calls(%rip), %eax
	leal	1(%rax), %edx
	movl	%edx, _ZZ8ListPushP4ListPcE12number_calls(%rip)
	cltq
	salq	$4, %rax
	movq	%rax, %rdx
	leaq	memory(%rip), %rax
	addq	%rdx, %rax
	movq	%rax, -8(%rbp)
	movq	-8(%rbp), %rax
	movq	-32(%rbp), %rdx
	movq	%rdx, (%rax)
	movq	-8(%rbp), %rax
	movq	$0, 8(%rax)
	movq	-24(%rbp), %rax
	movq	8(%rax), %rax
	testq	%rax, %rax
	jne	.L11
	movq	-24(%rbp), %rax
	movq	-8(%rbp), %rdx
	movq	%rdx, 16(%rax)
	movq	-24(%rbp), %rax
	movq	16(%rax), %rdx
	movq	-24(%rbp), %rax
	movq	%rdx, (%rax)
	jmp	.L12
.L11:
	movq	-24(%rbp), %rax
	movq	16(%rax), %rax
	movq	-8(%rbp), %rdx
	movq	%rdx, 8(%rax)
	movq	-24(%rbp), %rax
	movq	-8(%rbp), %rdx
	movq	%rdx, 16(%rax)
.L12:
	movq	-24(%rbp), %rax
	movq	8(%rax), %rax
	leaq	1(%rax), %rdx
	movq	-24(%rbp), %rax
	movq	%rdx, 8(%rax)
	movl	$0, %eax
	popq	%rbp
	.cfi_def_cfa 7, 8
	ret
	.cfi_endproc

Extremely slow, because of the stack manipulation, useful operations... But, with g++ -s list_pointers.cpp -O2 I got better code, but still with stack and some light brakers.

New code!

So, what you should do to rewrite code in assembler. My solution:

Make assembler listing for desired file wtih g++ -s list_pointers.cpp -O0
Just copy desired function in new file, list_push.s in my example. It was assembled with mangling. It is okey, because we will write extern, which, by default, means extern "C++", which means C++ names.
Don't forget to write .globl <function_name> in assembler file
In function declaration in the .cpp file, where desired function lived, wrtie: extern ListPush(...);. And delete function definition
Compile: g++ list_pointers.cpp list_push.s -O2
Now, if all works, with http://godbolt.org optimize you .s file

After a hard coding I wrote this function in clear assembler. It has documentation, so you can read it carefully

.globl _Z8ListPushP4ListPc
.type _Z8ListPushP4ListPc, @function
_Z8ListPushP4ListPc:
    # new_node = memory + number_calls++;	
   	movq	_ZZ8ListPushP4ListPcE12number_calls(%rip), %rax
	leal	1(%rax), %edx
	movl	%edx, _ZZ8ListPushP4ListPcE12number_calls(%rip)
	salq	$4, %rax
	leaq	memory(%rip), %rdx
	addq	%rax, %rdx

    # new_node->data = value;
    # new_node->next = NULL;
	movq	%rsi, (%rdx)
	movq	$0, 8(%rdx)    
        
    # if (list->number_nodes == 0) {
	movq	8(%rdi), %rax
	testq	%rax, %rax	
    	jne	.L11
    
    # list->number_nodes += 1;	
	addq    $1, %rax
	movq	%rax, 8(%rdi)

    # list->first_elem = list->last_elem = new_node;
	movq	%rdx, 16(%rdi)
	movq	%rdx, (%rdi)
	
    	xorl	%eax, %eax
	ret
.L11:
    # list->number_nodes += 1;	
	addq    $1, %rax
	movq	%rax, 8(%rdi)

    # list->last_elem->next = new_node;
    # list->last_elem = new_node;    
    	movq	16(%rdi), %rax
	movq	%rdx, 8(%rax)
	movq	%rdx, 16(%rdi)

    	xorl	%eax, %eax
	ret

Deleted all stack manipulation
Deleted some strange expressions:

	movq	-8(%rbp), %rax

Less jmp's
And some small one's: movl $0, %eax -> xorl %eax, %eax

Fabulous. I also rewrote the HashTablePush function Let't check the speed of this function!

As you can see, ListPush and HashTablePush reduced their time. Comparing with old version of the program So we get

Name		Name	Last commit message	Last commit date
Latest commit History 30 Commits
.idea		.idea
HashTable		HashTable
OptimizationPictures		OptimizationPictures
README.md		README.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

First of all

Abstract

Analyzing

New code!

12% optimization for all code

About

Releases

Packages

Languages

petialetia/OptimizationChallenge

Folders and files

Latest commit

History

Repository files navigation

First of all

Abstract

Analyzing

New code!

12% optimization for all code

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages