Project 1: String Library
Implement optimized versions of common string functions: strlen, strcpy, strcmp, memcpy, memset using assembly.
; Optimized strlen using SCASB
; Input: RDI = pointer to null-terminated string
; Output: RAX = string length
global asm_strlen
asm_strlen:
push rdi
xor rcx, rcx
dec rcx ; RCX = -1 (max count)
xor al, al ; Search for null byte
repne scasb ; Scan until null found
not rcx ; Invert to get count
dec rcx ; Exclude null terminator
mov rax, rcx
pop rdi
ret
; Fast memcpy using REP MOVSQ
; Input: RDI = dest, RSI = src, RDX = byte count
global asm_memcpy
asm_memcpy:
mov rcx, rdx
shr rcx, 3 ; Count of 8-byte chunks
rep movsq ; Copy 8 bytes at a time
mov rcx, rdx
and rcx, 7 ; Remaining bytes
rep movsb ; Copy remaining
ret
Save & Compile: string_lib.asm
Linux nasm -f elf64 string_lib.asm -o string_lib.o
macOS nasm -f macho64 string_lib.asm -o string_lib.o
Windows nasm -f win64 string_lib.asm -o string_lib.obj
Link with C: gcc main.c string_lib.o -o program
Project 2: Memory Allocator
A bump allocator is the simplest dynamic memory allocator—it just keeps incrementing a pointer. Think of it like a notepad: you keep writing at the next available line, never erasing.
Bump Allocator Concept:
Heap Start Current break
│ │
▼ ▼
┌─────────┬─────────┬─────────┬───────────┐
│ Alloc 1 │ Alloc 2 │ Alloc 3 │ FREE ... │
└─────────┴─────────┴─────────┴───────────┘
↑
next_ptr
Syscall 12 (brk): Set/get program break (end of heap)
section .data
heap_start dq 0 ; Initial program break
heap_ptr dq 0 ; Current allocation pointer
section .text
; Initialize allocator - call once at program start
global alloc_init
alloc_init:
mov rax, 12 ; syscall: brk
xor rdi, rdi ; Get current break (arg=0)
syscall
mov [rel heap_start], rax
mov [rel heap_ptr], rax
ret
; void* bump_alloc(size_t size)
; Input: RDI = size in bytes
; Output: RAX = pointer to allocated memory, or 0 on failure
global bump_alloc
bump_alloc:
; Align size to 16 bytes
add rdi, 15
and rdi, ~15
; Calculate new pointer
mov rax, [rel heap_ptr]
mov rsi, rax ; Save current position (return value)
add rax, rdi ; New heap_ptr
; Expand heap with brk syscall
push rsi
mov rdi, rax ; New break address
mov rax, 12 ; syscall: brk
syscall
pop rsi
; Check if brk succeeded
cmp rax, [rel heap_ptr]
jle .failure ; brk returns old value on failure
; Update pointer and return
mov [rel heap_ptr], rax
mov rax, rsi ; Return old pointer (allocated memory)
ret
.failure:
xor eax, eax ; Return NULL
ret
; void alloc_reset(void)
; Reset allocator to initial state (free all memory)
global alloc_reset
alloc_reset:
mov rax, [rel heap_start]
mov [rel heap_ptr], rax
mov rdi, rax
mov rax, 12 ; syscall: brk
syscall ; Shrink heap back
ret
Save & Compile: allocator.asm
Linux (uses brk syscall — Linux-specific)
nasm -f elf64 allocator.asm -o allocator.o
gcc main.c allocator.o -o allocator_demo
macOS (replace brk with mmap syscall 0x20000C5)
nasm -f macho64 allocator.asm -o allocator.o
Windows (use VirtualAlloc API instead of brk)
nasm -f win64 allocator.asm -o allocator.obj
Limitation: Bump allocators can't free individual allocations. Use alloc_reset() to free everything at once—perfect for arena/pool-style memory management.
Project 3: XOR Encryption
; XOR cipher - encrypt/decrypt buffer
; Input: RDI = buffer, RSI = length, RDX = key (single byte)
global xor_cipher
xor_cipher:
test rsi, rsi
jz .done
movzx eax, dl ; Key byte in AL
.loop:
xor [rdi], al ; XOR byte with key
inc rdi
dec rsi
jnz .loop
.done:
ret
Save & Compile: xor_cipher.asm
Linux nasm -f elf64 xor_cipher.asm -o xor_cipher.o
macOS nasm -f macho64 xor_cipher.asm -o xor_cipher.o
Windows nasm -f win64 xor_cipher.asm -o xor_cipher.obj
Link with C: gcc main.c xor_cipher.o -o cipher_demo
Project 4: Image Grayscale
Convert RGB images to grayscale using SIMD. The standard formula is: Gray = 0.299*R + 0.587*G + 0.114*B
RGB Pixel Format (24-bit):
┌─────┬─────┬─────┐
│ R │ G │ B │ x 4 pixels = 12 bytes
└─────┴─────┴─────┘
RGBA Pixel Format (32-bit) - easier for SIMD:
┌─────┬─────┬─────┬─────┐
│ R │ G │ B │ A │ x 4 pixels = 16 bytes = 1 XMM
└─────┴─────┴─────┴─────┘
section .data
align 16
; Grayscale coefficients (fixed-point: value * 256)
coeff_r: times 16 db 77 ; 0.299 * 256 ≈ 77
coeff_g: times 16 db 150 ; 0.587 * 256 ≈ 150
coeff_b: times 16 db 29 ; 0.114 * 256 ≈ 29
section .text
; void grayscale_sse(uint8_t* pixels, int count)
; Input: RDI = RGBA pixel array, RSI = pixel count
; Output: Modifies pixels in-place (R=G=B=gray, A unchanged)
global grayscale_sse
grayscale_sse:
shr rsi, 2 ; Process 4 pixels per iteration
jz .done
movdqa xmm4, [rel coeff_r]
movdqa xmm5, [rel coeff_g]
movdqa xmm6, [rel coeff_b]
.loop:
; Load 4 RGBA pixels (16 bytes)
movdqu xmm0, [rdi] ; [R0 G0 B0 A0 | R1 G1 B1 A1 | ...]
; Extract R, G, B channels
movdqa xmm1, xmm0 ; Copy for R
movdqa xmm2, xmm0 ; Copy for G
movdqa xmm3, xmm0 ; Copy for B
; Mask to extract each channel (every 4th byte starting at offset)
pand xmm1, [rel mask_r] ; Keep R bytes
pand xmm2, [rel mask_g] ; Keep G bytes
pand xmm3, [rel mask_b] ; Keep B bytes
; Shift G and B to align with R position for calculation
psrld xmm2, 8 ; G >> 8
psrld xmm3, 16 ; B >> 16
; Multiply by coefficients (8-bit)
pmullw xmm1, xmm4 ; R * 77
pmullw xmm2, xmm5 ; G * 150
pmullw xmm3, xmm6 ; B * 29
; Sum and divide by 256 (shift right 8)
paddw xmm1, xmm2
paddw xmm1, xmm3
psrlw xmm1, 8 ; gray = (R*77 + G*150 + B*29) / 256
; Pack gray value to all RGB channels
; Result: [gray gray gray A0 | gray gray gray A1 | ...]
movdqa xmm2, xmm1
pslld xmm1, 8 ; gray << 8 (to G position)
por xmm2, xmm1
pslld xmm1, 8 ; gray << 16 (to B position)
por xmm2, xmm1
pand xmm0, [rel mask_a] ; Keep original alpha
por xmm2, xmm0 ; Combine gray RGB + original A
movdqu [rdi], xmm2 ; Store result
add rdi, 16
dec rsi
jnz .loop
.done:
ret
section .rodata
align 16
mask_r: times 4 dd 0x000000FF
mask_g: times 4 dd 0x0000FF00
mask_b: times 4 dd 0x00FF0000
mask_a: times 4 dd 0xFF000000
Save & Compile: grayscale_sse.asm
Linux nasm -f elf64 grayscale_sse.asm -o grayscale_sse.o
macOS nasm -f macho64 grayscale_sse.asm -o grayscale_sse.o
Windows nasm -f win64 grayscale_sse.asm -o grayscale_sse.obj
Link with C: gcc main.c grayscale_sse.o -o grayscale_demo
Project 5: Math Functions
Fast Integer Square Root
Newton-Raphson method for integer square root—useful in graphics and game dev where floating point is too slow.
; uint32_t isqrt(uint32_t n)
; Input: EDI = n
; Output: EAX = floor(sqrt(n))
global isqrt
isqrt:
test edi, edi
jz .zero ; sqrt(0) = 0
; Initial guess: highest set bit / 2
bsr eax, edi ; Find highest set bit
shr eax, 1 ; Divide position by 2
mov ecx, 1
shl ecx, cl ; Initial guess = 2^(bit_pos/2)
mov eax, ecx
.newton_loop:
; x_new = (x + n/x) / 2
mov edx, edi
xor ecx, ecx
div eax ; EDX:EAX = n / x
add eax, ecx ; x + n/x (ECX held old x)
shr eax, 1 ; / 2
; Check convergence
mov ecx, eax ; Save for next iteration
cmp eax, edx
ja .newton_loop ; Keep iterating if not converged
ret
.zero:
xor eax, eax
ret
; Simpler version using bit manipulation
global isqrt_bit
isqrt_bit:
xor eax, eax ; Result = 0
mov ecx, 1 << 30 ; Start with highest power of 4 <= 2^30
.bit_loop:
test ecx, ecx
jz .done
mov edx, eax
add edx, ecx ; result + bit
cmp edi, edx
jb .shift
sub edi, edx ; n -= (result + bit)
shr eax, 1
add eax, ecx ; result = result/2 + bit
shr ecx, 2 ; bit /= 4
jmp .bit_loop
.shift:
shr eax, 1 ; result /= 2
shr ecx, 2 ; bit /= 4
jmp .bit_loop
.done:
ret
Save & Compile: isqrt.asm
Linux nasm -f elf64 isqrt.asm -o isqrt.o
macOS nasm -f macho64 isqrt.asm -o isqrt.o
Windows nasm -f win64 isqrt.asm -o isqrt.obj
Link with C: gcc main.c isqrt.o -o isqrt_demo
Factorial
; uint64_t factorial(uint32_t n)
; Input: EDI = n (max 20 for 64-bit result)
; Output: RAX = n!
global factorial
factorial:
mov eax, 1 ; result = 1
test edi, edi
jz .done ; 0! = 1
cmp edi, 20
ja .overflow ; 21! > 2^64
.loop:
imul rax, rdi ; result *= n
dec edi
jnz .loop
.done:
ret
.overflow:
mov rax, -1 ; Return max value on overflow
ret
; Lookup table version (fastest for small n)
section .rodata
align 8
factorial_table:
dq 1 ; 0!
dq 1 ; 1!
dq 2 ; 2!
dq 6 ; 3!
dq 24 ; 4!
dq 120 ; 5!
dq 720 ; 6!
dq 5040 ; 7!
dq 40320 ; 8!
dq 362880 ; 9!
dq 3628800 ; 10!
; ... up to 20!
section .text
global factorial_lookup
factorial_lookup:
cmp edi, 20
ja .overflow
mov rax, [rel factorial_table + rdi*8]
ret
.overflow:
mov rax, -1
ret
Save & Compile: factorial.asm
Linux nasm -f elf64 factorial.asm -o factorial.o
macOS nasm -f macho64 factorial.asm -o factorial.o
Windows nasm -f win64 factorial.asm -o factorial.obj
Link with C: gcc main.c factorial.o -o factorial_demo
Project 6: File I/O
Direct file operations using Linux syscalls—no libc required. Essential for standalone programs and understanding how file I/O really works.
Syscall Reference
| Syscall | RAX | RDI | RSI | RDX | Returns |
| open | 2 | filename | flags | mode | fd or -errno |
| read | 0 | fd | buffer | count | bytes read |
| write | 1 | fd | buffer | count | bytes written |
| close | 3 | fd | - | - | 0 or -errno |
| lseek | 8 | fd | offset | whence | new position |
section .data
filename db "test.txt", 0
write_msg db "Hello from assembly!", 10
write_len equ $ - write_msg
section .bss
buffer resb 4096 ; 4KB read buffer
fd resq 1 ; File descriptor
section .text
; Open flags (from fcntl.h)
O_RDONLY equ 0
O_WRONLY equ 1
O_RDWR equ 2
O_CREAT equ 0x40
O_TRUNC equ 0x200
O_APPEND equ 0x400
; int open_file(const char* path, int flags, int mode)
; Returns: fd on success, negative on error
global open_file
open_file:
mov rax, 2 ; syscall: open
; RDI = path (already set)
; RSI = flags (already set)
; RDX = mode (already set)
syscall
ret
; ssize_t read_file(int fd, void* buf, size_t count)
global read_file
read_file:
mov rax, 0 ; syscall: read
syscall
ret
; ssize_t write_file(int fd, const void* buf, size_t count)
global write_file
write_file:
mov rax, 1 ; syscall: write
syscall
ret
; int close_file(int fd)
global close_file
close_file:
mov rax, 3 ; syscall: close
syscall
ret
; Complete example: Copy file to stdout
global cat_file
cat_file:
; RDI = filename
push rbp
mov rbp, rsp
sub rsp, 16
; Open file for reading
mov rsi, O_RDONLY
xor edx, edx ; mode unused for reading
call open_file
test rax, rax
js .error ; Jump if negative (error)
mov [rbp-8], rax ; Save fd
.read_loop:
; Read chunk
mov rdi, [rbp-8] ; fd
lea rsi, [rel buffer] ; buffer
mov rdx, 4096 ; count
call read_file
test rax, rax
jz .done ; EOF
js .error ; Error
; Write to stdout
mov rdx, rax ; bytes read = bytes to write
mov rdi, 1 ; fd = stdout
lea rsi, [rel buffer]
call write_file
jmp .read_loop
.done:
; Close file
mov rdi, [rbp-8]
call close_file
xor eax, eax ; Return 0 (success)
jmp .exit
.error:
mov eax, -1 ; Return -1 (error)
.exit:
leave
ret
Save & Compile: file_io.asm
Linux (uses Linux syscalls for open/read/write/close)
nasm -f elf64 file_io.asm -o file_io.o
gcc main.c file_io.o -o file_io_demo
macOS (syscall numbers differ — add 0x2000000 offset)
nasm -f macho64 file_io.asm -o file_io.o
Windows (use CreateFile/ReadFile/WriteFile API)
nasm -f win64 file_io.asm -o file_io.obj
Exercise
Build a File Copy Utility
Combine these functions to create cp_asm src dest:
- Open source file for reading
- Create/truncate destination file (
O_WRONLY | O_CREAT | O_TRUNC, mode 0644)
- Loop: read chunk, write chunk until EOF
- Close both files
Continue the Series
Part 22: Performance & Optimization
Techniques to make your projects faster.
Read Article
Part 24: Capstone Project
Comprehensive capstone combining all skills.
Read Article
Part 17: C & Assembly Interop
Integrate these projects with C applications.
Read Article