Back to Technology

x86 Assembly Series Part 23: Practical Projects

February 6, 2026 Wasil Zafar 45 min read

Apply your assembly skills: build a memory allocator, string library functions, XOR encryption, image grayscale conversion, file utilities, and fast mathematical routines.

Table of Contents

  1. String Library
  2. Memory Allocator
  3. Encryption Routines
  4. Image Processing
  5. Math Functions
  6. File I/O Utilities

Project 1: String Library

Implement optimized versions of common string functions: strlen, strcpy, strcmp, memcpy, memset using assembly.
; Optimized strlen using SCASB
; Input:  RDI = pointer to null-terminated string
; Output: RAX = string length
global asm_strlen
asm_strlen:
    push rdi
    xor rcx, rcx
    dec rcx                 ; RCX = -1 (max count)
    xor al, al              ; Search for null byte
    repne scasb             ; Scan until null found
    not rcx                 ; Invert to get count
    dec rcx                 ; Exclude null terminator
    mov rax, rcx
    pop rdi
    ret

; Fast memcpy using REP MOVSQ
; Input:  RDI = dest, RSI = src, RDX = byte count
global asm_memcpy
asm_memcpy:
    mov rcx, rdx
    shr rcx, 3              ; Count of 8-byte chunks
    rep movsq               ; Copy 8 bytes at a time
    mov rcx, rdx
    and rcx, 7              ; Remaining bytes
    rep movsb               ; Copy remaining
    ret
Save & Compile: string_lib.asm

Linux nasm -f elf64 string_lib.asm -o string_lib.o

macOS nasm -f macho64 string_lib.asm -o string_lib.o

Windows nasm -f win64 string_lib.asm -o string_lib.obj

Link with C: gcc main.c string_lib.o -o program

Project 2: Memory Allocator

A bump allocator is the simplest dynamic memory allocator—it just keeps incrementing a pointer. Think of it like a notepad: you keep writing at the next available line, never erasing.

Bump Allocator Concept:

Heap Start                                    Current break
     │                                              │
     ▼                                              ▼
     ┌─────────┬─────────┬─────────┬───────────┐
     │ Alloc 1 │ Alloc 2 │ Alloc 3 │ FREE ...  │
     └─────────┴─────────┴─────────┴───────────┘
                                   ↑
                               next_ptr

Syscall 12 (brk): Set/get program break (end of heap)
section .data
    heap_start  dq 0        ; Initial program break
    heap_ptr    dq 0        ; Current allocation pointer

section .text

; Initialize allocator - call once at program start
global alloc_init
alloc_init:
    mov rax, 12             ; syscall: brk
    xor rdi, rdi            ; Get current break (arg=0)
    syscall
    mov [rel heap_start], rax
    mov [rel heap_ptr], rax
    ret

; void* bump_alloc(size_t size)
; Input: RDI = size in bytes
; Output: RAX = pointer to allocated memory, or 0 on failure
global bump_alloc
bump_alloc:
    ; Align size to 16 bytes
    add rdi, 15
    and rdi, ~15
    
    ; Calculate new pointer
    mov rax, [rel heap_ptr]
    mov rsi, rax            ; Save current position (return value)
    add rax, rdi            ; New heap_ptr
    
    ; Expand heap with brk syscall
    push rsi
    mov rdi, rax            ; New break address
    mov rax, 12             ; syscall: brk
    syscall
    pop rsi
    
    ; Check if brk succeeded
    cmp rax, [rel heap_ptr]
    jle .failure            ; brk returns old value on failure
    
    ; Update pointer and return
    mov [rel heap_ptr], rax
    mov rax, rsi            ; Return old pointer (allocated memory)
    ret
    
.failure:
    xor eax, eax            ; Return NULL
    ret

; void alloc_reset(void)
; Reset allocator to initial state (free all memory)
global alloc_reset
alloc_reset:
    mov rax, [rel heap_start]
    mov [rel heap_ptr], rax
    mov rdi, rax
    mov rax, 12             ; syscall: brk
    syscall                 ; Shrink heap back
    ret
Save & Compile: allocator.asm

Linux (uses brk syscall — Linux-specific)

nasm -f elf64 allocator.asm -o allocator.o
gcc main.c allocator.o -o allocator_demo

macOS (replace brk with mmap syscall 0x20000C5)

nasm -f macho64 allocator.asm -o allocator.o

Windows (use VirtualAlloc API instead of brk)

nasm -f win64 allocator.asm -o allocator.obj
Limitation: Bump allocators can't free individual allocations. Use alloc_reset() to free everything at once—perfect for arena/pool-style memory management.

Project 3: XOR Encryption

; XOR cipher - encrypt/decrypt buffer
; Input:  RDI = buffer, RSI = length, RDX = key (single byte)
global xor_cipher
xor_cipher:
    test rsi, rsi
    jz .done
    movzx eax, dl           ; Key byte in AL
.loop:
    xor [rdi], al           ; XOR byte with key
    inc rdi
    dec rsi
    jnz .loop
.done:
    ret
Save & Compile: xor_cipher.asm

Linux nasm -f elf64 xor_cipher.asm -o xor_cipher.o

macOS nasm -f macho64 xor_cipher.asm -o xor_cipher.o

Windows nasm -f win64 xor_cipher.asm -o xor_cipher.obj

Link with C: gcc main.c xor_cipher.o -o cipher_demo

Project 4: Image Grayscale

Convert RGB images to grayscale using SIMD. The standard formula is: Gray = 0.299*R + 0.587*G + 0.114*B

RGB Pixel Format (24-bit):
┌─────┬─────┬─────┐
│  R  │  G  │  B  │  x 4 pixels = 12 bytes
└─────┴─────┴─────┘

RGBA Pixel Format (32-bit) - easier for SIMD:
┌─────┬─────┬─────┬─────┐
│  R  │  G  │  B  │  A  │  x 4 pixels = 16 bytes = 1 XMM
└─────┴─────┴─────┴─────┘
section .data
align 16
    ; Grayscale coefficients (fixed-point: value * 256)
    coeff_r:    times 16 db 77     ; 0.299 * 256 ≈ 77
    coeff_g:    times 16 db 150    ; 0.587 * 256 ≈ 150
    coeff_b:    times 16 db 29     ; 0.114 * 256 ≈ 29

section .text

; void grayscale_sse(uint8_t* pixels, int count)
; Input: RDI = RGBA pixel array, RSI = pixel count
; Output: Modifies pixels in-place (R=G=B=gray, A unchanged)
global grayscale_sse
grayscale_sse:
    shr rsi, 2              ; Process 4 pixels per iteration
    jz .done
    
    movdqa xmm4, [rel coeff_r]
    movdqa xmm5, [rel coeff_g]
    movdqa xmm6, [rel coeff_b]
    
.loop:
    ; Load 4 RGBA pixels (16 bytes)
    movdqu xmm0, [rdi]      ; [R0 G0 B0 A0 | R1 G1 B1 A1 | ...]
    
    ; Extract R, G, B channels
    movdqa xmm1, xmm0       ; Copy for R
    movdqa xmm2, xmm0       ; Copy for G  
    movdqa xmm3, xmm0       ; Copy for B
    
    ; Mask to extract each channel (every 4th byte starting at offset)
    pand xmm1, [rel mask_r] ; Keep R bytes
    pand xmm2, [rel mask_g] ; Keep G bytes
    pand xmm3, [rel mask_b] ; Keep B bytes
    
    ; Shift G and B to align with R position for calculation
    psrld xmm2, 8           ; G >> 8
    psrld xmm3, 16          ; B >> 16
    
    ; Multiply by coefficients (8-bit)
    pmullw xmm1, xmm4       ; R * 77
    pmullw xmm2, xmm5       ; G * 150
    pmullw xmm3, xmm6       ; B * 29
    
    ; Sum and divide by 256 (shift right 8)
    paddw xmm1, xmm2
    paddw xmm1, xmm3
    psrlw xmm1, 8           ; gray = (R*77 + G*150 + B*29) / 256
    
    ; Pack gray value to all RGB channels
    ; Result: [gray gray gray A0 | gray gray gray A1 | ...]
    movdqa xmm2, xmm1
    pslld xmm1, 8           ; gray << 8 (to G position)
    por xmm2, xmm1
    pslld xmm1, 8           ; gray << 16 (to B position)
    por xmm2, xmm1
    pand xmm0, [rel mask_a] ; Keep original alpha
    por xmm2, xmm0          ; Combine gray RGB + original A
    
    movdqu [rdi], xmm2      ; Store result
    
    add rdi, 16
    dec rsi
    jnz .loop
    
.done:
    ret

section .rodata
align 16
    mask_r: times 4 dd 0x000000FF
    mask_g: times 4 dd 0x0000FF00
    mask_b: times 4 dd 0x00FF0000
    mask_a: times 4 dd 0xFF000000
Save & Compile: grayscale_sse.asm

Linux nasm -f elf64 grayscale_sse.asm -o grayscale_sse.o

macOS nasm -f macho64 grayscale_sse.asm -o grayscale_sse.o

Windows nasm -f win64 grayscale_sse.asm -o grayscale_sse.obj

Link with C: gcc main.c grayscale_sse.o -o grayscale_demo

Project 5: Math Functions

Fast Integer Square Root

Newton-Raphson method for integer square root—useful in graphics and game dev where floating point is too slow.

; uint32_t isqrt(uint32_t n)
; Input: EDI = n
; Output: EAX = floor(sqrt(n))
global isqrt
isqrt:
    test edi, edi
    jz .zero                ; sqrt(0) = 0
    
    ; Initial guess: highest set bit / 2
    bsr eax, edi            ; Find highest set bit
    shr eax, 1              ; Divide position by 2
    mov ecx, 1
    shl ecx, cl             ; Initial guess = 2^(bit_pos/2)
    mov eax, ecx
    
.newton_loop:
    ; x_new = (x + n/x) / 2
    mov edx, edi
    xor ecx, ecx
    div eax                 ; EDX:EAX = n / x
    add eax, ecx            ; x + n/x (ECX held old x)
    shr eax, 1              ; / 2
    
    ; Check convergence
    mov ecx, eax            ; Save for next iteration
    cmp eax, edx
    ja .newton_loop         ; Keep iterating if not converged
    
    ret
    
.zero:
    xor eax, eax
    ret

; Simpler version using bit manipulation
global isqrt_bit
isqrt_bit:
    xor eax, eax            ; Result = 0
    mov ecx, 1 << 30        ; Start with highest power of 4 <= 2^30
    
.bit_loop:
    test ecx, ecx
    jz .done
    
    mov edx, eax
    add edx, ecx            ; result + bit
    cmp edi, edx
    jb .shift
    
    sub edi, edx            ; n -= (result + bit)
    shr eax, 1
    add eax, ecx            ; result = result/2 + bit
    shr ecx, 2              ; bit /= 4
    jmp .bit_loop
    
.shift:
    shr eax, 1              ; result /= 2
    shr ecx, 2              ; bit /= 4
    jmp .bit_loop
    
.done:
    ret
Save & Compile: isqrt.asm

Linux nasm -f elf64 isqrt.asm -o isqrt.o

macOS nasm -f macho64 isqrt.asm -o isqrt.o

Windows nasm -f win64 isqrt.asm -o isqrt.obj

Link with C: gcc main.c isqrt.o -o isqrt_demo

Factorial

; uint64_t factorial(uint32_t n)
; Input: EDI = n (max 20 for 64-bit result)
; Output: RAX = n!
global factorial
factorial:
    mov eax, 1              ; result = 1
    test edi, edi
    jz .done                ; 0! = 1
    
    cmp edi, 20
    ja .overflow            ; 21! > 2^64
    
.loop:
    imul rax, rdi           ; result *= n
    dec edi
    jnz .loop
    
.done:
    ret
    
.overflow:
    mov rax, -1             ; Return max value on overflow
    ret

; Lookup table version (fastest for small n)
section .rodata
align 8
factorial_table:
    dq 1                    ; 0!
    dq 1                    ; 1!
    dq 2                    ; 2!
    dq 6                    ; 3!
    dq 24                   ; 4!
    dq 120                  ; 5!
    dq 720                  ; 6!
    dq 5040                 ; 7!
    dq 40320                ; 8!
    dq 362880               ; 9!
    dq 3628800              ; 10!
    ; ... up to 20!

section .text
global factorial_lookup
factorial_lookup:
    cmp edi, 20
    ja .overflow
    mov rax, [rel factorial_table + rdi*8]
    ret
.overflow:
    mov rax, -1
    ret
Save & Compile: factorial.asm

Linux nasm -f elf64 factorial.asm -o factorial.o

macOS nasm -f macho64 factorial.asm -o factorial.o

Windows nasm -f win64 factorial.asm -o factorial.obj

Link with C: gcc main.c factorial.o -o factorial_demo

Project 6: File I/O

Direct file operations using Linux syscalls—no libc required. Essential for standalone programs and understanding how file I/O really works.

Syscall Reference

SyscallRAXRDIRSIRDXReturns
open2filenameflagsmodefd or -errno
read0fdbuffercountbytes read
write1fdbuffercountbytes written
close3fd--0 or -errno
lseek8fdoffsetwhencenew position
section .data
    filename db "test.txt", 0
    write_msg db "Hello from assembly!", 10
    write_len equ $ - write_msg

section .bss
    buffer resb 4096        ; 4KB read buffer
    fd resq 1               ; File descriptor

section .text

; Open flags (from fcntl.h)
O_RDONLY    equ 0
O_WRONLY    equ 1
O_RDWR      equ 2
O_CREAT     equ 0x40
O_TRUNC     equ 0x200
O_APPEND    equ 0x400

; int open_file(const char* path, int flags, int mode)
; Returns: fd on success, negative on error
global open_file
open_file:
    mov rax, 2              ; syscall: open
    ; RDI = path (already set)
    ; RSI = flags (already set)
    ; RDX = mode (already set)
    syscall
    ret

; ssize_t read_file(int fd, void* buf, size_t count)
global read_file
read_file:
    mov rax, 0              ; syscall: read
    syscall
    ret

; ssize_t write_file(int fd, const void* buf, size_t count)
global write_file
write_file:
    mov rax, 1              ; syscall: write
    syscall
    ret

; int close_file(int fd)
global close_file
close_file:
    mov rax, 3              ; syscall: close
    syscall
    ret

; Complete example: Copy file to stdout
global cat_file
cat_file:
    ; RDI = filename
    push rbp
    mov rbp, rsp
    sub rsp, 16
    
    ; Open file for reading
    mov rsi, O_RDONLY
    xor edx, edx            ; mode unused for reading
    call open_file
    test rax, rax
    js .error               ; Jump if negative (error)
    mov [rbp-8], rax        ; Save fd
    
.read_loop:
    ; Read chunk
    mov rdi, [rbp-8]        ; fd
    lea rsi, [rel buffer]   ; buffer
    mov rdx, 4096           ; count
    call read_file
    
    test rax, rax
    jz .done                ; EOF
    js .error               ; Error
    
    ; Write to stdout
    mov rdx, rax            ; bytes read = bytes to write
    mov rdi, 1              ; fd = stdout
    lea rsi, [rel buffer]
    call write_file
    
    jmp .read_loop
    
.done:
    ; Close file
    mov rdi, [rbp-8]
    call close_file
    xor eax, eax            ; Return 0 (success)
    jmp .exit
    
.error:
    mov eax, -1             ; Return -1 (error)
    
.exit:
    leave
    ret
Save & Compile: file_io.asm

Linux (uses Linux syscalls for open/read/write/close)

nasm -f elf64 file_io.asm -o file_io.o
gcc main.c file_io.o -o file_io_demo

macOS (syscall numbers differ — add 0x2000000 offset)

nasm -f macho64 file_io.asm -o file_io.o

Windows (use CreateFile/ReadFile/WriteFile API)

nasm -f win64 file_io.asm -o file_io.obj
Exercise

Build a File Copy Utility

Combine these functions to create cp_asm src dest:

  1. Open source file for reading
  2. Create/truncate destination file (O_WRONLY | O_CREAT | O_TRUNC, mode 0644)
  3. Loop: read chunk, write chunk until EOF
  4. Close both files