; NASM code for image processing - see ip2.cpp to compile/use ; Written by Matt Mahoney, Dec. 5, 2004 global _rdtsc, _quantize, _grayscale, _rotate, _adjust, _horiz, _vert extern _printf section .text use32 class=CODE ; return low 32 bits of CPU clock _rdtsc: push edx rdtsc pop edx ret ; quantize image[0..imagesize-1] to n bits _quantize: ;(char *image, int inagesize, int n) pusha mov ecx, [esp+44] ; n mov eax, 0ffffff00h ; create mask of 8 copies of n 1s and 8-n 0s shr eax, cl mov ah, al ; 2 copies in ah, al movd mm0, eax pshufw mm0, mm0, 0 ; 8 copies in mm0 mov esi, [esp+36] ; image mov ecx, [esp+40] ; imagesize shr ecx, 3 sub esi, 8 align 16 .loop: movq mm1, [esi+ecx*8] pand mm1, mm0 movq [esi+ecx*8], mm1 sub ecx, 1 jnz .loop emms popa ret ; Convert color to gray scale: ; for (i=0; i [edi] until they meet in the middle movq mm0, [esi] movq mm1, [edi] pshufw mm2, mm0, 00011011b ; reverse words pshufw mm3, mm1, 00011011b movq mm0, mm2 ; swap bytes within words movq mm1, mm3 psllw mm2, 8 psrlw mm0, 8 psllw mm3, 8 psrlw mm1, 8 por mm2, mm0 por mm3, mm1 movq [esi], mm3 ; swap qwords and store movq [edi], mm2 add esi, 8 ; move to middle sub edi, 8 cmp esi, edi jb .loop emms popa ret ; adjust contrast (C) and brighness (B) of image: ; image[i] = image[i] * C/256 + B, i=0..n-1 _adjust: ;(char *image, int n, int C, int B) pusha mov eax, [esp+44] ; C movd mm2, eax pshufw mm2, mm2, 0 ; mm2 = 4 copies of C as signed words paddsw mm2, mm2 ; C*2 mov eax, [esp+48] ; B movd mm1, eax pshufw mm1, mm1, 0 ; mm1 = 4 copies of B mov esi, [esp+36] ; image mov ecx, [esp+40] ; n sub esi, 16 shr ecx, 3 pxor mm0, mm0 ; mm0 = 0 align 16 .loop: movq mm3, [esi+ecx*8+8] ; unpack 8 pixels * 128 to words in mm3, mm4 movq mm4, mm3 movq mm5, [esi+ecx*8] ; unpack 8 more to mm5, mm6 (2x unroll) movq mm6, mm5 punpcklbw mm3, mm0 punpckhbw mm4, mm0 punpcklbw mm5, mm0 punpckhbw mm6, mm0 psllw mm3, 7 psllw mm4, 7 psllw mm5, 7 psllw mm6, 7 pmulhw mm3, mm2 ; multiply by C/128 pmulhw mm4, mm2 pmulhw mm5, mm2 pmulhw mm6, mm2 paddsw mm3, mm1 ; add B paddsw mm4, mm1 paddsw mm5, mm1 paddsw mm6, mm1 packuswb mm3, mm4 ; pack and store packuswb mm5, mm6 movq [esi+ecx*8+8], mm3 movq [esi+ecx*8], mm5 sub ecx, 2 ja .loop emms popa ret ; Horizontal filter: image[i] = (L*image[i]+R*image[i+1])/256, i=0..n-1 _horiz: ;(char* image, int n, int L, int R) pusha mov eax, [esp+44] ; mm1 = 4 copies of L*2 movd mm1, eax pshufw mm1, mm1, 0 paddsw mm1, mm1 mov eax, [esp+48] ; mm2 = 4 copies of R*2 movd mm2, eax pshufw mm2, mm2, 0 paddsw mm2, mm2 mov edi, [esp+36] ; image mov edx, [esp+40] ; n xor ecx, ecx ; i pxor mm0, mm0 ; mm0 = 0 align 16 .loop: movq mm3, [edi+ecx] ; unpack 8 pixels * 128 to mm3, mm4 movq mm4, mm3 punpcklbw mm3, mm0 punpckhbw mm4, mm0 psllw mm3, 7 psllw mm4, 7 pmulhw mm3, mm1 ; multiply by L*2 pmulhw mm4, mm1 movq mm5, [edi+ecx+1] ; unpack 8 pixels to the right to mm5, mm6 (unaligned) movq mm6, mm5 punpcklbw mm5, mm0 punpckhbw mm6, mm0 psllw mm5, 7 psllw mm6, 7 pmulhw mm5, mm2 ; multiply by R*2 pmulhw mm6, mm2 paddsw mm3, mm5 ; add paddsw mm4, mm6 packuswb mm3, mm4 ; pack and store movq [edi+ecx], mm3 add ecx, 8 cmp ecx, edx jb .loop emms popa ret ; Vertical filter: image[i] = (T*image[i]+B*image[i-width])/256, i=0..n-1 _vert: ;(char* image, int n, int width, int T, int B) pusha mov eax, [esp+48] ; mm1 = 4 copies of T*2 movd mm1, eax pshufw mm1, mm1, 0 paddsw mm1, mm1 mov eax, [esp+52] ; mm2 = 4 copies of B*2 movd mm2, eax pshufw mm2, mm2, 0 paddsw mm2, mm2 mov edi, [esp+36] ; image sub edi, 8 mov esi, edi sub esi, [esp+44]; ; image-width (1 row below) mov ecx, [esp+40] ; n pxor mm0, mm0 ; mm0 = 0 align 16 .loop: movq mm3, [edi+ecx] ; unpack 8 pixels * 128 to mm3, mm4 movq mm4, mm3 punpcklbw mm3, mm0 punpckhbw mm4, mm0 psllw mm3, 7 psllw mm4, 7 pmulhw mm3, mm1 ; multiply by T*2 pmulhw mm4, mm1 movq mm5, [esi+ecx] ; unpack 8 pixels below to mm5, mm6 movq mm6, mm5 punpcklbw mm5, mm0 punpckhbw mm6, mm0 psllw mm5, 7 psllw mm6, 7 pmulhw mm5, mm2 ; multiply by B*2 pmulhw mm6, mm2 paddsw mm3, mm5 ; add paddsw mm4, mm6 packuswb mm3, mm4 ; pack and store movq [edi+ecx], mm3 sub ecx, 8 ja .loop emms popa ret ; print registers for debugging in this format: ; mm0-lo mm0-hi mm1-lo mm1-hi mm2-lo mm2-hi mm3-lo mm3-hi ; mm4-lo mm4-hi mm5-lo mm5-hi mm6-lo mm6-hi mm7-lo mm7-hi ; edi esi esp ebp ebx edx ecx eax preg_fmt db "%08X %08X %08X %08X %08X %08X %08X %08X",10 db "%08X %08X %08X %08X %08X %08X %08X %08X",10 db "%08X %08X %08X %08X %08X %08X %08X %08X",10,0 preg: pusha sub esp, 64 movq [esp+56], mm7 movq [esp+48], mm6 movq [esp+40], mm5 movq [esp+32], mm4 movq [esp+24], mm3 movq [esp+16], mm2 movq [esp+8], mm1 movq [esp], mm0 push preg_fmt call _printf add esp, 68 popa ret