Re: [RFC] div64_64 support
From: Sami Farin <hidden>
Date: 2007-03-07 18:33:05
Also in:
lkml
On Wed, Mar 07, 2007 at 11:11:49 -0500, Chuck Ebbert wrote:
Sami Farin wrote:quoted
On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote: ...quoted
And I found bug in gcc-4.1.2, it gave 0 for ncubic results when doing 1000 loops test... gcc-4.0.3 works.Found it.--- cbrt-test.c~ 2007-03-07 00:20:54.735248105 +0200 +++ cbrt-test.c 2007-03-07 00:21:03.964864343 +0200@@ -209,7 +209,7 @@ __asm__("bsrl %1,%0\n\t" "cmovzl %2,%0" - : "=&r" (r) : "rm" (x), "rm" (-1)); + : "=&r" (r) : "rm" (x), "rm" (-1) : "memory"); return r+1; }Now Linux 2.6 does not have "memory" in fls, maybe it causes some gcc funnies some people are seeing.Can you post the difference in the generated code with that change?
Fun.. looks when not using "memory" gcc does not even bother calling ncubic() 666 times. So it gets better timings ( 42/666=0 ) =)
--- cbrt-test-no_memory.s 2007-03-07 20:22:27.838466385 +0200
+++ cbrt-test-using_memory.s 2007-03-07 20:22:38.237013197 +0200... main: leal 4(%esp), %ecx andl $-16, %esp pushl -4(%ecx) pushl %ebp pushl %edi pushl %esi pushl %ebx pushl %ecx - subl $136, %esp + subl $152, %esp movl $.LC0, (%esp) call puts xorl %edx, %edx movl $27, %eax call ncubic cmpl $3, %eax - je .L83 + je .L87 movl $.LC1, (%esp) call puts -.L83: - xorl %eax, %eax - xorl %edi, %edi - movl %eax, 88(%esp) +.L87: xorl %eax, %eax - xorl %esi, %esi + xorl %ebp, %ebp movl %eax, 92(%esp) xorl %eax, %eax - xorl %ebp, %ebp + xorl %edi, %edi movl %eax, 96(%esp) xorl %eax, %eax + xorl %esi, %esi movl %eax, 100(%esp) xorl %eax, %eax movl %eax, 104(%esp) xorl %eax, %eax movl %eax, 108(%esp) - movl %edi, 112(%esp) - movl %esi, 116(%esp) - .p2align 4,,15 -.L84: + xorl %eax, %eax + movl %eax, 112(%esp) + movl %ebp, 116(%esp) + movl %edi, 120(%esp) + movl %esi, 124(%esp) +.L88: #APP movl $0, %eax cpuid rdtsc #NO_APP movl %eax, 56(%esp) movl %edx, 60(%esp) #APP movl $0, %eax cpuid rdtsc #NO_APP movl %eax, %esi movl %edx, %edi subl 56(%esp), %esi sbbl 60(%esp), %edi cmpl $0, %edi ja .L66 cmpl $999, %esi - jbe .L84 + jbe .L88 .L66: + movl 92(%esp), %edx + leal (%edx,%edx,2), %eax + movl cases+4(,%eax,4), %edi + movl cases(,%eax,4), %esi + movl %edi, %edx + movl %esi, %eax + call ncubic #APP movl $0, %eax cpuid rdtsc #NO_APP - movl %eax, %esi - movl %edx, %edi + movl $666, %ebx + movl %eax, 128(%esp) + movl %edx, 132(%esp) + .p2align 4,,15 +.L67: + movl %esi, %eax + movl %edi, %edx + call ncubic + decl %ebx + movl %eax, %ebp + jne .L67 #APP movl $0, %eax cpuid rdtsc #NO_APP - subl %esi, %eax + subl 128(%esp), %eax movl $666, %ebx - sbbl %edi, %edx - xorl %ecx, %ecx movl %ebx, 8(%esp) + sbbl 132(%esp), %edx + xorl %ecx, %ecx movl %ecx, 12(%esp) movl %eax, (%esp) movl %edx, 4(%esp) call __udivdi3 - addl %eax, 104(%esp) + addl %eax, 112(%esp) movl %edx, %ecx movl %eax, %ebx movl %edx, %esi - adcl %edx, 108(%esp) + adcl %edx, 116(%esp) imull %eax, %ecx mull %ebx addl %ecx, %ecx movl %eax, 56(%esp) addl %ecx, %edx movl 56(%esp), %eax - addl %eax, 112(%esp) + addl %eax, 120(%esp) movl %edx, 60(%esp) movl 60(%esp), %edx - adcl %edx, 116(%esp) - cmpl %esi, 92(%esp) - ja .L67 - jb .L68 - cmpl %ebx, 88(%esp) - jae .L67 -.L68: - movl %ebx, 88(%esp) - movl %esi, 92(%esp) -.L67: - leal (%ebp,%ebp,2), %ebx - sall $2, %ebx - movl cases+4(%ebx), %edx - movl cases(%ebx), %eax - call ncubic - movl cases+8(%ebx), %edx - subl %eax, %edx - movl %edx, %eax - sarl $31, %eax - xorl %eax, %edx - subl %eax, %edx - movl %edx, %ecx - sarl $31, %ecx - addl %edx, 96(%esp) - adcl %ecx, 100(%esp) - incl %ebp - cmpl $183, %ebp - jbe .L84 - movl 108(%esp), %eax - fildll 104(%esp) - testl %eax, %eax - js .L85 + adcl %edx, 124(%esp) + cmpl %esi, 100(%esp) + ja .L69 + jb .L70 + cmpl %ebx, 96(%esp) + jae .L69 .L70: - fstpl 120(%esp) + movl %ebx, 96(%esp) + movl %esi, 100(%esp) +.L69: + movl 92(%esp), %edx + leal (%edx,%edx,2), %eax + movl cases+8(,%eax,4), %eax + subl %ebp, %eax + movl %eax, %ecx + sarl $31, %ecx + xorl %ecx, %eax + subl %ecx, %eax + cltd + addl %eax, 104(%esp) + adcl %edx, 108(%esp) + incl 92(%esp) + cmpl $183, 92(%esp) + jbe .L88 movl 116(%esp), %eax - fldl 120(%esp) + fildll 112(%esp) + testl %eax, %eax + js .L89 +.L72: + fstpl 136(%esp) + movl 124(%esp), %eax + fldl 136(%esp) fdivl .LC7 testl %eax, %eax flds .LC4 fdivr %st, %st(1) - fildll 112(%esp) - js .L86 -.L71: - fstpl 120(%esp) - fldl 120(%esp) + fildll 120(%esp) + js .L90 +.L73: + fstpl 136(%esp) + fldl 136(%esp) fdivl .LC7 fdivp %st, %st(1) fld %st(1) fmul %st(2), %st fsubrp %st, %st(1) fld %st(0) fsqrt fucomi %st(0), %st - jp .L88 - je .L89 -.L88: + jp .L92 + je .L93 +.L92: fstp %st(0) fstpl (%esp) fstpl 64(%esp) call sqrt fldl 64(%esp) fxch %st(1) -.L72: - movl 96(%esp), %eax - movl 100(%esp), %edx - fildll 88(%esp) +.L74: + movl 104(%esp), %eax + movl 108(%esp), %edx + fildll 96(%esp) movl %eax, 40(%esp) - movl 92(%esp), %eax + movl 100(%esp), %eax movl %edx, 44(%esp) testl %eax, %eax - js .L87 -.L73: - fstpl 120(%esp) - movl 104(%esp), %eax + js .L91 +.L75: + fstpl 136(%esp) + movl 112(%esp), %eax movl $184, %ebp - fldl 120(%esp) + fldl 136(%esp) xorl %edi, %edi movl $.LC5, %esi fdivl .LC7 - movl 108(%esp), %edx + movl 116(%esp), %edx movl %ebp, 8(%esp) movl %edi, 12(%esp) movl %eax, (%esp) movl %edx, 4(%esp) fstpl 32(%esp) fstpl 24(%esp) fstpl 16(%esp) call __udivdi3 movl %esi, 4(%esp) movl $.LC6, (%esp) movl %eax, 8(%esp) movl %edx, 12(%esp) call printf - addl $136, %esp + addl $152, %esp xorl %eax, %eax popl %ecx popl %ebx popl %esi popl %edi popl %ebp leal -4(%ecx), %esp ret -.L89: +.L93: fstp %st(1) + jmp .L74 +.L89: + fadds .LC2 jmp .L72 -.L85: +.L91: fadds .LC2 - jmp .L70 -.L87: + jmp .L75 +.L90: fadds .LC2 jmp .L73 -.L86: - fadds .LC2 - jmp .L71 .size main, .-main .section .rodata .align 32 .type cases, @object .size cases, 2208 cases: ... --
Attachments
- cbrt-test.c [text/plain] 11463 bytes · preview