Thread (55 messages) 55 messages, 12 authors, 2007-03-22

Re: [RFC] div64_64 support

From: Sami Farin <hidden>
Date: 2007-03-07 18:33:05
Also in: lkml

On Wed, Mar 07, 2007 at 11:11:49 -0500, Chuck Ebbert wrote:
Sami Farin wrote:
quoted
On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote:
...
quoted
And I found bug in gcc-4.1.2, it gave 0 for ncubic results
when doing 1000 loops test... gcc-4.0.3 works.
Found it.
--- cbrt-test.c~	2007-03-07 00:20:54.735248105 +0200
+++ cbrt-test.c	2007-03-07 00:21:03.964864343 +0200
@@ -209,7 +209,7 @@
 
 	__asm__("bsrl %1,%0\n\t"
 		"cmovzl %2,%0"
-		: "=&r" (r) : "rm" (x), "rm" (-1));
+		: "=&r" (r) : "rm" (x), "rm" (-1) : "memory");
 	return r+1;
 }
 
Now Linux 2.6 does not have "memory" in fls, maybe it causes
some gcc funnies some people are seeing.
Can you post the difference in the generated code with that change?
Fun.. looks when not using "memory" gcc does not even bother
calling ncubic() 666 times.  So it gets better timings ( 42/666=0 ) =)
--- cbrt-test-no_memory.s	2007-03-07 20:22:27.838466385 +0200
+++ cbrt-test-using_memory.s	2007-03-07 20:22:38.237013197 +0200
...
 main:
 	leal	4(%esp), %ecx
 	andl	$-16, %esp
 	pushl	-4(%ecx)
 	pushl	%ebp
 	pushl	%edi
 	pushl	%esi
 	pushl	%ebx
 	pushl	%ecx
-	subl	$136, %esp
+	subl	$152, %esp
 	movl	$.LC0, (%esp)
 	call	puts
 	xorl	%edx, %edx
 	movl	$27, %eax
 	call	ncubic
 	cmpl	$3, %eax
-	je	.L83
+	je	.L87
 	movl	$.LC1, (%esp)
 	call	puts
-.L83:
-	xorl	%eax, %eax
-	xorl	%edi, %edi
-	movl	%eax, 88(%esp)
+.L87:
 	xorl	%eax, %eax
-	xorl	%esi, %esi
+	xorl	%ebp, %ebp
 	movl	%eax, 92(%esp)
 	xorl	%eax, %eax
-	xorl	%ebp, %ebp
+	xorl	%edi, %edi
 	movl	%eax, 96(%esp)
 	xorl	%eax, %eax
+	xorl	%esi, %esi
 	movl	%eax, 100(%esp)
 	xorl	%eax, %eax
 	movl	%eax, 104(%esp)
 	xorl	%eax, %eax
 	movl	%eax, 108(%esp)
-	movl	%edi, 112(%esp)
-	movl	%esi, 116(%esp)
-	.p2align 4,,15
-.L84:
+	xorl	%eax, %eax
+	movl	%eax, 112(%esp)
+	movl	%ebp, 116(%esp)
+	movl	%edi, 120(%esp)
+	movl	%esi, 124(%esp)
+.L88:
 #APP
 	movl $0, %eax
 	cpuid
 	rdtsc
 
 #NO_APP
 	movl	%eax, 56(%esp)
 	movl	%edx, 60(%esp)
 #APP
 	movl $0, %eax
 	cpuid
 	rdtsc
 
 #NO_APP
 	movl	%eax, %esi
 	movl	%edx, %edi
 	subl	56(%esp), %esi
 	sbbl	60(%esp), %edi
 	cmpl	$0, %edi
 	ja	.L66
 	cmpl	$999, %esi
-	jbe	.L84
+	jbe	.L88
 .L66:
+	movl	92(%esp), %edx
+	leal	(%edx,%edx,2), %eax
+	movl	cases+4(,%eax,4), %edi
+	movl	cases(,%eax,4), %esi
+	movl	%edi, %edx
+	movl	%esi, %eax
+	call	ncubic
 #APP
 	movl $0, %eax
 	cpuid
 	rdtsc
 
 #NO_APP
-	movl	%eax, %esi
-	movl	%edx, %edi
+	movl	$666, %ebx
+	movl	%eax, 128(%esp)
+	movl	%edx, 132(%esp)
+	.p2align 4,,15
+.L67:
+	movl	%esi, %eax
+	movl	%edi, %edx
+	call	ncubic
+	decl	%ebx
+	movl	%eax, %ebp
+	jne	.L67
 #APP
 	movl $0, %eax
 	cpuid
 	rdtsc
 
 #NO_APP
-	subl	%esi, %eax
+	subl	128(%esp), %eax
 	movl	$666, %ebx
-	sbbl	%edi, %edx
-	xorl	%ecx, %ecx
 	movl	%ebx, 8(%esp)
+	sbbl	132(%esp), %edx
+	xorl	%ecx, %ecx
 	movl	%ecx, 12(%esp)
 	movl	%eax, (%esp)
 	movl	%edx, 4(%esp)
 	call	__udivdi3
-	addl	%eax, 104(%esp)
+	addl	%eax, 112(%esp)
 	movl	%edx, %ecx
 	movl	%eax, %ebx
 	movl	%edx, %esi
-	adcl	%edx, 108(%esp)
+	adcl	%edx, 116(%esp)
 	imull	%eax, %ecx
 	mull	%ebx
 	addl	%ecx, %ecx
 	movl	%eax, 56(%esp)
 	addl	%ecx, %edx
 	movl	56(%esp), %eax
-	addl	%eax, 112(%esp)
+	addl	%eax, 120(%esp)
 	movl	%edx, 60(%esp)
 	movl	60(%esp), %edx
-	adcl	%edx, 116(%esp)
-	cmpl	%esi, 92(%esp)
-	ja	.L67
-	jb	.L68
-	cmpl	%ebx, 88(%esp)
-	jae	.L67
-.L68:
-	movl	%ebx, 88(%esp)
-	movl	%esi, 92(%esp)
-.L67:
-	leal	(%ebp,%ebp,2), %ebx
-	sall	$2, %ebx
-	movl	cases+4(%ebx), %edx
-	movl	cases(%ebx), %eax
-	call	ncubic
-	movl	cases+8(%ebx), %edx
-	subl	%eax, %edx
-	movl	%edx, %eax
-	sarl	$31, %eax
-	xorl	%eax, %edx
-	subl	%eax, %edx
-	movl	%edx, %ecx
-	sarl	$31, %ecx
-	addl	%edx, 96(%esp)
-	adcl	%ecx, 100(%esp)
-	incl	%ebp
-	cmpl	$183, %ebp
-	jbe	.L84
-	movl	108(%esp), %eax
-	fildll	104(%esp)
-	testl	%eax, %eax
-	js	.L85
+	adcl	%edx, 124(%esp)
+	cmpl	%esi, 100(%esp)
+	ja	.L69
+	jb	.L70
+	cmpl	%ebx, 96(%esp)
+	jae	.L69
 .L70:
-	fstpl	120(%esp)
+	movl	%ebx, 96(%esp)
+	movl	%esi, 100(%esp)
+.L69:
+	movl	92(%esp), %edx
+	leal	(%edx,%edx,2), %eax
+	movl	cases+8(,%eax,4), %eax
+	subl	%ebp, %eax
+	movl	%eax, %ecx
+	sarl	$31, %ecx
+	xorl	%ecx, %eax
+	subl	%ecx, %eax
+	cltd
+	addl	%eax, 104(%esp)
+	adcl	%edx, 108(%esp)
+	incl	92(%esp)
+	cmpl	$183, 92(%esp)
+	jbe	.L88
 	movl	116(%esp), %eax
-	fldl	120(%esp)
+	fildll	112(%esp)
+	testl	%eax, %eax
+	js	.L89
+.L72:
+	fstpl	136(%esp)
+	movl	124(%esp), %eax
+	fldl	136(%esp)
 	fdivl	.LC7
 	testl	%eax, %eax
 	flds	.LC4
 	fdivr	%st, %st(1)
-	fildll	112(%esp)
-	js	.L86
-.L71:
-	fstpl	120(%esp)
-	fldl	120(%esp)
+	fildll	120(%esp)
+	js	.L90
+.L73:
+	fstpl	136(%esp)
+	fldl	136(%esp)
 	fdivl	.LC7
 	fdivp	%st, %st(1)
 	fld	%st(1)
 	fmul	%st(2), %st
 	fsubrp	%st, %st(1)
 	fld	%st(0)
 	fsqrt
 	fucomi	%st(0), %st
-	jp	.L88
-	je	.L89
-.L88:
+	jp	.L92
+	je	.L93
+.L92:
 	fstp	%st(0)
 	fstpl	(%esp)
 	fstpl	64(%esp)
 	call	sqrt
 	fldl	64(%esp)
 	fxch	%st(1)
-.L72:
-	movl	96(%esp), %eax
-	movl	100(%esp), %edx
-	fildll	88(%esp)
+.L74:
+	movl	104(%esp), %eax
+	movl	108(%esp), %edx
+	fildll	96(%esp)
 	movl	%eax, 40(%esp)
-	movl	92(%esp), %eax
+	movl	100(%esp), %eax
 	movl	%edx, 44(%esp)
 	testl	%eax, %eax
-	js	.L87
-.L73:
-	fstpl	120(%esp)
-	movl	104(%esp), %eax
+	js	.L91
+.L75:
+	fstpl	136(%esp)
+	movl	112(%esp), %eax
 	movl	$184, %ebp
-	fldl	120(%esp)
+	fldl	136(%esp)
 	xorl	%edi, %edi
 	movl	$.LC5, %esi
 	fdivl	.LC7
-	movl	108(%esp), %edx
+	movl	116(%esp), %edx
 	movl	%ebp, 8(%esp)
 	movl	%edi, 12(%esp)
 	movl	%eax, (%esp)
 	movl	%edx, 4(%esp)
 	fstpl	32(%esp)
 	fstpl	24(%esp)
 	fstpl	16(%esp)
 	call	__udivdi3
 	movl	%esi, 4(%esp)
 	movl	$.LC6, (%esp)
 	movl	%eax, 8(%esp)
 	movl	%edx, 12(%esp)
 	call	printf
-	addl	$136, %esp
+	addl	$152, %esp
 	xorl	%eax, %eax
 	popl	%ecx
 	popl	%ebx
 	popl	%esi
 	popl	%edi
 	popl	%ebp
 	leal	-4(%ecx), %esp
 	ret
-.L89:
+.L93:
 	fstp	%st(1)
+	jmp	.L74
+.L89:
+	fadds	.LC2
 	jmp	.L72
-.L85:
+.L91:
 	fadds	.LC2
-	jmp	.L70
-.L87:
+	jmp	.L75
+.L90:
 	fadds	.LC2
 	jmp	.L73
-.L86:
-	fadds	.LC2
-	jmp	.L71
 	.size	main, .-main
 	.section	.rodata
 	.align 32
 	.type	cases, @object
 	.size	cases, 2208
 cases:
...


-- 

Attachments

Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help