Thread (7 messages) 7 messages, 2 authors, 2018-05-24

Re: [PATCH v5 2/4] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision

From: Michael Ellerman <mpe@ellerman.id.au>
Date: 2018-05-24 07:44:34

Hi Simon,

wei.guo.simon@gmail.com writes:
From: Simon Guo <redacted>

This patch add VMX primitives to do memcmp() in case the compare size
exceeds 4K bytes. KSM feature can benefit from this.
You say "exceeds 4K" here.
quoted hunk ↗ jump to hunk
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index f20e883..6303bbf 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -27,12 +27,73 @@
 #define LH	lhbrx
 #define LW	lwbrx
 #define LD	ldbrx
+#define LVS	lvsr
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+	vperm _VRT,_VRB,_VRA,_VRC
 #else
 #define LH	lhzx
 #define LW	lwzx
 #define LD	ldx
+#define LVS	lvsl
+#define VPERM(_VRT,_VRA,_VRB,_VRC) \
+	vperm _VRT,_VRA,_VRB,_VRC
 #endif
 
+#define VMX_OPS_THRES 4096
THRES == 4096

BTW, can we call it VMX_THRESH ?
quoted hunk ↗ jump to hunk
+#define ENTER_VMX_OPS	\
+	mflr    r0;	\
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+	std     r0,16(r1); \
+	stdu    r1,-STACKFRAMESIZE(r1); \
+	bl      enter_vmx_ops; \
+	cmpwi   cr1,r3,0; \
+	ld      r0,STACKFRAMESIZE+16(r1); \
+	ld      r3,STK_REG(R31)(r1); \
+	ld      r4,STK_REG(R30)(r1); \
+	ld      r5,STK_REG(R29)(r1); \
+	addi	r1,r1,STACKFRAMESIZE; \
+	mtlr    r0
+
+#define EXIT_VMX_OPS \
+	mflr    r0; \
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
+	std     r0,16(r1); \
+	stdu    r1,-STACKFRAMESIZE(r1); \
+	bl      exit_vmx_ops; \
+	ld      r0,STACKFRAMESIZE+16(r1); \
+	ld      r3,STK_REG(R31)(r1); \
+	ld      r4,STK_REG(R30)(r1); \
+	ld      r5,STK_REG(R29)(r1); \
+	addi	r1,r1,STACKFRAMESIZE; \
+	mtlr    r0
+
+/*
+ * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
+ * 16 bytes boundary and permute the result with the 1st 16 bytes.
+
+ *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
+ *    ^                                  ^                                 ^
+ * 0xbbbb10                          0xbbbb20                          0xbbb30
+ *                                 ^
+ *                                _vaddr
+ *
+ *
+ * _vmask is the mask generated by LVS
+ * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
+ *   for example: 0xyyyyyyyyyyyyy012 for big endian
+ * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
+ *   for example: 0x3456789abcdefzzz for big endian
+ * The permute result is saved in _v_res.
+ *   for example: 0x0123456789abcdef for big endian.
+ */
+#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
+        lvx     _v2nd_qw,_vaddr,off16; \
+        VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
+
 /*
  * There are 2 categories for memcmp:
  * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
@@ -174,6 +235,13 @@ _GLOBAL(memcmp)
 	blr
 
 .Llong:
+#ifdef CONFIG_ALTIVEC
+	/* Try to use vmx loop if length is larger than 4K */
+	cmpldi  cr6,r5,VMX_OPS_THRES
+	bge	cr6,.Lsameoffset_vmx_cmp
Here we compare the length to 4K and if it's greater *or equal* then we
go to the VMX case. Or am I reading it backward?

So we should say "if the size is 4K or more we do VMX" shouldn't we?

cheers
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help