Thread (40 messages) 40 messages, 3 authors, 2015-07-13
STALE3995d

[PATCH v3 09/11] KVM: arm: implement lazy world switch for debug registers

From: Christoffer Dall <hidden>
Date: 2015-07-03 21:05:53
Also in: kvm, kvmarm

On Fri, Jul 03, 2015 at 06:06:48PM +0800, Zhichao Huang wrote:

On June 30, 2015 9:15:22 PM GMT+08:00, Christoffer Dall [off-list ref] wrote:
quoted
On Mon, Jun 22, 2015 at 06:41:32PM +0800, Zhichao Huang wrote:
quoted
Implement switching of the debug registers. While the number
of registers is massive, CPUs usually don't implement them all
(A15 has 6 breakpoints and 4 watchpoints, which gives us a total
of 22 registers "only").

Notice that, for ARMv7, if the CONFIG_HAVE_HW_BREAKPOINT is set in
the guest, debug is always actively in use (ARM_DSCR_MDBGEN set).

We have to do the save/restore dance in this case, because the host
and the guest might use their respective debug registers at any moment.
this sounds expensive, and I suggested an alternative approach in the
previsou patch.  In any case, measuring the impact on this on hardware
would be a great idea...
quoted
If the CONFIG_HAVE_HW_BREAKPOINT is not set, and if no one flagged
the debug registers as dirty, we only save/resotre DBGDSCR.
restore
quoted
Signed-off-by: Zhichao Huang <redacted>
---
 arch/arm/kvm/interrupts.S      |  16 +++
 arch/arm/kvm/interrupts_head.S | 249 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 263 insertions(+), 2 deletions(-)
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 79caf79..d626275 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -116,6 +116,12 @@ ENTRY(__kvm_vcpu_run)
 	read_cp15_state store_to_vcpu = 0
 	write_cp15_state read_from_vcpu = 1
 
+	@ Store hardware CP14 state and load guest state
+	compute_debug_state 1f
+	bl __save_host_debug_regs
+	bl __restore_guest_debug_regs
+
+1:
 	@ If the host kernel has not been configured with VFPv3 support,
 	@ then it is safer if we deny guests from using it as well.
 #ifdef CONFIG_VFPv3
@@ -201,6 +207,16 @@ after_vfp_restore:
 	mrc	p15, 0, r2, c0, c0, 5
 	mcr	p15, 4, r2, c0, c0, 5
 
+	@ Store guest CP14 state and restore host state
+	skip_debug_state 1f
+	bl __save_guest_debug_regs
+	bl __restore_host_debug_regs
+	/* Clear the dirty flag for the next run, as all the state has
+	 * already been saved. Note that we nuke the whole 32bit word.
+	 * If we ever add more flags, we'll have to be more careful...
+	 */
+	clear_debug_dirty_bit
+1:
 	@ Store guest CP15 state and restore host state
 	read_cp15_state store_to_vcpu = 1
 	write_cp15_state read_from_vcpu = 0
diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
index 5662c39..ed406be 100644
--- a/arch/arm/kvm/interrupts_head.S
+++ b/arch/arm/kvm/interrupts_head.S
@@ -7,6 +7,7 @@
 #define VCPU_USR_SP		(VCPU_USR_REG(13))
 #define VCPU_USR_LR		(VCPU_USR_REG(14))
 #define CP15_OFFSET(_cp15_reg_idx) (VCPU_CP15 + (_cp15_reg_idx * 4))
+#define CP14_OFFSET(_cp14_reg_idx) (VCPU_CP14 + ((_cp14_reg_idx) *
4))
quoted
 
 /*
  * Many of these macros need to access the VCPU structure, which is
always
quoted
@@ -168,8 +169,7 @@ vcpu	.req	r0		@ vcpu pointer always in r0
  * Clobbers *all* registers.
  */
 .macro restore_guest_regs
-	/* reset DBGDSCR to disable debug mode */
-	mov	r2, #0
+	ldr	r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
 	mcr	p14, 0, r2, c0, c2, 2
 
 	restore_guest_regs_mode svc, #VCPU_SVC_REGS
@@ -250,6 +250,10 @@ vcpu	.req	r0		@ vcpu pointer always in r0
 	save_guest_regs_mode abt, #VCPU_ABT_REGS
 	save_guest_regs_mode und, #VCPU_UND_REGS
 	save_guest_regs_mode irq, #VCPU_IRQ_REGS
+
+	/* DBGDSCR reg */
+	mrc	p14, 0, r2, c0, c1, 0
+	str	r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
 .endm
 
 /* Reads cp15 registers from hardware and stores them in memory
@@ -449,6 +453,231 @@ vcpu	.req	r0		@ vcpu pointer always in r0
 	str	r5, [vcpu, #VCPU_DEBUG_FLAGS]
 .endm
 
+/* Assume r11/r12 in used, clobbers r2-r10 */
+.macro cp14_read_and_push Op2 skip_num
+	cmp	\skip_num, #8
+	// if (skip_num >= 8) then skip c8-c15 directly
+	bge	1f
+	adr	r2, 9998f
+	add	r2, r2, \skip_num, lsl #2
+	bx	r2
+1:
+	adr	r2, 9999f
+	sub	r3, \skip_num, #8
+	add	r2, r2, r3, lsl #2
+	bx	r2
+9998:
+	mrc	p14, 0, r10, c0, c15, \Op2
+	mrc	p14, 0, r9, c0, c14, \Op2
+	mrc	p14, 0, r8, c0, c13, \Op2
+	mrc	p14, 0, r7, c0, c12, \Op2
+	mrc	p14, 0, r6, c0, c11, \Op2
+	mrc	p14, 0, r5, c0, c10, \Op2
+	mrc	p14, 0, r4, c0, c9, \Op2
+	mrc	p14, 0, r3, c0, c8, \Op2
+	push	{r3-r10}
you probably don't want to do more stores to memory than required
Yeah, there is no need to push some registers, but I can't find a better 
way to optimize it, is there any precedents that I can refer to?
Can you not simply do what you do below where you read the coproc
register and then do the store of that and so on?

If you unify the two approaches you should be in the clear on this one
anyway...
Imaging that there are only 2 hwbrpts available, BCR0/BCR1, and we
can code like this:

Save:
		 jump to 1
    save BCR2 to r5
1:
    save BCR1 to r4
    save BCR0 to r3
    push {r3-r5}

Restore:
    pop {r3-r5}
    jump to 1
    restore r5 to BCR2
1:
    restore r4 to BCR1
    restore r3 to BCR0

Buf if we want to only push the registers we acutally need, we have to
code like this:

Save:
    jump to 1
    save BCR2 to r5
    push r5
1:
    save BCR1 to r4
    push r4
    save BCR0 to r3
    push r3

Resotre:
    jump to 1
    pop r5
    restore r5 to BCR2
1:
    pop r4
    restore r4 to BCR1
    pop r3
    restore r3 to BCR0

Then we might entercounter a mistake on restoring, as we want to pop
r4, but actually we pop r3.
quoted
quoted
+9999:
+	mrc	p14, 0, r10, c0, c7, \Op2
+	mrc	p14, 0, r9, c0, c6, \Op2
+	mrc	p14, 0, r8, c0, c5, \Op2
+	mrc	p14, 0, r7, c0, c4, \Op2
+	mrc	p14, 0, r6, c0, c3, \Op2
+	mrc	p14, 0, r5, c0, c2, \Op2
+	mrc	p14, 0, r4, c0, c1, \Op2
+	mrc	p14, 0, r3, c0, c0, \Op2
+	push	{r3-r10}
same
quoted
+.endm
+
+/* Assume r11/r12 in used, clobbers r2-r10 */
+.macro cp14_pop_and_write Op2 skip_num
+	cmp	\skip_num, #8
+	// if (skip_num >= 8) then skip c8-c15 directly
+	bge	1f
+	adr	r2, 9998f
+	add	r2, r2, \skip_num, lsl #2
+	pop	{r3-r10}
you probably don't want to do more loads from memory than required
quoted
+	bx	r2
+1:
+	adr	r2, 9999f
+	sub	r3, \skip_num, #8
+	add	r2, r2, r3, lsl #2
+	pop	{r3-r10}
same
quoted
+	bx	r2
+
+9998:
+	mcr	p14, 0, r10, c0, c15, \Op2
+	mcr	p14, 0, r9, c0, c14, \Op2
+	mcr	p14, 0, r8, c0, c13, \Op2
+	mcr	p14, 0, r7, c0, c12, \Op2
+	mcr	p14, 0, r6, c0, c11, \Op2
+	mcr	p14, 0, r5, c0, c10, \Op2
+	mcr	p14, 0, r4, c0, c9, \Op2
+	mcr	p14, 0, r3, c0, c8, \Op2
+
+	pop	{r3-r10}
+9999:
+	mcr	p14, 0, r10, c0, c7, \Op2
+	mcr	p14, 0, r9, c0, c6, \Op2
+	mcr	p14, 0, r8, c0, c5, \Op2
+	mcr	p14, 0, r7, c0, c4, \Op2
+	mcr	p14, 0, r6, c0, c3, \Op2
+	mcr	p14, 0, r5, c0, c2, \Op2
+	mcr	p14, 0, r4, c0, c1, \Op2
+	mcr	p14, 0, r3, c0, c0, \Op2
+.endm
+
+/* Assume r11/r12 in used, clobbers r2-r3 */
+.macro cp14_read_and_str Op2 cp14_reg0 skip_num
+	adr	r3, 1f
+	add	r3, r3, \skip_num, lsl #3
+	bx	r3
+1:
+	mrc	p14, 0, r2, c0, c15, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
+	mrc	p14, 0, r2, c0, c14, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
+	mrc	p14, 0, r2, c0, c13, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
+	mrc	p14, 0, r2, c0, c12, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
+	mrc	p14, 0, r2, c0, c11, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
+	mrc	p14, 0, r2, c0, c10, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
+	mrc	p14, 0, r2, c0, c9, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
+	mrc	p14, 0, r2, c0, c8, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
+	mrc	p14, 0, r2, c0, c7, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
+	mrc	p14, 0, r2, c0, c6, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
+	mrc	p14, 0, r2, c0, c5, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
+	mrc	p14, 0, r2, c0, c4, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
+	mrc	p14, 0, r2, c0, c3, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
+	mrc	p14, 0, r2, c0, c2, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
+	mrc	p14, 0, r2, c0, c1, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
+	mrc	p14, 0, r2, c0, c0, \Op2
+	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
+.endm
+
+/* Assume r11/r12 in used, clobbers r2-r3 */
+.macro cp14_ldr_and_write Op2 cp14_reg0 skip_num
+	adr	r3, 1f
+	add	r3, r3, \skip_num, lsl #3
+	bx	r3
+1:
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
+	mcr	p14, 0, r2, c0, c15, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
+	mcr	p14, 0, r2, c0, c14, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
+	mcr	p14, 0, r2, c0, c13, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
+	mcr	p14, 0, r2, c0, c12, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
+	mcr	p14, 0, r2, c0, c11, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
+	mcr	p14, 0, r2, c0, c10, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
+	mcr	p14, 0, r2, c0, c9, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
+	mcr	p14, 0, r2, c0, c8, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
+	mcr	p14, 0, r2, c0, c7, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
+	mcr	p14, 0, r2, c0, c6, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
+	mcr	p14, 0, r2, c0, c5, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
+	mcr	p14, 0, r2, c0, c4, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
+	mcr	p14, 0, r2, c0, c3, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
+	mcr	p14, 0, r2, c0, c2, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
+	mcr	p14, 0, r2, c0, c1, \Op2
+	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
+	mcr	p14, 0, r2, c0, c0, \Op2
+.endm
can you not find some way of unifying cp14_pop_and_write with
cp14_ldr_and_write and cp14_read_and_push with cp14_read_and_str ?

Probably having two separate structs for the VFP state on the vcpu
struct
for both the guest and the host state is one possible way of doing so.
OK, I will do it.
Would you like me to rename the struct vfp_hard_struct, and add 
host_cp14_state in there, or add a new struct host_cp14_state in the
kvm_vcpu_arch?
Not sure I understand the question exactly.  I would probably define
kvm_cpu_context_t as a new struct that includes the VFP state instead of
it being a typedef, but I haven't looked at it too carefully.

quoted
quoted
+
+/* Get extract number of BRPs and WRPs. Saved in r11/r12 */
+.macro read_hw_dbg_num
+	mrc	p14, 0, r2, c0, c0, 0
+	ubfx	r11, r2, #24, #4
+	add	r11, r11, #1		// Extract BRPs
+	ubfx	r12, r2, #28, #4
+	add	r12, r12, #1		// Extract WRPs
+	mov	r2, #16
+	sub	r11, r2, r11		// How many BPs to skip
+	sub	r12, r2, r12		// How many WPs to skip
+.endm
+
+/* Reads cp14 registers from hardware.
You have a lot of multi-line comments in these patches which don't
start
with a separate '/*' line, as dictated by the Linux kernel coding
style.
So far, I've ignored this, but please fix all these throughout the
series when you respin.
quoted
+ * Writes cp14 registers in-order to the stack.
+ *
+ * Assumes vcpu pointer in vcpu reg
+ *
+ * Clobbers r2-r12
+ */
+.macro save_host_debug_regs
+	read_hw_dbg_num
+	cp14_read_and_push #4, r11	@ DBGBVR
+	cp14_read_and_push #5, r11	@ DBGBCR
+	cp14_read_and_push #6, r12	@ DBGWVR
+	cp14_read_and_push #7, r12	@ DBGWCR
+.endm
+
+/* Reads cp14 registers from hardware.
+ * Writes cp14 registers in-order to the VCPU struct pointed to by
vcpup.
quoted
+ *
+ * Assumes vcpu pointer in vcpu reg
+ *
+ * Clobbers r2-r12
+ */
+.macro save_guest_debug_regs
+	read_hw_dbg_num
+	cp14_read_and_str #4, cp14_DBGBVR0, r11
why do you need the has before the op2 field?
Sorry, I can't quite understand.
heh, I meant hash, why is is '#4' instead of '4' ?

Thanks,
-Christoffer
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help