[Spice-commits] 2 commits - exec-all.h qemu-img.c qemu-io.c tcg/ppc

Thu Nov 8 10:59:06 PST 2012

exec-all.h           |    2 +-
 qemu-img.c           |    3 +--
 qemu-io.c            |    3 +--
 tcg/ppc/tcg-target.c |   32 ++++++++++++++++++++++++--------
 4 files changed, 27 insertions(+), 13 deletions(-)

New commits:
commit 2592c59a66d456fe98fe96cb5787b356c40ee66f
Author: Paolo Bonzini <pbonzini at redhat.com>
Date:   Sat Nov 3 18:10:17 2012 +0100

    tools: initialize main loop before block layer
    
    Tools were broken because they initialized the block layer while
    qemu_aio_context was still NULL.
    
    Reported-by: malc <av1474 at comtv.ru>
    Signed-off-by: Paolo Bonzini <pbonzini at redhat.com>
    Signed-off-by: malc <av1474 at comtv.ru>

diff --git a/qemu-img.c b/qemu-img.c
index b17bddd..e29e01b 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -2001,14 +2001,13 @@ int main(int argc, char **argv)
 
     error_set_progname(argv[0]);
 
+    qemu_init_main_loop();
     bdrv_init();
     if (argc < 2)
         help();
     cmdname = argv[1];
     argc--; argv++;
 
-    qemu_init_main_loop();
-
     /* find the command */
     for(cmd = img_cmds; cmd->name != NULL; cmd++) {
         if (!strcmp(cmdname, cmd->name)) {
diff --git a/qemu-io.c b/qemu-io.c
index d0f4fb7..1ad7d3a 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -1892,9 +1892,8 @@ int main(int argc, char **argv)
         exit(1);
     }
 
-    bdrv_init();
-
     qemu_init_main_loop();
+    bdrv_init();
 
     /* initialize commands */
     quit_init();
commit c878da3b27ceeed953c9f9a1eb002d59e9dcb4c6
Author: malc <av1474 at comtv.ru>
Date:   Mon Nov 5 21:47:04 2012 +0400

    tcg/ppc32: Use trampolines to trim the code size for mmu slow path accessors
    
    mmu access looks something like:
    
    <check tlb>
    if miss goto slow_path
    <fast path>
    done:
    ...
    
    ; end of the TB
    slow_path:
     <pre process>
     mr r3, r27         ; move areg0 to r3
                        ; (r3 holds the first argument for all the PPC32 ABIs)
     <call mmu_helper>
     b $+8
     .long done
     <post process>
     b done
    
    On ppc32 <call mmu_helper> is:
    
    (SysV and Darwin)
    
    mmu_helper is most likely not within direct branching distance from
    the call site, necessitating
    
    a. moving 32 bit offset of mmu_helper into a GPR ; 8 bytes
    b. moving GPR to CTR/LR                          ; 4 bytes
    c. (finally) branching to CTR/LR                 ; 4 bytes
    
    r3 setting              - 4 bytes
    call                    - 16 bytes
    dummy jump over retaddr - 4 bytes
    embedded retaddr        - 4 bytes
             Total overhead - 28 bytes
    
    (PowerOpen (AIX))
    a. moving 32 bit offset of mmu_helper's TOC into a GPR1 ; 8 bytes
    b. loading 32 bit function pointer into GPR2            ; 4 bytes
    c. moving GPR2 to CTR/LR                                ; 4 bytes
    d. loading 32 bit small area pointer into R2            ; 4 bytes
    e. (finally) branching to CTR/LR                        ; 4 bytes
    
    r3 setting              - 4 bytes
    call                    - 24 bytes
    dummy jump over retaddr - 4 bytes
    embedded retaddr        - 4 bytes
             Total overhead - 36 bytes
    
    Following is done to trim the code size of slow path sections:
    
    In tcg_target_qemu_prologue trampolines are emitted that look like this:
    
    trampoline:
    mfspr r3, LR
    addi  r3, 4
    mtspr LR, r3      ; fixup LR to point over embedded retaddr
    mr    r3, r27
    <jump mmu_helper> ; tail call of sorts
    
    And slow path becomes:
    
    slow_path:
     <pre process>
     <call trampoline>
     .long done
     <post process>
     b done
    
    call                    - 4 bytes (trampoline is within code gen buffer
                                       and most likely accessible via
                                       direct branch)
    embedded retaddr        - 4 bytes
             Total overhead - 8 bytes
    
    In the end the icache pressure is decreased by 20/28 bytes at the cost
    of an extra jump to trampoline and adjusting LR (to skip over embedded
    retaddr) once inside.
    
    Signed-off-by: malc <av1474 at comtv.ru>

diff --git a/exec-all.h b/exec-all.h
index 94ed613..6b3272a 100644
--- a/exec-all.h
+++ b/exec-all.h
@@ -337,7 +337,7 @@ extern uintptr_t tci_tb_ptr;
                                     *(int32_t *)((void *)GETRA() + 3) - 1))
 # elif defined (_ARCH_PPC) && !defined (_ARCH_PPC64)
 #  define GETRA() ((uintptr_t)__builtin_return_address(0))
-#  define GETPC_LDST() ((uintptr_t) ((*(int32_t *)(GETRA() + 4)) - 1))
+#  define GETPC_LDST() ((uintptr_t) ((*(int32_t *)(GETRA() - 4)) - 1))
 # else
 #  error "CONFIG_QEMU_LDST_OPTIMIZATION needs GETPC_LDST() implementation!"
 # endif
diff --git a/tcg/ppc/tcg-target.c b/tcg/ppc/tcg-target.c
index a1c74ce..34a0693 100644
--- a/tcg/ppc/tcg-target.c
+++ b/tcg/ppc/tcg-target.c
@@ -569,6 +569,9 @@ static const void * const qemu_st_helpers[4] = {
     helper_stq_mmu,
 };
 
+static void *ld_trampolines[4];
+static void *st_trampolines[4];
+
 static void tcg_out_tlb_check (TCGContext *s, int r0, int r1, int r2,
                                int addr_reg, int addr_reg2, int s_bits,
                                int offset1, int offset2, uint8_t **label_ptr)
@@ -848,8 +851,7 @@ static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
     reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr);
 
     /* slow path */
-    ir = 3;
-    tcg_out_mov (s, TCG_TYPE_I32, ir++, TCG_AREG0);
+    ir = 4;
 #if TARGET_LONG_BITS == 32
     tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
 #else
@@ -860,8 +862,7 @@ static void tcg_out_qemu_ld_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
     tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
 #endif
     tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
-    tcg_out_call (s, (tcg_target_long) qemu_ld_helpers[s_bits], 1);
-    tcg_out32 (s, B | 8);
+    tcg_out_call (s, (tcg_target_long) ld_trampolines[s_bits], 1);
     tcg_out32 (s, (tcg_target_long) raddr);
     switch (opc) {
     case 0|4:
@@ -916,8 +917,7 @@ static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
     reloc_pc14 (label_ptr[0], (tcg_target_long) s->code_ptr);
 
     /* slow path */
-    ir = 3;
-    tcg_out_mov (s, TCG_TYPE_I32, ir++, TCG_AREG0);
+    ir = 4;
 #if TARGET_LONG_BITS == 32
     tcg_out_mov (s, TCG_TYPE_I32, ir++, addr_reg);
 #else
@@ -959,8 +959,7 @@ static void tcg_out_qemu_st_slow_path (TCGContext *s, TCGLabelQemuLdst *label)
     ir++;
 
     tcg_out_movi (s, TCG_TYPE_I32, ir, mem_index);
-    tcg_out_call (s, (tcg_target_long) qemu_st_helpers[opc], 1);
-    tcg_out32 (s, B | 8);
+    tcg_out_call (s, (tcg_target_long) st_trampolines[opc], 1);
     tcg_out32 (s, (tcg_target_long) raddr);
     tcg_out_b (s, 0, (tcg_target_long) raddr);
 }
@@ -983,6 +982,15 @@ void tcg_out_tb_finalize(TCGContext *s)
 }
 #endif
 
+static void emit_ldst_trampoline (TCGContext *s, const void *ptr)
+{
+    tcg_out32 (s, MFSPR | RT (3) | LR);
+    tcg_out32 (s, ADDI | RT (3) | RA (3) | 4);
+    tcg_out32 (s, MTSPR | RS (3) | LR);
+    tcg_out_mov (s, TCG_TYPE_I32, 3, TCG_AREG0);
+    tcg_out_b (s, 0, (tcg_target_long) ptr);
+}
+
 static void tcg_target_qemu_prologue (TCGContext *s)
 {
     int i, frame_size;
@@ -1043,6 +1051,14 @@ static void tcg_target_qemu_prologue (TCGContext *s)
     tcg_out32 (s, MTSPR | RS (0) | LR);
     tcg_out32 (s, ADDI | RT (1) | RA (1) | frame_size);
     tcg_out32 (s, BCLR | BO_ALWAYS);
+
+    for (i = 0; i < 4; ++i) {
+        ld_trampolines[i] = s->code_ptr;
+        emit_ldst_trampoline (s, qemu_ld_helpers[i]);
+
+        st_trampolines[i] = s->code_ptr;
+        emit_ldst_trampoline (s, qemu_st_helpers[i]);
+    }
 }
 
 static void tcg_out_ld (TCGContext *s, TCGType type, TCGReg ret, TCGReg arg1,