Bug 1499507 - Allow the compiler to generate a non-atomic increment instruction for the stack pointer increment. r=njn

This change reduces the binary size on macOS x64 by around 50KB.

Here's a diff of the impact on the code generated for Attr_Binding::get_specified
in the Mac build. It's a bit hard to read because %r12 and %rbx swap their
function, but what happens in this method is that "movq       %r12, %rcx" goes
away, and the two instructions "leal       0x1(%r12) %eax" and
"movl       %eax, 0x10(%rbx)" turn into an "incl       0x10(%r12)".
So the old code was preserving the original value of profilingStack->stackPointer
in a register, and then using it later to compute the incremented stackPointer.
The new code uses an "incl" instruction for the stackPointer increment and
doesn't worry that the stackPointer value might have changed since the stack
size check at the start of the function. (It can't have changed.)

before: %rbx has the ProfilingStack*, %r12 has profilingStack->stackPointer
after: %r12 has the ProfilingStack*, %rbx has profilingStack->stackPointer

@@ -3,37 +3,35 @@
    movq       %rsp, %rbp
    pushq      %r15
    pushq      %r14
    pushq      %r12
    pushq      %rbx
    subq       $0x10, %rsp
    movq       %rcx, %r14
    movq       %rdx, %r15
-   movq       0x80(%rdi), %rbx
-   movq       %rbx, -40(%rbp)
-   testq      %rbx, %rbx
+   movq       0x80(%rdi), %r12
+   movq       %r12, -40(%rbp)
+   testq      %r12, %r12
    je         loc_xxxxx

-   movl       0x10(%rbx), %r12d
-   cmpl       (%rbx), %r12d
+   movl       0x10(%r12), %ebx
+   cmpl       (%r12), %ebx
    jae        loc_xxxxx

-   movq       0x8(%rbx), %rax
-   movq       %r12, %rcx
-   shlq       $0x5, %rcx
-   leaq       aAttr, %rdx                                 ; "Attr"
-   movq       %rdx, (%rax,%rcx)
-   leaq       aSpecified, %rdx                            ; "specified"
-   movq       %rdx, 0x8(%rax,%rcx)
-   leaq       -40(%rbp), %rdx
-   movq       %rdx, 0x10(%rax,%rcx)
-   movl       $0x3a1, 0x1c(%rax,%rcx)
-   leal       0x1(%r12), %eax
-   movl       %eax, 0x10(%rbx)
+   movq       0x8(%r12), %rax
+   shlq       $0x5, %rbx
+   leaq       aAttr, %rcx                                 ; "Attr"
+   movq       %rcx, (%rax,%rbx)
+   leaq       aSpecified, %rcx                            ; "specified"
+   movq       %rcx, 0x8(%rax,%rbx)
+   leaq       -40(%rbp), %rcx
+   movq       %rcx, 0x10(%rax,%rbx)
+   movl       $0x3a1, 0x1c(%rax,%rbx)
+   incl       0x10(%r12)

    movq       %r15, %rdi
    call       __ZNK7mozilla3dom4Attr9SpecifiedEv          ; mozilla::dom::Attr::Specified() const
    movzxl     %al, %eax
    movabsq    $0xfff9000000000000, %rcx
    orq        %rax, %rcx
    movq       %rcx, (%r14)
    movq       -40(%rbp), %rax
@@ -47,11 +45,11 @@
    popq       %rbx
    popq       %r12
    popq       %r14
    popq       %r15
    popq       %rbp
    ret
                         ; endp

-   movq       %rbx, %rdi
+   movq       %r12, %rdi
    call       __ZN14ProfilingStack18ensureCapacitySlowEv  ; ProfilingStack::ensureCapacitySlow()
    jmp        loc_xxxxx

Depends on D9205

Differential Revision: https://phabricator.services.mozilla.com/D9206

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Markus Stange 2018-11-05 20:58:39 +00:00
parent 42d8bddb50
commit ed18047939

View File

@ -394,12 +394,17 @@ class ProfilingStack final
void pushLabelFrame(const char* label, const char* dynamicString, void* sp,
js::ProfilingStackFrame::Category category,
uint32_t flags = 0) {
uint32_t oldStackPointer = stackPointer;
// This thread is the only one that ever changes the value of
// stackPointer.
// Store the value of the atomic in a non-atomic local variable so that
// the compiler won't generate two separate loads from the atomic for
// the size check and the frames[] array indexing operation.
uint32_t stackPointerVal = stackPointer;
if (MOZ_UNLIKELY(oldStackPointer >= capacity)) {
if (MOZ_UNLIKELY(stackPointerVal >= capacity)) {
ensureCapacitySlow();
}
frames[oldStackPointer].initLabelFrame(label, dynamicString, sp,
frames[stackPointerVal].initLabelFrame(label, dynamicString, sp,
category, flags);
// This must happen at the end! The compiler will not reorder this
@ -408,9 +413,10 @@ class ProfilingStack final
// Do the read and the write as two separate statements, in order to
// make it clear that we don't need an atomic increment, which would be
// more expensive on x86 than the separate operations done here.
// This thread is the only one that ever changes the value of
// stackPointer.
stackPointer = oldStackPointer + 1;
// However, don't use stackPointerVal here; instead, allow the compiler
// to turn this store into a non-atomic increment instruction which
// takes up less code size.
stackPointer = stackPointer + 1;
}
void pushSpMarkerFrame(void* sp) {
@ -427,6 +433,8 @@ class ProfilingStack final
void pushJsFrame(const char* label, const char* dynamicString, JSScript* script,
jsbytecode* pc) {
// This thread is the only one that ever changes the value of
// stackPointer. Only load the atomic once.
uint32_t oldStackPointer = stackPointer;
if (MOZ_UNLIKELY(oldStackPointer >= capacity)) {
@ -435,7 +443,7 @@ class ProfilingStack final
frames[oldStackPointer].initJsFrame(label, dynamicString, script, pc);
// This must happen at the end, see the comment in pushLabelFrame.
stackPointer = oldStackPointer + 1;
stackPointer = stackPointer + 1;
}
void pop() {