diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp index 9c68a9ce9ca..6d9bb8c15f7 100644 --- a/lib/Target/X86/X86CodeEmitter.cpp +++ b/lib/Target/X86/X86CodeEmitter.cpp @@ -1008,23 +1008,47 @@ void Emitter::emitInstruction(MachineInstr &MI, // If this is a pseudo instruction, lower it. switch (Desc->getOpcode()) { - case X86::ADD16rr_DB: Desc = UpdateOp(MI, II, X86::OR16rr); break; - case X86::ADD32rr_DB: Desc = UpdateOp(MI, II, X86::OR32rr); break; - case X86::ADD64rr_DB: Desc = UpdateOp(MI, II, X86::OR64rr); break; - case X86::ADD16ri_DB: Desc = UpdateOp(MI, II, X86::OR16ri); break; - case X86::ADD32ri_DB: Desc = UpdateOp(MI, II, X86::OR32ri); break; - case X86::ADD64ri32_DB: Desc = UpdateOp(MI, II, X86::OR64ri32); break; - case X86::ADD16ri8_DB: Desc = UpdateOp(MI, II, X86::OR16ri8); break; - case X86::ADD32ri8_DB: Desc = UpdateOp(MI, II, X86::OR32ri8); break; - case X86::ADD64ri8_DB: Desc = UpdateOp(MI, II, X86::OR64ri8); break; - case X86::ACQUIRE_MOV8rm: Desc = UpdateOp(MI, II, X86::MOV8rm); break; - case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break; - case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break; - case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break; - case X86::RELEASE_MOV8mr: Desc = UpdateOp(MI, II, X86::MOV8mr); break; - case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break; - case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break; - case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break; + case X86::ADD16rr_DB: Desc = UpdateOp(MI, II, X86::OR16rr); break; + case X86::ADD32rr_DB: Desc = UpdateOp(MI, II, X86::OR32rr); break; + case X86::ADD64rr_DB: Desc = UpdateOp(MI, II, X86::OR64rr); break; + case X86::ADD16ri_DB: Desc = UpdateOp(MI, II, X86::OR16ri); break; + case X86::ADD32ri_DB: Desc = UpdateOp(MI, II, X86::OR32ri); break; + case X86::ADD64ri32_DB: Desc = UpdateOp(MI, II, X86::OR64ri32); break; + case X86::ADD16ri8_DB: Desc = UpdateOp(MI, II, X86::OR16ri8); break; + case X86::ADD32ri8_DB: Desc = UpdateOp(MI, II, X86::OR32ri8); break; + case X86::ADD64ri8_DB: Desc = UpdateOp(MI, II, X86::OR64ri8); break; + case X86::ACQUIRE_MOV8rm: Desc = UpdateOp(MI, II, X86::MOV8rm); break; + case X86::ACQUIRE_MOV16rm: Desc = UpdateOp(MI, II, X86::MOV16rm); break; + case X86::ACQUIRE_MOV32rm: Desc = UpdateOp(MI, II, X86::MOV32rm); break; + case X86::ACQUIRE_MOV64rm: Desc = UpdateOp(MI, II, X86::MOV64rm); break; + case X86::RELEASE_MOV8mr: Desc = UpdateOp(MI, II, X86::MOV8mr); break; + case X86::RELEASE_MOV16mr: Desc = UpdateOp(MI, II, X86::MOV16mr); break; + case X86::RELEASE_MOV32mr: Desc = UpdateOp(MI, II, X86::MOV32mr); break; + case X86::RELEASE_MOV64mr: Desc = UpdateOp(MI, II, X86::MOV64mr); break; + case X86::RELEASE_MOV8mi: Desc = UpdateOp(MI, II, X86::MOV8mi); break; + case X86::RELEASE_MOV16mi: Desc = UpdateOp(MI, II, X86::MOV16mi); break; + case X86::RELEASE_MOV32mi: Desc = UpdateOp(MI, II, X86::MOV32mi); break; + case X86::RELEASE_MOV64mi32: Desc = UpdateOp(MI, II, X86::MOV64mi32); break; + case X86::RELEASE_ADD8mi: Desc = UpdateOp(MI, II, X86::ADD8mi); break; + case X86::RELEASE_ADD32mi: Desc = UpdateOp(MI, II, X86::ADD32mi); break; + case X86::RELEASE_ADD64mi32: Desc = UpdateOp(MI, II, X86::ADD64mi32); break; + case X86::RELEASE_AND8mi: Desc = UpdateOp(MI, II, X86::AND8mi); break; + case X86::RELEASE_AND32mi: Desc = UpdateOp(MI, II, X86::AND32mi); break; + case X86::RELEASE_AND64mi32: Desc = UpdateOp(MI, II, X86::AND64mi32); break; + case X86::RELEASE_OR8mi: Desc = UpdateOp(MI, II, X86::OR8mi); break; + case X86::RELEASE_OR32mi: Desc = UpdateOp(MI, II, X86::OR32mi); break; + case X86::RELEASE_OR64mi32: Desc = UpdateOp(MI, II, X86::OR64mi32); break; + case X86::RELEASE_XOR8mi: Desc = UpdateOp(MI, II, X86::XOR8mi); break; + case X86::RELEASE_XOR32mi: Desc = UpdateOp(MI, II, X86::XOR32mi); break; + case X86::RELEASE_XOR64mi32: Desc = UpdateOp(MI, II, X86::XOR64mi32); break; + case X86::RELEASE_INC8m: Desc = UpdateOp(MI, II, X86::INC8m); break; + case X86::RELEASE_INC16m: Desc = UpdateOp(MI, II, X86::INC16m); break; + case X86::RELEASE_INC32m: Desc = UpdateOp(MI, II, X86::INC32m); break; + case X86::RELEASE_INC64m: Desc = UpdateOp(MI, II, X86::INC64m); break; + case X86::RELEASE_DEC8m: Desc = UpdateOp(MI, II, X86::DEC8m); break; + case X86::RELEASE_DEC16m: Desc = UpdateOp(MI, II, X86::DEC16m); break; + case X86::RELEASE_DEC32m: Desc = UpdateOp(MI, II, X86::DEC32m); break; + case X86::RELEASE_DEC64m: Desc = UpdateOp(MI, II, X86::DEC64m); break; } diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 4f9bec6b191..7ad8d789787 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -751,18 +751,88 @@ defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>, TB, LOCK; -def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), - "#ACQUIRE_MOV PSEUDO!", - [(set GR8:$dst, (atomic_load_8 addr:$src))]>; -def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), - "#ACQUIRE_MOV PSEUDO!", - [(set GR16:$dst, (atomic_load_16 addr:$src))]>; -def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), - "#ACQUIRE_MOV PSEUDO!", - [(set GR32:$dst, (atomic_load_32 addr:$src))]>; -def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), - "#ACQUIRE_MOV PSEUDO!", - [(set GR64:$dst, (atomic_load_64 addr:$src))]>; +/* The following multiclass tries to make sure that in code like + * x.store (immediate op x.load(acquire), release) + * an operation directly on memory is generated instead of wasting a register. + * It is not automatic as atomic_store/load are only lowered to MOV instructions + * extremely late to prevent them from being accidentally reordered in the backend + * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions) + */ +multiclass RELEASE_BINOP_MI { + def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_8 addr:$dst, (!cast(op) + (atomic_load_8 addr:$dst), (i8 imm:$src)))]>; + // NAME#16 is not generated as 16-bit arithmetic instructions are considered + // costly and avoided as far as possible by this backend anyway + def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_32 addr:$dst, (!cast(op) + (atomic_load_32 addr:$dst), (i32 imm:$src)))]>; + def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), + "#RELEASE_BINOP PSEUDO!", + [(atomic_store_64 addr:$dst, (!cast(op) + (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>; +} +defm RELEASE_ADD : RELEASE_BINOP_MI<"add">; +defm RELEASE_AND : RELEASE_BINOP_MI<"and">; +defm RELEASE_OR : RELEASE_BINOP_MI<"or">; +defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">; +// Note: we don't deal with sub, because substractions of constants are +// optimized into additions before this code can run + +multiclass RELEASE_UNOP { + def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_8 addr:$dst, dag8)]>; + def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_16 addr:$dst, dag16)]>; + def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_32 addr:$dst, dag32)]>; + def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst), + "#RELEASE_UNOP PSEUDO!", + [(atomic_store_64 addr:$dst, dag64)]>; +} + +defm RELEASE_INC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 1)), + (add (atomic_load_16 addr:$dst), (i16 1)), + (add (atomic_load_32 addr:$dst), (i32 1)), + (add (atomic_load_64 addr:$dst), (i64 1))>; +defm RELEASE_DEC : RELEASE_UNOP< + (add (atomic_load_8 addr:$dst), (i8 -1)), + (add (atomic_load_16 addr:$dst), (i16 -1)), + (add (atomic_load_32 addr:$dst), (i32 -1)), + (add (atomic_load_64 addr:$dst), (i64 -1))>; +/* +TODO: These don't work because the type inference of TableGen fails. +TODO: find a way to fix it. +defm RELEASE_NEG : RELEASE_UNOP< + (ineg (atomic_load_8 addr:$dst)), + (ineg (atomic_load_16 addr:$dst)), + (ineg (atomic_load_32 addr:$dst)), + (ineg (atomic_load_64 addr:$dst))>; +defm RELEASE_NOT : RELEASE_UNOP< + (not (atomic_load_8 addr:$dst)), + (not (atomic_load_16 addr:$dst)), + (not (atomic_load_32 addr:$dst)), + (not (atomic_load_64 addr:$dst))>; +*/ + +def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_8 addr:$dst, (i8 imm:$src))]>; +def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_16 addr:$dst, (i16 imm:$src))]>; +def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_32 addr:$dst, (i32 imm:$src))]>; +def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), + "#RELEASE_MOV PSEUDO !", + [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), "#RELEASE_MOV PSEUDO!", @@ -777,11 +847,22 @@ def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), "#RELEASE_MOV PSEUDO!", [(atomic_store_64 addr:$dst, GR64:$src)]>; +def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR8:$dst, (atomic_load_8 addr:$src))]>; +def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR16:$dst, (atomic_load_16 addr:$src))]>; +def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR32:$dst, (atomic_load_32 addr:$src))]>; +def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), + "#ACQUIRE_MOV PSEUDO!", + [(set GR64:$dst, (atomic_load_64 addr:$src))]>; //===----------------------------------------------------------------------===// // Conditional Move Pseudo Instructions. //===----------------------------------------------------------------------===// - // CMOV* - Used to implement the SSE SELECT DAG operation. Expanded after // instruction selection into a branch sequence. let Uses = [EFLAGS], usesCustomInserter = 1 in { diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 892396bc4ed..ded84fc28f1 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -583,14 +583,38 @@ ReSimplify: // Atomic load and store require a separate pseudo-inst because Acquire // implies mayStore and Release implies mayLoad; fix these to regular MOV // instructions here - case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify; - case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify; - case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify; - case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify; - case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify; - case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify; - case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify; - case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify; + case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify; + case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify; + case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify; + case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify; + case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify; + case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify; + case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify; + case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify; + case X86::RELEASE_MOV8mi: OutMI.setOpcode(X86::MOV8mi); goto ReSimplify; + case X86::RELEASE_MOV16mi: OutMI.setOpcode(X86::MOV16mi); goto ReSimplify; + case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify; + case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify; + case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify; + case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify; + case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify; + case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify; + case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify; + case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify; + case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify; + case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify; + case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify; + case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify; + case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify; + case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify; + case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify; + case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify; + case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify; + case X86::RELEASE_INC64m: OutMI.setOpcode(X86::INC64m); goto ReSimplify; + case X86::RELEASE_DEC8m: OutMI.setOpcode(X86::DEC8m); goto ReSimplify; + case X86::RELEASE_DEC16m: OutMI.setOpcode(X86::DEC16m); goto ReSimplify; + case X86::RELEASE_DEC32m: OutMI.setOpcode(X86::DEC32m); goto ReSimplify; + case X86::RELEASE_DEC64m: OutMI.setOpcode(X86::DEC64m); goto ReSimplify; // We don't currently select the correct instruction form for instructions // which have a short %eax, etc. form. Handle this by custom lowering, for diff --git a/test/CodeGen/X86/atomic_mi.ll b/test/CodeGen/X86/atomic_mi.ll new file mode 100644 index 00000000000..36d62375524 --- /dev/null +++ b/test/CodeGen/X86/atomic_mi.ll @@ -0,0 +1,502 @@ +; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s --check-prefix X64 +; RUN: llc < %s -march=x86 -verify-machineinstrs | FileCheck %s --check-prefix X32 + +; This file checks that atomic (non-seq_cst) stores of immediate values are +; done in one mov instruction and not 2. More precisely, it makes sure that the +; immediate is not first copied uselessly into a register. + +; Similarily, it checks that a binary operation of an immediate with an atomic +; variable that is stored back in that variable is done as a single instruction. +; For example: x.store(42 + x.load(memory_order_acquire), memory_order_release) +; should be just an add instruction, instead of loading x into a register, doing +; an add and storing the result back. +; The binary operations supported are currently add, and, or, xor. +; sub is not supported because they are translated by an addition of the +; negated immediate. +; Finally, we also check the same kind of pattern for inc/dec + +; seq_cst stores are left as (lock) xchgl, but we try to check every other +; attribute at least once. + +; Please note that these operations do not require the lock prefix: only +; sequentially consistent stores require this kind of protection on X86. +; And even for seq_cst operations, llvm uses the xchg instruction which has +; an implicit lock prefix, so making it explicit is not required. + +define void @store_atomic_imm_8(i8* %p) { +; X64-LABEL: store_atomic_imm_8 +; X64: movb +; X64-NOT: movb +; X32-LABEL: store_atomic_imm_8 +; X32: movb +; X32-NOT: movb + store atomic i8 42, i8* %p release, align 1 + ret void +} + +define void @store_atomic_imm_16(i16* %p) { +; X64-LABEL: store_atomic_imm_16 +; X64: movw +; X64-NOT: movw +; X32-LABEL: store_atomic_imm_16 +; X32: movw +; X32-NOT: movw + store atomic i16 42, i16* %p monotonic, align 2 + ret void +} + +define void @store_atomic_imm_32(i32* %p) { +; X64-LABEL: store_atomic_imm_32 +; X64: movl +; X64-NOT: movl +; On 32 bits, there is an extra movl for each of those functions +; (probably for alignment reasons). +; X32-LABEL: store_atomic_imm_32 +; X32: movl 4(%esp), %eax +; X32: movl +; X32-NOT: movl + store atomic i32 42, i32* %p release, align 4 + ret void +} + +define void @store_atomic_imm_64(i64* %p) { +; X64-LABEL: store_atomic_imm_64 +; X64: movq +; X64-NOT: movq +; These are implemented with a CAS loop on 32 bit architectures, and thus +; cannot be optimized in the same way as the others. +; X32-LABEL: store_atomic_imm_64 +; X32: cmpxchg8b + store atomic i64 42, i64* %p release, align 8 + ret void +} + +; If an immediate is too big to fit in 32 bits, it cannot be store in one mov, +; even on X64, one must use movabsq that can only target a register. +define void @store_atomic_imm_64_big(i64* %p) { +; X64-LABEL: store_atomic_imm_64_big +; X64: movabsq +; X64: movq + store atomic i64 100000000000, i64* %p monotonic, align 8 + ret void +} + +; It would be incorrect to replace a lock xchgl by a movl +define void @store_atomic_imm_32_seq_cst(i32* %p) { +; X64-LABEL: store_atomic_imm_32_seq_cst +; X64: xchgl +; X32-LABEL: store_atomic_imm_32_seq_cst +; X32: xchgl + store atomic i32 42, i32* %p seq_cst, align 4 + ret void +} + +; ----- ADD ----- + +define void @add_8(i8* %p) { +; X64-LABEL: add_8 +; X64-NOT: lock +; X64: addb +; X64-NOT: movb +; X32-LABEL: add_8 +; X32-NOT: lock +; X32: addb +; X32-NOT: movb + %1 = load atomic i8* %p seq_cst, align 1 + %2 = add i8 %1, 2 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @add_16(i16* %p) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: add_16 +; X64-NOT: addw +; X32-LABEL: add_16 +; X32-NOT: addw + %1 = load atomic i16* %p acquire, align 2 + %2 = add i16 %1, 2 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @add_32(i32* %p) { +; X64-LABEL: add_32 +; X64-NOT: lock +; X64: addl +; X64-NOT: movl +; X32-LABEL: add_32 +; X32-NOT: lock +; X32: addl +; X32-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = add i32 %1, 2 + store atomic i32 %2, i32* %p monotonic, align 4 + ret void +} + +define void @add_64(i64* %p) { +; X64-LABEL: add_64 +; X64-NOT: lock +; X64: addq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'addq'. +; X32-LABEL: add_64 + %1 = load atomic i64* %p acquire, align 8 + %2 = add i64 %1, 2 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @add_32_seq_cst(i32* %p) { +; X64-LABEL: add_32_seq_cst +; X64: xchgl +; X32-LABEL: add_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = add i32 %1, 2 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- AND ----- + +define void @and_8(i8* %p) { +; X64-LABEL: and_8 +; X64-NOT: lock +; X64: andb +; X64-NOT: movb +; X32-LABEL: and_8 +; X32-NOT: lock +; X32: andb +; X32-NOT: movb + %1 = load atomic i8* %p monotonic, align 1 + %2 = and i8 %1, 2 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @and_16(i16* %p) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: and_16 +; X64-NOT: andw +; X32-LABEL: and_16 +; X32-NOT: andw + %1 = load atomic i16* %p acquire, align 2 + %2 = and i16 %1, 2 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @and_32(i32* %p) { +; X64-LABEL: and_32 +; X64-NOT: lock +; X64: andl +; X64-NOT: movl +; X32-LABEL: and_32 +; X32-NOT: lock +; X32: andl +; X32-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = and i32 %1, 2 + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @and_64(i64* %p) { +; X64-LABEL: and_64 +; X64-NOT: lock +; X64: andq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'andq'. +; X32-LABEL: and_64 + %1 = load atomic i64* %p acquire, align 8 + %2 = and i64 %1, 2 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @and_32_seq_cst(i32* %p) { +; X64-LABEL: and_32_seq_cst +; X64: xchgl +; X32-LABEL: and_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = and i32 %1, 2 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- OR ----- + +define void @or_8(i8* %p) { +; X64-LABEL: or_8 +; X64-NOT: lock +; X64: orb +; X64-NOT: movb +; X32-LABEL: or_8 +; X32-NOT: lock +; X32: orb +; X32-NOT: movb + %1 = load atomic i8* %p acquire, align 1 + %2 = or i8 %1, 2 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @or_16(i16* %p) { +; X64-LABEL: or_16 +; X64-NOT: orw +; X32-LABEL: or_16 +; X32-NOT: orw + %1 = load atomic i16* %p acquire, align 2 + %2 = or i16 %1, 2 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @or_32(i32* %p) { +; X64-LABEL: or_32 +; X64-NOT: lock +; X64: orl +; X64-NOT: movl +; X32-LABEL: or_32 +; X32-NOT: lock +; X32: orl +; X32-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = or i32 %1, 2 + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @or_64(i64* %p) { +; X64-LABEL: or_64 +; X64-NOT: lock +; X64: orq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'orq'. +; X32-LABEL: or_64 + %1 = load atomic i64* %p acquire, align 8 + %2 = or i64 %1, 2 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @or_32_seq_cst(i32* %p) { +; X64-LABEL: or_32_seq_cst +; X64: xchgl +; X32-LABEL: or_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = or i32 %1, 2 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- XOR ----- + +define void @xor_8(i8* %p) { +; X64-LABEL: xor_8 +; X64-NOT: lock +; X64: xorb +; X64-NOT: movb +; X32-LABEL: xor_8 +; X32-NOT: lock +; X32: xorb +; X32-NOT: movb + %1 = load atomic i8* %p acquire, align 1 + %2 = xor i8 %1, 2 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @xor_16(i16* %p) { +; X64-LABEL: xor_16 +; X64-NOT: xorw +; X32-LABEL: xor_16 +; X32-NOT: xorw + %1 = load atomic i16* %p acquire, align 2 + %2 = xor i16 %1, 2 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @xor_32(i32* %p) { +; X64-LABEL: xor_32 +; X64-NOT: lock +; X64: xorl +; X64-NOT: movl +; X32-LABEL: xor_32 +; X32-NOT: lock +; X32: xorl +; X32-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = xor i32 %1, 2 + store atomic i32 %2, i32* %p release, align 4 + ret void +} + +define void @xor_64(i64* %p) { +; X64-LABEL: xor_64 +; X64-NOT: lock +; X64: xorq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'xorq'. +; X32-LABEL: xor_64 + %1 = load atomic i64* %p acquire, align 8 + %2 = xor i64 %1, 2 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @xor_32_seq_cst(i32* %p) { +; X64-LABEL: xor_32_seq_cst +; X64: xchgl +; X32-LABEL: xor_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = xor i32 %1, 2 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- INC ----- + +define void @inc_8(i8* %p) { +; X64-LABEL: inc_8 +; X64-NOT: lock +; X64: incb +; X64-NOT: movb +; X32-LABEL: inc_8 +; X32-NOT: lock +; X32: incb +; X32-NOT: movb + %1 = load atomic i8* %p seq_cst, align 1 + %2 = add i8 %1, 1 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @inc_16(i16* %p) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: inc_16 +; X64-NOT: incw +; X32-LABEL: inc_16 +; X32-NOT: incw + %1 = load atomic i16* %p acquire, align 2 + %2 = add i16 %1, 1 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @inc_32(i32* %p) { +; X64-LABEL: inc_32 +; X64-NOT: lock +; X64: incl +; X64-NOT: movl +; X32-LABEL: inc_32 +; X32-NOT: lock +; X32: incl +; X32-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = add i32 %1, 1 + store atomic i32 %2, i32* %p monotonic, align 4 + ret void +} + +define void @inc_64(i64* %p) { +; X64-LABEL: inc_64 +; X64-NOT: lock +; X64: incq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'incq'. +; X32-LABEL: inc_64 + %1 = load atomic i64* %p acquire, align 8 + %2 = add i64 %1, 1 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @inc_32_seq_cst(i32* %p) { +; X64-LABEL: inc_32_seq_cst +; X64: xchgl +; X32-LABEL: inc_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = add i32 %1, 1 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +} + +; ----- DEC ----- + +define void @dec_8(i8* %p) { +; X64-LABEL: dec_8 +; X64-NOT: lock +; X64: decb +; X64-NOT: movb +; X32-LABEL: dec_8 +; X32-NOT: lock +; X32: decb +; X32-NOT: movb + %1 = load atomic i8* %p seq_cst, align 1 + %2 = sub i8 %1, 1 + store atomic i8 %2, i8* %p release, align 1 + ret void +} + +define void @dec_16(i16* %p) { +; Currently the transformation is not done on 16 bit accesses, as the backend +; treat 16 bit arithmetic as expensive on X86/X86_64. +; X64-LABEL: dec_16 +; X64-NOT: decw +; X32-LABEL: dec_16 +; X32-NOT: decw + %1 = load atomic i16* %p acquire, align 2 + %2 = sub i16 %1, 1 + store atomic i16 %2, i16* %p release, align 2 + ret void +} + +define void @dec_32(i32* %p) { +; X64-LABEL: dec_32 +; X64-NOT: lock +; X64: decl +; X64-NOT: movl +; X32-LABEL: dec_32 +; X32-NOT: lock +; X32: decl +; X32-NOT: movl + %1 = load atomic i32* %p acquire, align 4 + %2 = sub i32 %1, 1 + store atomic i32 %2, i32* %p monotonic, align 4 + ret void +} + +define void @dec_64(i64* %p) { +; X64-LABEL: dec_64 +; X64-NOT: lock +; X64: decq +; X64-NOT: movq +; We do not check X86-32 as it cannot do 'decq'. +; X32-LABEL: dec_64 + %1 = load atomic i64* %p acquire, align 8 + %2 = sub i64 %1, 1 + store atomic i64 %2, i64* %p release, align 8 + ret void +} + +define void @dec_32_seq_cst(i32* %p) { +; X64-LABEL: dec_32_seq_cst +; X64: xchgl +; X32-LABEL: dec_32_seq_cst +; X32: xchgl + %1 = load atomic i32* %p monotonic, align 4 + %2 = sub i32 %1, 1 + store atomic i32 %2, i32* %p seq_cst, align 4 + ret void +}