; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s

; One dimensional loop with load that can be hoisted outside of loop
;   for (int i = 0; i < N; ++i)
;     if (!memcmp(a[i], b, 4))
;       sum += 1;
;
define i64 @one_dimensional(ptr %a, ptr %b, i64 %N) {
; CHECK-LABEL: one_dimensional:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    ldr w9, [x1]
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB0_1: // %for.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr x10, [x0], #8
; CHECK-NEXT:    ldr w10, [x10]
; CHECK-NEXT:    cmp w10, w9
; CHECK-NEXT:    cinc x8, x8, eq
; CHECK-NEXT:    subs x2, x2, #1
; CHECK-NEXT:    b.ne .LBB0_1
; CHECK-NEXT:  // %bb.2: // %for.exit
; CHECK-NEXT:    mov x0, x8
; CHECK-NEXT:    ret
entry:
  br label %for.body

for.body:                                         ; preds = %entry, %for.body
  %i.06 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
  %sum.05 = phi i64 [ %spec.select, %for.body ], [ 0, %entry ]
  %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06
  %0 = load ptr, ptr %arrayidx, align 8
  %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4)
  %tobool = icmp eq i32 %bcmp, 0
  %add = zext i1 %tobool to i64
  %spec.select = add i64 %sum.05, %add
  %inc = add nuw i64 %i.06, 1
  %exitcond = icmp eq i64 %inc, %N
  br i1 %exitcond, label %for.exit, label %for.body

for.exit:                                 ; preds = %for.body
  ret i64 %spec.select
}

; Same but loop is two dimensional. Load is hosted outside of both loops
;   for (int i = 0; i < N; ++i)
;     for (int j = 0; j < M; ++j)
;       if (!memcmp(a[i][j], b, 4))
;         sum += 1;
;
define i64 @two_dimensional(ptr %a, ptr %b, i64 %N, i64 %M) {
; CHECK-LABEL: two_dimensional:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    ldr w10, [x1]
; CHECK-NEXT:    mov x9, xzr
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB1_1: // %for.cond1.preheader
; CHECK-NEXT:    // =>This Loop Header: Depth=1
; CHECK-NEXT:    // Child Loop BB1_2 Depth 2
; CHECK-NEXT:    ldr x11, [x0, x9, lsl #3]
; CHECK-NEXT:    mov x12, x3
; CHECK-NEXT:  .LBB1_2: // %for.body4
; CHECK-NEXT:    // Parent Loop BB1_1 Depth=1
; CHECK-NEXT:    // => This Inner Loop Header: Depth=2
; CHECK-NEXT:    ldr x13, [x11], #8
; CHECK-NEXT:    ldr w13, [x13]
; CHECK-NEXT:    cmp w13, w10
; CHECK-NEXT:    cinc x8, x8, eq
; CHECK-NEXT:    subs x12, x12, #1
; CHECK-NEXT:    b.ne .LBB1_2
; CHECK-NEXT:  // %bb.3: // %for.cond1.for.exit3_crit_edge
; CHECK-NEXT:    // in Loop: Header=BB1_1 Depth=1
; CHECK-NEXT:    add x9, x9, #1
; CHECK-NEXT:    cmp x9, x2
; CHECK-NEXT:    b.ne .LBB1_1
; CHECK-NEXT:  // %bb.4: // %for.exit
; CHECK-NEXT:    mov x0, x8
; CHECK-NEXT:    ret
entry:
  br label %for.cond1.preheader

for.cond1.preheader:                           ; preds = %entry, %for.cond1.for.exit3_crit_edge
  %i.019 = phi i64 [ %inc7, %for.cond1.for.exit3_crit_edge ], [ 0, %entry ]
  %sum.018 = phi i64 [ %spec.select, %for.cond1.for.exit3_crit_edge ], [ 0, %entry ]
  %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.019
  %0 = load ptr, ptr %arrayidx, align 8
  br label %for.body4

for.body4:                                     ; preds = %for.cond1.preheader, %for.body4
  %j.016 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body4 ]
  %sum.115 = phi i64 [ %sum.018, %for.cond1.preheader ], [ %spec.select, %for.body4 ]
  %arrayidx5 = getelementptr inbounds ptr, ptr %0, i64 %j.016
  %1 = load ptr, ptr %arrayidx5, align 8
  %bcmp = tail call i32 @bcmp(ptr %1, ptr %b, i64 4)
  %tobool = icmp eq i32 %bcmp, 0
  %add = zext i1 %tobool to i64
  %spec.select = add i64 %sum.115, %add
  %inc = add nuw i64 %j.016, 1
  %exitcond = icmp eq i64 %inc, %M
  br i1 %exitcond, label %for.cond1.for.exit3_crit_edge, label %for.body4

for.cond1.for.exit3_crit_edge:         ; preds = %for.body4
  %inc7 = add nuw i64 %i.019, 1
  %exitcond22 = icmp eq i64 %inc7, %N
  br i1 %exitcond22, label %for.exit, label %for.cond1.preheader

for.exit:                                 ; preds = %for.cond1.for.exit3_crit_edge
  ret i64 %spec.select
}

; Same but loop is three dimensional. Load is hosted outside of all three loops
;   for (int i = 0; i < N; ++i)
;     for (int j = 0; j < M; ++j)
;       for (int k = 0; k < K; ++k)
;         if (!memcmp(a[i][j][k], b, 4))
;           sum += 1;
;
define i64 @three_dimensional(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
; CHECK-LABEL: three_dimensional:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    ldr w10, [x1]
; CHECK-NEXT:    mov x9, xzr
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB2_1: // %for.cond1.preheader
; CHECK-NEXT:    // =>This Loop Header: Depth=1
; CHECK-NEXT:    // Child Loop BB2_2 Depth 2
; CHECK-NEXT:    // Child Loop BB2_3 Depth 3
; CHECK-NEXT:    ldr x11, [x0, x9, lsl #3]
; CHECK-NEXT:    mov x12, xzr
; CHECK-NEXT:  .LBB2_2: // %for.cond5.preheader
; CHECK-NEXT:    // Parent Loop BB2_1 Depth=1
; CHECK-NEXT:    // => This Loop Header: Depth=2
; CHECK-NEXT:    // Child Loop BB2_3 Depth 3
; CHECK-NEXT:    ldr x13, [x11, x12, lsl #3]
; CHECK-NEXT:    mov x14, x4
; CHECK-NEXT:  .LBB2_3: // %for.body8
; CHECK-NEXT:    // Parent Loop BB2_1 Depth=1
; CHECK-NEXT:    // Parent Loop BB2_2 Depth=2
; CHECK-NEXT:    // => This Inner Loop Header: Depth=3
; CHECK-NEXT:    ldr x15, [x13], #8
; CHECK-NEXT:    ldr w15, [x15]
; CHECK-NEXT:    cmp w15, w10
; CHECK-NEXT:    cinc x8, x8, eq
; CHECK-NEXT:    subs x14, x14, #1
; CHECK-NEXT:    b.ne .LBB2_3
; CHECK-NEXT:  // %bb.4: // %for.cond5.for.cond
; CHECK-NEXT:    // in Loop: Header=BB2_2 Depth=2
; CHECK-NEXT:    add x12, x12, #1
; CHECK-NEXT:    cmp x12, x3
; CHECK-NEXT:    b.ne .LBB2_2
; CHECK-NEXT:  // %bb.5: // %for.cond1.for.cond
; CHECK-NEXT:    // in Loop: Header=BB2_1 Depth=1
; CHECK-NEXT:    add x9, x9, #1
; CHECK-NEXT:    cmp x9, x2
; CHECK-NEXT:    b.ne .LBB2_1
; CHECK-NEXT:  // %bb.6: // %for.exit
; CHECK-NEXT:    mov x0, x8
; CHECK-NEXT:    ret
entry:
  br label %for.cond1.preheader

for.cond1.preheader:                        ; preds = %entry, %for.cond1.for.cond
  %i.033 = phi i64 [ %inc15, %for.cond1.for.cond ], [ 0, %entry ]
  %sum.032 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ]
  %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.033
  %0 = load ptr, ptr %arrayidx, align 8
  br label %for.cond5.preheader

for.cond5.preheader:                     ; preds = %for.cond5.for.cond, %for.cond1.preheader
  %j.029 = phi i64 [ 0, %for.cond1.preheader ], [ %inc12, %for.cond5.for.cond ]
  %sum.128 = phi i64 [ %sum.032, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ]
  %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.029
  %1 = load ptr, ptr %arrayidx9, align 8
  br label %for.body8

for.body8:                               ; preds = %for.body8, %for.cond5.preheader
  %k.026 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ]
  %sum.225 = phi i64 [ %sum.128, %for.cond5.preheader ], [ %spec.select, %for.body8 ]
  %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.026
  %2 = load ptr, ptr %arrayidx10, align 8
  %bcmp = tail call i32 @bcmp(ptr %2, ptr %b, i64 4)
  %tobool = icmp eq i32 %bcmp, 0
  %add = zext i1 %tobool to i64
  %spec.select = add i64 %sum.225, %add
  %inc = add nuw i64 %k.026, 1
  %exitcond = icmp eq i64 %inc, %K
  br i1 %exitcond, label %for.cond5.for.cond, label %for.body8

for.cond5.for.cond:   ; preds = %for.body8
  %inc12 = add nuw i64 %j.029, 1
  %exitcond44 = icmp eq i64 %inc12, %M
  br i1 %exitcond44, label %for.cond1.for.cond, label %for.cond5.preheader

for.cond1.for.cond: ; preds = %for.cond5.for.cond
  %inc15 = add nuw i64 %i.033, 1
  %exitcond45 = icmp eq i64 %inc15, %N
  br i1 %exitcond45, label %for.exit, label %for.cond1.preheader

for.exit:                                 ; preds = %for.cond1.for.cond
  ret i64 %spec.select
}

; Three dimensional loop but `b` is invariant only relatively to the inner loop.
; Make sure that load is hoisted only outside of first loop
;   for (int i = 0; i < N; ++i)
;     for (int j = 0; j < M; ++j)
;       for (int k = 0; k < K; ++k)
;         if (!memcmp(a[i][j][k], b[j], 4))
;           sum += 1;
;
define i64 @three_dimensional_middle(ptr %a, ptr %b, i64 %N, i64 %M, i64 %K) {
; CHECK-LABEL: three_dimensional_middle:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    mov x9, xzr
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB3_1: // %for.cond1.preheader
; CHECK-NEXT:    // =>This Loop Header: Depth=1
; CHECK-NEXT:    // Child Loop BB3_2 Depth 2
; CHECK-NEXT:    // Child Loop BB3_3 Depth 3
; CHECK-NEXT:    ldr x10, [x0, x9, lsl #3]
; CHECK-NEXT:    mov x11, xzr
; CHECK-NEXT:  .LBB3_2: // %for.cond5.preheader
; CHECK-NEXT:    // Parent Loop BB3_1 Depth=1
; CHECK-NEXT:    // => This Loop Header: Depth=2
; CHECK-NEXT:    // Child Loop BB3_3 Depth 3
; CHECK-NEXT:    ldr x13, [x1, x11, lsl #3]
; CHECK-NEXT:    ldr x12, [x10, x11, lsl #3]
; CHECK-NEXT:    mov x14, x4
; CHECK-NEXT:    ldr w13, [x13]
; CHECK-NEXT:  .LBB3_3: // %for.body8
; CHECK-NEXT:    // Parent Loop BB3_1 Depth=1
; CHECK-NEXT:    // Parent Loop BB3_2 Depth=2
; CHECK-NEXT:    // => This Inner Loop Header: Depth=3
; CHECK-NEXT:    ldr x15, [x12], #8
; CHECK-NEXT:    ldr w15, [x15]
; CHECK-NEXT:    cmp w15, w13
; CHECK-NEXT:    cinc x8, x8, eq
; CHECK-NEXT:    subs x14, x14, #1
; CHECK-NEXT:    b.ne .LBB3_3
; CHECK-NEXT:  // %bb.4: // %for.cond5.for.cond
; CHECK-NEXT:    // in Loop: Header=BB3_2 Depth=2
; CHECK-NEXT:    add x11, x11, #1
; CHECK-NEXT:    cmp x11, x3
; CHECK-NEXT:    b.ne .LBB3_2
; CHECK-NEXT:  // %bb.5: // %for.cond1.for.cond
; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
; CHECK-NEXT:    add x9, x9, #1
; CHECK-NEXT:    cmp x9, x2
; CHECK-NEXT:    b.ne .LBB3_1
; CHECK-NEXT:  // %bb.6: // %for.exit
; CHECK-NEXT:    mov x0, x8
; CHECK-NEXT:    ret
entry:
  br label %for.cond1.preheader

for.cond1.preheader:                        ; preds = %entry, %for.cond1.for.cond
  %i.035 = phi i64 [ %inc16, %for.cond1.for.cond ], [ 0, %entry ]
  %sum.034 = phi i64 [ %spec.select, %for.cond1.for.cond ], [ 0, %entry ]
  %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.035
  %0 = load ptr, ptr %arrayidx, align 8
  br label %for.cond5.preheader

for.cond5.preheader:                     ; preds = %for.cond5.for.cond, %for.cond1.preheader
  %j.031 = phi i64 [ 0, %for.cond1.preheader ], [ %inc13, %for.cond5.for.cond ]
  %sum.130 = phi i64 [ %sum.034, %for.cond1.preheader ], [ %spec.select, %for.cond5.for.cond ]
  %arrayidx9 = getelementptr inbounds ptr, ptr %0, i64 %j.031
  %1 = load ptr, ptr %arrayidx9, align 8
  %arrayidx11 = getelementptr inbounds ptr, ptr %b, i64 %j.031
  %2 = load ptr, ptr %arrayidx11, align 8
  br label %for.body8

for.body8:                               ; preds = %for.body8, %for.cond5.preheader
  %k.028 = phi i64 [ 0, %for.cond5.preheader ], [ %inc, %for.body8 ]
  %sum.227 = phi i64 [ %sum.130, %for.cond5.preheader ], [ %spec.select, %for.body8 ]
  %arrayidx10 = getelementptr inbounds ptr, ptr %1, i64 %k.028
  %3 = load ptr, ptr %arrayidx10, align 8
  %bcmp = tail call i32 @bcmp(ptr %3, ptr %2, i64 4)
  %tobool = icmp eq i32 %bcmp, 0
  %add = zext i1 %tobool to i64
  %spec.select = add i64 %sum.227, %add
  %inc = add nuw i64 %k.028, 1
  %exitcond = icmp eq i64 %inc, %K
  br i1 %exitcond, label %for.cond5.for.cond, label %for.body8

for.cond5.for.cond:   ; preds = %for.body8
  %inc13 = add nuw i64 %j.031, 1
  %exitcond46 = icmp eq i64 %inc13, %M
  br i1 %exitcond46, label %for.cond1.for.cond, label %for.cond5.preheader

for.cond1.for.cond: ; preds = %for.cond5.for.cond
  %inc16 = add nuw i64 %i.035, 1
  %exitcond47 = icmp eq i64 %inc16, %N
  br i1 %exitcond47, label %for.exit, label %for.cond1.preheader

for.exit:                                 ; preds = %for.cond1.for.cond
  ret i64 %spec.select
}

; Make sure that store inside loop prevents hoisting invariant loads
;   for (int i = 0; i < N; ++i)
;     c[i] = memcmp(a[i], b, 4);
;
define void @one_dimensional_with_store(ptr %a, ptr %b, ptr %c, i32 %N) {
; CHECK-LABEL: one_dimensional_with_store:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    mov w8, w3
; CHECK-NEXT:  .LBB4_1: // %for.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr x9, [x0], #8
; CHECK-NEXT:    ldr w10, [x1]
; CHECK-NEXT:    ldr w9, [x9]
; CHECK-NEXT:    rev w10, w10
; CHECK-NEXT:    rev w9, w9
; CHECK-NEXT:    cmp w9, w10
; CHECK-NEXT:    cset w9, hi
; CHECK-NEXT:    cset w10, lo
; CHECK-NEXT:    subs x8, x8, #1
; CHECK-NEXT:    sub w9, w9, w10
; CHECK-NEXT:    strb w9, [x2], #1
; CHECK-NEXT:    b.ne .LBB4_1
; CHECK-NEXT:  // %bb.2: // %for.exit
; CHECK-NEXT:    ret
entry:
  br label %for.body.preheader

for.body.preheader:                               ; preds = %entry
  %wide.trip.count = zext i32 %N to i64
  br label %for.body

for.body:                                         ; preds = %for.body.preheader, %for.body
  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
  %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv
  %0 = load ptr, ptr %arrayidx, align 8
  %call = tail call i32 @memcmp(ptr %0, ptr %b, i64 4)
  %conv = trunc i32 %call to i8
  %arrayidx2 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv
  store i8 %conv, ptr %arrayidx2, align 1
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
  br i1 %exitcond.not, label %for.exit, label %for.body

for.exit:                                 ; preds = %for.body
  ret void
}

; Make sure that call inside loop prevents hoisting invariant loads
;
define i32 @one_dimensional_with_call(ptr %a, ptr %b, i32 %N) {
; CHECK-LABEL: one_dimensional_with_call:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
; CHECK-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT:    .cfi_def_cfa_offset 48
; CHECK-NEXT:    .cfi_offset w19, -8
; CHECK-NEXT:    .cfi_offset w20, -16
; CHECK-NEXT:    .cfi_offset w21, -24
; CHECK-NEXT:    .cfi_offset w22, -32
; CHECK-NEXT:    .cfi_offset w30, -48
; CHECK-NEXT:    mov x19, x1
; CHECK-NEXT:    mov x21, x0
; CHECK-NEXT:    mov w20, wzr
; CHECK-NEXT:    mov w22, w2
; CHECK-NEXT:  .LBB5_1: // %for.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr x8, [x21], #8
; CHECK-NEXT:    ldr w9, [x19]
; CHECK-NEXT:    ldr w8, [x8]
; CHECK-NEXT:    cmp w8, w9
; CHECK-NEXT:    cinc w20, w20, eq
; CHECK-NEXT:    bl func
; CHECK-NEXT:    subs x22, x22, #1
; CHECK-NEXT:    b.ne .LBB5_1
; CHECK-NEXT:  // %bb.2: // %for.exit
; CHECK-NEXT:    mov w0, w20
; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT:    ret
entry:
  br label %for.body.preheader

for.body.preheader:                               ; preds = %entry
  %wide.trip.count = zext i32 %N to i64
  br label %for.body

for.body:                                         ; preds = %for.body.preheader, %for.body
  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
  %sum.05 = phi i32 [ 0, %for.body.preheader ], [ %spec.select, %for.body ]
  %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %indvars.iv
  %0 = load ptr, ptr %arrayidx, align 8
  %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 4)
  %tobool.not = icmp eq i32 %bcmp, 0
  %add = zext i1 %tobool.not to i32
  %spec.select = add nuw nsw i32 %sum.05, %add
  tail call void @func()
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
  br i1 %exitcond.not, label %for.exit, label %for.body

for.exit:                                 ; preds = %for.body
  ret i32 %spec.select
}

; One dimensional loop with memcmp size equal six.
; The test shows that shows that several loads can be hoisted at the same time.
;   for (int i = 0; i < N; ++i)
;     if (!memcmp(a[i], b, 6))
;       sum += 1;
;
define i64 @one_dimensional_two_loads(ptr %a, ptr %b, i64 %N) {
; CHECK-LABEL: one_dimensional_two_loads:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    ldr w9, [x1]
; CHECK-NEXT:    ldrh w10, [x1, #4]
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:  .LBB6_1: // %for.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr x11, [x0], #8
; CHECK-NEXT:    ldr w12, [x11]
; CHECK-NEXT:    ldrh w11, [x11, #4]
; CHECK-NEXT:    cmp w12, w9
; CHECK-NEXT:    ccmp w11, w10, #0, eq
; CHECK-NEXT:    cinc x8, x8, eq
; CHECK-NEXT:    subs x2, x2, #1
; CHECK-NEXT:    b.ne .LBB6_1
; CHECK-NEXT:  // %bb.2: // %for.exit
; CHECK-NEXT:    mov x0, x8
; CHECK-NEXT:    ret
entry:
  br label %for.body

for.body:                                         ; preds = %entry, %for.body
  %i.06 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
  %sum.05 = phi i64 [ %spec.select, %for.body ], [ 0, %entry ]
  %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %i.06
  %0 = load ptr, ptr %arrayidx, align 8
  %bcmp = tail call i32 @bcmp(ptr %0, ptr %b, i64 6)
  %tobool = icmp eq i32 %bcmp, 0
  %add = zext i1 %tobool to i64
  %spec.select = add i64 %sum.05, %add
  %inc = add nuw i64 %i.06, 1
  %exitcond = icmp eq i64 %inc, %N
  br i1 %exitcond, label %for.exit, label %for.body

for.exit:                                 ; preds = %for.body
  ret i64 %spec.select
}

; See issue https://github.com/llvm/llvm-project/issues/72855
;
; When hoisting instruction out of the loop, ensure that loads are not common
; subexpressions eliminated. In this example pointer %c may alias pointer %b,
; so when hoisting `%y = load i64, ptr %b` instruction we can't replace it with
; `%b.val = load i64, ptr %b`
;
define i64 @hoisting_no_cse(ptr %a, ptr %b, ptr %c, i64 %N) {
; CHECK-LABEL: hoisting_no_cse:
; CHECK:       // %bb.0: // %entry
; CHECK-NEXT:    ldr x8, [x1]
; CHECK-NEXT:    add x8, x8, #1
; CHECK-NEXT:    str x8, [x2]
; CHECK-NEXT:    mov x8, xzr
; CHECK-NEXT:    ldr x9, [x1]
; CHECK-NEXT:  .LBB7_1: // %for.body
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    ldr x10, [x0], #8
; CHECK-NEXT:    ldr x10, [x10]
; CHECK-NEXT:    cmp x10, x9
; CHECK-NEXT:    cinc x8, x8, eq
; CHECK-NEXT:    subs x3, x3, #1
; CHECK-NEXT:    b.ne .LBB7_1
; CHECK-NEXT:  // %bb.2: // %for.exit
; CHECK-NEXT:    mov x0, x8
; CHECK-NEXT:    ret
entry:
  %b.val = load i64, ptr %b
  %b.val.changed = add i64 %b.val, 1
  store i64 %b.val.changed, ptr %c
  br label %for.body

for.body:                                         ; preds = %entry, %for.body
  %idx = phi i64 [ %inc, %for.body ], [ 0, %entry ]
  %sum = phi i64 [ %spec.select, %for.body ], [ 0, %entry ]
  %arrayidx = getelementptr inbounds ptr, ptr %a, i64 %idx
  %0 = load ptr, ptr %arrayidx, align 8
  %x = load i64, ptr %0
  %y = load i64, ptr %b
  %cmp = icmp eq i64 %x, %y
  %add = zext i1 %cmp to i64
  %spec.select = add i64 %sum, %add
  %inc = add nuw i64 %idx, 1
  %exitcond = icmp eq i64 %inc, %N
  br i1 %exitcond, label %for.exit, label %for.body

for.exit:                                 ; preds = %for.body
  ret i64 %spec.select
}

@a = external local_unnamed_addr global i32, align 4

; Make sure the load is not hoisted out of the loop across memory barriers.
define i32 @load_between_memory_barriers() {
; CHECK-LABEL: load_between_memory_barriers:
; CHECK:       // %bb.0:
; CHECK-NEXT:    adrp x8, :got:a
; CHECK-NEXT:    ldr x8, [x8, :got_lo12:a]
; CHECK-NEXT:  .LBB8_1: // %loop
; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
; CHECK-NEXT:    //MEMBARRIER
; CHECK-NEXT:    ldr w0, [x8]
; CHECK-NEXT:    //MEMBARRIER
; CHECK-NEXT:    cbz w0, .LBB8_1
; CHECK-NEXT:  // %bb.2: // %exit
; CHECK-NEXT:    ret
  br label %loop

loop:
  fence syncscope("singlethread") acq_rel
  %l = load i32, ptr @a, align 4
  fence syncscope("singlethread") acq_rel
  %c = icmp eq i32 %l, 0
  br i1 %c, label %loop, label %exit

exit:
  ret i32 %l
}

declare i32 @bcmp(ptr, ptr, i64)
declare i32 @memcmp(ptr, ptr, i64)
declare void @func()
