; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}

; TODO: add i1, and <6 x i8> vector tests.

; TODO: add test for vectors that exceed 128-bit length
; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors
; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed.

; TODO: generate PTX that preserves Concurrent Forward Progress
;       for atomic operations to local statespace
;       by generating atomic or volatile operations.

; TODO: design exposure for atomic operations on vector types.

; TODO: add weak,atomic,volatile,atomic volatile tests
;       for .const and .param statespaces.

;; generic statespace

; generic

; TODO: make the lowering of this weak vector ops consistent with
;       the ones of the next tests. This test lowers to a weak PTX
;       vector op, but next test lowers to a vector PTX op.
define void @generic_2xi8(ptr %a) {
; CHECK-LABEL: generic_2xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_2xi8_param_0];
; CHECK-NEXT:    ld.v2.b8 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.v2.b8 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i8>, ptr %a
  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
  store <2 x i8> %a.add, ptr %a
  ret void
}

; TODO: make the lowering of this weak vector ops consistent with
;       the ones of the previous test. This test lowers to a weak
;       PTX scalar op, but prior test lowers to a vector PTX op.
define void @generic_4xi8(ptr %a) {
; CHECK-LABEL: generic_4xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b32 %r<13>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_4xi8_param_0];
; CHECK-NEXT:    ld.b32 %r1, [%rd1];
; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
; CHECK-NEXT:    st.b32 [%rd1], %r12;
; CHECK-NEXT:    ret;
  %a.load = load <4 x i8>, ptr %a
  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
  store <4 x i8> %a.add, ptr %a
  ret void
}

define void @generic_8xi8(ptr %a) {
; CHECK-LABEL: generic_8xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<25>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_8xi8_param_0];
; CHECK-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
; CHECK-NEXT:    st.v2.b32 [%rd1], {%r24, %r13};
; CHECK-NEXT:    ret;
  %a.load = load <8 x i8>, ptr %a
  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store <8 x i8> %a.add, ptr %a
  ret void
}

define void @generic_16xi8(ptr %a) {
; CHECK-LABEL: generic_16xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<33>;
; CHECK-NEXT:    .reg .b32 %r<49>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_16xi8_param_0];
; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
; CHECK-NEXT:    st.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
; CHECK-NEXT:    ret;
  %a.load = load <16 x i8>, ptr %a
  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store <16 x i8> %a.add, ptr %a
  ret void
}

define void @generic_2xi16(ptr %a) {
; CHECK-LABEL: generic_2xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_2xi16_param_0];
; CHECK-NEXT:    ld.v2.b16 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.v2.b16 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i16>, ptr %a
  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
  store <2 x i16> %a.add, ptr %a
  ret void
}

define void @generic_4xi16(ptr %a) {
; CHECK-LABEL: generic_4xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_4xi16_param_0];
; CHECK-NEXT:    ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
; CHECK-NEXT:    st.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
; CHECK-NEXT:    ret;
  %a.load = load <4 x i16>, ptr %a
  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
  store <4 x i16> %a.add, ptr %a
  ret void
}

define void @generic_8xi16(ptr %a) {
; CHECK-LABEL: generic_8xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_8xi16_param_0];
; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
; CHECK-NEXT:    st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load <8 x i16>, ptr %a
  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
  store <8 x i16> %a.add, ptr %a
  ret void
}

define void @generic_2xi32(ptr %a) {
; CHECK-LABEL: generic_2xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_2xi32_param_0];
; CHECK-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.s32 %r3, %r2, 1;
; CHECK-NEXT:    add.s32 %r4, %r1, 1;
; CHECK-NEXT:    st.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i32>, ptr %a
  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
  store <2 x i32> %a.add, ptr %a
  ret void
}

define void @generic_4xi32(ptr %a) {
; CHECK-LABEL: generic_4xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_4xi32_param_0];
; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.s32 %r5, %r4, 1;
; CHECK-NEXT:    add.s32 %r6, %r3, 1;
; CHECK-NEXT:    add.s32 %r7, %r2, 1;
; CHECK-NEXT:    add.s32 %r8, %r1, 1;
; CHECK-NEXT:    st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load <4 x i32>, ptr %a
  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
  store <4 x i32> %a.add, ptr %a
  ret void
}

define void @generic_2xi64(ptr %a) {
; CHECK-LABEL: generic_2xi64(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_2xi64_param_0];
; CHECK-NEXT:    ld.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
; CHECK-NEXT:    st.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i64>, ptr %a
  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
  store <2 x i64> %a.add, ptr %a
  ret void
}

define void @generic_2xfloat(ptr %a) {
; CHECK-LABEL: generic_2xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_2xfloat_param_0];
; CHECK-NEXT:    ld.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
; CHECK-NEXT:    st.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x float>, ptr %a
  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
  store <2 x float> %a.add, ptr %a
  ret void
}

define void @generic_4xfloat(ptr %a) {
; CHECK-LABEL: generic_4xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_4xfloat_param_0];
; CHECK-NEXT:    ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
; CHECK-NEXT:    st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load <4 x float>, ptr %a
  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
  store <4 x float> %a.add, ptr %a
  ret void
}

define void @generic_2xdouble(ptr %a) {
; CHECK-LABEL: generic_2xdouble(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_2xdouble_param_0];
; CHECK-NEXT:    ld.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
; CHECK-NEXT:    st.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load <2 x double>, ptr %a
  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
  store <2 x double> %a.add, ptr %a
  ret void
}

; generic_volatile

; TODO: volatile, atomic, and volatile atomic memory operations on vector types.
; Currently, LLVM:
; - does not allow atomic operations on vectors.
; - it allows volatile operations but not clear what that means.
; Following both semantics make sense in general and PTX supports both:
; - volatile/atomic/volatile atomic applies to the whole vector
; - volatile/atomic/volatile atomic applies elementwise
; Actions required:
; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those
;   Below tests show that the current implementation picks the semantics in an inconsistent way
;   * volatile <2 x i8> lowers to "elementwise volatile"
;   * <4 x i8> lowers to "full vector volatile"
; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics
; - update tests in load-store-sm70.ll as well.

; TODO: make this operation consistent with the one for <4 x i8>
; This operation lowers to a "element wise volatile PTX operation".
define void @generic_volatile_2xi8(ptr %a) {
; CHECK-LABEL: generic_volatile_2xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_2xi8_param_0];
; CHECK-NEXT:    ld.volatile.v2.b8 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.volatile.v2.b8 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i8>, ptr %a
  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
  store volatile <2 x i8> %a.add, ptr %a
  ret void
}

; TODO: make this operation consistent with the one for <2 x i8>
; This operation lowers to a "full vector volatile PTX operation".
define void @generic_volatile_4xi8(ptr %a) {
; CHECK-LABEL: generic_volatile_4xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b32 %r<13>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_4xi8_param_0];
; CHECK-NEXT:    ld.volatile.b32 %r1, [%rd1];
; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
; CHECK-NEXT:    st.volatile.b32 [%rd1], %r12;
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i8>, ptr %a
  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
  store volatile <4 x i8> %a.add, ptr %a
  ret void
}

define void @generic_volatile_8xi8(ptr %a) {
; CHECK-LABEL: generic_volatile_8xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<25>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_8xi8_param_0];
; CHECK-NEXT:    ld.volatile.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
; CHECK-NEXT:    st.volatile.v2.b32 [%rd1], {%r24, %r13};
; CHECK-NEXT:    ret;
  %a.load = load volatile <8 x i8>, ptr %a
  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store volatile <8 x i8> %a.add, ptr %a
  ret void
}

define void @generic_volatile_16xi8(ptr %a) {
; CHECK-LABEL: generic_volatile_16xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<33>;
; CHECK-NEXT:    .reg .b32 %r<49>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_16xi8_param_0];
; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
; CHECK-NEXT:    st.volatile.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
; CHECK-NEXT:    ret;
  %a.load = load volatile <16 x i8>, ptr %a
  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store volatile <16 x i8> %a.add, ptr %a
  ret void
}

define void @generic_volatile_2xi16(ptr %a) {
; CHECK-LABEL: generic_volatile_2xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_2xi16_param_0];
; CHECK-NEXT:    ld.volatile.v2.b16 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.volatile.v2.b16 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i16>, ptr %a
  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
  store volatile <2 x i16> %a.add, ptr %a
  ret void
}

define void @generic_volatile_4xi16(ptr %a) {
; CHECK-LABEL: generic_volatile_4xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_4xi16_param_0];
; CHECK-NEXT:    ld.volatile.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
; CHECK-NEXT:    st.volatile.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i16>, ptr %a
  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
  store volatile <4 x i16> %a.add, ptr %a
  ret void
}

define void @generic_volatile_8xi16(ptr %a) {
; CHECK-LABEL: generic_volatile_8xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_8xi16_param_0];
; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
; CHECK-NEXT:    st.volatile.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <8 x i16>, ptr %a
  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
  store volatile <8 x i16> %a.add, ptr %a
  ret void
}

define void @generic_volatile_2xi32(ptr %a) {
; CHECK-LABEL: generic_volatile_2xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_2xi32_param_0];
; CHECK-NEXT:    ld.volatile.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.s32 %r3, %r2, 1;
; CHECK-NEXT:    add.s32 %r4, %r1, 1;
; CHECK-NEXT:    st.volatile.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i32>, ptr %a
  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
  store volatile <2 x i32> %a.add, ptr %a
  ret void
}

define void @generic_volatile_4xi32(ptr %a) {
; CHECK-LABEL: generic_volatile_4xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_4xi32_param_0];
; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.s32 %r5, %r4, 1;
; CHECK-NEXT:    add.s32 %r6, %r3, 1;
; CHECK-NEXT:    add.s32 %r7, %r2, 1;
; CHECK-NEXT:    add.s32 %r8, %r1, 1;
; CHECK-NEXT:    st.volatile.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i32>, ptr %a
  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
  store volatile <4 x i32> %a.add, ptr %a
  ret void
}

define void @generic_volatile_2xi64(ptr %a) {
; CHECK-LABEL: generic_volatile_2xi64(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_2xi64_param_0];
; CHECK-NEXT:    ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
; CHECK-NEXT:    st.volatile.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i64>, ptr %a
  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
  store volatile <2 x i64> %a.add, ptr %a
  ret void
}

define void @generic_volatile_2xfloat(ptr %a) {
; CHECK-LABEL: generic_volatile_2xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_2xfloat_param_0];
; CHECK-NEXT:    ld.volatile.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
; CHECK-NEXT:    st.volatile.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x float>, ptr %a
  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
  store volatile <2 x float> %a.add, ptr %a
  ret void
}

define void @generic_volatile_4xfloat(ptr %a) {
; CHECK-LABEL: generic_volatile_4xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_4xfloat_param_0];
; CHECK-NEXT:    ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
; CHECK-NEXT:    st.volatile.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x float>, ptr %a
  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
  store volatile <4 x float> %a.add, ptr %a
  ret void
}

define void @generic_volatile_2xdouble(ptr %a) {
; CHECK-LABEL: generic_volatile_2xdouble(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [generic_volatile_2xdouble_param_0];
; CHECK-NEXT:    ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
; CHECK-NEXT:    st.volatile.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x double>, ptr %a
  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
  store volatile <2 x double> %a.add, ptr %a
  ret void
}

;; global statespace

; global

define void @global_2xi8(ptr addrspace(1) %a) {
; CHECK-LABEL: global_2xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_2xi8_param_0];
; CHECK-NEXT:    ld.global.v2.b8 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.global.v2.b8 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i8>, ptr addrspace(1) %a
  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
  store <2 x i8> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_4xi8(ptr addrspace(1) %a) {
; CHECK-LABEL: global_4xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b32 %r<13>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_4xi8_param_0];
; CHECK-NEXT:    ld.global.b32 %r1, [%rd1];
; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
; CHECK-NEXT:    st.global.b32 [%rd1], %r12;
; CHECK-NEXT:    ret;
  %a.load = load <4 x i8>, ptr addrspace(1) %a
  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
  store <4 x i8> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_8xi8(ptr addrspace(1) %a) {
; CHECK-LABEL: global_8xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<25>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_8xi8_param_0];
; CHECK-NEXT:    ld.global.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
; CHECK-NEXT:    st.global.v2.b32 [%rd1], {%r24, %r13};
; CHECK-NEXT:    ret;
  %a.load = load <8 x i8>, ptr addrspace(1) %a
  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store <8 x i8> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_16xi8(ptr addrspace(1) %a) {
; CHECK-LABEL: global_16xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<33>;
; CHECK-NEXT:    .reg .b32 %r<49>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_16xi8_param_0];
; CHECK-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
; CHECK-NEXT:    st.global.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
; CHECK-NEXT:    ret;
  %a.load = load <16 x i8>, ptr addrspace(1) %a
  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store <16 x i8> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_2xi16(ptr addrspace(1) %a) {
; CHECK-LABEL: global_2xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_2xi16_param_0];
; CHECK-NEXT:    ld.global.v2.b16 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.global.v2.b16 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i16>, ptr addrspace(1) %a
  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
  store <2 x i16> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_4xi16(ptr addrspace(1) %a) {
; CHECK-LABEL: global_4xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_4xi16_param_0];
; CHECK-NEXT:    ld.global.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
; CHECK-NEXT:    st.global.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
; CHECK-NEXT:    ret;
  %a.load = load <4 x i16>, ptr addrspace(1) %a
  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
  store <4 x i16> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_8xi16(ptr addrspace(1) %a) {
; CHECK-LABEL: global_8xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_8xi16_param_0];
; CHECK-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
; CHECK-NEXT:    st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load <8 x i16>, ptr addrspace(1) %a
  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
  store <8 x i16> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_2xi32(ptr addrspace(1) %a) {
; CHECK-LABEL: global_2xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_2xi32_param_0];
; CHECK-NEXT:    ld.global.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.s32 %r3, %r2, 1;
; CHECK-NEXT:    add.s32 %r4, %r1, 1;
; CHECK-NEXT:    st.global.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i32>, ptr addrspace(1) %a
  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
  store <2 x i32> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_4xi32(ptr addrspace(1) %a) {
; CHECK-LABEL: global_4xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_4xi32_param_0];
; CHECK-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.s32 %r5, %r4, 1;
; CHECK-NEXT:    add.s32 %r6, %r3, 1;
; CHECK-NEXT:    add.s32 %r7, %r2, 1;
; CHECK-NEXT:    add.s32 %r8, %r1, 1;
; CHECK-NEXT:    st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load <4 x i32>, ptr addrspace(1) %a
  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
  store <4 x i32> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_2xi64(ptr addrspace(1) %a) {
; CHECK-LABEL: global_2xi64(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_2xi64_param_0];
; CHECK-NEXT:    ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
; CHECK-NEXT:    st.global.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i64>, ptr addrspace(1) %a
  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
  store <2 x i64> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_2xfloat(ptr addrspace(1) %a) {
; CHECK-LABEL: global_2xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_2xfloat_param_0];
; CHECK-NEXT:    ld.global.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
; CHECK-NEXT:    st.global.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x float>, ptr addrspace(1) %a
  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
  store <2 x float> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_4xfloat(ptr addrspace(1) %a) {
; CHECK-LABEL: global_4xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_4xfloat_param_0];
; CHECK-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
; CHECK-NEXT:    st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load <4 x float>, ptr addrspace(1) %a
  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
  store <4 x float> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_2xdouble(ptr addrspace(1) %a) {
; CHECK-LABEL: global_2xdouble(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_2xdouble_param_0];
; CHECK-NEXT:    ld.global.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
; CHECK-NEXT:    st.global.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load <2 x double>, ptr addrspace(1) %a
  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
  store <2 x double> %a.add, ptr addrspace(1) %a
  ret void
}

; global_volatile

define void @global_volatile_2xi8(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_2xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_2xi8_param_0];
; CHECK-NEXT:    ld.volatile.global.v2.b8 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.volatile.global.v2.b8 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i8>, ptr addrspace(1) %a
  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
  store volatile <2 x i8> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_volatile_4xi8(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_4xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b32 %r<13>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_4xi8_param_0];
; CHECK-NEXT:    ld.volatile.global.b32 %r1, [%rd1];
; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
; CHECK-NEXT:    st.volatile.global.b32 [%rd1], %r12;
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i8>, ptr addrspace(1) %a
  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
  store volatile <4 x i8> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_volatile_8xi8(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_8xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<25>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_8xi8_param_0];
; CHECK-NEXT:    ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
; CHECK-NEXT:    st.volatile.global.v2.b32 [%rd1], {%r24, %r13};
; CHECK-NEXT:    ret;
  %a.load = load volatile <8 x i8>, ptr addrspace(1) %a
  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store volatile <8 x i8> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_volatile_16xi8(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_16xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<33>;
; CHECK-NEXT:    .reg .b32 %r<49>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_16xi8_param_0];
; CHECK-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
; CHECK-NEXT:    st.volatile.global.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
; CHECK-NEXT:    ret;
  %a.load = load volatile <16 x i8>, ptr addrspace(1) %a
  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store volatile <16 x i8> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_volatile_2xi16(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_2xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_2xi16_param_0];
; CHECK-NEXT:    ld.volatile.global.v2.b16 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.volatile.global.v2.b16 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i16>, ptr addrspace(1) %a
  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
  store volatile <2 x i16> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_volatile_4xi16(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_4xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_4xi16_param_0];
; CHECK-NEXT:    ld.volatile.global.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
; CHECK-NEXT:    st.volatile.global.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i16>, ptr addrspace(1) %a
  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
  store volatile <4 x i16> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_volatile_8xi16(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_8xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_8xi16_param_0];
; CHECK-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
; CHECK-NEXT:    st.volatile.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <8 x i16>, ptr addrspace(1) %a
  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
  store volatile <8 x i16> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_volatile_2xi32(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_2xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_2xi32_param_0];
; CHECK-NEXT:    ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.s32 %r3, %r2, 1;
; CHECK-NEXT:    add.s32 %r4, %r1, 1;
; CHECK-NEXT:    st.volatile.global.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i32>, ptr addrspace(1) %a
  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
  store volatile <2 x i32> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_volatile_4xi32(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_4xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_4xi32_param_0];
; CHECK-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.s32 %r5, %r4, 1;
; CHECK-NEXT:    add.s32 %r6, %r3, 1;
; CHECK-NEXT:    add.s32 %r7, %r2, 1;
; CHECK-NEXT:    add.s32 %r8, %r1, 1;
; CHECK-NEXT:    st.volatile.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i32>, ptr addrspace(1) %a
  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
  store volatile <4 x i32> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_volatile_2xi64(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_2xi64(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_2xi64_param_0];
; CHECK-NEXT:    ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
; CHECK-NEXT:    st.volatile.global.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i64>, ptr addrspace(1) %a
  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
  store volatile <2 x i64> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_volatile_2xfloat(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_2xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_2xfloat_param_0];
; CHECK-NEXT:    ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
; CHECK-NEXT:    st.volatile.global.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x float>, ptr addrspace(1) %a
  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
  store volatile <2 x float> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_volatile_4xfloat(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_4xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_4xfloat_param_0];
; CHECK-NEXT:    ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
; CHECK-NEXT:    st.volatile.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x float>, ptr addrspace(1) %a
  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
  store volatile <4 x float> %a.add, ptr addrspace(1) %a
  ret void
}

define void @global_volatile_2xdouble(ptr addrspace(1) %a) {
; CHECK-LABEL: global_volatile_2xdouble(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [global_volatile_2xdouble_param_0];
; CHECK-NEXT:    ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
; CHECK-NEXT:    st.volatile.global.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x double>, ptr addrspace(1) %a
  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
  store volatile <2 x double> %a.add, ptr addrspace(1) %a
  ret void
}

;; shared statespace

; shared

define void @shared_2xi8(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_2xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_2xi8_param_0];
; CHECK-NEXT:    ld.shared.v2.b8 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.shared.v2.b8 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i8>, ptr addrspace(3) %a
  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
  store <2 x i8> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_4xi8(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_4xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b32 %r<13>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_4xi8_param_0];
; CHECK-NEXT:    ld.shared.b32 %r1, [%rd1];
; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
; CHECK-NEXT:    st.shared.b32 [%rd1], %r12;
; CHECK-NEXT:    ret;
  %a.load = load <4 x i8>, ptr addrspace(3) %a
  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
  store <4 x i8> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_8xi8(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_8xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<25>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_8xi8_param_0];
; CHECK-NEXT:    ld.shared.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
; CHECK-NEXT:    st.shared.v2.b32 [%rd1], {%r24, %r13};
; CHECK-NEXT:    ret;
  %a.load = load <8 x i8>, ptr addrspace(3) %a
  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store <8 x i8> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_16xi8(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_16xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<33>;
; CHECK-NEXT:    .reg .b32 %r<49>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_16xi8_param_0];
; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
; CHECK-NEXT:    st.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
; CHECK-NEXT:    ret;
  %a.load = load <16 x i8>, ptr addrspace(3) %a
  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store <16 x i8> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_2xi16(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_2xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_2xi16_param_0];
; CHECK-NEXT:    ld.shared.v2.b16 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.shared.v2.b16 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i16>, ptr addrspace(3) %a
  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
  store <2 x i16> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_4xi16(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_4xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_4xi16_param_0];
; CHECK-NEXT:    ld.shared.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
; CHECK-NEXT:    st.shared.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
; CHECK-NEXT:    ret;
  %a.load = load <4 x i16>, ptr addrspace(3) %a
  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
  store <4 x i16> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_8xi16(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_8xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_8xi16_param_0];
; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
; CHECK-NEXT:    st.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load <8 x i16>, ptr addrspace(3) %a
  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
  store <8 x i16> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_2xi32(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_2xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_2xi32_param_0];
; CHECK-NEXT:    ld.shared.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.s32 %r3, %r2, 1;
; CHECK-NEXT:    add.s32 %r4, %r1, 1;
; CHECK-NEXT:    st.shared.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i32>, ptr addrspace(3) %a
  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
  store <2 x i32> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_4xi32(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_4xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_4xi32_param_0];
; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.s32 %r5, %r4, 1;
; CHECK-NEXT:    add.s32 %r6, %r3, 1;
; CHECK-NEXT:    add.s32 %r7, %r2, 1;
; CHECK-NEXT:    add.s32 %r8, %r1, 1;
; CHECK-NEXT:    st.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load <4 x i32>, ptr addrspace(3) %a
  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
  store <4 x i32> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_2xi64(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_2xi64(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_2xi64_param_0];
; CHECK-NEXT:    ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
; CHECK-NEXT:    st.shared.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i64>, ptr addrspace(3) %a
  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
  store <2 x i64> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_2xfloat(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_2xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_2xfloat_param_0];
; CHECK-NEXT:    ld.shared.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
; CHECK-NEXT:    st.shared.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x float>, ptr addrspace(3) %a
  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
  store <2 x float> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_4xfloat(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_4xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_4xfloat_param_0];
; CHECK-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
; CHECK-NEXT:    st.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load <4 x float>, ptr addrspace(3) %a
  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
  store <4 x float> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_2xdouble(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_2xdouble(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_2xdouble_param_0];
; CHECK-NEXT:    ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
; CHECK-NEXT:    st.shared.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load <2 x double>, ptr addrspace(3) %a
  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
  store <2 x double> %a.add, ptr addrspace(3) %a
  ret void
}

; shared_volatile

define void @shared_volatile_2xi8(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_2xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_2xi8_param_0];
; CHECK-NEXT:    ld.volatile.shared.v2.b8 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.volatile.shared.v2.b8 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i8>, ptr addrspace(3) %a
  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
  store volatile <2 x i8> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_volatile_4xi8(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_4xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b32 %r<13>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_4xi8_param_0];
; CHECK-NEXT:    ld.volatile.shared.b32 %r1, [%rd1];
; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
; CHECK-NEXT:    st.volatile.shared.b32 [%rd1], %r12;
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i8>, ptr addrspace(3) %a
  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
  store volatile <4 x i8> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_volatile_8xi8(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_8xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<25>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_8xi8_param_0];
; CHECK-NEXT:    ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
; CHECK-NEXT:    st.volatile.shared.v2.b32 [%rd1], {%r24, %r13};
; CHECK-NEXT:    ret;
  %a.load = load volatile <8 x i8>, ptr addrspace(3) %a
  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store volatile <8 x i8> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_volatile_16xi8(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_16xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<33>;
; CHECK-NEXT:    .reg .b32 %r<49>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_16xi8_param_0];
; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
; CHECK-NEXT:    ret;
  %a.load = load volatile <16 x i8>, ptr addrspace(3) %a
  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store volatile <16 x i8> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_volatile_2xi16(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_2xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_2xi16_param_0];
; CHECK-NEXT:    ld.volatile.shared.v2.b16 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.volatile.shared.v2.b16 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i16>, ptr addrspace(3) %a
  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
  store volatile <2 x i16> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_volatile_4xi16(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_4xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_4xi16_param_0];
; CHECK-NEXT:    ld.volatile.shared.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
; CHECK-NEXT:    st.volatile.shared.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i16>, ptr addrspace(3) %a
  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
  store volatile <4 x i16> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_volatile_8xi16(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_8xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_8xi16_param_0];
; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <8 x i16>, ptr addrspace(3) %a
  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
  store volatile <8 x i16> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_volatile_2xi32(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_2xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_2xi32_param_0];
; CHECK-NEXT:    ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.s32 %r3, %r2, 1;
; CHECK-NEXT:    add.s32 %r4, %r1, 1;
; CHECK-NEXT:    st.volatile.shared.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i32>, ptr addrspace(3) %a
  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
  store volatile <2 x i32> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_volatile_4xi32(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_4xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_4xi32_param_0];
; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.s32 %r5, %r4, 1;
; CHECK-NEXT:    add.s32 %r6, %r3, 1;
; CHECK-NEXT:    add.s32 %r7, %r2, 1;
; CHECK-NEXT:    add.s32 %r8, %r1, 1;
; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i32>, ptr addrspace(3) %a
  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
  store volatile <4 x i32> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_volatile_2xi64(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_2xi64(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_2xi64_param_0];
; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i64>, ptr addrspace(3) %a
  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
  store volatile <2 x i64> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_volatile_2xfloat(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_2xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_2xfloat_param_0];
; CHECK-NEXT:    ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
; CHECK-NEXT:    st.volatile.shared.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x float>, ptr addrspace(3) %a
  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
  store volatile <2 x float> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_volatile_4xfloat(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_4xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_4xfloat_param_0];
; CHECK-NEXT:    ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
; CHECK-NEXT:    st.volatile.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x float>, ptr addrspace(3) %a
  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
  store volatile <4 x float> %a.add, ptr addrspace(3) %a
  ret void
}

define void @shared_volatile_2xdouble(ptr addrspace(3) %a) {
; CHECK-LABEL: shared_volatile_2xdouble(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [shared_volatile_2xdouble_param_0];
; CHECK-NEXT:    ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
; CHECK-NEXT:    st.volatile.shared.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x double>, ptr addrspace(3) %a
  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
  store volatile <2 x double> %a.add, ptr addrspace(3) %a
  ret void
}

;; local statespace

; local

define void @local_2xi8(ptr addrspace(5) %a) {
; CHECK-LABEL: local_2xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_2xi8_param_0];
; CHECK-NEXT:    ld.local.v2.b8 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.local.v2.b8 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i8>, ptr addrspace(5) %a
  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
  store <2 x i8> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_4xi8(ptr addrspace(5) %a) {
; CHECK-LABEL: local_4xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b32 %r<13>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_4xi8_param_0];
; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
; CHECK-NEXT:    st.local.b32 [%rd1], %r12;
; CHECK-NEXT:    ret;
  %a.load = load <4 x i8>, ptr addrspace(5) %a
  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
  store <4 x i8> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_8xi8(ptr addrspace(5) %a) {
; CHECK-LABEL: local_8xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<25>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_8xi8_param_0];
; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%r24, %r13};
; CHECK-NEXT:    ret;
  %a.load = load <8 x i8>, ptr addrspace(5) %a
  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store <8 x i8> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_16xi8(ptr addrspace(5) %a) {
; CHECK-LABEL: local_16xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<33>;
; CHECK-NEXT:    .reg .b32 %r<49>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_16xi8_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
; CHECK-NEXT:    ret;
  %a.load = load <16 x i8>, ptr addrspace(5) %a
  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store <16 x i8> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_2xi16(ptr addrspace(5) %a) {
; CHECK-LABEL: local_2xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_2xi16_param_0];
; CHECK-NEXT:    ld.local.v2.b16 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.local.v2.b16 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i16>, ptr addrspace(5) %a
  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
  store <2 x i16> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_4xi16(ptr addrspace(5) %a) {
; CHECK-LABEL: local_4xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_4xi16_param_0];
; CHECK-NEXT:    ld.local.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
; CHECK-NEXT:    st.local.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
; CHECK-NEXT:    ret;
  %a.load = load <4 x i16>, ptr addrspace(5) %a
  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
  store <4 x i16> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_8xi16(ptr addrspace(5) %a) {
; CHECK-LABEL: local_8xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_8xi16_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load <8 x i16>, ptr addrspace(5) %a
  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
  store <8 x i16> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_2xi32(ptr addrspace(5) %a) {
; CHECK-LABEL: local_2xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_2xi32_param_0];
; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.s32 %r3, %r2, 1;
; CHECK-NEXT:    add.s32 %r4, %r1, 1;
; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i32>, ptr addrspace(5) %a
  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
  store <2 x i32> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_4xi32(ptr addrspace(5) %a) {
; CHECK-LABEL: local_4xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_4xi32_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.s32 %r5, %r4, 1;
; CHECK-NEXT:    add.s32 %r6, %r3, 1;
; CHECK-NEXT:    add.s32 %r7, %r2, 1;
; CHECK-NEXT:    add.s32 %r8, %r1, 1;
; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load <4 x i32>, ptr addrspace(5) %a
  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
  store <4 x i32> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_2xi64(ptr addrspace(5) %a) {
; CHECK-LABEL: local_2xi64(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_2xi64_param_0];
; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
; CHECK-NEXT:    st.local.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load <2 x i64>, ptr addrspace(5) %a
  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
  store <2 x i64> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_2xfloat(ptr addrspace(5) %a) {
; CHECK-LABEL: local_2xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_2xfloat_param_0];
; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load <2 x float>, ptr addrspace(5) %a
  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
  store <2 x float> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_4xfloat(ptr addrspace(5) %a) {
; CHECK-LABEL: local_4xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_4xfloat_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load <4 x float>, ptr addrspace(5) %a
  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
  store <4 x float> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_2xdouble(ptr addrspace(5) %a) {
; CHECK-LABEL: local_2xdouble(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_2xdouble_param_0];
; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
; CHECK-NEXT:    st.local.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load <2 x double>, ptr addrspace(5) %a
  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
  store <2 x double> %a.add, ptr addrspace(5) %a
  ret void
}

; local_volatile

define void @local_volatile_2xi8(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_2xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_2xi8_param_0];
; CHECK-NEXT:    ld.local.v2.b8 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.local.v2.b8 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i8>, ptr addrspace(5) %a
  %a.add = add <2 x i8> %a.load, <i8 1, i8 1>
  store volatile <2 x i8> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_volatile_4xi8(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_4xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b32 %r<13>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_4xi8_param_0];
; CHECK-NEXT:    ld.local.b32 %r1, [%rd1];
; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r2;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r3, %rs2;
; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r4;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r5, %rs4;
; CHECK-NEXT:    prmt.b32 %r6, %r5, %r3, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r7, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r7;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs6;
; CHECK-NEXT:    prmt.b32 %r9, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r9;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r10, %rs8;
; CHECK-NEXT:    prmt.b32 %r11, %r10, %r8, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r6, 0x5410U;
; CHECK-NEXT:    st.local.b32 [%rd1], %r12;
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i8>, ptr addrspace(5) %a
  %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1>
  store volatile <4 x i8> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_volatile_8xi8(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_8xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<25>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_8xi8_param_0];
; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r3;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r4, %rs2;
; CHECK-NEXT:    prmt.b32 %r5, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r5;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs4;
; CHECK-NEXT:    prmt.b32 %r7, %r6, %r4, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r8, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r8;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
; CHECK-NEXT:    prmt.b32 %r10, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r10;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs8;
; CHECK-NEXT:    prmt.b32 %r12, %r11, %r9, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r13, %r12, %r7, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r14, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r14;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r15, %rs10;
; CHECK-NEXT:    prmt.b32 %r16, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r16;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs12;
; CHECK-NEXT:    prmt.b32 %r18, %r17, %r15, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r19, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r19;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r20, %rs14;
; CHECK-NEXT:    prmt.b32 %r21, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r21;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs16;
; CHECK-NEXT:    prmt.b32 %r23, %r22, %r20, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r24, %r23, %r18, 0x5410U;
; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%r24, %r13};
; CHECK-NEXT:    ret;
  %a.load = load volatile <8 x i8>, ptr addrspace(5) %a
  %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store volatile <8 x i8> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_volatile_16xi8(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_16xi8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<33>;
; CHECK-NEXT:    .reg .b32 %r<49>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_16xi8_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    prmt.b32 %r5, %r4, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs1, %r5;
; CHECK-NEXT:    add.s16 %rs2, %rs1, 1;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs2;
; CHECK-NEXT:    prmt.b32 %r7, %r4, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs3, %r7;
; CHECK-NEXT:    add.s16 %rs4, %rs3, 1;
; CHECK-NEXT:    cvt.u32.u16 %r8, %rs4;
; CHECK-NEXT:    prmt.b32 %r9, %r8, %r6, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r10, %r4, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs5, %r10;
; CHECK-NEXT:    add.s16 %rs6, %rs5, 1;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs6;
; CHECK-NEXT:    prmt.b32 %r12, %r4, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs7, %r12;
; CHECK-NEXT:    add.s16 %rs8, %rs7, 1;
; CHECK-NEXT:    cvt.u32.u16 %r13, %rs8;
; CHECK-NEXT:    prmt.b32 %r14, %r13, %r11, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r15, %r14, %r9, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r16, %r3, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs9, %r16;
; CHECK-NEXT:    add.s16 %rs10, %rs9, 1;
; CHECK-NEXT:    cvt.u32.u16 %r17, %rs10;
; CHECK-NEXT:    prmt.b32 %r18, %r3, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs11, %r18;
; CHECK-NEXT:    add.s16 %rs12, %rs11, 1;
; CHECK-NEXT:    cvt.u32.u16 %r19, %rs12;
; CHECK-NEXT:    prmt.b32 %r20, %r19, %r17, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r21, %r3, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs13, %r21;
; CHECK-NEXT:    add.s16 %rs14, %rs13, 1;
; CHECK-NEXT:    cvt.u32.u16 %r22, %rs14;
; CHECK-NEXT:    prmt.b32 %r23, %r3, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs15, %r23;
; CHECK-NEXT:    add.s16 %rs16, %rs15, 1;
; CHECK-NEXT:    cvt.u32.u16 %r24, %rs16;
; CHECK-NEXT:    prmt.b32 %r25, %r24, %r22, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r26, %r25, %r20, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r27, %r2, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs17, %r27;
; CHECK-NEXT:    add.s16 %rs18, %rs17, 1;
; CHECK-NEXT:    cvt.u32.u16 %r28, %rs18;
; CHECK-NEXT:    prmt.b32 %r29, %r2, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs19, %r29;
; CHECK-NEXT:    add.s16 %rs20, %rs19, 1;
; CHECK-NEXT:    cvt.u32.u16 %r30, %rs20;
; CHECK-NEXT:    prmt.b32 %r31, %r30, %r28, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r32, %r2, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs21, %r32;
; CHECK-NEXT:    add.s16 %rs22, %rs21, 1;
; CHECK-NEXT:    cvt.u32.u16 %r33, %rs22;
; CHECK-NEXT:    prmt.b32 %r34, %r2, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs23, %r34;
; CHECK-NEXT:    add.s16 %rs24, %rs23, 1;
; CHECK-NEXT:    cvt.u32.u16 %r35, %rs24;
; CHECK-NEXT:    prmt.b32 %r36, %r35, %r33, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r37, %r36, %r31, 0x5410U;
; CHECK-NEXT:    prmt.b32 %r38, %r1, 0, 0x7773U;
; CHECK-NEXT:    cvt.u16.u32 %rs25, %r38;
; CHECK-NEXT:    add.s16 %rs26, %rs25, 1;
; CHECK-NEXT:    cvt.u32.u16 %r39, %rs26;
; CHECK-NEXT:    prmt.b32 %r40, %r1, 0, 0x7772U;
; CHECK-NEXT:    cvt.u16.u32 %rs27, %r40;
; CHECK-NEXT:    add.s16 %rs28, %rs27, 1;
; CHECK-NEXT:    cvt.u32.u16 %r41, %rs28;
; CHECK-NEXT:    prmt.b32 %r42, %r41, %r39, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r43, %r1, 0, 0x7771U;
; CHECK-NEXT:    cvt.u16.u32 %rs29, %r43;
; CHECK-NEXT:    add.s16 %rs30, %rs29, 1;
; CHECK-NEXT:    cvt.u32.u16 %r44, %rs30;
; CHECK-NEXT:    prmt.b32 %r45, %r1, 0, 0x7770U;
; CHECK-NEXT:    cvt.u16.u32 %rs31, %r45;
; CHECK-NEXT:    add.s16 %rs32, %rs31, 1;
; CHECK-NEXT:    cvt.u32.u16 %r46, %rs32;
; CHECK-NEXT:    prmt.b32 %r47, %r46, %r44, 0x3340U;
; CHECK-NEXT:    prmt.b32 %r48, %r47, %r42, 0x5410U;
; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15};
; CHECK-NEXT:    ret;
  %a.load = load volatile <16 x i8>, ptr addrspace(5) %a
  %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
  store volatile <16 x i8> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_volatile_2xi16(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_2xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_2xi16_param_0];
; CHECK-NEXT:    ld.local.v2.b16 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    st.local.v2.b16 [%rd1], {%rs4, %rs3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i16>, ptr addrspace(5) %a
  %a.add = add <2 x i16> %a.load, <i16 1, i16 1>
  store volatile <2 x i16> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_volatile_4xi16(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_4xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_4xi16_param_0];
; CHECK-NEXT:    ld.local.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
; CHECK-NEXT:    add.s16 %rs5, %rs4, 1;
; CHECK-NEXT:    add.s16 %rs6, %rs3, 1;
; CHECK-NEXT:    add.s16 %rs7, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs1, 1;
; CHECK-NEXT:    st.local.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i16>, ptr addrspace(5) %a
  %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1>
  store volatile <4 x i16> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_volatile_8xi16(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_8xi16(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<17>;
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_8xi16_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
; CHECK-NEXT:    add.s16 %rs3, %rs2, 1;
; CHECK-NEXT:    add.s16 %rs4, %rs1, 1;
; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
; CHECK-NEXT:    add.s16 %rs7, %rs6, 1;
; CHECK-NEXT:    add.s16 %rs8, %rs5, 1;
; CHECK-NEXT:    mov.b32 %r6, {%rs8, %rs7};
; CHECK-NEXT:    mov.b32 {%rs9, %rs10}, %r2;
; CHECK-NEXT:    add.s16 %rs11, %rs10, 1;
; CHECK-NEXT:    add.s16 %rs12, %rs9, 1;
; CHECK-NEXT:    mov.b32 %r7, {%rs12, %rs11};
; CHECK-NEXT:    mov.b32 {%rs13, %rs14}, %r1;
; CHECK-NEXT:    add.s16 %rs15, %rs14, 1;
; CHECK-NEXT:    add.s16 %rs16, %rs13, 1;
; CHECK-NEXT:    mov.b32 %r8, {%rs16, %rs15};
; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <8 x i16>, ptr addrspace(5) %a
  %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
  store volatile <8 x i16> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_volatile_2xi32(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_2xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_2xi32_param_0];
; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.s32 %r3, %r2, 1;
; CHECK-NEXT:    add.s32 %r4, %r1, 1;
; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i32>, ptr addrspace(5) %a
  %a.add = add <2 x i32> %a.load, <i32 1, i32 1>
  store volatile <2 x i32> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_volatile_4xi32(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_4xi32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_4xi32_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.s32 %r5, %r4, 1;
; CHECK-NEXT:    add.s32 %r6, %r3, 1;
; CHECK-NEXT:    add.s32 %r7, %r2, 1;
; CHECK-NEXT:    add.s32 %r8, %r1, 1;
; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x i32>, ptr addrspace(5) %a
  %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1>
  store volatile <4 x i32> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_volatile_2xi64(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_2xi64(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_2xi64_param_0];
; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.s64 %rd4, %rd3, 1;
; CHECK-NEXT:    add.s64 %rd5, %rd2, 1;
; CHECK-NEXT:    st.local.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x i64>, ptr addrspace(5) %a
  %a.add = add <2 x i64> %a.load, <i64 1, i64 1>
  store volatile <2 x i64> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_volatile_2xfloat(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_2xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_2xfloat_param_0];
; CHECK-NEXT:    ld.local.v2.b32 {%r1, %r2}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r3, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
; CHECK-NEXT:    st.local.v2.b32 [%rd1], {%r4, %r3};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x float>, ptr addrspace(5) %a
  %a.add = fadd <2 x float> %a.load, <float 1., float 1.>
  store volatile <2 x float> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_volatile_4xfloat(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_4xfloat(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_4xfloat_param_0];
; CHECK-NEXT:    ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT:    add.rn.f32 %r5, %r4, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r6, %r3, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r7, %r2, 0f3F800000;
; CHECK-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
; CHECK-NEXT:    st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5};
; CHECK-NEXT:    ret;
  %a.load = load volatile <4 x float>, ptr addrspace(5) %a
  %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.>
  store volatile <4 x float> %a.add, ptr addrspace(5) %a
  ret void
}

define void @local_volatile_2xdouble(ptr addrspace(5) %a) {
; CHECK-LABEL: local_volatile_2xdouble(
; CHECK:       {
; CHECK-NEXT:    .reg .b64 %rd<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [local_volatile_2xdouble_param_0];
; CHECK-NEXT:    ld.local.v2.b64 {%rd2, %rd3}, [%rd1];
; CHECK-NEXT:    add.rn.f64 %rd4, %rd3, 0d3FF0000000000000;
; CHECK-NEXT:    add.rn.f64 %rd5, %rd2, 0d3FF0000000000000;
; CHECK-NEXT:    st.local.v2.b64 [%rd1], {%rd5, %rd4};
; CHECK-NEXT:    ret;
  %a.load = load volatile <2 x double>, ptr addrspace(5) %a
  %a.add = fadd <2 x double> %a.load, <double 1., double 1.>
  store volatile <2 x double> %a.add, ptr addrspace(5) %a
  ret void
}
