; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mcpu=sm_50 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mcpu=sm_50 | %ptxas-verify %}

target triple = "nvptx64-nvidia-cuda"

define i32 @test_simple_rotl(i32 %x) {
; CHECK-LABEL: test_simple_rotl(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_simple_rotl_param_0];
; CHECK-NEXT:    shf.l.wrap.b32 %r2, %r1, %r1, 7;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
; CHECK-NEXT:    ret;
  %shl = shl i32 %x, 7
  %shr = lshr i32 %x, 25
  %add = add i32 %shl, %shr
  ret i32 %add
}

define i32 @test_simple_rotr(i32 %x) {
; CHECK-LABEL: test_simple_rotr(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_simple_rotr_param_0];
; CHECK-NEXT:    shf.l.wrap.b32 %r2, %r1, %r1, 25;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
; CHECK-NEXT:    ret;
  %shr = lshr i32 %x, 7
  %shl = shl i32 %x, 25
  %add = add i32 %shr, %shl
  ret i32 %add
}

define i32 @test_rotl_var(i32 %x, i32 %y) {
; CHECK-LABEL: test_rotl_var(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_rotl_var_param_0];
; CHECK-NEXT:    ld.param.b32 %r2, [test_rotl_var_param_1];
; CHECK-NEXT:    shf.l.wrap.b32 %r3, %r1, %r1, %r2;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
; CHECK-NEXT:    ret;
  %shl = shl i32 %x, %y
  %sub = sub i32 32, %y
  %shr = lshr i32 %x, %sub
  %add = add i32 %shl, %shr
  ret i32 %add
}

define i32 @test_rotr_var(i32 %x, i32 %y) {
; CHECK-LABEL: test_rotr_var(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_rotr_var_param_0];
; CHECK-NEXT:    ld.param.b32 %r2, [test_rotr_var_param_1];
; CHECK-NEXT:    shf.r.wrap.b32 %r3, %r1, %r1, %r2;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
; CHECK-NEXT:    ret;
  %shr = lshr i32 %x, %y
  %sub = sub i32 32, %y
  %shl = shl i32 %x, %sub
  %add = add i32 %shr, %shl
  ret i32 %add
}

define i32 @test_invalid_rotl_var_and(i32 %x, i32 %y) {
; CHECK-LABEL: test_invalid_rotl_var_and(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<8>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_invalid_rotl_var_and_param_0];
; CHECK-NEXT:    ld.param.b32 %r2, [test_invalid_rotl_var_and_param_1];
; CHECK-NEXT:    shl.b32 %r3, %r1, %r2;
; CHECK-NEXT:    neg.s32 %r4, %r2;
; CHECK-NEXT:    and.b32 %r5, %r4, 31;
; CHECK-NEXT:    shr.u32 %r6, %r1, %r5;
; CHECK-NEXT:    add.s32 %r7, %r6, %r3;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
; CHECK-NEXT:    ret;
  %shr = shl i32 %x, %y
  %sub = sub nsw i32 0, %y
  %and = and i32 %sub, 31
  %shl = lshr i32 %x, %and
  %add = add i32 %shl, %shr
  ret i32 %add
}

define i32 @test_invalid_rotr_var_and(i32 %x, i32 %y) {
; CHECK-LABEL: test_invalid_rotr_var_and(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<8>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_invalid_rotr_var_and_param_0];
; CHECK-NEXT:    ld.param.b32 %r2, [test_invalid_rotr_var_and_param_1];
; CHECK-NEXT:    shr.u32 %r3, %r1, %r2;
; CHECK-NEXT:    neg.s32 %r4, %r2;
; CHECK-NEXT:    and.b32 %r5, %r4, 31;
; CHECK-NEXT:    shl.b32 %r6, %r1, %r5;
; CHECK-NEXT:    add.s32 %r7, %r3, %r6;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
; CHECK-NEXT:    ret;
  %shr = lshr i32 %x, %y
  %sub = sub nsw i32 0, %y
  %and = and i32 %sub, 31
  %shl = shl i32 %x, %and
  %add = add i32 %shr, %shl
  ret i32 %add
}

define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) {
; CHECK-LABEL: test_fshl_special_case(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_fshl_special_case_param_0];
; CHECK-NEXT:    ld.param.b32 %r2, [test_fshl_special_case_param_1];
; CHECK-NEXT:    ld.param.b32 %r3, [test_fshl_special_case_param_2];
; CHECK-NEXT:    shf.l.wrap.b32 %r4, %r2, %r1, %r3;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
; CHECK-NEXT:    ret;
  %shl = shl i32 %x0, %y
  %srli = lshr i32 %x1, 1
  %x = xor i32 %y, 31
  %srlo = lshr i32 %srli, %x
  %o = add i32 %shl, %srlo
  ret i32 %o
}

define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) {
; CHECK-LABEL: test_fshr_special_case(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_fshr_special_case_param_0];
; CHECK-NEXT:    ld.param.b32 %r2, [test_fshr_special_case_param_1];
; CHECK-NEXT:    ld.param.b32 %r3, [test_fshr_special_case_param_2];
; CHECK-NEXT:    shf.r.wrap.b32 %r4, %r2, %r1, %r3;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
; CHECK-NEXT:    ret;
  %shl = lshr i32 %x1, %y
  %srli = shl i32 %x0, 1
  %x = xor i32 %y, 31
  %srlo = shl i32 %srli, %x
  %o = add i32 %shl, %srlo
  ret i32 %o
}

define i64 @test_rotl_udiv_special_case(i64 %i) {
; CHECK-LABEL: test_rotl_udiv_special_case(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-NEXT:    .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [test_rotl_udiv_special_case_param_0];
; CHECK-NEXT:    mul.hi.u64 %rd2, %rd1, -6148914691236517205;
; CHECK-NEXT:    shr.u64 %rd3, %rd2, 1;
; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd3;
; CHECK-NEXT:    shf.l.wrap.b32 %r3, %r2, %r1, 28;
; CHECK-NEXT:    shf.l.wrap.b32 %r4, %r1, %r2, 28;
; CHECK-NEXT:    mov.b64 %rd4, {%r4, %r3};
; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
; CHECK-NEXT:    ret;
  %lhs_div = udiv i64 %i, 3
  %rhs_div = udiv i64 %i, 48
  %lhs_shift = shl i64 %lhs_div, 60
  %out = add i64 %lhs_shift, %rhs_div
  ret i64 %out
}

define i32 @test_rotl_mul_special_case(i32 %i) {
; CHECK-LABEL: test_rotl_mul_special_case(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_rotl_mul_special_case_param_0];
; CHECK-NEXT:    mul.lo.s32 %r2, %r1, 9;
; CHECK-NEXT:    shf.l.wrap.b32 %r3, %r2, %r2, 7;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
; CHECK-NEXT:    ret;
  %lhs_mul = mul i32 %i, 9
  %rhs_mul = mul i32 %i, 1152
  %lhs_shift = lshr i32 %lhs_mul, 25
  %out = add i32 %lhs_shift, %rhs_mul
  ret i32 %out
}

define i64 @test_rotl_mul_with_mask_special_case(i64 %i) {
; CHECK-LABEL: test_rotl_mul_with_mask_special_case(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<7>;
; CHECK-NEXT:    .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b64 %rd1, [test_rotl_mul_with_mask_special_case_param_0];
; CHECK-NEXT:    mul.lo.s64 %rd2, %rd1, 9;
; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
; CHECK-NEXT:    shf.l.wrap.b32 %r5, %r4, %r1, 7;
; CHECK-NEXT:    shf.l.wrap.b32 %r6, %r1, %r2, 7;
; CHECK-NEXT:    mov.b64 %rd3, {%r5, %r6};
; CHECK-NEXT:    and.b64 %rd4, %rd3, 255;
; CHECK-NEXT:    st.param.b64 [func_retval0], %rd4;
; CHECK-NEXT:    ret;
  %lhs_mul = mul i64 %i, 1152
  %rhs_mul = mul i64 %i, 9
  %lhs_and = and i64 %lhs_mul, 160
  %rhs_shift = lshr i64 %rhs_mul, 57
  %out = add i64 %lhs_and, %rhs_shift
  ret i64 %out
}

define i32 @test_fshl_with_mask_special_case(i32 %x) {
; CHECK-LABEL: test_fshl_with_mask_special_case(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [test_fshl_with_mask_special_case_param_0];
; CHECK-NEXT:    or.b32 %r2, %r1, 1;
; CHECK-NEXT:    shf.l.wrap.b32 %r3, %r1, %r2, 5;
; CHECK-NEXT:    and.b32 %r4, %r3, -31;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
; CHECK-NEXT:    ret;
  %or1 = or i32 %x, 1
  %sh1 = shl i32 %or1, 5
  %sh2 = lshr i32 %x, 27
  %1 = and i32 %sh2, 1
  %r = add i32 %sh1, %1
  ret i32 %r
}
