#!/bin/sh
# Detect platform-specific compiler flags for robscale

# Detect -fopenmp-simd support
OPENMP_SIMD_FLAG=""
OPENMP_SIMD_DEFINE=""
echo "int main(){return 0;}" > conftest.cpp
if ${CXX:-c++} -fopenmp-simd -c conftest.cpp -o conftest.o 2>/dev/null; then
  OPENMP_SIMD_FLAG="-fopenmp-simd"
  OPENMP_SIMD_DEFINE="-DROBSCALE_HAS_OMP_SIMD"
  echo "  -fopenmp-simd supported"
else
  echo "  -fopenmp-simd not supported, skipping"
fi
rm -f conftest.cpp conftest.o

HAS_SLEEF="no"

# Skip SLEEF on macOS as Accelerate is faster and preferred
if [ "$(uname -s)" != "Darwin" ]; then
  # Check standard paths first
  for prefix in /usr /usr/local /opt/homebrew; do
    if [ -f "${prefix}/include/sleef.h" ]; then
      SLEEF_CFLAGS="-I${prefix}/include -DROBSCALE_HAS_SLEEF"
      SLEEF_LIBS="-L${prefix}/lib -lsleef"
      HAS_SLEEF="yes"
      break
    fi
  done

  # Fallback to pkg-config if not found in standard paths
  if [ "${HAS_SLEEF}" = "no" ]; then
    if pkg-config --exists sleef 2>/dev/null; then
      SLEEF_CFLAGS="$(pkg-config --cflags sleef) -DROBSCALE_HAS_SLEEF"
      SLEEF_LIBS=$(pkg-config --libs sleef)
      HAS_SLEEF="yes"
    fi
  fi
fi

if [ "${HAS_SLEEF}" = "yes" ]; then
  echo "  SLEEF detected"
else
  echo "  SLEEF not detected, falling back to Accelerate/OpenMP"
fi

# Detect glibc libmvec _ZGVdN4v_tanh (Linux x86_64, glibc >= 2.35)
# libmvec is 25-50% faster than SLEEF for 4-wide double tanh on Zen/Skylake.
# Independent of SLEEF: libmvec is a glibc-native AVX2 tanh that needs no
# third-party library.  ROBSCALE_HAS_AVX2_DISPATCH (compiler-derived in
# robscale_config.h) provides the target-attribute infrastructure.
HAS_GLIBC_MVEC="no"
GLIBC_MVEC_CFLAGS=""
GLIBC_MVEC_LIBS=""
if [ "$(uname -s)" = "Linux" ]; then
  # Test 1: glibc >= 2.35 (versioned _ZGVdN4v_tanh@GLIBC_2.35 requires this)
  cat > conftest_glibc.c << 'CFEOF'
#include <features.h>
#if !(__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 35))
#error "glibc < 2.35"
#endif
int main(void) { return 0; }
CFEOF
  if ${CC:-gcc} conftest_glibc.c -o conftest_glibc 2>/dev/null; then
    # Test 2: _ZGVdN4v_tanh linkable from libmvec
    cat > conftest_mvec.c << 'CFEOF'
typedef double v4d __attribute__((vector_size(32)));
extern v4d _ZGVdN4v_tanh(v4d);
int main(void) { v4d x = {0.5,0.5,0.5,0.5}; v4d r = _ZGVdN4v_tanh(x); return (int)r[0]; }
CFEOF
    if ${CC:-gcc} conftest_mvec.c -lmvec -o conftest_mvec 2>/dev/null; then
      HAS_GLIBC_MVEC="yes"
      GLIBC_MVEC_CFLAGS="-DROBSCALE_HAS_GLIBC_MVEC"
      GLIBC_MVEC_LIBS="-lmvec"
      echo "  glibc libmvec _ZGVdN4v_tanh detected (preferred AVX2 tanh)"
    else
      echo "  libmvec not available"
    fi
    rm -f conftest_mvec.c conftest_mvec
  else
    echo "  glibc < 2.35, libmvec not available"
  fi
  rm -f conftest_glibc.c conftest_glibc
fi

# Detect glibc libmvec _ZGVeN8v_tanh (AVX-512 8-wide tanh, Linux x86_64)
# Links from the same libmvec as the 4-wide variant. No separate -l needed
# when GLIBC_MVEC_LIBS is already set. On non-AVX-512 hardware (Zen 3) the
# symbol links but is never called; CPUID dispatches to the AVX2 path.
HAS_AVX512_TANH="no"
AVX512_TANH_CFLAGS=""
if [ "${HAS_GLIBC_MVEC}" = "yes" ]; then
  cat > conftest_mvec512.c << 'CFEOF'
typedef double v8d __attribute__((vector_size(64)));
extern v8d _ZGVeN8v_tanh(v8d);
int main(void) { v8d x = {0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5}; v8d r = _ZGVeN8v_tanh(x); return (int)r[0]; }
CFEOF
  if ${CC:-gcc} conftest_mvec512.c -lmvec -o conftest_mvec512 2>/dev/null; then
    HAS_AVX512_TANH="yes"
    AVX512_TANH_CFLAGS="-DROBSCALE_HAS_AVX512_TANH"
    echo "  glibc libmvec _ZGVeN8v_tanh detected (AVX-512 8-wide tanh)"
  else
    echo "  _ZGVeN8v_tanh not available (libmvec too old or not AVX-512 enabled)"
  fi
  rm -f conftest_mvec512.c conftest_mvec512
fi

# Detect macOS Accelerate framework (vForce for vectorized tanh)
ACCELERATE_LIBS=""
if [ "$(uname -s)" = "Darwin" ]; then
  ACCELERATE_LIBS="-framework Accelerate"
  echo "  macOS detected, linking Accelerate framework"
fi

# ── TBB backend detection ────────────────────────────────────────────────────
# Priority 1: system oneTBB  (explicit -ltbb, modern scheduler)
# Priority 2: RcppParallel bundled TBB  (explicit link to its libtbb.so)
# Priority 3: OpenMP thread-level parallel (#pragma omp parallel fallback)
TBB_DEFINE=""
TBB_LIBS=""
OMP_PARALLEL_CFLAGS=""
OMP_PARALLEL_LIBS=""

SYSTEM_TBB_LIB=""
SYSTEM_TBB_HEADER=""
for libdir in /usr/lib /usr/lib64 /usr/lib/x86_64-linux-gnu \
              /usr/lib/aarch64-linux-gnu /usr/lib/arm-linux-gnueabihf \
              /usr/local/lib /opt/homebrew/lib; do
  if [ -f "${libdir}/libtbb.so" ] || [ -f "${libdir}/libtbb.dylib" ]; then
    SYSTEM_TBB_LIB="${libdir}"
    break
  fi
done
for incdir in /usr/include /usr/local/include /opt/homebrew/include; do
  if [ -f "${incdir}/oneapi/tbb/parallel_reduce.h" ]; then
    SYSTEM_TBB_HEADER="${incdir}"
    break
  fi
done

# pkg-config fallback if path search failed
if [ -z "${SYSTEM_TBB_LIB}" ] || [ -z "${SYSTEM_TBB_HEADER}" ]; then
  if pkg-config --exists tbb 2>/dev/null; then
    SYSTEM_TBB_LIB="pkg-config"
    SYSTEM_TBB_HEADER="pkg-config"
    TBB_DEFINE="-DROBSCALE_HAS_SYSTEM_TBB $(pkg-config --cflags tbb)"
    TBB_LIBS="$(pkg-config --libs tbb)"
    echo "  System oneTBB detected via pkg-config"
  fi
fi

if [ "${SYSTEM_TBB_LIB}" = "pkg-config" ]; then
  : # TBB_DEFINE and TBB_LIBS already set above by pkg-config
elif [ -n "${SYSTEM_TBB_LIB}" ] && [ -n "${SYSTEM_TBB_HEADER}" ]; then
  TBB_DEFINE="-DROBSCALE_HAS_SYSTEM_TBB"
  TBB_LIBS="-L${SYSTEM_TBB_LIB} -ltbb"
  echo "  System oneTBB detected (${SYSTEM_TBB_LIB}/libtbb.so), using -ltbb"
else
  RCPP_PAR_LIB=""
  if [ -n "${R_HOME}" ]; then
    RCPP_PAR_LIB=$("${R_HOME}/bin/Rscript" --no-save -e \
      "cat(system.file('lib', package='RcppParallel', mustWork=FALSE))" \
      2>/dev/null || true)
  fi
  if [ -n "${RCPP_PAR_LIB}" ] && [ -f "${RCPP_PAR_LIB}/libtbb.so" ]; then
    TBB_DEFINE="-DUSE_DIRECT_TBB"
    TBB_LIBS="-L${RCPP_PAR_LIB} -ltbb -Wl,-rpath,${RCPP_PAR_LIB}"
    echo "  RcppParallel TBB detected at ${RCPP_PAR_LIB}, using -ltbb"
  else
    TBB_DEFINE="-DROBSCALE_HAS_OMP_PARALLEL"
    OMP_PARALLEL_CFLAGS='$(SHLIB_OPENMP_CXXFLAGS)'
    OMP_PARALLEL_LIBS='$(SHLIB_OPENMP_CXXFLAGS)'
    echo "  No TBB found, using OpenMP parallel fallback"
  fi
fi

# Force recompilation when flags change (make doesn't track PKG_CXXFLAGS)
rm -f src/*.o src/robscale.so

# Generate Makevars from template
sed -e "s|@OPENMP_SIMD_FLAG@|${OPENMP_SIMD_FLAG}|" \
    -e "s|@OPENMP_SIMD_DEFINE@|${OPENMP_SIMD_DEFINE}|" \
    -e "s|@ACCELERATE_LIBS@|${ACCELERATE_LIBS}|" \
    -e "s|@SLEEF_CFLAGS@|${SLEEF_CFLAGS}|" \
    -e "s|@SLEEF_LIBS@|${SLEEF_LIBS}|" \
    -e "s|@TBB_DEFINE@|${TBB_DEFINE}|" \
    -e "s|@TBB_LIBS@|${TBB_LIBS}|" \
    -e "s|@OMP_PARALLEL_CFLAGS@|${OMP_PARALLEL_CFLAGS}|" \
    -e "s|@OMP_PARALLEL_LIBS@|${OMP_PARALLEL_LIBS}|" \
    -e "s|@GLIBC_MVEC_CFLAGS@|${GLIBC_MVEC_CFLAGS}|" \
    -e "s|@GLIBC_MVEC_LIBS@|${GLIBC_MVEC_LIBS}|" \
    -e "s|@AVX512_TANH_CFLAGS@|${AVX512_TANH_CFLAGS}|" \
    src/Makevars.in > src/Makevars

echo "  Generated src/Makevars"
