#!/usr/bin/env bash
# =============================================================================
# Bonsai Ternary LLM — One-Shot Setup for CPU-Only (no GPU, no CUDA)
# Linux (Ubuntu/Debian/Fedora/Arch)
#
# Usage:
#   chmod +x setup.sh && ./setup.sh            # Q2_0 ternary (recommended)
#   ./setup.sh --quant q4_0                     # Q4_0-lossless (fork-free)
#   ./setup.sh --no-download                    # build only, skip the model
#
# What it does: installs deps, builds a CLEAN CPU-ONLY Prism llama.cpp fork
# (-DGGML_CUDA=OFF, so the binaries have zero CUDA dependencies — nothing to
# put on LD_LIBRARY_PATH later), downloads a Bonsai model, writes a `bonsai`
# helper, and runs a test prompt. No GPU or NVIDIA driver required.
#
# The Prism fork loads BOTH the native Q2_0 ternary format and the
# Q4_0-lossless format, so this one build covers either quant.
# =============================================================================
set -euo pipefail

# ─── Config ──────────────────────────────────────────────────────────────────
INSTALL_DIR="${HOME}/llama.cpp-cpu-only"
MODELS_DIR="${HOME}/models/bonsai"
QUANT="q2_0"          # q2_0 (recommended, ternary) | q4_0 (fork-free)
THREADS="$(nproc)"    # inference threads; capped at physical cores below
DOWNLOAD=1

Q2_URL="https://huggingface.co/prism-ml/Ternary-Bonsai-8B-gguf/resolve/main/Ternary-Bonsai-8B-Q2_0.gguf?download=1"
Q2_FILE="${MODELS_DIR}/Ternary-Bonsai-8B-Q2_0.gguf"
Q4_URL="https://huggingface.co/Minarut/Ternary-Bonsai-8B-GGUF-llamacpp-compatible/resolve/main/Ternary-Bonsai-8B-Q4_0-lossless.gguf"
Q4_FILE="${MODELS_DIR}/Bonsai-8B-Q4_0-lossless.gguf"

# ─── Parse args ──────────────────────────────────────────────────────────────
while [ $# -gt 0 ]; do
  case "$1" in
    --quant) QUANT="${2:-q2_0}"; shift 2 ;;
    --no-download) DOWNLOAD=0; shift ;;
    *) echo "Unknown arg: $1"; exit 1 ;;
  esac
done

# ─── Color output ────────────────────────────────────────────────────────────
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m'
info()  { echo -e "${GREEN}[✓]${NC} $1"; }
warn()  { echo -e "${YELLOW}[!]${NC} $1"; }
err()   { echo -e "${RED}[✗]${NC} $1"; }
step()  { echo -e "\n${YELLOW}══ $1 ══${NC}"; }

# ─── 1. Install system deps ─────────────────────────────────────────────────
step "1/4 — Installing system dependencies"
if command -v apt-get &>/dev/null; then
  sudo DEBIAN_FRONTEND=noninteractive apt-get update -qq
  sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq git cmake build-essential curl
elif command -v dnf &>/dev/null; then
  sudo dnf install -y git cmake make gcc-c++ curl
elif command -v pacman &>/dev/null; then
  sudo pacman -S --noconfirm git cmake base-devel curl
else
  warn "Unknown package manager — ensure git, cmake, make, g++, curl are installed"
fi
info "System dependencies ready (no CUDA needed)"

# Pick a sane default thread count: physical cores, not SMT threads.
# Generation is memory-bandwidth-bound and prefill peaks around physical-core
# count, so oversubscribing SMT threads only hurts.
if command -v lscpu &>/dev/null; then
  CORES_PER_SOCKET=$(lscpu | awk -F: '/^Core\(s\) per socket/{gsub(/ /,"",$2);print $2}')
  SOCKETS=$(lscpu | awk -F: '/^Socket\(s\)/{gsub(/ /,"",$2);print $2}')
  if [ -n "${CORES_PER_SOCKET:-}" ] && [ -n "${SOCKETS:-}" ]; then
    THREADS=$(( CORES_PER_SOCKET * SOCKETS ))
  fi
fi
[ "${THREADS:-0}" -ge 1 ] 2>/dev/null || THREADS="$(nproc)"
info "Using ${THREADS} inference threads (physical cores)"

# ─── 2. Clone and build the Prism llama.cpp fork (CPU-only) ─────────────────
step "2/4 — Building the Prism llama.cpp fork, CPU-only (3-10 min)"
if [ -d "$INSTALL_DIR" ]; then
  warn "Removing existing build at $INSTALL_DIR"
  rm -rf "$INSTALL_DIR"
fi
git clone --depth 1 -b prism https://github.com/PrismML-Eng/llama.cpp.git "$INSTALL_DIR"
cd "$INSTALL_DIR"

# -DGGML_CUDA=OFF  → pure CPU binary, zero CUDA deps (no LD_LIBRARY_PATH later).
# -DGGML_NATIVE=ON → target this CPU's instruction set (AVX2 etc.), which the
#                    ternary Q2_0 kernel leans on. Use -j4 instead of the line
#                    below if you want to keep the machine responsive.
cmake -B build-cpu \
  -DGGML_CUDA=OFF \
  -DGGML_NATIVE=ON \
  -DCMAKE_BUILD_TYPE=Release
cmake --build build-cpu -j"$(nproc)" --target llama-cli llama-bench llama-server

CLI="$INSTALL_DIR/build-cpu/bin/llama-cli"
[ -f "$CLI" ] || { err "Build failed — llama-cli not found"; exit 1; }

# Sanity-check: no unresolved (CUDA) libraries should remain.
MISSING=$(ldd "$CLI" 2>/dev/null | grep -c "not found" || true)
if [ "${MISSING:-0}" -eq 0 ]; then
  info "CPU-only build OK — no missing/CUDA libraries"
else
  warn "ldd reports ${MISSING} missing libraries — check the build"
fi

# ─── 3. Download model ──────────────────────────────────────────────────────
mkdir -p "$MODELS_DIR"
if [ "$QUANT" = "q4_0" ]; then
  MODEL_FILE="$Q4_FILE"; MODEL_URL="$Q4_URL"; SIZE="4.3 GB"
else
  MODEL_FILE="$Q2_FILE"; MODEL_URL="$Q2_URL"; SIZE="2 GB"
fi

if [ "$DOWNLOAD" -eq 1 ]; then
  step "3/4 — Downloading Bonsai 8B ${QUANT} (${SIZE})"
  if [ -f "$MODEL_FILE" ]; then
    warn "Model already exists, skipping ($(du -h "$MODEL_FILE" | cut -f1))"
  else
    echo "  This is a ${SIZE} download — it will take a while."
    # --progress-bar shows progress; -C - resumes a partial download.
    curl -L -C - --progress-bar -o "$MODEL_FILE" "$MODEL_URL"
    info "Downloaded: $(du -h "$MODEL_FILE" | cut -f1)"
  fi
else
  step "3/4 — Skipping model download (--no-download)"
fi

# ─── 4. Helper + test ───────────────────────────────────────────────────────
step "4/4 — Creating helper and running a test"
cat > "$MODELS_DIR/bonsai-chat.sh" << SCRIPT
#!/bin/bash
CLI="$CLI"
MODEL="$MODEL_FILE"
# -ngl 0 keeps every layer on the CPU. -t ${THREADS} matches physical cores.
# -fa 1 is flash attention. -e makes a one-shot -p prompt print and exit.
"\$CLI" -m "\$MODEL" -ngl 0 -t ${THREADS} -fa 1 \
  -p "\${*:-Hello! What is the capital of France?}" -n 200 -e
SCRIPT
chmod +x "$MODELS_DIR/bonsai-chat.sh"

if ! grep -q "bonsai-chat" "$HOME/.bashrc" 2>/dev/null; then
  printf '\n# Bonsai ternary LLM helper (CPU-only)\nalias bonsai="%s/bonsai-chat.sh"\n' "$MODELS_DIR" >> "$HOME/.bashrc"
  info "Added 'bonsai' alias to ~/.bashrc"
fi

if [ -f "$MODEL_FILE" ]; then
  echo "--- Test prompt (CPU-only, this is slow — ~3 tok/s generation) ---"
  "$CLI" -m "$MODEL_FILE" -ngl 0 -t "${THREADS}" -fa 1 \
    -p "Hello! What is the capital of France?" -n 64 -e --no-display-prompt 2>&1 | tail -6
  echo ""
  info "Done. Usage:  bonsai \"Your question\"   (run 'source ~/.bashrc' first)"
  echo "  Watch CPU load during a run with:  htop"
  echo "  Serve an OpenAI-compatible API:    $INSTALL_DIR/build-cpu/bin/llama-server \\"
  echo "                                       -m $MODEL_FILE -ngl 0 -t ${THREADS} -fa 1 -c 4096 --port 8080"
else
  warn "No model present — download one, then run: $MODELS_DIR/bonsai-chat.sh \"Hi\""
fi