import shutil
import os
import glob

def cuda_toolkit_available() -> bool:
    if shutil.which("nvcc") is not None:
        return True
    for k in ("CUDAHOME", "CUDA_HOME", "CUDA_PATH"):
        v = os.environ.get(k)
        if v and os.path.isdir(v):
            return True
    # Jupyter kernels often lack CUDA on PATH; probe default install locations (Windows)
    for base in (
        os.environ.get("CUDA_PATH", ""),
        os.path.join(os.environ.get("ProgramFiles", r"C:\\Program Files"), "NVIDIA GPU Computing Toolkit", "CUDA"),
    ):
        if not base or not os.path.isdir(base):
            continue
        for nvcc in glob.glob(os.path.join(base, "v*", "bin", "nvcc.exe")):
            if os.path.isfile(nvcc):
                return True
    return False

HAVE_CUDA = cuda_toolkit_available()
if HAVE_CUDA:
    print("CUDA toolkit: OK (nvcc or CUDA install found).")
else:
    print(
        "WARNING: CUDA toolkit not detected. Add the CUDA `bin` folder to PATH or set CUDA_PATH. "
        "Cells that set fc.device = 'gpu' will fail until CUDA is visible."
    )

import numpy as np
import escape as esc
from escape.core.compilers import compile

esc.require("0.9.8")

CUDA toolkit: OK (nvcc or CUDA install found).
Loading material database from C:\dev\escape-core\python\src\escape\scattering\..\data\mdb\materials.db

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 35
     33 import numpy as np
     34 import escape as esc
---> 35 from escape.core.compilers import compile
     37 esc.require("0.9.8")

ModuleNotFoundError: No module named 'escape.core.compilers'

x = esc.var("x")
f_jit = esc.sin(x) + 1.0
f_jit.device = "gpu"

print("Scalar f_jit(0.5) =", f_jit(0.5))
print("device:", f_jit.device)
f_jit.show()

x = esc.var("x")
A = esc.par("A", 1.0)
mu = esc.par("mu", 0.0, userlim=[-1, 1])
sigma = esc.par("sigma", 1.0)
f_gauss = A * esc.exp(-(x - mu) ** 2 / (2 * sigma ** 2))

def bench_array(n: int):
    t = np.linspace(-8.0, 8.0, n, dtype=np.float64)
    xb = np.ascontiguousarray(t)
    # Always compare against CPU baseline (loop may leave device on "gpu").
    f_gauss.device = "cpu"
    y_orig = np.asarray(f_gauss(xb), dtype=np.float64)
    y_cpu = np.asarray(f_gauss(xb), dtype=np.float64)
    f_gauss.device="gpu"
    y_gpu = np.asarray(f_gauss(xb), dtype=np.float64)
    err_cpu = float(np.max(np.abs(y_cpu - y_orig)))
    err_gpu = float(np.max(np.abs(y_gpu - y_orig)))
    return err_cpu, err_gpu

for n in (1_000_000, 5_000_000, 10_000_000):
    ec, eg = bench_array(n)
    print(f"n = {n:>10,}  max|cpu-orig| = {ec:.3e}  max|gpu-orig| = {eg:.3e}")

f_gauss.device="gpu"
f_gauss.show()

n = 100_000_000
xb = np.ascontiguousarray(np.linspace(-8.0, 8.0, n, dtype=np.float64))
f_gauss.device = "cpu"
print(f"Array length {n:,} — interpreted:")
%timeit f_gauss(xb)

f_gauss.device = "gpu"
print(f"\nSame — GPU JIT (device gpu):")
%timeit f_gauss(xb)

yb = f_gauss(xb)
yb.size

x = esc.var("x")
a = esc.par("a", 2.5)
b = esc.par("b", -1.3)
f_mix = a * esc.sin(b * x) + esc.exp(-x ** 2)

n = 8_000_000
xb = np.ascontiguousarray(np.linspace(-4.0, 4.0, n, dtype=np.float64))

f_mix.device = "cpu"
y_cpu = f_mix(xb)
f_mix.device = "gpu"
y_gpu = f_mix(xb)
print("max|gpu - cpu|:", float(np.max(np.abs(np.asarray(y_gpu) - np.asarray(y_cpu)))))

print(f"n = {n:,} — CPU compiled")
f_mix.device = "cpu"
%timeit f_mix(xb)
f_mix.device = "gpu"
print(f"n = {n:,} — GPU JIT")
%timeit f_mix(xb)

x = esc.var("x")
a = esc.par("a", 1.0)
f_exp = a * esc.exp(-x ** 2)

f_exp.device = "gpu"

n = 10_000_000
xb = np.ascontiguousarray(np.linspace(-12.0, 12.0, n, dtype=np.float64))


print(f"\nGPU functor — {n:,} points:")
%timeit f_exp(xb)
f_exp.device = "cpu"
%timeit f_exp(xb)

x = esc.var("x")
f = x * 2.0
fc = compile(f)

n = 2_000_000
xb = np.ascontiguousarray(np.linspace(-3.0, 3.0, n, dtype=np.float64))
y_ref = np.asarray(f(xb), dtype=np.float64)

fc.device = "gpu"
y_gpu = np.asarray(fc(xb), dtype=np.float64)
fc.device = "cpu"
y_cpu = np.asarray(fc(xb), dtype=np.float64)

print("max|y_gpu - ref|:", float(np.max(np.abs(y_gpu - y_ref))))
print("max|y_cpu - ref|:", float(np.max(np.abs(y_cpu - y_ref))))

fc.device = "gpu"
print(f"\nGPU device — {n:,} points:")
%timeit -n1 -r1 fc(xb)
fc.device = "cpu"
print(f"\nCPU device — {n:,} points:")
%timeit -n1 -r1 fc(xb)

x = esc.var("x")
y = esc.var("y")
f2 = esc.exp(-x) * y + 1.0

f2.device = "gpu"

nx = ny = 10500
xs = np.linspace(-1.0, 1.0, nx, dtype=np.float64)
ys = np.linspace(-1.0, 1.0, ny, dtype=np.float64)
X, Y = np.meshgrid(xs, ys, indexing="xy")
# Packed layout: [x0, y0, x1, y1, ...] — length n_points * num_variables
pts = np.ascontiguousarray(np.column_stack([X.ravel(), Y.ravel()]).ravel())

z = np.asarray(f2(pts), dtype=np.float64)
z_ref = (X * Y + 1.0).ravel()
print("max error:", float(np.max(np.abs(z - z_ref))))
print(f"grid {nx}x{ny} = {pts.size // 2:,} points")
%timeit -n1 -r1 fc2(pts)

%timeit f2(pts)

x = esc.var("x")
mean_p = esc.par("mean", 10.0)
sigma_p = esc.par("sigma", 2.0)
fwhm_p = esc.par("fwhm", 4.0)

distributions = {
    "normal":      esc.normal(x, mean_p, fwhm_p),
    "gamma":       esc.gamma(x, mean_p, sigma_p),
    "schulz":      esc.schulz(x, mean_p, sigma_p),
    "lognorm":     esc.lognorm(x, mean_p, sigma_p),
    "uniform":     esc.uniform(x, mean_p, fwhm_p),
    "triangular":  esc.triangular(x, mean_p, fwhm_p),
}

xarr = np.linspace(0.1, 25.0, 500)

for name, f in distributions.items():
    cpu_vals = f(xarr)

    f.device = "gpu"
    gpu_vals = f(xarr)
    f.device = "cpu"

    max_diff = np.max(np.abs(cpu_vals - gpu_vals))
    ok = max_diff < 1e-12
    print(f"{name:12s}  max|cpu-gpu| = {max_diff:.2e}  {'OK' if ok else 'MISMATCH'}")

x = esc.var("x")
tau = esc.var("tau")
mean_f = esc.par("mean_f", 10.0)
sigma_f = esc.par("sigma_f", 1.0)
mean_g = esc.par("mean_g", 0.0)
sigma_g = esc.par("sigma_g", 1.5)

f_conv = esc.normal(tau, mean_f, sigma_f)
g_conv = esc.normal(x - tau, mean_g, sigma_g)

h_conv = esc.convolution(f_conv, g_conv, x, tau, 0.0, 30.0, 1e-8, 1e-8, 200)

xarr = np.linspace(2.0, 18.0, 200)
cpu_vals = h_conv(xarr)

h_conv.device = "gpu"
gpu_vals = h_conv(xarr)
h_conv.device = "cpu"

max_diff = np.max(np.abs(cpu_vals - gpu_vals))
print(f"convolution   max|cpu-gpu| = {max_diff:.2e}  {'OK' if max_diff < 1e-6 else 'MISMATCH'}")

x = esc.var("x")
mean_v = esc.var("mean")
sigma_p = esc.par("sigma", 0.1, userlim=[0, 10])

f_avg = esc.sin(x*1000)**2
g_avg = esc.normal(x, mean_v, sigma_p * x)

                 
h_avg = esc.average_normal(f_avg, sigma_p, x, epsrel=1e-8, epsabs=1e-8, maxiter=200, numpoints=15)

xarr = np.linspace(1.0, 10.0, 100)
cpu_vals = h_avg(xarr)

h_avg.device = "gpu"
gpu_vals = h_avg(xarr)
h_avg.device = "cpu"

max_diff = np.max(np.abs(cpu_vals - gpu_vals))
print(f"weighted avg  max|cpu-gpu| = {max_diff:.2e}  {'OK' if max_diff < 1e-6 else 'MISMATCH'}")

h_avg.device = "gpu"
h_avg.show(coordinates=np.linspace(-10, 10, 1000))

Item	Description
`f.device = \"gpu\"`	Enable GPU JIT for array evaluation (`nvcc` + cache at runtime); requires CUDA toolkit on `PATH`.
`f.device`	`"gpu"`, `"cpu"`, or a cluster `tcp://…` endpoint string.
`compile(f)`	Optional, CPU-only: native extension (see compiler.ipynb). Not required for GPU JIT.
`compile(f, keep_build=True)`	Keep CPU compile artifacts (`build_dir`).
`compile(f, build_dir=path)`	Fixed CPU build directory.

Functor GPU JIT (CUDA)¶

Requirements¶

Quick start (JIT only)¶

How it works (short)¶

Why GPU timings can be slower than CPU in `%timeit`¶

Example 1 — Gaussian peak (large arrays)¶

Example 2 — Mixed trig + exp (matches unit test, larger grid)¶

Example 3 — Decaying exponential grid (10M points)¶

Example 4 — CPU fallback on the same GPU build¶

Example 5 — Two variables (smaller grid)¶

API reference¶

JIT vs CPU `compile()`¶

See also¶

Distribution Functors on GPU¶

Convolution on GPU¶

Weighted Average (Distribution-based) on GPU¶

Functor GPU JIT (CUDA)¶

Requirements¶

Quick start (JIT only)¶

How it works (short)¶

Why GPU timings can be slower than CPU in %timeit¶

Example 1 — Gaussian peak (large arrays)¶

Example 2 — Mixed trig + exp (matches unit test, larger grid)¶

Example 3 — Decaying exponential grid (10M points)¶

Example 4 — CPU fallback on the same GPU build¶

Example 5 — Two variables (smaller grid)¶

API reference¶

JIT vs CPU compile()¶

See also¶

Distribution Functors on GPU¶

Convolution on GPU¶

Weighted Average (Distribution-based) on GPU¶

Why GPU timings can be slower than CPU in `%timeit`¶

JIT vs CPU `compile()`¶