GPU Benchmark Results: Condensation & Coagulation¶
This notebook loads benchmark results produced by
particula/gpu/tests/benchmark_test.py and visualizes GPU vs CPU
scaling for the two main aerosol-dynamics kernels:
- Condensation -- per-particle mass transfer (embarrassingly parallel, O(n) work).
- Coagulation -- stochastic particle-resolved collisions (sequential pair selection per box, but parallelized across boxes).
Run the benchmarks first:
pytest particula/gpu/tests/benchmark_test.py --benchmark -v -s
The test suite writes results to gpu_benchmark_results.json in the
repository root.
In [ ]:
Copied!
import json
from pathlib import Path
import matplotlib.pyplot as plt
import json
from pathlib import Path
import matplotlib.pyplot as plt
Load benchmark data¶
In [ ]:
Copied!
BENCHMARK_FILENAME = "gpu_benchmark_results.json"
def _find_benchmark_results() -> Path:
"""Walk up from cwd to find the benchmark JSON at the repo root."""
current = Path.cwd().resolve()
for parent in [current, *current.parents]:
candidate = parent / BENCHMARK_FILENAME
if candidate.exists():
return candidate
# Stop at the repo root (contains pyproject.toml)
if (parent / "pyproject.toml").exists():
break
msg = (
f"Could not find {BENCHMARK_FILENAME}. "
"Run the benchmarks first:\n"
" pytest particula/gpu/tests/benchmark_test.py "
"--benchmark -v -s"
)
raise FileNotFoundError(msg)
results_path = _find_benchmark_results()
with open(results_path) as f:
data = json.load(f)
benchmarks = data["benchmarks"]
print(f"Loaded {len(benchmarks)} benchmark entries from {results_path}")
print(f"Run started: {data.get('started_at', 'unknown')}")
BENCHMARK_FILENAME = "gpu_benchmark_results.json"
def _find_benchmark_results() -> Path:
"""Walk up from cwd to find the benchmark JSON at the repo root."""
current = Path.cwd().resolve()
for parent in [current, *current.parents]:
candidate = parent / BENCHMARK_FILENAME
if candidate.exists():
return candidate
# Stop at the repo root (contains pyproject.toml)
if (parent / "pyproject.toml").exists():
break
msg = (
f"Could not find {BENCHMARK_FILENAME}. "
"Run the benchmarks first:\n"
" pytest particula/gpu/tests/benchmark_test.py "
"--benchmark -v -s"
)
raise FileNotFoundError(msg)
results_path = _find_benchmark_results()
with open(results_path) as f:
data = json.load(f)
benchmarks = data["benchmarks"]
print(f"Loaded {len(benchmarks)} benchmark entries from {results_path}")
print(f"Run started: {data.get('started_at', 'unknown')}")
Condensation scaling¶
In [ ]:
Copied!
# Parse condensation entries
cond_entries = {
k: v for k, v in benchmarks.items() if k.startswith("condensation_")
}
# Build table data
cond_rows = []
for key, entry in sorted(
cond_entries.items(),
key=lambda x: x[1]["n_boxes"] * x[1]["n_particles"],
):
label = key.replace("condensation_", "")
n_total = entry["n_boxes"] * entry["n_particles"]
gpu_s = entry["gpu_time_s"]
steps = entry["steps"]
gpu_per_step = gpu_s / steps
cpu_s = entry.get("cpu_time_s")
speedup = entry.get("speedup")
cond_rows.append(
{
"label": label,
"n_boxes": entry["n_boxes"],
"n_particles": entry["n_particles"],
"total": n_total,
"gpu_s": gpu_s,
"gpu_per_step": gpu_per_step,
"cpu_s": cpu_s,
"speedup": speedup,
}
)
# Print table
print(
f"{'Config':<12} {'Boxes':>5} {'Particles':>10} "
f"{'Total':>10} {'GPU (s)':>9} {'CPU (s)':>9} {'Speedup':>8}"
)
print("-" * 75)
for r in cond_rows:
cpu_str = f"{r['cpu_s']:.4f}" if r["cpu_s"] else "--"
spd_str = f"{r['speedup']:.1f}x" if r["speedup"] else "--"
print(
f"{r['label']:<12} {r['n_boxes']:>5} {r['n_particles']:>10,} "
f"{r['total']:>10,} {r['gpu_s']:>9.4f} {cpu_str:>9} {spd_str:>8}"
)
# Parse condensation entries
cond_entries = {
k: v for k, v in benchmarks.items() if k.startswith("condensation_")
}
# Build table data
cond_rows = []
for key, entry in sorted(
cond_entries.items(),
key=lambda x: x[1]["n_boxes"] * x[1]["n_particles"],
):
label = key.replace("condensation_", "")
n_total = entry["n_boxes"] * entry["n_particles"]
gpu_s = entry["gpu_time_s"]
steps = entry["steps"]
gpu_per_step = gpu_s / steps
cpu_s = entry.get("cpu_time_s")
speedup = entry.get("speedup")
cond_rows.append(
{
"label": label,
"n_boxes": entry["n_boxes"],
"n_particles": entry["n_particles"],
"total": n_total,
"gpu_s": gpu_s,
"gpu_per_step": gpu_per_step,
"cpu_s": cpu_s,
"speedup": speedup,
}
)
# Print table
print(
f"{'Config':<12} {'Boxes':>5} {'Particles':>10} "
f"{'Total':>10} {'GPU (s)':>9} {'CPU (s)':>9} {'Speedup':>8}"
)
print("-" * 75)
for r in cond_rows:
cpu_str = f"{r['cpu_s']:.4f}" if r["cpu_s"] else "--"
spd_str = f"{r['speedup']:.1f}x" if r["speedup"] else "--"
print(
f"{r['label']:<12} {r['n_boxes']:>5} {r['n_particles']:>10,} "
f"{r['total']:>10,} {r['gpu_s']:>9.4f} {cpu_str:>9} {spd_str:>8}"
)
Condensation GPU scaling plot¶
In [ ]:
Copied!
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Left: GPU time vs total particles
totals = [r["total"] for r in cond_rows]
gpu_times = [r["gpu_per_step"] * 1000 for r in cond_rows] # ms
ax1.scatter(totals, gpu_times, s=50, zorder=5)
ax1.set_xscale("log")
ax1.set_yscale("log")
ax1.set_xlabel("Total particles (boxes x particles)")
ax1.set_ylabel("GPU time per step (ms)")
ax1.set_title("Condensation: GPU scaling")
ax1.grid(True, alpha=0.3)
# Right: Speedup (where CPU data exists)
cpu_rows = [r for r in cond_rows if r["speedup"] is not None]
if cpu_rows:
labels = [r["label"] for r in cpu_rows]
speedups = [r["speedup"] for r in cpu_rows]
bars = ax2.barh(labels, speedups, color="steelblue")
ax2.set_xlabel("GPU speedup vs CPU (x)")
ax2.set_title("Condensation: GPU vs CPU speedup")
ax2.set_xscale("log")
for bar, spd in zip(bars, speedups, strict=True):
ax2.text(
bar.get_width() * 1.1,
bar.get_y() + bar.get_height() / 2,
f"{spd:.0f}x",
va="center",
fontsize=9,
)
ax2.grid(True, alpha=0.3, axis="x")
fig.tight_layout()
plt.show()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Left: GPU time vs total particles
totals = [r["total"] for r in cond_rows]
gpu_times = [r["gpu_per_step"] * 1000 for r in cond_rows] # ms
ax1.scatter(totals, gpu_times, s=50, zorder=5)
ax1.set_xscale("log")
ax1.set_yscale("log")
ax1.set_xlabel("Total particles (boxes x particles)")
ax1.set_ylabel("GPU time per step (ms)")
ax1.set_title("Condensation: GPU scaling")
ax1.grid(True, alpha=0.3)
# Right: Speedup (where CPU data exists)
cpu_rows = [r for r in cond_rows if r["speedup"] is not None]
if cpu_rows:
labels = [r["label"] for r in cpu_rows]
speedups = [r["speedup"] for r in cpu_rows]
bars = ax2.barh(labels, speedups, color="steelblue")
ax2.set_xlabel("GPU speedup vs CPU (x)")
ax2.set_title("Condensation: GPU vs CPU speedup")
ax2.set_xscale("log")
for bar, spd in zip(bars, speedups, strict=True):
ax2.text(
bar.get_width() * 1.1,
bar.get_y() + bar.get_height() / 2,
f"{spd:.0f}x",
va="center",
fontsize=9,
)
ax2.grid(True, alpha=0.3, axis="x")
fig.tight_layout()
plt.show()
Coagulation scaling¶
In [ ]:
Copied!
# Parse coagulation entries
coag_entries = {
k: v for k, v in benchmarks.items() if k.startswith("coagulation_")
}
coag_rows = []
for key, entry in sorted(
coag_entries.items(),
key=lambda x: x[1]["n_boxes"] * x[1]["n_particles"],
):
label = key.replace("coagulation_", "")
n_total = entry["n_boxes"] * entry["n_particles"]
gpu_s = entry["gpu_time_s"]
steps = entry["steps"]
gpu_per_step = gpu_s / steps
cpu_s = entry.get("cpu_time_s")
speedup = entry.get("speedup")
coag_rows.append(
{
"label": label,
"n_boxes": entry["n_boxes"],
"n_particles": entry["n_particles"],
"total": n_total,
"gpu_s": gpu_s,
"gpu_per_step": gpu_per_step,
"cpu_s": cpu_s,
"speedup": speedup,
}
)
print(
f"{'Config':<12} {'Boxes':>5} {'Particles':>10} "
f"{'Total':>10} {'GPU (s)':>9} {'CPU (s)':>9} {'Speedup':>8}"
)
print("-" * 75)
for r in coag_rows:
cpu_str = f"{r['cpu_s']:.4f}" if r["cpu_s"] else "--"
spd_str = f"{r['speedup']:.1f}x" if r["speedup"] else "--"
print(
f"{r['label']:<12} {r['n_boxes']:>5} {r['n_particles']:>10,} "
f"{r['total']:>10,} {r['gpu_s']:>9.4f} {cpu_str:>9} {spd_str:>8}"
)
# Parse coagulation entries
coag_entries = {
k: v for k, v in benchmarks.items() if k.startswith("coagulation_")
}
coag_rows = []
for key, entry in sorted(
coag_entries.items(),
key=lambda x: x[1]["n_boxes"] * x[1]["n_particles"],
):
label = key.replace("coagulation_", "")
n_total = entry["n_boxes"] * entry["n_particles"]
gpu_s = entry["gpu_time_s"]
steps = entry["steps"]
gpu_per_step = gpu_s / steps
cpu_s = entry.get("cpu_time_s")
speedup = entry.get("speedup")
coag_rows.append(
{
"label": label,
"n_boxes": entry["n_boxes"],
"n_particles": entry["n_particles"],
"total": n_total,
"gpu_s": gpu_s,
"gpu_per_step": gpu_per_step,
"cpu_s": cpu_s,
"speedup": speedup,
}
)
print(
f"{'Config':<12} {'Boxes':>5} {'Particles':>10} "
f"{'Total':>10} {'GPU (s)':>9} {'CPU (s)':>9} {'Speedup':>8}"
)
print("-" * 75)
for r in coag_rows:
cpu_str = f"{r['cpu_s']:.4f}" if r["cpu_s"] else "--"
spd_str = f"{r['speedup']:.1f}x" if r["speedup"] else "--"
print(
f"{r['label']:<12} {r['n_boxes']:>5} {r['n_particles']:>10,} "
f"{r['total']:>10,} {r['gpu_s']:>9.4f} {cpu_str:>9} {spd_str:>8}"
)
Coagulation GPU scaling plot¶
In [ ]:
Copied!
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Separate single-box vs multi-box for color coding
single_box = [r for r in coag_rows if r["n_boxes"] == 1]
multi_box = [r for r in coag_rows if r["n_boxes"] > 1]
# Left: GPU time vs particles per box (colored by n_boxes)
for rows, color, label in [
(single_box, "tab:blue", "1 box"),
(multi_box, "tab:orange", "multi-box"),
]:
if rows:
x = [r["n_particles"] for r in rows]
y = [r["gpu_per_step"] * 1000 for r in rows]
ax1.scatter(x, y, s=50, color=color, label=label, zorder=5)
ax1.set_xscale("log")
ax1.set_yscale("log")
ax1.set_xlabel("Particles per box")
ax1.set_ylabel("GPU time per step (ms)")
ax1.set_title("Coagulation: GPU scaling")
ax1.legend()
ax1.grid(True, alpha=0.3)
# Right: Speedup (where CPU data exists)
cpu_rows = [r for r in coag_rows if r["speedup"] is not None]
if cpu_rows:
labels = [r["label"] for r in cpu_rows]
speedups = [r["speedup"] for r in cpu_rows]
colors = [
"tab:blue" if r["n_boxes"] == 1 else "tab:orange" for r in cpu_rows
]
bars = ax2.barh(labels, speedups, color=colors)
ax2.set_xlabel("GPU speedup vs CPU (x)")
ax2.set_title("Coagulation: GPU vs CPU speedup")
for bar, spd in zip(bars, speedups, strict=True):
ax2.text(
bar.get_width() + 0.5,
bar.get_y() + bar.get_height() / 2,
f"{spd:.1f}x",
va="center",
fontsize=9,
)
ax2.grid(True, alpha=0.3, axis="x")
fig.tight_layout()
plt.show()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Separate single-box vs multi-box for color coding
single_box = [r for r in coag_rows if r["n_boxes"] == 1]
multi_box = [r for r in coag_rows if r["n_boxes"] > 1]
# Left: GPU time vs particles per box (colored by n_boxes)
for rows, color, label in [
(single_box, "tab:blue", "1 box"),
(multi_box, "tab:orange", "multi-box"),
]:
if rows:
x = [r["n_particles"] for r in rows]
y = [r["gpu_per_step"] * 1000 for r in rows]
ax1.scatter(x, y, s=50, color=color, label=label, zorder=5)
ax1.set_xscale("log")
ax1.set_yscale("log")
ax1.set_xlabel("Particles per box")
ax1.set_ylabel("GPU time per step (ms)")
ax1.set_title("Coagulation: GPU scaling")
ax1.legend()
ax1.grid(True, alpha=0.3)
# Right: Speedup (where CPU data exists)
cpu_rows = [r for r in coag_rows if r["speedup"] is not None]
if cpu_rows:
labels = [r["label"] for r in cpu_rows]
speedups = [r["speedup"] for r in cpu_rows]
colors = [
"tab:blue" if r["n_boxes"] == 1 else "tab:orange" for r in cpu_rows
]
bars = ax2.barh(labels, speedups, color=colors)
ax2.set_xlabel("GPU speedup vs CPU (x)")
ax2.set_title("Coagulation: GPU vs CPU speedup")
for bar, spd in zip(bars, speedups, strict=True):
ax2.text(
bar.get_width() + 0.5,
bar.get_y() + bar.get_height() / 2,
f"{spd:.1f}x",
va="center",
fontsize=9,
)
ax2.grid(True, alpha=0.3, axis="x")
fig.tight_layout()
plt.show()
Key observations¶
Condensation¶
- The GPU condensation kernel is embarrassingly parallel -- each particle's mass transfer is independent. This gives near-constant GPU time up to ~100k particles, with linear scaling beyond that.
- Speedups range from ~13x at 1k particles to ~1,400x at 100k particles per box, demonstrating excellent GPU utilization.
- At 1M particles the GPU still completes a step in under 1 ms.
Coagulation¶
- The GPU coagulation kernel uses a single thread per box for the sequential acceptance-rejection pair selection loop. This means single-box performance is limited by the per-particle work within that one thread.
- Multi-box configurations show 25--30x speedups because the GPU parallelizes across boxes.
- Single-box speedups are modest (1.5--3x) because the GPU thread does essentially the same sequential work as the CPU, just with faster per-operation throughput.
- The
k_maxcomputation uses an O(n) min/max radius bound (matching the CPU's bin-pair strategy) rather than the original O(n^2) all-pairs scan. This reduced single-box GPU time from 29s to 0.28s at 2k particles.
When to use GPU acceleration¶
| Scenario | Recommendation |
|---|---|
| Condensation, any size | Always use GPU -- massive speedup |
| Coagulation, many boxes | Use GPU -- ~25x speedup |
| Coagulation, single box, <5k particles | Either -- modest GPU advantage |
| Coagulation, single box, >10k particles | GPU preferred for throughput |