13 — Profiling and Benchmarking Baseline¶
Phase 2 baseline measurements for the evaluation pipeline functions. Sweeps: N (maps), H (pixel side), T (thresholds), B (ℓ-bands). Pre-optimisation timings on the local machine (CPU, single core).
1 Setup¶
[ ]:
import cProfile, pstats, tracemalloc, timeit, os, sys
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
sys.path.insert(0, os.path.join(os.getcwd(), '..', '..'))
from foregrounds_diffusion.flatmaps import (
get_lpf_hpf, make_gaussian_realisation, map2cl,
)
from foregrounds_diffusion.moments import compute_cross_moments, compute_summed_moments
from foregrounds_diffusion.morphology import compute_minkowski_tensors
from foregrounds_diffusion.peak_counts import compute_peak_minima_counts
%matplotlib inline
plt.rcParams.update({'figure.dpi': 100, 'font.size': 10})
[ ]:
def profile_fn(fn, *args, n_repeat=3, **kwargs):
"""Return median wall-clock time and peak memory for one call of fn.""""
times = timeit.repeat(lambda: fn(*args, **kwargs), number=1, repeat=n_repeat)
tracemalloc.start()
fn(*args, **kwargs)
_, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
return {
'time_median_s': float(sorted(times)[n_repeat // 2]),
'time_min_s': float(min(times)),
'peak_mem_mb': peak / 1024 ** 2,
}
def sweep(fn, dim_values, make_args, n_repeat=3, label='N', verbose=True):
"""Sweep a single input dimension; return (times, peak_mems) arrays.""""
times, mems = [], []
for val in dim_values:
args, kwargs = make_args(val)
r = profile_fn(fn, *args, n_repeat=n_repeat, **kwargs)
times.append(r['time_median_s'])
mems.append(r['peak_mem_mb'])
if verbose:
print(f' {label}={val:>5} {r["time_median_s"]:.4f}s {r["peak_mem_mb"]:.1f} MB')
return np.array(times), np.array(mems)
def fit_slope(xs, ys):
"""Fit log-log power-law slope.""""
xs, ys = np.array(xs, float), np.array(ys, float)
mask = ys > 0
if mask.sum() < 2:
return float('nan')
slope, *_ = linregress(np.log(xs[mask]), np.log(ys[mask]))
return slope
[ ]:
_EL = np.arange(1, 5000)
_CL = 1e-5 * _EL.astype(float) ** (-2)
def make_maps(N, H):
np.random.seed(42)
params = [H, H, 1.40625, 1.40625]
return (
np.array([make_gaussian_realisation(params, _EL, _CL) for _ in range(N)]),
params,
)
def make_bp_filters(params, B):
edges = np.linspace(200, 7000, B + 1)
return [get_lpf_hpf(params, (edges[i], edges[i+1]), filter_type=2) for i in range(B)]
N_VALUES = [1, 5, 10, 50, 100]
H_VALUES = [32, 64, 128, 256]
T_VALUES = [5, 10, 25, 50, 100]
B_VALUES = [2, 4, 8, 16]
H_FIXED, T_FIXED, B_FIXED, N_FIXED = 64, 25, 4, 10
flatskymapparams = [H_FIXED, H_FIXED, 1.40625, 1.40625]
thresholds = np.linspace(-2, 2, T_FIXED)
bp_fixed = make_bp_filters(flatskymapparams, B_FIXED)
thresh_peaks = np.linspace(-1, 5, 20)
thresh_minima = np.linspace(-5, 1, 20)
maps_cache = {N: make_maps(N, H_FIXED)[0] for N in N_VALUES}
maps10, _ = make_maps(N_FIXED, H_FIXED)
print('Fixtures ready.')
2 Baseline measurements (pre-optimisation)¶
2a compute_minkowski_tensors¶
[ ]:
print('N-scaling (H=64, T=25)')
mink_N_times, mink_N_mems = sweep(
compute_minkowski_tensors, N_VALUES,
lambda N: ((maps_cache[N], lambda x: x, thresholds), {}), label='N')
print()
print('H-scaling (N=10, T=25)')
mink_H_times, mink_H_mems = [], []
for H in H_VALUES:
maps, params = make_maps(N_FIXED, H)
r = profile_fn(compute_minkowski_tensors, maps, lambda x: x, np.linspace(-2,2,T_FIXED))
mink_H_times.append(r['time_median_s']); mink_H_mems.append(r['peak_mem_mb'])
print(f' H={H:>4} {r["time_median_s"]:.4f}s {r["peak_mem_mb"]:.1f} MB')
mink_H_times, mink_H_mems = np.array(mink_H_times), np.array(mink_H_mems)
print()
print('T-scaling (N=10, H=64)')
mink_T_times, mink_T_mems = sweep(
compute_minkowski_tensors, T_VALUES,
lambda T: ((maps10, lambda x: x, np.linspace(-2,2,T)), {}), label='T')
print(f'\nN-slope={fit_slope(N_VALUES,mink_N_times):.2f} '
f'H-slope={fit_slope(H_VALUES,mink_H_times):.2f} '
f'T-slope={fit_slope(T_VALUES,mink_T_times):.2f}')
2b compute_cross_moments¶
[ ]:
print('N-scaling (H=64, B=4)')
cross_N_times, cross_N_mems = sweep(
compute_cross_moments, N_VALUES,
lambda N: ((maps_cache[N], maps_cache[N], bp_fixed), {}), label='N')
print()
print('B-scaling (N=10, H=64)')
cross_B_times, cross_B_mems = sweep(
compute_cross_moments, B_VALUES,
lambda B: ((maps10, maps10, make_bp_filters(flatskymapparams, B)), {}), label='B')
print(f'\nN-slope={fit_slope(N_VALUES,cross_N_times):.2f} '
f'B-slope={fit_slope(B_VALUES,cross_B_times):.2f}')
2c map2cl (per-map loop)¶
[ ]:
print('N-scaling (H=64)')
map2cl_N_times, map2cl_N_mems = [], []
for N in N_VALUES:
maps = maps_cache[N]
r = profile_fn(lambda maps=maps: [map2cl(flatskymapparams, m) for m in maps])
map2cl_N_times.append(r['time_median_s']); map2cl_N_mems.append(r['peak_mem_mb'])
print(f' N={N:>5} {r["time_median_s"]:.4f}s {r["peak_mem_mb"]:.1f} MB')
map2cl_N_times, map2cl_N_mems = np.array(map2cl_N_times), np.array(map2cl_N_mems)
print()
print('H-scaling (N=10)')
map2cl_H_times, map2cl_H_mems = [], []
for H in H_VALUES:
maps, params = make_maps(N_FIXED, H)
r = profile_fn(lambda maps=maps, p=params: [map2cl(p, m) for m in maps])
map2cl_H_times.append(r['time_median_s']); map2cl_H_mems.append(r['peak_mem_mb'])
print(f' H={H:>4} {r["time_median_s"]:.4f}s {r["peak_mem_mb"]:.1f} MB')
map2cl_H_times, map2cl_H_mems = np.array(map2cl_H_times), np.array(map2cl_H_mems)
print(f'\nN-slope={fit_slope(N_VALUES,map2cl_N_times):.2f} '
f'H-slope={fit_slope(H_VALUES,map2cl_H_times):.2f}')
2d compute_peak_minima_counts¶
[ ]:
print('N-scaling (H=64)')
peak_N_times, peak_N_mems = sweep(
compute_peak_minima_counts, N_VALUES,
lambda N: ((maps_cache[N], thresh_peaks, thresh_minima), {}), label='N')
print(f'\nN-slope={fit_slope(N_VALUES,peak_N_times):.2f}')
3 Figures (pre-optimisation)¶
[ ]:
# Figure 1 — Wall-clock time vs N (log-log), one panel per function
fig, ax = plt.subplots(figsize=(7, 4))
for label, times in [
('compute_minkowski_tensors', mink_N_times),
('compute_cross_moments', cross_N_times),
('map2cl (loop)', map2cl_N_times),
('compute_peak_minima_counts', peak_N_times),
]:
ax.loglog(N_VALUES, times, marker='o', label=label)
# Reference: ideal linear
ax.loglog(N_VALUES, np.array(N_VALUES, float) / N_VALUES[0] * mink_N_times[0],
'k--', lw=0.8, label='O(N) reference')
ax.set_xlabel('N (number of maps)')
ax.set_ylabel('Wall-clock time (s)')
ax.set_title('Figure 1 — Time vs N (H=64, T=25, B=4)')
ax.legend(fontsize=8)
fig.tight_layout()
os.makedirs('../../plots/benchmarks', exist_ok=True)
fig.savefig('../../plots/benchmarks/fig1_time_vs_N.png', dpi=120)
plt.show()
[ ]:
# Figure 2 — Wall-clock time vs map size H×W (log-log)
fig, ax = plt.subplots(figsize=(6, 4))
ax.loglog(H_VALUES, mink_H_times, marker='s', label='compute_minkowski_tensors')
ax.loglog(H_VALUES, map2cl_H_times, marker='^', label='map2cl (N=10 loop)')
# O(H²) and O(H² log H) references
h = np.array(H_VALUES, float)
ax.loglog(h, h**2 / h[0]**2 * mink_H_times[0], 'k--', lw=0.8, label='O(H²) ref')
ax.set_xlabel('Map side length H (pixels)')
ax.set_ylabel('Wall-clock time (s)')
ax.set_title('Figure 2 — Time vs H (N=10)')
ax.legend(fontsize=8)
fig.tight_layout()
fig.savefig('../../plots/benchmarks/fig2_time_vs_H.png', dpi=120)
plt.show()
[ ]:
# Figure 3 — Peak memory vs N
fig, ax = plt.subplots(figsize=(7, 4))
for label, mems in [
('compute_minkowski_tensors', mink_N_mems),
('compute_cross_moments', cross_N_mems),
('map2cl', map2cl_N_mems),
]:
ax.plot(N_VALUES, mems, marker='o', label=label)
ax.set_xlabel('N (number of maps)')
ax.set_ylabel('Peak memory (MB)')
ax.set_title('Figure 3 — Peak memory vs N (H=64)')
ax.legend(fontsize=8)
fig.tight_layout()
fig.savefig('../../plots/benchmarks/fig3_mem_vs_N.png', dpi=120)
plt.show()
7 Scaling law summary table (pre-optimisation)¶
[ ]:
import pandas as pd
rows = [
('compute_minkowski_tensors',
fit_slope(N_VALUES, mink_N_times), fit_slope(H_VALUES, mink_H_times),
fit_slope(T_VALUES, mink_T_times), '—',
mink_N_times[-1], '—'),
('compute_cross_moments',
fit_slope(N_VALUES, cross_N_times), '—', '—',
fit_slope(B_VALUES, cross_B_times),
cross_N_times[-1], '—'),
('map2cl (loop)',
fit_slope(N_VALUES, map2cl_N_times), fit_slope(H_VALUES, map2cl_H_times),
'—', '—',
map2cl_N_times[-1], '—'),
('compute_peak_minima_counts',
fit_slope(N_VALUES, peak_N_times), '—', '—', '—',
peak_N_times[-1], '—'),
]
df = pd.DataFrame(rows, columns=[
'Function', 'N-slope', 'H-slope', 'T-slope', 'B-slope',
'Time @N=100,H=64 (s)', 'Post-opt time (s)'
])
df = df.set_index('Function')
print(df.to_string())
4 Optimisations applied¶
To be filled in after §2.6 optimisations are implemented.
5 Post-optimisation measurements¶
To be filled in after §2.6 optimisations are implemented.
6 Before/after comparison figures¶
To be filled in after §2.6 optimisations are implemented.
8 Parallel scaling (§3.2)¶
Strong-scaling benchmark: fixed total work (N=50 maps, H=64²), varying worker count. All functions now accept n_jobs — see §3.2 implementation notes.
[ ]:
from multiprocessing import cpu_count
from foregrounds_diffusion.moments import compute_summed_moments, compute_cross_moments
from foregrounds_diffusion.peak_counts import compute_peak_minima_counts
from foregrounds_diffusion.morphology import compute_minkowski_tensors
N_PARALLEL = 50
MAX_WORKERS = min(cpu_count(), 8)
N_WORKERS = [w for w in [1, 2, 4, 8] if w <= MAX_WORKERS]
print(f'Logical CPUs available: {cpu_count()}, sweeping: {N_WORKERS}')
maps50, _ = make_maps(N_PARALLEL, H_FIXED)
thresholds_p = np.linspace(-2, 4, 15)
thresholds_m = np.linspace(-4, 2, 15)
bp4 = make_bp_filters(flatskymapparams, B_FIXED)
[ ]:
# Strong scaling — time vs n_workers for three functions
scale_fns = {
'compute_minkowski_tensors': lambda nj: compute_minkowski_tensors(
maps50, lambda x: x, thresholds, n_jobs=nj),
'compute_cross_moments': lambda nj: compute_cross_moments(
maps50, maps50, bp4, n_jobs=nj),
'compute_peak_minima_counts': lambda nj: compute_peak_minima_counts(
maps50, thresholds_p, thresholds_m,
smoothing_scales_arcmin=(2.5,), n_jobs=nj),
}
scale_times = {fn: [] for fn in scale_fns}
for fn_name, fn in scale_fns.items():
print(f'\n{fn_name}')
for nj in N_WORKERS:
r = profile_fn(fn, nj)
scale_times[fn_name].append(r['time_median_s'])
eff = scale_times[fn_name][0] / (nj * r['time_median_s']) * 100
print(f' n_jobs={nj:2d} {r["time_median_s"]:.3f}s eff={eff:.0f}%')
[ ]:
# Figure 11 — Strong scaling: wall-clock time vs n_workers
fig, ax = plt.subplots(figsize=(6, 4))
colors_scale = [plt.rcParams['axes.prop_cycle'].by_key()['color'][i] for i in range(3)]
for (fn_name, times), color in zip(scale_times.items(), colors_scale):
t1 = times[0]
ax.plot(N_WORKERS, times, 'o-', color=color,
label=fn_name.replace('_', '\\_') if False else fn_name)
# Ideal linear speedup reference (dashed)
t1_ref = max(t[0] for t in scale_times.values())
ideal = [t1_ref / nj for nj in N_WORKERS]
ax.plot(N_WORKERS, ideal, 'k--', lw=0.8, label='Ideal (linear)')
ax.set_xlabel('Number of workers (n_jobs)')
ax.set_ylabel('Wall-clock time (s)')
ax.set_title(f'Strong scaling — N={N_PARALLEL} maps, H={H_FIXED}²')
ax.legend(fontsize=7)
ax.set_xticks(N_WORKERS)
plt.tight_layout()
plt.savefig('plots/benchmarks/fig11_strong_scaling.pdf')
plt.savefig('plots/benchmarks/fig11_strong_scaling.png', dpi=150)
plt.show()
[ ]:
# Strong-scaling summary table
rows = []
for fn_name, times in scale_times.items():
t1 = times[0]
row = {'Function': fn_name, 'Serial (s)': f'{t1:.3f}'}
for nj, t in zip(N_WORKERS[1:], times[1:]):
eff = t1 / (nj * t) * 100
row[f'{nj} workers'] = f'{t:.3f}s ({eff:.0f}%)'
rows.append(row)
import pandas as pd
pd.set_option('display.max_colwidth', 40)
pd.DataFrame(rows).set_index('Function')