@article{jens_domke_at_2023,
  title = {At the {{Locus}} of {{Performance}}: {{Quantifying}} the {{Effects}} of {{Copious 3D-Stacked Cache}} on {{HPC Workloads}}},
  author = {{Jens Domke} and {Emil Vatai} and {Balazs Gerofi} and {Yuetsu Kodama} and {Mohamed Wahib} and {Artur Podobas} and {Sparsh Mittal} and {Miquel Peric\`as} and {Lingqi Zhang} and {Peng Chen} and {Aleksandr Drozd} and {Satoshi Matsuoka}},
  year = {2023},
  journal = {ACM Transactions on Architecture and Code Optimization},
  abstract = {Over the last three decades, innovations in the memory subsystem were primarily targeted at overcoming the data movement bottleneck. In this paper, we focus on a specific market trend in memory technology: 3D-stacked memory and caches. We investigate the impact of extending the on-chip memory capabilities in future HPC-focused processors, particularly by 3D-stacked SRAM. First, we propose a method oblivious to the memory subsystem to gauge the upper-bound in performance improvements when data movement costs are eliminated. Then, using the gem5 simulator, we model two variants of a hypothetical LARge Cache processor (LARC), fabricated in \textbackslash unit[1.5]\{nm\} and enriched with high-capacity 3D-stacked cache. With a volume of experiments involving a broad set of proxy-applications and benchmarks, we aim to reveal how HPC CPU performance will evolve, and conclude an average boost of 9.56x for cache-sensitive HPC applications, on a per-chip basis. Additionally, we exhaustively document our methodological exploration to motivate HPC centers to drive their own technological agenda through enhanced co-design.},
  doi = {10.1145/3629520},
  keywords = {GPUs,Large-scale FFT,mixed-precision,MPI},
  annotation = {accepted; to appear}
}