@article{antici_f-data_2025, title = {F-{{DATA}}: {{A Fugaku Workload Dataset}} for {{Job-centric Predictive Modelling}} in {{HPC Systems}}}, author = {Antici, Francesco and Bartolini, Andrea and Domke, Jens and Kiziltan, Zeynep and Yamamoto, Keiji}, year = {2025}, month = jul, journal = {Scientific Data}, volume = {12}, number = {1}, pages = {1321}, issn = {2052-4463}, doi = {10.1038/s41597-025-05633-1}, abstract = {In the last decades, High Performance Computing (HPC) systems have accelerated scientific discoveries and innovations across different domains, from epidemic studies to climate science. For sustainable development of HPC systems, it is fundamental to address their environmental impact regarding carbon footprint emission and energy requirement, while ensuring high system throughput. Analyzing and predicting HPC job execution characteristics is instrumental in developing workload management strategies to simultaneously optimize the system throughput and minimize the environmental impact. However, model development for accurate predictions is hindered by lack of voluminous public datasets. In this paper, we present F-DATA, a public dataset containing the information of around 24 million jobs executed on Fugaku, the most powerful supercomputer during the data collection phase. The data contains an extensive set of features, allowing for a multitude of job characteristics prediction. The sensitive job data appears both in anonymized and irreversibly encoded versions. The encoding is based on a Natural Language Processing model and retains sensitive but useful job information for prediction purposes without violating privacy concerns.} }