Building software with slurm jobs using EasyBuild
This blogpost serves as inspiration and get you started running jobs based on some of the stuff I run on my clusters.
Do you want to conveniently build modules in a hurry? Then you will love --job combined with -r.
What is does
By specifying eb -r --job Foobar-1.2.3-foss-2025b.eb easybuild will
1. Resolves all dependencies
2. Pre-fetch all the sources
3. Submit jobs for each easyconfig that needs to be built
- Jobs have dependency tracking based on their dependencies
And then you'll see
$ squeue --me
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
9015294 vera Foobar-1 you PD 0:00 1 (Dependency)
9015292 vera Stuff-1. you R 0:06 1 node-01
9015293 vera Thing-2. you R 0:06 1 node-02
You can specify --job-cores=X to pick the size of your job.
Passing SLURM flags
What if your job needs a GPU partition, or specify a TRES?
Set them via environment variables:
SBATCH_GPUS_PER_NODE=H100:1 eb -r --job-cores=8 Foobar-1.2.3-foss-2025b-CUDA-12.8.0.eb
In fact, why not make an alias for it?
alias buildH100='SBATCH_GPUS_PER_NODE=H100:1 eb -r --job'
Depending on your cluster, you may find the following environment variables useful:
SBATCH_GPUS_PER_NODE
SBATCH_CONSTRAINT
SBATCH_PARTITION
SBATCH_ACCOUNT
but of course, many more exist.
Collect build logs
If you don't want job logs in your current working directory when you submit, collect them to one place using --job-output=/path/to/logs .
If you have multiple cluster or architectures, you may wish to place them in specific locations.
Remember that you can set this via environment variables as well, e.g:
Remember that you can set this via environment variables as well, e.g:
export EASYBUILD_JOB_OUTPUT_DIR=$HOME/log_${CLUSTER_NAME}_${ARCH}/
Debug failed builds
If you have a problem build and the job logs isn't that useful, you can redirect tmpdir to persistent storage:
--tmpdir=/your/centrestorage/path/eb-tmp
A handy alias to nicely keep track of your job queue can be made:
Simple
A handy alias to nicely keep track of your job queue can be made:
alias q='squeue --me -O jobid:10,tres-per-node:18,name:60,TimeUsed:10,reasonlist'
alias wq='watch -c \"squeue --me -O jobid:10,tres-per-node:18,name:60,TimeUsed:10,reasonlist\"'
Using `bat`, we can even color the output nicely:
### Colorful
Using `bat`, we can even color the output nicely:
```bash
alias wq="watch -c \"squeue -u c3-builder -O jobid:10,tres-per-node:18,name:60,TimeUsed:10,reasonlist | sed 's/^ *//g' | sed 's/ \+/,/g' | bat -f -l csv --style plain --theme=ansi | column -s, -t | bat --style grid\""
Advanced
Why not look at sacct as well? And color based on state?
| #!/usr/bin/env python3
import subprocess
from datetime import datetime, timedelta
from rich.console import Console
from rich.table import Table
SACCT_LIMIT=timedelta(hours=12)
def run(cmd):
return subprocess.check_output(cmd, text=True).strip().splitlines()
def parse_table(lines):
header = lines[0].split("|")
rows = []
for line in lines[1:]:
cols = line.split("|")
rows.append(dict(zip(header, cols)))
return rows
def parse_gpu(tres_str):
tres = [x.split('=')[0] for x in tres_str.split(',')]
for t in tres:
if 'gpu' in t:
return t.split(':')[1]
return ''
def dhms_to_hms(dhms: str) -> str:
"""
Convert Slurm elapsed time D-HH:MM:SS or HH:MM:SS into HHH:MM:SS
"""
if '-' in dhms:
days, hms = dhms.split('-', 1)
hours, minutes, seconds = map(int, hms.split(':'))
total_hours = int(days) * 24 + hours
return f"{total_hours:3}:{minutes:02}:{seconds:02}"
else:
return dhms
hours, minutes, seconds = map(int, dhms.split(':'))
total_hours = hours
return f"{total_hours:3}:{minutes:02}:{seconds:02}"
def fix_path(path, jobid) -> str:
path = path.replace('/cephyr/users/c3-builder/Vera', '~')
path = path.replace('/cephyr/users/c3-builder/Alvis', '~')
return path.replace('%j', jobid)
def get_squeue():
cmd = [
"squeue",
"--me",
"-o", "%i|%j|%T|%M|%R|%f|%b"
]
lines = run(cmd)
return parse_table(lines)
def get_sacct():
start = (datetime.now() - SACCT_LIMIT).strftime("%Y-%m-%dT%H:%M:%S")
cmd = [
"sacct",
"-X",
"-S", start,
"--parsable2",
"-o", "jobid,jobname,state,elapsed,nodelist,reason,constraints,reqtres,stdout"
]
lines = run(cmd)
return parse_table(lines)
def classify_arch(job):
c = job.get("constraints", "").lower()
tres = job.get("tres", "").lower()
nodes = job.get("node", "").lower()
cpu_model = 'UNK'
# explicit constraint wins
if "zen4" in c:
cpu_model = "ZEN"
if "icelake" in c:
cpu_model = "ICE"
if "skylake" in c:
cpu_model = "SKY"
gpu = parse_gpu(tres)
# heuristic: GPUs are on specific CPU models in my cluster
lookup = {'h100': 'ZEN', 't4': 'SKY', 'v100': 'SKY', 'a40': 'ICE', 'a100': 'ICE'}
return lookup.get(gpu, cpu_model), gpu.upper()
def normalize_squeue(jobs):
out = []
for j in jobs:
entry = {
"source": "squeue",
"jobid": j["JOBID"],
"name": j["NAME"],
"state": j["STATE"].split()[0],
"time": dhms_to_hms(j["TIME"]),
"node": j["NODELIST(REASON)"],
"tres": j["TRES_PER_NODE"],
"constraints": j["FEATURES"],
"stdout": "",
}
entry["arch"], entry["gpu"] = classify_arch(entry)
out.append(entry)
return out
def normalize_sacct(jobs):
out = []
for j in jobs:
entry = {
"source": "sacct",
"jobid": j["JobID"],
"name": j["JobName"],
"state": j["State"].split()[0],
"time": dhms_to_hms(j["Elapsed"]),
"node": j["NodeList"],
"tres": j["ReqTRES"],
"constraints": j["Constraints"],
"stdout": j["StdOut"],
}
entry["arch"], entry["gpu"] = classify_arch(entry)
out.append(entry)
return out
def main():
jobs = normalize_squeue(get_squeue())
sa = normalize_sacct(get_sacct())
active_jobids = [j['jobid'] for j in jobs] # dont double up on active jobs
jobs += [s for s in sa if s['jobid'] not in active_jobids]
console = Console(force_terminal=True)
table = Table(show_header=True, header_style="bold", box=None)
table.add_column("JOBID", justify="right")
table.add_column("STATE")
table.add_column("ARCH")
table.add_column("GPU")
table.add_column("TIME", justify="right")
table.add_column("NODE")
table.add_column("NAME")
table.add_column("OUT")
for j in jobs:
state = j['state']
stdout = ''
if state.upper() in ("FAILED", "TIMEOUT"):
state = f"[red]{state}[/red]"
stdout = fix_path(j['stdout'], j['jobid'])
elif state.upper() == "COMPLETED":
state = f"[green]{state}[/green]"
elif state.upper() == "CANCELLED":
state = f"[yellow]{state}[/yellow]"
elif state.upper() == "PENDING":
state = f"[#888888]{state}[/#888888]"
arch = j['arch']
if arch == 'ZEN':
arch = f"[red]{arch}[/red]"
else:
arch = f"[blue]{arch}[/blue]"
table.add_row(j['jobid'], state, arch, j['gpu'], j['time'], j['node'], j['name'], stdout)
console.print(table)
if __name__ == "__main__":
main()
|
example running with watch --color q_advanced: