Skip to content

Commit

Permalink
Cloudstats: Adding GPUs and block size in the table
Browse files Browse the repository at this point in the history
  • Loading branch information
guilbaults committed Oct 4, 2024
1 parent bdd6a47 commit bc45ab1
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 1 deletion.
4 changes: 4 additions & 0 deletions cloudstats/templates/cloudstats/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ <h4>{% translate "Projects" %}</h4>
<th>{% translate "Used cores" %}</th>
<th>{% translate "Running memory (GB)" %}</th>
<th>{% translate "Used memory (GB)" %}</th>
<th>{% translate "Running GPUs" %}</th>
<th>{% translate "Block capacity (GB)" %}</th>
</tr>
</thead>
<tbody>
Expand All @@ -48,6 +50,8 @@ <h4>{% translate "Projects" %}</h4>
<td>{{project.used_cores | floatformat:1}}</td>
<td>{{project.memory | floatformat:1}}</td>
<td>{{project.used_memory | floatformat:1}}</td>
<td>{{project.gpu_qty | floatformat:1}}</td>
<td>{{project.block_capacity | floatformat:1}}</td>
</tr>
{% endfor %}
</tbody>
Expand Down
31 changes: 30 additions & 1 deletion cloudstats/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,30 @@ def index(request):
for line in stats_used_memory:
all_projects[line['metric']['project_name']]['used_memory'] = statistics.mean(line['y'])

context['total_projects'] = {'cores': 0, 'used_cores': 0, 'memory': 0, 'used_memory': 0}
# infer the number of gpus from the instance_name
query_gpus = 'count(libvirtd_domain_balloon_current{{ {filter} }}) by (project_name, instance_type)'.format(
filter=prom.get_filter('cloudstats'),
)
stats_gpus = prom.query_prometheus_multiple(query_gpus, datetime.now() - timedelta(days=31), datetime.now(), step='1d')
for line in stats_gpus:
try:
gpu_qty = settings.CLOUD_INSTANCE_TYPE[line['metric']['instance_type']]['gpu']
except KeyError:
gpu_qty = 0

if 'gpu_qty' in all_projects[line['metric']['project_name']]:
all_projects[line['metric']['project_name']]['gpu_qty'] =+ statistics.mean(line['y']) * gpu_qty
else:
all_projects[line['metric']['project_name']]['gpu_qty'] = statistics.mean(line['y']) * gpu_qty

query_block_capacity = 'sum(libvirtd_domain_block_capacity{{ {filter} }}/1024/1024/1024) by (project_name)'.format(
filter=prom.get_filter('cloudstats'),
)
stats_block_capacity = prom.query_prometheus_multiple(query_block_capacity, datetime.now() - timedelta(days=31), datetime.now(), step='1d')
for line in stats_block_capacity:
all_projects[line['metric']['project_name']]['block_capacity'] = statistics.mean(line['y'])

context['total_projects'] = {'cores': 0, 'used_cores': 0, 'memory': 0, 'used_memory': 0, 'gpu_qty': 0, 'block_capacity': 0}
for project in sorted(all_projects):
context['all_projects'].append({
'id': project,
Expand All @@ -58,11 +81,15 @@ def index(request):
'used_cores': all_projects[project]['used_cores'],
'memory': all_projects[project]['memory'],
'used_memory': all_projects[project]['used_memory'],
'gpu_qty': all_projects[project]['gpu_qty'],
'block_capacity': all_projects[project]['block_capacity'],
})
context['total_projects']['cores'] += all_projects[project]['cores']
context['total_projects']['used_cores'] += all_projects[project]['used_cores']
context['total_projects']['memory'] += all_projects[project]['memory']
context['total_projects']['used_memory'] += all_projects[project]['used_memory']
context['total_projects']['gpu_qty'] += all_projects[project]['gpu_qty']
context['total_projects']['block_capacity'] += all_projects[project]['block_capacity']

context['all_projects'].append({
'id': 'total',
Expand All @@ -71,6 +98,8 @@ def index(request):
'used_cores': context['total_projects']['used_cores'],
'memory': context['total_projects']['memory'],
'used_memory': context['total_projects']['used_memory'],
'gpu_qty': context['total_projects']['gpu_qty'],
'block_capacity': context['total_projects']['block_capacity'],
})

# Grab the hypervisors hostnames
Expand Down
14 changes: 14 additions & 0 deletions userportal/settings/30-cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,20 @@

CLOUD_ALLOCATIONS_FILE = '/var/www/userportal/projects-rac2022.yml'

# We can gather the cpu/memory info from the exporter, but not the gpu info
CLOUD_INSTANCE_TYPE = {
'gpu12-120-850gb-a100x1': { 'gpu': 1 },
'gpu24-240-1700gb-a100x2': { 'gpu': 2 },
'gpu48-480-3400gb-a100x4': { 'gpu': 4 },
'gpu16-240-3375gb-a100x1': { 'gpu': 1 },
'gpu32-480-6750gb-a100x2': { 'gpu': 2 },
'gpu64-960-13500gb-a100x4': { 'gpu': 4 },
'gpu13-240-2500gb-a100-80gx1': { 'gpu': 1 },
'gpu26-480-5000gb-a100-80gx2': { 'gpu': 2 },
'gpu52-960-10000gb-a100-80gx4': { 'gpu': 4 },
'gpu104-1920-20000gb-a100-80gx8': { 'gpu': 8 },
}

# Links in the menu
EXTERNAL_LINKS = []
#EXTERNAL_LINKS = [
Expand Down

0 comments on commit bc45ab1

Please sign in to comment.