Skip to content

Commit

Permalink
fix: stopping all scenarios when not enough resources plus websocket …
Browse files Browse the repository at this point in the history
…errors
  • Loading branch information
FerTV committed Dec 18, 2024
1 parent cbf6d92 commit bc7126d
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 97 deletions.
3 changes: 0 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,6 @@ lock: ## Update the lock file
.PHONY: check
check: ## Run code quality tools
@echo "🛠️ Running code quality checks"
@echo "🔍 Checking uv lock file consistency"
@$(UV) sync
@echo "🚨 Linting code with pre-commit"
@$(UV) run pre-commit run -a

.PHONY: check-plus
Expand Down
43 changes: 23 additions & 20 deletions nebula/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

import docker
import psutil
import torch
import uvicorn
from dotenv import load_dotenv
from fastapi import FastAPI
Expand Down Expand Up @@ -52,90 +51,94 @@ async def read_root():
async def get_status():
return {"status": "NEBULA Controller API is running"}


@app.get("/resources")
async def get_resources():
devices = 0
gpu_memory_percent = []

# Obtain available RAM
memory_info = await asyncio.to_thread(psutil.virtual_memory)

if importlib.util.find_spec("pynvml") is not None:
try:
import pynvml

await asyncio.to_thread(pynvml.nvmlInit)
devices = await asyncio.to_thread(pynvml.nvmlDeviceGetCount)

# Obtain GPU info
for i in range(devices):
handle = await asyncio.to_thread(pynvml.nvmlDeviceGetHandleByIndex, i)
memory_info_gpu = await asyncio.to_thread(pynvml.nvmlDeviceGetMemoryInfo, handle)
memory_used_percent = (memory_info_gpu.used / memory_info_gpu.total) * 100
gpu_memory_percent.append(memory_used_percent)

except Exception: # noqa: S110
pass

return {
# "cpu_percent": psutil.cpu_percent(),
"gpus" : devices,
"memory_percent" : memory_info.percent,
"gpus": devices,
"memory_percent": memory_info.percent,
"gpu_memory_percent": gpu_memory_percent,
}


@app.get("/least_memory_gpu")
async def get_least_memory_gpu():
gpu_with_least_memory_index = None

if importlib.util.find_spec("pynvml") is not None:
try:
import pynvml

await asyncio.to_thread(pynvml.nvmlInit)
devices = await asyncio.to_thread(pynvml.nvmlDeviceGetCount)

# Obtain GPU info
for i in range(devices):
handle = await asyncio.to_thread(pynvml.nvmlDeviceGetHandleByIndex, i)
memory_info = await asyncio.to_thread(pynvml.nvmlDeviceGetMemoryInfo, handle)
memory_used_percent = (memory_info.used / memory_info.total) * 100

# Obtain GPU with less memory available
if memory_used_percent > max_memory_used_percent:
max_memory_used_percent = memory_used_percent
gpu_with_least_memory_index = i

except Exception: # noqa: S110
pass

return {
"gpu_with_least_memory_index": gpu_with_least_memory_index,
}


@app.get("/available_gpus/")
async def get_available_gpu():
available_gpus = []

if importlib.util.find_spec("pynvml") is not None:
try:
import pynvml

await asyncio.to_thread(pynvml.nvmlInit)
devices = await asyncio.to_thread(pynvml.nvmlDeviceGetCount)

# Obtain GPU info
for i in range(devices):
handle = await asyncio.to_thread(pynvml.nvmlDeviceGetHandleByIndex, i)
memory_info = await asyncio.to_thread(pynvml.nvmlDeviceGetMemoryInfo, handle)
memory_used_percent = (memory_info.used / memory_info.total) * 100

# Obtain available GPUs
if memory_used_percent < 5:
available_gpus.append(i)

return {
"available_gpus": available_gpus,
}
}
except Exception: # noqa: S110
pass

Expand Down Expand Up @@ -683,7 +686,7 @@ def run_frontend(self):
f"{self.root_path}:/nebula",
"/var/run/docker.sock:/var/run/docker.sock",
f"{self.root_path}/nebula/frontend/config/nebula:/etc/nginx/sites-available/default",
f"{self.db_dir}/databases:/nebula/nebula/frontend/databases"
f"{self.db_dir}/databases:/nebula/nebula/frontend/databases",
],
extra_hosts={"host.docker.internal": "host-gateway"},
port_bindings={80: self.frontend_port, 8080: self.statistics_port},
Expand Down
Loading

0 comments on commit bc7126d

Please sign in to comment.