Skip to content

Commit

Permalink
Check O2 alerts and improve error handling
Browse files Browse the repository at this point in the history
  • Loading branch information
albireox committed Sep 18, 2024
1 parent 23bfcb1 commit b988eb0
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 44 deletions.
4 changes: 2 additions & 2 deletions src/lvmcryo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ notifications:
slack_mentions:
error: '@here'
email_recipients:
- [email protected]
- [email protected]
email_server: smtp-01.lco.cl
email_from: LVM LN2 system <[email protected]>
email_reply_to: [email protected]

api_routes:
slack: http://lvm-hub.lco.cl:8080/api/slack/message
alerts: http://lvm-hub.lco.cl:8080/api/alerts
alerts: http://localhost:8888/alerts
fill_data: http://lvm-hub.lco.cl:8080/api/spectrographs/fills/measurements
register_fill: http://lvm-hub.lco.cl:8080/api/spectrographs/fills/register

Expand Down
143 changes: 103 additions & 40 deletions src/lvmcryo/handlers/ln2.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
import logging
from dataclasses import dataclass, field

from typing import Coroutine
from typing import Coroutine, Literal, NoReturn, overload

import httpx
import sshkeyboard
from pydantic import BaseModel, field_serializer
from rich import box
Expand All @@ -30,7 +29,7 @@
from lvmcryo.config import ValveConfig, get_internal_config
from lvmcryo.handlers.thermistor import ThermistorMonitor
from lvmcryo.handlers.valve import ValveHandler
from lvmcryo.tools import TimerProgressBar, cancel_task, get_fake_logger
from lvmcryo.tools import TimerProgressBar, cancel_task, get_fake_logger, o2_alert


def convert_datetime_to_iso_8601_with_z_suffix(dt: datetime.datetime) -> str:
Expand Down Expand Up @@ -117,7 +116,7 @@ def __post_init__(self):
self.valve_handlers: dict[str, ValveHandler] = {}
for camera in self.cameras + [self.purge_valve]:
if camera not in self.valve_info:
raise ValueError(f"Cannot find valve infor for {camera!r}.")
raise ValueError(f"Cannot find valve info for {camera!r}.")

actor = self.valve_info[camera].actor
outlet = self.valve_info[camera].outlet
Expand All @@ -141,6 +140,7 @@ def __post_init__(self):

self.failed: bool = False
self.aborted: bool = False
self.error: str | None = None

def get_specs(self):
"""Returns a list of spectrographs being handled."""
Expand Down Expand Up @@ -181,23 +181,34 @@ async def check(
"""

# Check O2 alarms.
if not self.alerts_route:
self.log.warning("No alerts route provided. Not checking O2 alarms.")
else:
try:
self.log.info("Checking for O2 alarms ...")
if await o2_alert(self.alerts_route):
self.fail("O2 alarm detected.")
else:
self.log.debug("No O2 alarms reported.")
except Exception as err:
self.fail(f"Error checking O2 alarms: {err}")

if max_temperature is not None:
self.log.info("Checking LN2 temperatures ...")
try:
spec_temperatures = await spectrograph_temperatures()
except Exception as err:
raise RuntimeError(f"Failed reading spectrograph temperatures: {err}")
self.fail(f"Failed reading spectrograph temperatures: {err}")
else:
for camera in self.cameras:
ln2_temp = spec_temperatures[f"{camera}_ln2"]

if ln2_temp is None:
self.fail()
raise RuntimeError(f"Failed retrieving {camera!r} temperature.")
self.fail(f"Failed retrieving {camera!r} temperature.")

if ln2_temp > max_temperature:
self.fail()
raise RuntimeError(
self.fail(
f"LN2 temperature for camera {camera} is {ln2_temp:.1f} C "
f"which is above the maximum allowed temperature "
f"({max_temperature:.1f} C)."
Expand All @@ -208,18 +219,16 @@ async def check(
try:
spec_pressures = await spectrograph_pressures()
except Exception as err:
raise RuntimeError(f"Failed reading spectrograph pressures: {err}")
self.fail(f"Failed reading spectrograph pressures: {err}")
else:
for camera in self.cameras:
pressure = spec_pressures[camera]

if pressure is None:
self.fail()
raise RuntimeError(f"Failed retrieving {camera!r} pressure.")
self.fail(f"Failed retrieving {camera!r} pressure.")

if pressure > max_pressure:
self.fail()
raise RuntimeError(
self.fail(
f"Pressure for camera {camera} is {pressure} Torr "
f"which is above the maximum allowed pressure "
f"({max_pressure} Torr)."
Expand All @@ -231,8 +240,7 @@ async def check(
try:
thermistors = await read_thermistors()
except Exception as err:
self.fail()
raise RuntimeError(f"Failed reading thermistors: {err}")
self.fail(f"Failed reading thermistors: {err}")

for valve in self.valve_handlers:
thermistor_info = self.valve_info[valve].thermistor
Expand All @@ -246,12 +254,11 @@ async def check(
continue

if thermistor_info.channel is None:
raise RuntimeError(f"Thermistor channel for {valve} not defined.")
self.fail(f"Thermistor channel for {valve} not defined.")

thermistor_value = thermistors[thermistor_info.channel]
if thermistor_value is True:
self.fail()
raise RuntimeError(f"Thermistor for valve {valve} is active.")
self.fail(f"Thermistor for valve {valve} is active.")

self.log.info("All pre-fill checks passed.")

Expand Down Expand Up @@ -360,15 +367,15 @@ async def fill(

cameras = cameras or self.cameras
if cameras is None or len(cameras) == 0:
raise RuntimeError("No cameras selected for filling.")
self.fail("No cameras selected for filling.")

fill_tasks: list[Coroutine] = []

for camera in cameras:
try:
valve_handler = self.valve_handlers[camera]
except KeyError:
raise RuntimeError(f"Unable to find valve for camera {camera!r}.")
self.fail(f"Unable to find valve for camera {camera!r}.")

fill_tasks.append(
valve_handler.start_fill(
Expand Down Expand Up @@ -445,11 +452,13 @@ async def monitor_keys(key: str):
return

if key == "x" or key == "X":
self.log.warning("Aborting.")

await self.close_valves(only_active=False)

self.abort() # Prevent any future actions.
self.log.warning("Aborting now.")
# No not raise an alert here.
await self.abort(
error="Aborted by user.",
close_valves=True,
raise_error=False,
)

elif key == "enter":
self.log.warning("Finishing purge/fill.")
Expand Down Expand Up @@ -485,23 +494,29 @@ async def monitor_alerts(self):
self.log.warning("No alerts route provided. Not monitoring alerts.")
return

n_failed: int = 0

while True:
try:
async with httpx.AsyncClient(follow_redirects=True) as client:
response = await client.get(self.alerts_route)
if await o2_alert(self.alerts_route):
await self.abort(
error="O2 alarm detected: closing valves and aborting.",
close_valves=True,
raise_error=False,
)

if response.status_code != 200:
self.log.warning(f"Error reading alerts: {response.text}")
else:
alerts = response.json()
if any(alerts["o2_room_alerts"].values()):
self.log.error("O2 alarm: closing valves and aborting.")
await self.close_valves(only_active=False)
self.abort()
return
n_failed = 0

except Exception as ee:
self.log.warning(f"Error reading alerts: {ee}")
n_failed += 1

if n_failed >= 10:
await self.abort(
error="Too many errors reading alerts. Aborting.",
close_valves=True,
raise_error=False,
)

await asyncio.sleep(3)

Expand Down Expand Up @@ -531,16 +546,64 @@ async def clear(self):

self._alerts_monitor_task = await cancel_task(self._alerts_monitor_task)

def fail(self):
@overload
def fail(self, error: str, raise_error: bool = True) -> NoReturn: ...

@overload
def fail(self, error=None, raise_error: bool = True) -> None: ...

def fail(
self,
error: str | None = None,
raise_error: bool = True,
) -> NoReturn | None:
"""Sets the fail flag and event time."""

self.failed = True
self.event_times.fail_time = get_now()

def abort(self):
if error:
self.error = error
self.log.error(error)
if raise_error:
raise RuntimeError(error)

@overload
async def abort(
self,
error: str | None,
close_valves: bool,
raise_error: bool = True,
) -> NoReturn: ...

@overload
async def abort(
self,
error: str | None,
close_valves: bool,
raise_error: Literal[True] = True,
) -> NoReturn: ...

@overload
async def abort(
self,
error: str | None,
close_valves: bool,
raise_error: Literal[False] = False,
) -> None: ...

async def abort(
self,
error: str | None = None,
close_valves: bool = False,
raise_error: bool = True,
) -> NoReturn | None:
"""Aborts the fill."""

self.aborted = True
self.event_times.abort_time = get_now()

self.fail()
if close_valves:
await self.close_valves(only_active=False)

self.fail(error=error or "Aborted.", raise_error=raise_error)
4 changes: 2 additions & 2 deletions src/lvmcryo/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ async def ln2_runner(

if handler.failed or handler.aborted:
await handler.clear()
raise RuntimeError("Purge failed or was aborted.")
raise RuntimeError(handler.error or "Purge failed or was aborted.")

if config.action == Actions.purge_fill or config.action == Actions.fill:
await notifier.post_to_slack("Starting fill.")
Expand All @@ -137,7 +137,7 @@ async def ln2_runner(

if handler.failed or handler.aborted:
await handler.clear()
raise RuntimeError("Fill failed or was aborted.")
raise RuntimeError(handler.error or "Fill failed or was aborted.")

await notifier.post_to_slack(f"LN₂ `{action}` completed successfully.")
await handler.clear()
Expand Down
18 changes: 18 additions & 0 deletions src/lvmcryo/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from typing import Any

import httpx
from jinja2 import Environment, FileSystemLoader
from rich.console import Console
from rich.progress import BarColumn, MofNCompleteColumn, Progress, TaskID, TextColumn
Expand Down Expand Up @@ -231,3 +232,20 @@ def get_fake_logger():
logger.disabled = True

return logger


async def o2_alert(route: str = "http://lvm-hub.lco.cl:8080/api/alerts"):
"""Is there an active O2 alert?"""

try:
async with httpx.AsyncClient(follow_redirects=True) as client:
response = await client.get(route)

if response.status_code != 200:
raise RuntimeError(response.text)
else:
alerts = response.json()
return alerts["o2_alert"]

except Exception as ee:
raise RuntimeError(f"Error reading alerts: {ee}")

0 comments on commit b988eb0

Please sign in to comment.