Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better energy checkpointing #1236

Merged
merged 13 commits into from
Nov 13, 2024
4 changes: 4 additions & 0 deletions c_common/models/chip_power_monitor/src/chip_power_monitor.c
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,10 @@ static void sample_in_slot(UNUSED uint unused0, UNUSED uint unused1) {
if (simulation_is_finished()) {
simulation_handle_pause_resume(resume_callback);

if (sample_count > 0) {
record_aggregate_sample();
}

recording_finalise();

// Invert the time calculation so that any time read is correct
Expand Down
27 changes: 0 additions & 27 deletions spinn_front_end_common/data/fec_data_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ class _FecDataModel(object):
"_database_file_path",
"_database_socket_addresses",
"_ds_database_path",
"_energy_checkpoints",
"_executable_targets",
"_executable_types",
"_first_machine_time_step",
Expand Down Expand Up @@ -191,7 +190,6 @@ def _soft_reset(self) -> None:
self._first_machine_time_step = 0
self._run_step: Optional[int] = None
self._n_run_steps: Optional[int] = None
self._energy_checkpoints: List[int] = []

def _clear_notification_protocol(self) -> None:
if self._notification_protocol:
Expand Down Expand Up @@ -1357,28 +1355,3 @@ def iterate_live_output_devices(cls) -> Iterable[LiveOutputDevice]:
:rtype: iterable(LiveOutputDevice)
"""
return iter(cls.__fec_data._live_output_devices)

@classmethod
def add_energy_checkpoint(cls, checkpoint_ms: int):
"""
Add an energy checkpoint.

:param checkpoint: The checkpoint to be added in milliseconds
"""
cls.__fec_data._energy_checkpoints.append(checkpoint_ms)

@classmethod
def iterate_energy_checkpoints(cls) -> Iterable[int]:
"""
Iterate over energy checkpoints.

:rtype: iterable(int)
"""
return iter(cls.__fec_data._energy_checkpoints)

@classmethod
def clear_energy_checkpoints(cls) -> None:
"""
Clear all energy checkpoints.
"""
cls.__fec_data._energy_checkpoints.clear()
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from spinn_front_end_common.utilities.base_database import BaseDatabase

_SECONDS_TO_MICRO_SECONDS_CONVERSION = 1000
#: Name of the database in the data folder
PROVENANCE_CORE_KEY = "Power_Monitor_Core"


def _timestamp():
Expand Down Expand Up @@ -575,3 +575,20 @@ def get_core_name(self, x: int, y: int, p: int) -> Optional[str]:
""", (x, y, p)):
return str(row["core_name"], 'utf8')
return None

def get_power_monitor_core(self, x, y) -> int:
"""
Gets the power monitor core for chip x, y

:param str description:
:return: list of tuples x, y, value)
:rtype: list(tuple(int, int, float))
"""
for row in self.execute(
"""
SELECT the_value
FROM monitor_provenance
WHERE x = ? AND y = ? AND description = ?
""", (x, y, PROVENANCE_CORE_KEY)):
return int(row["the_value"])
raise LookupError(f"No power monitor core for {x=} {y=}")
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

from collections import defaultdict
from typing import Final, Optional, cast, Dict, Tuple
from spinn_utilities.config_holder import get_config_bool
import numpy
from spinn_utilities.config_holder import get_config_bool, get_config_int
from spinn_machine import Machine
from spinn_machine.version.abstract_version import (
AbstractVersion, ChipActiveTime, RouterPackets)
Expand All @@ -27,11 +28,17 @@
.load_data_specification import load_using_advanced_monitors
from spinn_front_end_common.utility_models\
.chip_power_monitor_machine_vertex import (
PROVENANCE_TIME_KEY, ChipPowerMonitorMachineVertex)
RECORDING_CHANNEL, ChipPowerMonitorMachineVertex)
from spinn_front_end_common.interface.buffer_management.storage_objects \
import BufferDatabase
from spinn_front_end_common.abstract_models import AbstractHasAssociatedBinary

#: milliseconds per second
_MS_PER_SECOND: Final = 1000.0
#: microseconds per millisecond
_US_PER_MS: Final = 1000.0
#: microseconds per second
_US_PER_SECOND: Final = 1000000.0


def compute_energy_used(checkpoint: Optional[int] = None) -> PowerUsed:
Expand Down Expand Up @@ -106,7 +113,8 @@ def compute_energy_used(checkpoint: Optional[int] = None) -> PowerUsed:
n_active_cores += 1
n_active_chips = len(active_cores)

run_chip_active_time = _extract_cores_active_time(checkpoint, active_cores)
run_chip_active_time = _extract_cores_active_time(
checkpoint, active_cores, version)
load_chip_active_time = _make_extra_monitor_core_use(
data_loading_ms, machine, version.n_scamp_cores + 2,
version.n_scamp_cores + 1)
Expand Down Expand Up @@ -163,15 +171,37 @@ def _extract_router_packets(


def _extract_cores_active_time(
checkpoint: Optional[int],
active_cores: Dict[Tuple[int, int], int]) -> ChipActiveTime:
key = PROVENANCE_TIME_KEY
if checkpoint is not None:
key = f"{PROVENANCE_TIME_KEY}_{checkpoint}"
with ProvenanceReader() as db:
data = {(x, y): (value, active_cores[x, y])
for (x, y, value) in db.get_monitor_by_chip(key)}
return data
checkpoint: Optional[int], active_cores: Dict[Tuple[int, int], int],
version: AbstractVersion) -> ChipActiveTime:
sampling_frequency = get_config_int("EnergyMonitor", "sampling_frequency")

chip_activity: ChipActiveTime = {}
with BufferDatabase() as buff_db:
for (x, y), n_cores in active_cores.items():
# Find the core that was used on this chip for power monitoring
p = buff_db.get_power_monitor_core(x, y)
# Get time per sample in seconds (frequency in microseconds)
time_for_recorded_sample_s = sampling_frequency / _US_PER_SECOND
data, _missing = buff_db.get_recording(x, y, p, RECORDING_CHANNEL)
results = numpy.frombuffer(data, dtype=numpy.uint32).reshape(
-1, version.max_cores_per_chip + 1)
# Get record times in milliseconds (frequency in microseconds)
record_times = results[:, 0] * sampling_frequency / _US_PER_MS
# The remaining columns are the counts of active / inactive at
# each sample point
activity = results[:, 1:].astype(numpy.float64)
# Set the activity of *this* core to 0, as we don't want to
# measure that!
physical_core = FecDataView.get_physical_core_id((x, y), p)
activity[:, physical_core] = 0
# Convert to actual active time, assuming the core is fully active
# or fully inactive between samples
activity_times = activity * time_for_recorded_sample_s
# If checkpoint is specified, filter the times
if checkpoint is not None:
activity_times = activity_times[record_times < checkpoint]
chip_activity[x, y] = (activity_times.sum(), n_cores)
return chip_activity


def _make_extra_monitor_core_use(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from spinn_utilities.config_holder import get_config_int
from spinn_utilities.progress_bar import ProgressBar
from pacman.model.placements import Placement, Placements
from spinn_front_end_common.data import FecDataView
Expand All @@ -29,10 +28,8 @@ def sample_chip_power_monitor() -> ChipPowerMonitorMachineVertex:

:rtype: ChipPowerMonitorMachineVertex
"""
sampling_frequency = get_config_int("EnergyMonitor", "sampling_frequency")
return ChipPowerMonitorMachineVertex(
"Sample ChipPowerMonitorMachineVertex",
sampling_frequency=sampling_frequency)
"Sample ChipPowerMonitorMachineVertex")


def insert_chip_power_monitors_to_graphs(placements: Placements):
Expand All @@ -41,15 +38,13 @@ def insert_chip_power_monitors_to_graphs(placements: Placements):

:param ~pacman.model.placements.Placements placements:
"""
sampling_frequency = get_config_int("EnergyMonitor", "sampling_frequency")
machine = FecDataView.get_machine()
# create progress bar
progress = ProgressBar(
machine.n_chips, "Adding Chip power monitors to Graph")

for chip in progress.over(machine.chips):
vertex = ChipPowerMonitorMachineVertex(
f"ChipPowerMonitor on {chip.x}, {chip.y}",
sampling_frequency=sampling_frequency)
f"ChipPowerMonitor on {chip.x}, {chip.y}")
p = pick_core_for_system_placement(placements, chip)
placements.add_placement(Placement(vertex, chip.x, chip.y, p))
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
from enum import IntEnum
from typing import List

import numpy

from spinn_utilities.config_holder import get_config_int
from spinn_utilities.log import FormatAdapter
from spinn_utilities.overrides import overrides
Expand All @@ -44,23 +42,21 @@
locate_memory_region_for_placement)
from spinn_front_end_common.interface.simulation.simulation_utilities import (
get_simulation_header_array)
from spinn_front_end_common.interface.provenance import (
AbstractProvidesProvenanceDataFromMachine)
from spinn_front_end_common.interface.buffer_management.storage_objects\
.buffer_database import PROVENANCE_CORE_KEY

logger = FormatAdapter(logging.getLogger(__name__))
BINARY_FILE_NAME = "chip_power_monitor.aplx"
PROVENANCE_COUNT_KEY = "Power_Monitor_Total_Activity_Count"
PROVENANCE_TIME_KEY = "Power_Monitor_Total_Activity_Time"
RECORDING_CHANNEL = 0

RECORDING_SIZE_PER_ENTRY = 18 * BYTES_PER_WORD
RECORDING_SIZE_PER_ENTRY = 19 * BYTES_PER_WORD
DEFAULT_MALLOCS_USED = 3
CONFIG_SIZE_IN_BYTES = 2 * BYTES_PER_WORD


class ChipPowerMonitorMachineVertex(
MachineVertex, AbstractHasAssociatedBinary,
AbstractGeneratesDataSpecification, AbstractReceiveBuffersToHost,
AbstractProvidesProvenanceDataFromMachine):
AbstractGeneratesDataSpecification, AbstractReceiveBuffersToHost):
"""
Machine vertex for C code representing functionality to record
idle times in a machine graph.
Expand All @@ -69,7 +65,7 @@ class ChipPowerMonitorMachineVertex(
This is an unusual machine vertex, in that it has no associated
application vertex.
"""
__slots__ = ("_sampling_frequency", "__n_samples_per_recording")
__slots__ = ("__sampling_frequency", "__n_samples_per_recording")

class _REGIONS(IntEnum):
# data regions
Expand All @@ -80,32 +76,24 @@ class _REGIONS(IntEnum):
#: which channel in the recording region has the recorded samples
_SAMPLE_RECORDING_CHANNEL = 0

def __init__(self, label: str, sampling_frequency: int):
def __init__(self, label: str):
"""
:param str label: vertex label
:param int sampling_frequency: how often to sample, in microseconds
"""
super().__init__(
label=label, app_vertex=None, vertex_slice=None)
self._sampling_frequency = sampling_frequency
self.__sampling_frequency = get_config_int(
"EnergyMonitor", "sampling_frequency")
self.__n_samples_per_recording = get_config_int(
"EnergyMonitor", "n_samples_per_recording_entry")

@property
def sampling_frequency(self) -> int:
"""
How often to sample, in microseconds.

:rtype: int
"""
return self._sampling_frequency

@property
@overrides(MachineVertex.sdram_required)
def sdram_required(self) -> AbstractSDRAM:
# The number of sample per step does not have to be an int
samples_per_step = (FecDataView.get_hardware_time_step_us() /
self._sampling_frequency)
self.__sampling_frequency)
recording_per_step = samples_per_step / self.__n_samples_per_recording
max_recording_per_step = math.ceil(recording_per_step)
overflow_recordings = max_recording_per_step - recording_per_step
Expand Down Expand Up @@ -146,6 +134,8 @@ def generate_data_specification(
# End-of-Spec:
spec.end_specification()

self.__write_recording_metadata(placement)

def _write_configuration_region(self, spec: DataSpecificationGenerator):
"""
Write the data needed by the C code to configure itself.
Expand All @@ -155,7 +145,7 @@ def _write_configuration_region(self, spec: DataSpecificationGenerator):
"""
spec.switch_write_focus(region=self._REGIONS.CONFIG)
spec.write_value(self.__n_samples_per_recording)
spec.write_value(self._sampling_frequency)
spec.write_value(self.__sampling_frequency)

def _write_setup_info(self, spec):
"""
Expand Down Expand Up @@ -228,63 +218,12 @@ def _deduce_sdram_requirements_per_timer_tick(self) -> int:
:rtype: int
"""
recording_time = (
self._sampling_frequency * self.__n_samples_per_recording)
self.__sampling_frequency * self.__n_samples_per_recording)
n_entries = math.floor(FecDataView.get_hardware_time_step_us() /
recording_time)
return int(math.ceil(n_entries * RECORDING_SIZE_PER_ENTRY))

def get_recorded_data(self, placement: Placement) -> numpy.ndarray:
"""
Get data from SDRAM given placement and buffer manager.
Also arranges for provenance data to be available.

:param ~pacman.model.placements.Placement placement:
the location on machine to get data from
:return: results, an array with 1 dimension of uint32 values
:rtype: ~numpy.ndarray
"""
# for buffering output info is taken form the buffer manager
# get raw data as a byte array
buffer_manager = FecDataView.get_buffer_manager()
record_raw, data_missing = buffer_manager.get_recording(
placement, self._SAMPLE_RECORDING_CHANNEL)
if data_missing:
logger.warning(
"Chip Power monitor has lost data on chip({}, {})",
placement.x, placement.y)
results = numpy.frombuffer(record_raw, dtype="uint32").reshape(-1, 19)
return results

@overrides(AbstractProvidesProvenanceDataFromMachine
.get_provenance_data_from_machine)
def get_provenance_data_from_machine(self, placement: Placement):
# We do this to make sure we actually store the data
results = self.get_recorded_data(placement)
# Get record times in milliseconds
record_times = results[:, 0] * self._sampling_frequency / 1000
activity = results[:, 1:].astype("float")
physical_p = FecDataView().get_physical_core_id(
placement.xy, placement.p)
# Set the activity of *this* core to 0, as we don't want to measure
# that!
activity[:, physical_p] = 0
time_for_recorded_sample_s = self._sampling_frequency / 1000000
activity_times = activity * time_for_recorded_sample_s
for checkpoint in FecDataView().iterate_energy_checkpoints():
# Find all activity up to the check point
activity_before = activity[record_times < checkpoint].sum()
activity_time = activity_times[record_times < checkpoint].sum()
with ProvenanceWriter() as db:
db.insert_monitor(
placement.x, placement.y,
f"{PROVENANCE_COUNT_KEY}_{checkpoint}", activity_before)
db.insert_monitor(
placement.x, placement.y,
f"{PROVENANCE_TIME_KEY}_{checkpoint}", activity_time)
activity_count = activity.sum()
activity_time = activity_times.sum()
def __write_recording_metadata(self, placement: Placement) -> None:
with ProvenanceWriter() as db:
db.insert_monitor(
placement.x, placement.y, PROVENANCE_COUNT_KEY, activity_count)
db.insert_monitor(
placement.x, placement.y, PROVENANCE_TIME_KEY, activity_time)
placement.x, placement.y, PROVENANCE_CORE_KEY, placement.p)