Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

linux: assign CPU temperatures by package/core or CCD #1352

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
82 changes: 79 additions & 3 deletions linux/LibSensors.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ in the source distribution for its full text.
#include <errno.h>
#include <limits.h>
#include <math.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
Expand All @@ -35,6 +36,7 @@ in the source distribution for its full text.
#define sym_sensors_get_features sensors_get_features
#define sym_sensors_get_subfeature sensors_get_subfeature
#define sym_sensors_get_value sensors_get_value
#define sym_sensors_get_label sensors_get_label

#else

Expand All @@ -44,6 +46,7 @@ static const sensors_chip_name* (*sym_sensors_get_detected_chips)(const sensors_
static const sensors_feature* (*sym_sensors_get_features)(const sensors_chip_name*, int*);
static const sensors_subfeature* (*sym_sensors_get_subfeature)(const sensors_chip_name*, const sensors_feature*, sensors_subfeature_type);
static int (*sym_sensors_get_value)(const sensors_chip_name*, int, double*);
static char* (*sym_sensors_get_label)(const sensors_chip_name*, const sensors_feature *feature);

static void* dlopenHandle = NULL;

Expand Down Expand Up @@ -82,6 +85,7 @@ int LibSensors_init(void) {
resolve(sensors_get_features);
resolve(sensors_get_subfeature);
resolve(sensors_get_value);
resolve(sensors_get_label);

#undef resolve
}
Expand Down Expand Up @@ -153,6 +157,38 @@ static int tempDriverPriority(const sensors_chip_name* chip) {
return -1;
}

int LibSensors_countCCDs(void) {

#ifndef BUILD_STATIC
if (!dlopenHandle)
return 0;
#endif /* !BUILD_STATIC */

int ccds = 0;

int n = 0;
for (const sensors_chip_name* chip = sym_sensors_get_detected_chips(NULL, &n); chip; chip = sym_sensors_get_detected_chips(NULL, &n)) {
int m = 0;
for (const sensors_feature* feature = sym_sensors_get_features(chip, &m); feature; feature = sym_sensors_get_features(chip, &m)) {
if (feature->type != SENSORS_FEATURE_TEMP)
continue;

if (!feature->name || !String_startsWith(feature->name, "temp"))
continue;

char *label = sym_sensors_get_label(chip, feature);
if (label) {
if (String_startsWith(label, "Tccd")) {
ccds++;
}
free(label);
}
}
}

return ccds;
}

void LibSensors_getCPUTemperatures(CPUData* cpus, unsigned int existingCPUs, unsigned int activeCPUs) {
assert(existingCPUs > 0 && existingCPUs < 16384);

Expand All @@ -168,6 +204,8 @@ void LibSensors_getCPUTemperatures(CPUData* cpus, unsigned int existingCPUs, uns
unsigned int coreTempCount = 0;
int topPriority = 99;

int ccdID = 0;

int n = 0;
for (const sensors_chip_name* chip = sym_sensors_get_detected_chips(NULL, &n); chip; chip = sym_sensors_get_detected_chips(NULL, &n)) {
const int priority = tempDriverPriority(chip);
Expand All @@ -185,6 +223,8 @@ void LibSensors_getCPUTemperatures(CPUData* cpus, unsigned int existingCPUs, uns

topPriority = priority;

int physicalID = -1;

int m = 0;
for (const sensors_feature* feature = sym_sensors_get_features(chip, &m); feature; feature = sym_sensors_get_features(chip, &m)) {
if (feature->type != SENSORS_FEATURE_TEMP)
Expand All @@ -200,9 +240,6 @@ void LibSensors_getCPUTemperatures(CPUData* cpus, unsigned int existingCPUs, uns
/* Feature name IDs start at 1, adjust to start at 0 to match data indices */
tempID--;

if (tempID > existingCPUs)
continue;

const sensors_subfeature* subFeature = sym_sensors_get_subfeature(chip, feature, SENSORS_SUBFEATURE_TEMP_INPUT);
if (!subFeature)
continue;
Expand Down Expand Up @@ -241,6 +278,45 @@ void LibSensors_getCPUTemperatures(CPUData* cpus, unsigned int existingCPUs, uns
}
}

char *label = sym_sensors_get_label(chip, feature);
if (label) {
bool skip = true;
/* Intel coretemp names, labels mention package and phyiscal id */
if (String_startsWith(label, "Package id ")) {
physicalID = strtoul(label + strlen("Package id "), NULL, 10);
} else if (String_startsWith(label, "Physical id ")) {
physicalID = strtoul(label + strlen("Physical id "), NULL, 10);
} else if (String_startsWith(label, "Core ")) {
int coreID = strtoul(label + strlen("Core "), NULL, 10);
for (size_t i = 1; i < existingCPUs + 1; i++) {
if (cpus[i].physicalID == physicalID && cpus[i].coreID == coreID) {
data[i] = temp;
coreTempCount++;
}
}
}

/* AMD k10temp/zenpower names, only CCD is known */
else if (String_startsWith(label, "Tccd")) {
for (size_t i = 1; i <= existingCPUs; i++) {
if (cpus[i].ccdID == ccdID) {
data[i] = temp;
coreTempCount++;
}
}
ccdID++;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not use the number from the label here, e.g. Tccd5? The features might not be sorted or even monotonically increasing by 1.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need a total count of CCD however, too...

} else {
skip = false;
}

free(label);
if (skip)
continue;
}

if (tempID > existingCPUs)
continue;

/* If already set, e.g. Ryzen reporting platform temperature for each die, use the bigger one */
if (isNaN(data[tempID])) {
data[tempID] = temp;
Expand Down
1 change: 1 addition & 0 deletions linux/LibSensors.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ int LibSensors_init(void);
void LibSensors_cleanup(void);
int LibSensors_reload(void);

int LibSensors_countCCDs(void);
void LibSensors_getCPUTemperatures(CPUData* cpus, unsigned int existingCPUs, unsigned int activeCPUs);

#endif /* HEADER_LibSensors */
107 changes: 106 additions & 1 deletion linux/LinuxMachine.c
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,100 @@ static void scanCPUFrequencyFromCPUinfo(LinuxMachine* this) {
}
}

#ifdef HAVE_SENSORS_SENSORS_H
static void LinuxMachine_fetchCPUTopologyFromCPUinfo(LinuxMachine* this) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function does similar to scanCPUFrequencyFromCPUinfo() scan /proc/cpuinfo.
Parsing the frequency here should not make a performance difference.
Then scanCPUFrequencyFromSysCPUFreq() can be changed to write its values first to a local array and on success override the frequencies from that array.
Thus /proc/cpuinfo is not scanned multiple times.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then we need to move the ifdef HAVE_SENSORS_SENSORS_H into that function, which makes it harder to read I think.

const Machine* super = &this->super;

FILE* file = fopen(PROCCPUINFOFILE, "r");
if (file == NULL)
return;

int cpuid = -1;
int coreid = -1;
int physicalid = -1;

int max_physicalid = -1;
int max_coreid = -1;

while (!feof(file)) {
char *buffer = String_readLine(file);
if (!buffer)
break;

if (buffer[0] == '\0') { /* empty line after each cpu */
if (cpuid >= 0 && (unsigned int)cpuid < super->existingCPUs) {
CPUData* cpuData = &(this->cpuData[cpuid + 1]);
cpuData->coreID = coreid;
cpuData->physicalID = physicalid;

if (coreid > max_coreid)
max_coreid = coreid;
if (physicalid > max_physicalid)
max_physicalid = physicalid;

cpuid = -1;
coreid = -1;
physicalid = -1;
}
} else if (String_startsWith(buffer, "processor")) {
sscanf(buffer, "processor : %d", &cpuid);
} else if (String_startsWith(buffer, "physical id")) {
sscanf(buffer, "physical id : %d", &physicalid);
} else if (String_startsWith(buffer, "core id")) {
sscanf(buffer, "core id : %d", &coreid);
}

free(buffer);
}

this->maxPhysicalID = max_physicalid;
this->maxCoreID = max_coreid;

fclose(file);
}

static void LinuxMachine_assignCCDs(LinuxMachine* this, int ccds) {
/* For AMD k10temp/zenpower, temperatures are provided for CCDs only,
which is an aggregate of multiple cores.
There's no obvious mapping between hwmon sensors and sockets and CCDs.
Assume both are iterated in order.
Hypothesis: Each CCD has same size N = #Cores/#CCD
and is assigned N coreID in sequence.
Also assume all CPUs have same number of CCDs. */

const Machine* super = &this->super;
CPUData *cpus = this->cpuData;

if (ccds == 0) {
for (size_t i = 0; i < super->existingCPUs + 1; i++) {
cpus[i].ccdID = -1;
}
return;
}

int coresPerCCD = super->existingCPUs / ccds;
cgzones marked this conversation as resolved.
Show resolved Hide resolved

int ccd = 0;
int nc = coresPerCCD;
for (int p = 0; p <= (int)this->maxPhysicalID; p++) {
for (int c = 0; c <= (int)this->maxCoreID; c++) {
for (size_t i = 1; i <= super->existingCPUs; i++) {
if (cpus[i].physicalID != p || cpus[i].coreID != c)
continue;

cpus[i].ccdID = ccd;

if (--nc == 0) {
nc = coresPerCCD;
ccd++;
}
}
}
}
}

#endif

static void LinuxMachine_scanCPUFrequency(LinuxMachine* this) {
const Machine* super = &this->super;

Expand All @@ -638,7 +732,11 @@ void Machine_scan(Machine* super) {
LinuxMachine_scanCPUTime(this);

const Settings* settings = super->settings;
if (settings->showCPUFrequency)
if (settings->showCPUFrequency
#ifdef HAVE_SENSORS_SENSORS_H
|| settings->showCPUTemperature
#endif
)
LinuxMachine_scanCPUFrequency(this);

#ifdef HAVE_SENSORS_SENSORS_H
Expand Down Expand Up @@ -687,6 +785,13 @@ Machine* Machine_new(UsersTable* usersTable, uid_t userId) {
// Initialize CPU count
LinuxMachine_updateCPUcount(this);

#ifdef HAVE_SENSORS_SENSORS_H
// Fetch CPU topology
LinuxMachine_fetchCPUTopologyFromCPUinfo(this);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe LinuxMachine_fetchCPUTopologyFromCPUinfo() can return whether the topology changed and then LibSensors_countCCDs() and LinuxMachine_assignCCDs() get only called on change?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know what this means, this code runs only once already?

int ccds = LibSensors_countCCDs();
LinuxMachine_assignCCDs(this, ccds);
#endif

return super;
}

Expand Down
9 changes: 9 additions & 0 deletions linux/LinuxMachine.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ typedef struct CPUData_ {

#ifdef HAVE_SENSORS_SENSORS_H
double temperature;

int physicalID; /* different for each CPU socket */
int coreID; /* same for hyperthreading */
int ccdID; /* same for each AMD chiplet */
#endif

bool online;
Expand All @@ -74,6 +78,11 @@ typedef struct LinuxMachine_ {

CPUData* cpuData;

#ifdef HAVE_SENSORS_SENSORS_H
int maxPhysicalID;
int maxCoreID;
#endif

memory_t totalHugePageMem;
memory_t usedHugePageMem[HTOP_HUGEPAGE_COUNT];

Expand Down