From b61779c667843b04e97b7daeff64f0f5347fb81e Mon Sep 17 00:00:00 2001 From: Kirill Date: Mon, 18 Nov 2024 09:41:22 +0000 Subject: [PATCH 01/43] feat: init implementation for metrics collection --- plugins/gcp/fix_plugin_gcp/__init__.py | 7 ++- plugins/gcp/fix_plugin_gcp/collector.py | 61 +++++++++++++++++++ plugins/gcp/fix_plugin_gcp/config.py | 6 +- plugins/gcp/fix_plugin_gcp/resources/base.py | 32 ++++++++++ .../fix_plugin_gcp/resources/monitoring.py | 0 5 files changed, 103 insertions(+), 3 deletions(-) create mode 100644 plugins/gcp/fix_plugin_gcp/resources/monitoring.py diff --git a/plugins/gcp/fix_plugin_gcp/__init__.py b/plugins/gcp/fix_plugin_gcp/__init__.py index 4be20f3dcd..538bdf09cb 100644 --- a/plugins/gcp/fix_plugin_gcp/__init__.py +++ b/plugins/gcp/fix_plugin_gcp/__init__.py @@ -10,6 +10,7 @@ from fixlib.core.actions import CoreFeedback from fixlib.graph import Graph, MaxNodesExceeded from fixlib.logger import log, setup_logger +from fixlib.types import Json from .collector import GcpProjectCollector from .config import GcpConfig from .resources.base import GcpProject @@ -77,10 +78,11 @@ def collect(self) -> None: project_id, feedback, cloud, + self.task_data or {}, max_resources_per_account=self.max_resources_per_account, **collect_args, ) - for project_id in credentials.keys() + for project_id in credentials ] for future in futures.as_completed(wait_for): project_graph = future.result() @@ -98,6 +100,7 @@ def collect_project( project_id: str, core_feedback: CoreFeedback, cloud: Cloud, + task_data: Json, args: Optional[Namespace] = None, running_config: Optional[RunningConfig] = None, credentials: Optional[Dict[str, Any]] = None, @@ -130,7 +133,7 @@ def collect_project( try: core_feedback.progress_done(project_id, 0, 1) - gpc = GcpProjectCollector(Config.gcp, cloud, project, core_feedback, max_resources_per_account) + gpc = GcpProjectCollector(Config.gcp, cloud, project, core_feedback, task_data, max_resources_per_account) try: gpc.collect() except MaxNodesExceeded as ex: diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index cdd59e4711..29d93e9c5b 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -1,3 +1,4 @@ +from datetime import datetime, timezone import logging from concurrent.futures import ThreadPoolExecutor from typing import Type, List, Any, Optional @@ -20,6 +21,8 @@ from fixlib.baseresources import Cloud from fixlib.core.actions import CoreFeedback, ErrorAccumulator from fixlib.graph import Graph +from fixlib.json import value_in_path +from fixlib.types import Json log = logging.getLogger("fix.plugins.gcp") all_resources: List[Type[GcpResource]] = ( @@ -58,6 +61,7 @@ def __init__( cloud: Cloud, project: GcpProject, core_feedback: CoreFeedback, + task_data: Json, max_resources_per_account: Optional[int] = None, ) -> None: self.config = config @@ -67,6 +71,7 @@ def __init__( self.error_accumulator = ErrorAccumulator() self.graph = Graph(root=self.project, max_nodes=max_resources_per_account) self.credentials = Credentials.get(self.project.id) + self.task_data = task_data def collect(self) -> None: with ThreadPoolExecutor( @@ -77,7 +82,20 @@ def collect(self) -> None: # It should only be used in scenarios, where it is safe to do so. # This executor is shared between all regions. shared_queue = ExecutorQueue(executor, self.project.safe_name) + + def get_last_run() -> Optional[datetime]: + td = self.task_data + if not td: + return None + timestamp = value_in_path(td, ["timing", td.get("step", ""), "started_at"]) + + if timestamp is None: + return None + + return datetime.fromtimestamp(timestamp, timezone.utc) + project_global_region = GcpRegion.fallback_global_region(self.project) + last_run = get_last_run() global_builder = GraphBuilder( self.graph, self.cloud, @@ -87,6 +105,8 @@ def collect(self) -> None: self.core_feedback, self.error_accumulator, project_global_region, + config=self.config, + last_run_started_at=last_run, ) global_builder.add_node(project_global_region, {}) @@ -113,6 +133,13 @@ def collect(self) -> None: self.error_accumulator.report_all(global_builder.core_feedback) + if global_builder.config.collect_usage_metrics: + try: + log.info(f"[GCP:{self.project.id}] Collect usage metrics.") + self.collect_usage_metrics(global_builder) + except Exception as e: + log.warning(f"Failed to collect usage metrics in project {self.project.id}: {e}") + shared_queue.wait_for_submitted_work() log.info(f"[GCP:{self.project.id}] Connect resources and create edges.") # connect nodes for node, data in list(self.graph.nodes(data=True)): @@ -131,6 +158,40 @@ def collect(self) -> None: self.core_feedback.progress_done(self.project.id, 1, 1, context=[self.cloud.id]) log.info(f"[GCP:{self.project.id}] Collecting resources done.") + def collect_usage_metrics(self, builder: GraphBuilder) -> None: + metrics_queries = defaultdict(list) + two_hours = timedelta(hours=2) + thirty_minutes = timedelta(minutes=30) + lookup_map = {} + for resource in builder.graph.nodes: + if not isinstance(resource, AwsResource): + continue + # region can be overridden in the query: s3 is global, but need to be queried per region + if region := cast(AwsRegion, resource.region()): + lookup_map[resource.id] = resource + resource_queries: List[cloudwatch.AwsCloudwatchQuery] = resource.collect_usage_metrics(builder) + for query in resource_queries: + query_region = query.region or region + start = query.start_delta or builder.metrics_delta + if query.period and query.period < thirty_minutes: + start = min(start, two_hours) + metrics_queries[(query_region, start)].append(query) + for (region, start), queries in metrics_queries.items(): + + def collect_and_set_metrics( + start: timedelta, region: AwsRegion, queries: List[cloudwatch.AwsCloudwatchQuery] + ) -> None: + start_at = builder.created_at - start + try: + result = cloudwatch.AwsCloudwatchMetricData.query_for_multiple( + builder.for_region(region), start_at, builder.created_at, queries + ) + cloudwatch.update_resource_metrics(lookup_map, result) + except Exception as e: + log.warning(f"Error occurred in region {region}: {e}") + + builder.submit_work("cloudwatch", collect_and_set_metrics, start, region, queries) + def remove_unconnected_nodes(self, builder: GraphBuilder) -> None: def rm_leaf_nodes(clazz: Any, ignore_kinds: Optional[Type[Any]] = None) -> None: remove_nodes = set() diff --git a/plugins/gcp/fix_plugin_gcp/config.py b/plugins/gcp/fix_plugin_gcp/config.py index 28667303b6..afb0463998 100644 --- a/plugins/gcp/fix_plugin_gcp/config.py +++ b/plugins/gcp/fix_plugin_gcp/config.py @@ -1,6 +1,6 @@ from fixlib.proc import num_default_threads from attrs import define, field -from typing import List, ClassVar +from typing import List, ClassVar, Optional @define @@ -31,6 +31,10 @@ class GcpConfig: "If false, the error is logged and the resource is skipped." }, ) + collect_usage_metrics: Optional[bool] = field( + default=True, + metadata={"description": "Collect resource usage metrics via GCP Monitoring, enabled by default"}, + ) def should_collect(self, name: str) -> bool: if self.collect: diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index da56cbc517..e053db5736 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -1,5 +1,6 @@ from __future__ import annotations +from datetime import datetime, timedelta import json import logging from concurrent.futures import Future @@ -8,6 +9,7 @@ from typing import Callable, List, ClassVar, Optional, TypeVar, Type, Any, Dict, Set, Tuple from attr import define, field +from fix_plugin_gcp.config import GcpConfig from google.auth.credentials import Credentials as GoogleAuthCredentials from googleapiclient.errors import HttpError @@ -32,6 +34,8 @@ from fixlib.types import Json from fixinventorydata.cloud import regions as cloud_region_data +from fixlib.utils import utc + log = logging.getLogger("fix.plugins.gcp") @@ -81,7 +85,9 @@ def __init__( core_feedback: CoreFeedback, error_accumulator: ErrorAccumulator, fallback_global_region: GcpRegion, + config: GcpConfig, region: Optional[GcpRegion] = None, + last_run_started_at: Optional[datetime] = None, graph_nodes_access: Optional[Lock] = None, graph_edges_access: Optional[Lock] = None, ) -> None: @@ -95,12 +101,38 @@ def __init__( self.core_feedback = core_feedback self.error_accumulator = error_accumulator self.fallback_global_region = fallback_global_region + self.config = config self.region_by_name: Dict[str, GcpRegion] = {} self.region_by_zone_name: Dict[str, GcpRegion] = {} self.zone_by_name: Dict[str, GcpZone] = {} + self.last_run_started_at = last_run_started_at self.graph_nodes_access = graph_nodes_access or Lock() self.graph_edges_access = graph_edges_access or Lock() + if last_run_started_at: + now = utc() + + # limit the metrics to the last hour + if now - last_run_started_at > timedelta(hours=2): + start = now - timedelta(hours=2) + else: + start = last_run_started_at + + delta = now - start + + min_delta = max(delta, timedelta(seconds=60)) + # in case the last collection happened too quickly, raise the metrics timedelta to 60s, + if min_delta != delta: + start = now - min_delta + delta = min_delta + else: + now = utc() + delta = timedelta(hours=1) + start = now - delta + + self.metrics_start = start + self.metrics_delta = delta + def submit_work(self, fn: Callable[..., T], *args: Any, **kwargs: Any) -> Future[T]: """ Use this method for work that can be done in parallel. diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py new file mode 100644 index 0000000000..e69de29bb2 From d57d41dc371452006abf80b39d991fdefb186f6b Mon Sep 17 00:00:00 2001 From: Kirill Date: Tue, 19 Nov 2024 17:01:07 +0000 Subject: [PATCH 02/43] feat: finished fetching implementation --- plugins/gcp/fix_plugin_gcp/collector.py | 24 +- plugins/gcp/fix_plugin_gcp/resources/base.py | 9 +- .../fix_plugin_gcp/resources/monitoring.py | 210 ++++++++++++++++++ plugins/gcp/fix_plugin_gcp/utils.py | 22 +- 4 files changed, 250 insertions(+), 15 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index 29d93e9c5b..5b1c81e809 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -1,7 +1,8 @@ -from datetime import datetime, timezone +from collections import defaultdict +from datetime import datetime, timedelta, timezone import logging from concurrent.futures import ThreadPoolExecutor -from typing import Type, List, Any, Optional +from typing import Type, List, Any, Optional, cast from fix_plugin_gcp.config import GcpConfig from fix_plugin_gcp.gcp_client import GcpApiSpec @@ -15,6 +16,7 @@ firestore, filestore, cloudfunctions, + monitoring, ) from fix_plugin_gcp.resources.base import GcpResource, GcpProject, ExecutorQueue, GraphBuilder, GcpRegion, GcpZone from fix_plugin_gcp.utils import Credentials @@ -164,33 +166,33 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> None: thirty_minutes = timedelta(minutes=30) lookup_map = {} for resource in builder.graph.nodes: - if not isinstance(resource, AwsResource): + if not isinstance(resource, GcpResource): continue # region can be overridden in the query: s3 is global, but need to be queried per region - if region := cast(AwsRegion, resource.region()): + if region := cast(GcpRegion, resource.region()): lookup_map[resource.id] = resource - resource_queries: List[cloudwatch.AwsCloudwatchQuery] = resource.collect_usage_metrics(builder) + resource_queries: List[monitoring.GcpMonitoringQuery] = resource.collect_usage_metrics(builder) for query in resource_queries: query_region = query.region or region - start = query.start_delta or builder.metrics_delta + start = builder.metrics_delta if query.period and query.period < thirty_minutes: start = min(start, two_hours) metrics_queries[(query_region, start)].append(query) for (region, start), queries in metrics_queries.items(): def collect_and_set_metrics( - start: timedelta, region: AwsRegion, queries: List[cloudwatch.AwsCloudwatchQuery] + start: timedelta, region: GcpRegion, queries: List[monitoring.GcpMonitoringQuery] ) -> None: start_at = builder.created_at - start try: - result = cloudwatch.AwsCloudwatchMetricData.query_for_multiple( - builder.for_region(region), start_at, builder.created_at, queries + result = monitoring.GcpMonitoringMetricData.query_for( + builder.for_region(region), queries, start_at, builder.created_at ) - cloudwatch.update_resource_metrics(lookup_map, result) + monitoring.update_resource_metrics(lookup_map, result) except Exception as e: log.warning(f"Error occurred in region {region}: {e}") - builder.submit_work("cloudwatch", collect_and_set_metrics, start, region, queries) + builder.submit_work(collect_and_set_metrics, start, region, queries) def remove_unconnected_nodes(self, builder: GraphBuilder) -> None: def rm_leaf_nodes(clazz: Any, ignore_kinds: Optional[Type[Any]] = None) -> None: diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index e053db5736..a47f7a146b 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -102,17 +102,18 @@ def __init__( self.error_accumulator = error_accumulator self.fallback_global_region = fallback_global_region self.config = config + self.created_at = utc() + self.last_run_started_at = last_run_started_at self.region_by_name: Dict[str, GcpRegion] = {} self.region_by_zone_name: Dict[str, GcpRegion] = {} self.zone_by_name: Dict[str, GcpZone] = {} - self.last_run_started_at = last_run_started_at self.graph_nodes_access = graph_nodes_access or Lock() self.graph_edges_access = graph_edges_access or Lock() if last_run_started_at: now = utc() - # limit the metrics to the last hour + # limit the metrics to the last 2 hours if now - last_run_started_at > timedelta(hours=2): start = now - timedelta(hours=2) else: @@ -406,6 +407,10 @@ def post_process_instance(self, builder: GraphBuilder, source: Json) -> None: """ pass + def collect_usage_metrics(self, builder: GraphBuilder) -> List: # type: ignore + # Default behavior: do nothing + return [] + @classmethod def collect_resources(cls: Type[GcpResource], builder: GraphBuilder, **kwargs: Any) -> List[GcpResource]: # Default behavior: in case the class has an ApiSpec, call the api and call collect. diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index e69de29bb2..abc8f24f2d 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -0,0 +1,210 @@ +from functools import cached_property, lru_cache +import logging +import re +from datetime import datetime, timedelta +from typing import Callable, ClassVar, Dict, List, Optional, Type, Tuple, TypeVar, Any, Union +from concurrent.futures import as_completed +from json import loads as json_loads + +from attr import define, field, frozen + +from fix_plugin_gcp.gcp_client import GcpClient +from fix_plugin_gcp.resources.base import GcpApiSpec, GcpResource, GraphBuilder, GcpRegion +from fix_plugin_gcp.utils import MetricNormalization +from fixlib.baseresources import MetricName, MetricUnit, ModelReference, BaseResource, StatName +from fixlib.durations import duration_str +from fixlib.graph import Graph +from fixlib.json import from_json, sort_json +from fixlib.json_bender import S, K, Bend, Bender, ForallBend, bend, F, SecondsFromEpochToDatetime +from fixlib.types import Json +from fixlib.utils import utc_str + +service_name = "monitoring" +log = logging.getLogger("fix.plugins.gcp") + + +@define(hash=True, frozen=True) +class GcpMonitoringQuery: + metric_name: Union[str, MetricName] # final name of the metric + query_name: str # name of the metric (e.g., GCP metric type) + resource_name: str # name of resource + period: timedelta # period of the metric + ref_id: str # reference ID for the resource (e.g., instance ID) + metric_id: str # unique metric identifier (metric_name + instance_id) + stat: str # aggregation type, supports ALIGN_MEAN, ALIGN_MAX, ALIGN_MIN + normalization: Optional[MetricNormalization] = None # normalization info + region: Optional[GcpRegion] = None + + @staticmethod + def create( + *, + query_name: str, + period: timedelta, + ref_id: str, + resource_name: str, + metric_name: Union[str, MetricName], + stat: str, + normalization: Optional[MetricNormalization] = None, + region: Optional[GcpRegion] = None, + ) -> "GcpMonitoringQuery": + # Metric ID generation: metric name + resource ID + metric_id = f"{metric_name}/{ref_id}" + + return GcpMonitoringQuery( + metric_name=metric_name, + query_name=query_name, + period=period, + ref_id=ref_id, + resource_name=resource_name, + metric_id=metric_id, + stat=stat, + region=region, + normalization=normalization, + ) + + +@define(eq=False, slots=False) +class GcpMonitoringMetricDataPoint: + kind: ClassVar[str] = "gcp_monitoring_metric_data_point" + mapping: ClassVar[Dict[str, Bender]] = { + "start_time": S("interval", "startTime"), + "end_time": S("interval", "endTime"), + "value": S("value", "doubleValue"), + } + start_time: Optional[datetime] = field(default=None) + end_time: Optional[datetime] = field(default=None) + value: Optional[float] = field(default=None) + + +@define(eq=False, slots=False) +class GcpMonitoringMetricData: + kind: ClassVar[str] = "gcp_monitoring_metric_data" + mapping: ClassVar[Dict[str, Bender]] = { + "id": S("metric", "type") + K("/") + S("metric", "labels", "instance_name"), + "metric_points": S("points", default=[]) >> Bend(GcpMonitoringMetricDataPoint.mapping), + "metric_kind": S("metricKind"), + "value_type": S("valueType"), + } + id: Optional[str] = field(default=None) + metric_points: List[GcpMonitoringMetricDataPoint] = field(factory=list) + metric_kind: Optional[str] = field(default=None) + value_type: Optional[str] = field(default=None) + + @classmethod + def called_collect_apis(cls) -> List[GcpApiSpec]: + api_spec = GcpApiSpec( + service="monitoring", + version="v3", + accessors=["projects", "timeSeries"], + action="list", + request_parameter={ + "name": "projects/{project}", + }, + request_parameter_in={"project"}, + response_path="timeSeries", + ) + return [api_spec] + + @staticmethod + def query_for( + builder: GraphBuilder, + queries: List[GcpMonitoringQuery], + start_time: datetime, + end_time: datetime, + ) -> "Dict[GcpMonitoringQuery, GcpMonitoringMetricData]": + lookup = {q.metric_id: q for q in queries} + result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData] = {} + futures = [] + + api_spec = GcpApiSpec( + service="monitoring", + version="v3", + accessors=["projects", "timeSeries"], + action="list", + request_parameter={ + "name": "projects/{project}", + "aggregation_crossSeriesReducer": "REDUCE_NONE", + "aggregation_groupByFields": "[]", + "interval_endTime": utc_str(end_time), + "interval_startTime": utc_str(start_time), + "view": "FULL", + # set parametes below dynamically + # "aggregation_alignmentPeriod": None, + # "aggregation_perSeriesAligner": None, + # "filter": None, + }, + request_parameter_in={"project"}, + response_path="timeSeries", + ) + + for query in queries: + api_spec.request_parameter["filter"] = ( + f"metric.type = {query.query_name} AND metric.labels.instance_name = {query.resource_name}" + ) + api_spec.request_parameter["aggregation_alignmentPeriod"] = f"{int(query.period.total_seconds())}s" + api_spec.request_parameter["aggregation_perSeriesAligner"] = query.stat + future = builder.submit_work( + GcpMonitoringMetricData._query_for_chunk, + builder, + api_spec, + ) + futures.append(future) + # Retrieve results from submitted queries and populate the result dictionary + for future in as_completed(futures): + try: + metric_query_result = future.result() + for metric, metric_id in metric_query_result: + if metric is not None and metric_id is not None: + result[lookup[metric_id]] = metric + except Exception as e: + log.warning(f"An error occurred while processing a metric query: {e}") + raise e + return result + + @staticmethod + def _query_for_chunk( + builder: GraphBuilder, + api_spec: GcpApiSpec, + ) -> "List[Tuple[GcpMonitoringMetricData, str]]": + query_result = [] + try: + part = builder.client.list(api_spec) + for single in part: + metric = from_json(bend(GcpMonitoringMetricData.mapping, single), GcpMonitoringMetricData) + if metric.id: + query_result.append((metric, metric.id)) + return query_result + except Exception as e: + raise e + + +V = TypeVar("V", bound=BaseResource) + + +def update_resource_metrics( + resources_map: Dict[str, V], + cloudwatch_result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData], +) -> None: + for query, metric in cloudwatch_result.items(): + resource = resources_map.get(query.ref_id) + if resource is None: + continue + if len(metric.metric_values) == 0: + continue + normalizer = query.normalization + if not normalizer: + continue + + for metric_value, maybe_stat_name in normalizer.compute_stats(metric.metric_values): + try: + metric_name = query.metric_name + if not metric_name: + continue + name = metric_name + "_" + normalizer.unit + value = normalizer.normalize_value(metric_value) + stat_name = maybe_stat_name or normalizer.get_stat_value(query.stat) + if stat_name: + resource._resource_usage[name][str(stat_name)] = value + except KeyError as e: + log.warning(f"An error occured while setting metric values: {e}") + raise diff --git a/plugins/gcp/fix_plugin_gcp/utils.py b/plugins/gcp/fix_plugin_gcp/utils.py index f4fa418796..f4f3567893 100644 --- a/plugins/gcp/fix_plugin_gcp/utils.py +++ b/plugins/gcp/fix_plugin_gcp/utils.py @@ -1,8 +1,9 @@ import os import socket from datetime import datetime -from typing import Iterable, List, Union, Callable, Any, Dict, Optional +from typing import Iterable, TypeVar, List, Union, Callable, Any, Dict, Optional +from attrs import frozen from google.oauth2 import service_account from googleapiclient import discovery from googleapiclient.discovery_cache.base import Cache as GoogleApiClientCache @@ -11,7 +12,7 @@ from tenacity import Retrying, stop_after_attempt, retry_if_exception_type import fixlib.logger -from fixlib.baseresources import BaseResource +from fixlib.baseresources import BaseResource, MetricName, MetricUnit, StatName from fixlib.config import Config from fixlib.core.actions import CoreFeedback from fixlib.graph import Graph @@ -19,6 +20,7 @@ log = fixlib.logger.getLogger("fix." + __name__) fixlib.logger.getLogger("googleapiclient").setLevel(fixlib.logger.ERROR) +T = TypeVar("T") SCOPES = ["https://www.googleapis.com/auth/cloud-platform"] @@ -246,3 +248,19 @@ def gcp_resource(resource: BaseResource, graph: Graph = None): service = gcp_service(resource, graph) gr = getattr(service, resource._client_method) return gr() + + +def identity(x: T) -> T: + return x + + +@frozen(kw_only=True) +class MetricNormalization: + metric_name: MetricName + unit: MetricUnit + stat_map: Dict[str, StatName] = { + "minimum": StatName.min, + "average": StatName.avg, + "maximum": StatName.max, + } + normalize_value: Callable[[float], float] = identity From 387de89346cbab1314ee42abdb619667c72f843c Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 20 Nov 2024 09:35:35 +0000 Subject: [PATCH 03/43] finished metric setting implementation --- .../fix_plugin_gcp/resources/monitoring.py | 101 +++++++++++------- plugins/gcp/fix_plugin_gcp/utils.py | 18 +--- 2 files changed, 63 insertions(+), 56 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index abc8f24f2d..6cc248d389 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -10,7 +10,6 @@ from fix_plugin_gcp.gcp_client import GcpClient from fix_plugin_gcp.resources.base import GcpApiSpec, GcpResource, GraphBuilder, GcpRegion -from fix_plugin_gcp.utils import MetricNormalization from fixlib.baseresources import MetricName, MetricUnit, ModelReference, BaseResource, StatName from fixlib.durations import duration_str from fixlib.graph import Graph @@ -21,6 +20,38 @@ service_name = "monitoring" log = logging.getLogger("fix.plugins.gcp") +T = TypeVar("T") + + +def identity(x: T) -> T: + return x + + +@frozen(kw_only=True) +class MetricNormalization: + unit: MetricUnit + # Use Tuple instead of Dict for stat_map because it should be immutable + stat_map: Tuple[Tuple[str, StatName], Tuple[str, StatName], Tuple[str, StatName]] = ( + ("ALIGN_MIN", StatName.min), + ("ALIGN_MEAN", StatName.avg), + ("ALIGN_MAX", StatName.max), + ) + normalize_value: Callable[[float], float] = identity + + def get_stat_value(self, key: str) -> Optional[StatName]: + """ + Get the value from stat_map based on the given key. + + Args: + key: The key to search for in the stat_map. + + Returns: + The corresponding value from stat_map. + """ + for stat_key, value in self.stat_map: + if stat_key == key: + return value + return None @define(hash=True, frozen=True) @@ -32,6 +63,7 @@ class GcpMonitoringQuery: ref_id: str # reference ID for the resource (e.g., instance ID) metric_id: str # unique metric identifier (metric_name + instance_id) stat: str # aggregation type, supports ALIGN_MEAN, ALIGN_MAX, ALIGN_MIN + label_type: str normalization: Optional[MetricNormalization] = None # normalization info region: Optional[GcpRegion] = None @@ -44,6 +76,7 @@ def create( resource_name: str, metric_name: Union[str, MetricName], stat: str, + label_type: str, normalization: Optional[MetricNormalization] = None, region: Optional[GcpRegion] = None, ) -> "GcpMonitoringQuery": @@ -57,38 +90,26 @@ def create( ref_id=ref_id, resource_name=resource_name, metric_id=metric_id, + label_type=label_type, stat=stat, region=region, normalization=normalization, ) -@define(eq=False, slots=False) -class GcpMonitoringMetricDataPoint: - kind: ClassVar[str] = "gcp_monitoring_metric_data_point" - mapping: ClassVar[Dict[str, Bender]] = { - "start_time": S("interval", "startTime"), - "end_time": S("interval", "endTime"), - "value": S("value", "doubleValue"), - } - start_time: Optional[datetime] = field(default=None) - end_time: Optional[datetime] = field(default=None) - value: Optional[float] = field(default=None) - - @define(eq=False, slots=False) class GcpMonitoringMetricData: kind: ClassVar[str] = "gcp_monitoring_metric_data" mapping: ClassVar[Dict[str, Bender]] = { - "id": S("metric", "type") + K("/") + S("metric", "labels", "instance_name"), - "metric_points": S("points", default=[]) >> Bend(GcpMonitoringMetricDataPoint.mapping), + "metric_values": S("points", default=[]) >> ForallBend(S("value", "doubleValue")), "metric_kind": S("metricKind"), "value_type": S("valueType"), + "metric_type": S("metric", "type"), } - id: Optional[str] = field(default=None) - metric_points: List[GcpMonitoringMetricDataPoint] = field(factory=list) + metric_values: List[float] = field(factory=list) metric_kind: Optional[str] = field(default=None) value_type: Optional[str] = field(default=None) + metric_type: Optional[str] = field(default=None) @classmethod def called_collect_apis(cls) -> List[GcpApiSpec]: @@ -139,7 +160,7 @@ def query_for( for query in queries: api_spec.request_parameter["filter"] = ( - f"metric.type = {query.query_name} AND metric.labels.instance_name = {query.resource_name}" + f"metric.type = {query.query_name} AND metric.labels.{query.label_type} = {query.resource_name}" ) api_spec.request_parameter["aggregation_alignmentPeriod"] = f"{int(query.period.total_seconds())}s" api_spec.request_parameter["aggregation_perSeriesAligner"] = query.stat @@ -147,13 +168,14 @@ def query_for( GcpMonitoringMetricData._query_for_chunk, builder, api_spec, + query, ) futures.append(future) # Retrieve results from submitted queries and populate the result dictionary for future in as_completed(futures): try: - metric_query_result = future.result() - for metric, metric_id in metric_query_result: + metric_query_result: List[Tuple[str, GcpMonitoringMetricData]] = future.result() + for metric_id, metric in metric_query_result: if metric is not None and metric_id is not None: result[lookup[metric_id]] = metric except Exception as e: @@ -165,14 +187,14 @@ def query_for( def _query_for_chunk( builder: GraphBuilder, api_spec: GcpApiSpec, - ) -> "List[Tuple[GcpMonitoringMetricData, str]]": + query: GcpMonitoringQuery, + ) -> "List[Tuple[str, GcpMonitoringMetricData]]": query_result = [] try: part = builder.client.list(api_spec) for single in part: metric = from_json(bend(GcpMonitoringMetricData.mapping, single), GcpMonitoringMetricData) - if metric.id: - query_result.append((metric, metric.id)) + query_result.append((query.metric_id, metric)) return query_result except Exception as e: raise e @@ -183,9 +205,9 @@ def _query_for_chunk( def update_resource_metrics( resources_map: Dict[str, V], - cloudwatch_result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData], + monitoring_metric_result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData], ) -> None: - for query, metric in cloudwatch_result.items(): + for query, metric in monitoring_metric_result.items(): resource = resources_map.get(query.ref_id) if resource is None: continue @@ -195,16 +217,17 @@ def update_resource_metrics( if not normalizer: continue - for metric_value, maybe_stat_name in normalizer.compute_stats(metric.metric_values): - try: - metric_name = query.metric_name - if not metric_name: - continue - name = metric_name + "_" + normalizer.unit - value = normalizer.normalize_value(metric_value) - stat_name = maybe_stat_name or normalizer.get_stat_value(query.stat) - if stat_name: - resource._resource_usage[name][str(stat_name)] = value - except KeyError as e: - log.warning(f"An error occured while setting metric values: {e}") - raise + most_recent_value = metric.metric_values[0] + + try: + metric_name = query.metric_name + if not metric_name: + continue + name = metric_name + "_" + normalizer.unit + value = normalizer.normalize_value(most_recent_value) + stat_name = normalizer.get_stat_value(query.stat) + if stat_name: + resource._resource_usage[name][str(stat_name)] = value + except KeyError as e: + log.warning(f"An error occured while setting metric values: {e}") + raise diff --git a/plugins/gcp/fix_plugin_gcp/utils.py b/plugins/gcp/fix_plugin_gcp/utils.py index f4f3567893..6def26401a 100644 --- a/plugins/gcp/fix_plugin_gcp/utils.py +++ b/plugins/gcp/fix_plugin_gcp/utils.py @@ -1,7 +1,7 @@ import os import socket from datetime import datetime -from typing import Iterable, TypeVar, List, Union, Callable, Any, Dict, Optional +from typing import Iterable, Tuple, TypeVar, List, Union, Callable, Any, Dict, Optional from attrs import frozen from google.oauth2 import service_account @@ -248,19 +248,3 @@ def gcp_resource(resource: BaseResource, graph: Graph = None): service = gcp_service(resource, graph) gr = getattr(service, resource._client_method) return gr() - - -def identity(x: T) -> T: - return x - - -@frozen(kw_only=True) -class MetricNormalization: - metric_name: MetricName - unit: MetricUnit - stat_map: Dict[str, StatName] = { - "minimum": StatName.min, - "average": StatName.avg, - "maximum": StatName.max, - } - normalize_value: Callable[[float], float] = identity From bb95a8ffe5faf04583ccc68ecffe67d0dd958d0e Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 20 Nov 2024 12:58:30 +0000 Subject: [PATCH 04/43] feat: added metric collection to the vm instances --- fixlib/fixlib/baseresources.py | 3 +- .../gcp/fix_plugin_gcp/resources/compute.py | 82 +++++++++++++++++++ .../fix_plugin_gcp/resources/monitoring.py | 75 +++++++++++++++-- 3 files changed, 152 insertions(+), 8 deletions(-) diff --git a/fixlib/fixlib/baseresources.py b/fixlib/fixlib/baseresources.py index 0032907e1d..d22f707862 100644 --- a/fixlib/fixlib/baseresources.py +++ b/fixlib/fixlib/baseresources.py @@ -116,8 +116,7 @@ def get(self) -> Dict[str, Any]: return changes -# todo: replace to StrEnum once resoto is on 3.11 -class MetricName(str, Enum): +class MetricName(StrEnum): def __str__(self) -> str: return self.value diff --git a/plugins/gcp/fix_plugin_gcp/resources/compute.py b/plugins/gcp/fix_plugin_gcp/resources/compute.py index 5f5cd4ae3b..6972289d7d 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/compute.py +++ b/plugins/gcp/fix_plugin_gcp/resources/compute.py @@ -9,6 +9,7 @@ from fix_plugin_gcp.gcp_client import GcpApiSpec, InternalZoneProp from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder from fix_plugin_gcp.resources.billing import GcpSku +from fix_plugin_gcp.resources.monitoring import GcpMonitoringQuery, STAT_LIST, normalizer_factory from fixlib.baseresources import ( BaseAutoScalingGroup, BaseBucket, @@ -24,6 +25,7 @@ BaseSubnet, BaseTunnel, BaseVolumeType, + MetricName, ModelReference, BaseVolume, VolumeStatus, @@ -3553,6 +3555,86 @@ def connect_in_graph(self, builder: GraphBuilder, source: Json) -> None: self, reverse=True, delete_same_as_default=True, clazz=GcpSubnetwork, link=nic.subnetwork ) + def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuery]: + queries: List[GcpMonitoringQuery] = [] + queries = [] + delta = builder.metrics_delta + queries.extend( + [ + GcpMonitoringQuery.create( + query_name="compute.googleapis.com/instance/cpu/utilization", + period=delta, + ref_id=self.id, + resource_name=self.id, + metric_name=MetricName.CpuUtilization, + normalization=normalizer_factory.percent, + stat=stat, + label_name="instance_name", + ) + for stat in STAT_LIST + ] + ) + queries.extend( + [ + GcpMonitoringQuery.create( + query_name=name, + period=delta, + ref_id=self.id, + resource_name=self.id, + metric_name=metric_name, + normalization=normalizer_factory.count, + stat=stat, + label_name="instance_name", + ) + for stat in STAT_LIST + for name, metric_name in [ + ("compute.googleapis.com/instance/network/received_bytes_count ", MetricName.NetworkIn), + ("compute.googleapis.com/instance/network/sent_bytes_count", MetricName.NetworkOut), + ] + ] + ) + + queries.extend( + [ + GcpMonitoringQuery.create( + query_name=name, + period=delta, + ref_id=self.id, + resource_name=self.id, + metric_name=metric_name, + normalization=normalizer_factory.count, + stat=stat, + label_name="instance_name", + ) + for stat in STAT_LIST + for name, metric_name in [ + ("compute.googleapis.com/instance/disk/read_ops_count", MetricName.DiskRead), + ("compute.googleapis.com/instance/disk/write_ops_count", MetricName.DiskWrite), + ] + ] + ) + + queries.extend( + [ + GcpMonitoringQuery.create( + query_name=name, + period=delta, + ref_id=self.id, + resource_name=self.id, + metric_name=metric_name, + normalization=normalizer_factory.count, + stat=stat, + label_name="instance_name", + ) + for stat in STAT_LIST + for name, metric_name in [ + ("compute.googleapis.com/instance/disk/read_bytes_count", MetricName.DiskRead), + ("compute.googleapis.com/instance/disk/write_bytes_count", MetricName.DiskWrite), + ] + ] + ) + return queries + @classmethod def collect(cls: Type[GcpResource], raw: List[Json], builder: GraphBuilder) -> List[GcpResource]: # Additional behavior: iterate over list of collected GcpInstances and for each: diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index 6cc248d389..ac44417914 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -23,6 +23,9 @@ T = TypeVar("T") +STAT_LIST: List[str] = ["ALIGN_MIN", "ALIGN_MEAN", "ALIGN_MAX"] + + def identity(x: T) -> T: return x @@ -63,7 +66,7 @@ class GcpMonitoringQuery: ref_id: str # reference ID for the resource (e.g., instance ID) metric_id: str # unique metric identifier (metric_name + instance_id) stat: str # aggregation type, supports ALIGN_MEAN, ALIGN_MAX, ALIGN_MIN - label_type: str + label_name: str normalization: Optional[MetricNormalization] = None # normalization info region: Optional[GcpRegion] = None @@ -76,12 +79,12 @@ def create( resource_name: str, metric_name: Union[str, MetricName], stat: str, - label_type: str, + label_name: str, normalization: Optional[MetricNormalization] = None, region: Optional[GcpRegion] = None, ) -> "GcpMonitoringQuery": - # Metric ID generation: metric name + resource ID - metric_id = f"{metric_name}/{ref_id}" + # Metric ID generation: metric query name + resource ID + metric_id = f"{query_name}/{ref_id}" return GcpMonitoringQuery( metric_name=metric_name, @@ -90,7 +93,7 @@ def create( ref_id=ref_id, resource_name=resource_name, metric_id=metric_id, - label_type=label_type, + label_name=label_name, stat=stat, region=region, normalization=normalization, @@ -133,6 +136,12 @@ def query_for( start_time: datetime, end_time: datetime, ) -> "Dict[GcpMonitoringQuery, GcpMonitoringMetricData]": + if builder.region: + log.info( + f"[{builder.region.safe_name}|{start_time}|{duration_str(end_time - start_time)}] Query for {len(queries)} metrics." + ) + else: + log.info(f"[global|{start_time}|{duration_str(end_time - start_time)}] Query for {len(queries)} metrics.") lookup = {q.metric_id: q for q in queries} result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData] = {} futures = [] @@ -160,7 +169,7 @@ def query_for( for query in queries: api_spec.request_parameter["filter"] = ( - f"metric.type = {query.query_name} AND metric.labels.{query.label_type} = {query.resource_name}" + f"metric.type = {query.query_name} AND metric.labels.{query.label_name} = {query.resource_name}" ) api_spec.request_parameter["aggregation_alignmentPeriod"] = f"{int(query.period.total_seconds())}s" api_spec.request_parameter["aggregation_perSeriesAligner"] = query.stat @@ -231,3 +240,57 @@ def update_resource_metrics( except KeyError as e: log.warning(f"An error occured while setting metric values: {e}") raise + + +class NormalizerFactory: + @cached_property + def count(self) -> MetricNormalization: + return MetricNormalization( + unit=MetricUnit.Count, + normalize_value=lambda x: round(x, ndigits=4), + ) + + @cached_property + def bytes(self) -> MetricNormalization: + return MetricNormalization( + unit=MetricUnit.Bytes, + normalize_value=lambda x: round(x, ndigits=4), + ) + + @cached_property + def bytes_per_second(self) -> MetricNormalization: + return MetricNormalization( + unit=MetricUnit.BytesPerSecond, + normalize_value=lambda x: round(x, ndigits=4), + ) + + @cached_property + def iops(self) -> MetricNormalization: + return MetricNormalization( + unit=MetricUnit.IOPS, + normalize_value=lambda x: round(x, ndigits=4), + ) + + @cached_property + def seconds(self) -> MetricNormalization: + return MetricNormalization( + unit=MetricUnit.Seconds, + normalize_value=lambda x: round(x, ndigits=4), + ) + + @cached_property + def milliseconds(self) -> MetricNormalization: + return MetricNormalization( + unit=MetricUnit.Milliseconds, + normalize_value=lambda x: round(x, ndigits=4), + ) + + @cached_property + def percent(self) -> MetricNormalization: + return MetricNormalization( + unit=MetricUnit.Percent, + normalize_value=lambda x: round(x, ndigits=4), + ) + + +normalizer_factory = NormalizerFactory() From d4f5eccb755e5cf81b522cbfd5713d398d09a9fd Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 20 Nov 2024 14:34:25 +0000 Subject: [PATCH 05/43] feat: avoid race conditions --- .../fix_plugin_azure/resource/metrics.py | 6 +++-- plugins/gcp/fix_plugin_gcp/resources/base.py | 6 +++-- .../fix_plugin_gcp/resources/monitoring.py | 22 +++++++++++-------- plugins/gcp/test/conftest.py | 10 ++++++++- plugins/gcp/test/test_collector.py | 5 ++++- 5 files changed, 34 insertions(+), 15 deletions(-) diff --git a/plugins/azure/fix_plugin_azure/resource/metrics.py b/plugins/azure/fix_plugin_azure/resource/metrics.py index c476dd26a0..e22b243f28 100644 --- a/plugins/azure/fix_plugin_azure/resource/metrics.py +++ b/plugins/azure/fix_plugin_azure/resource/metrics.py @@ -1,3 +1,4 @@ +from copy import deepcopy from datetime import datetime, timedelta from concurrent.futures import as_completed import logging @@ -271,12 +272,13 @@ def _query_for_single( interval: str, ) -> "Tuple[Optional[AzureMetricData], Optional[str]]": try: + local_api_spec = deepcopy(api_spec) # Set the path for the API call based on the instance ID of the query - api_spec.path = f"{query.instance_id}/providers/Microsoft.Insights/metrics" + local_api_spec.path = f"{query.instance_id}/providers/Microsoft.Insights/metrics" # Retrieve metric data from the API aggregation = ",".join(query.aggregation) part = builder.client.list( - api_spec, + local_api_spec, metricnames=query.metric_name, metricNamespace=query.metric_namespace, timespan=timespan, diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index a47f7a146b..670e221296 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -308,7 +308,9 @@ def for_region(self, region: GcpRegion) -> GraphBuilder: self.core_feedback, self.error_accumulator, self.fallback_global_region, + self.config, region, + self.last_run_started_at, self.graph_nodes_access, self.graph_edges_access, ) @@ -415,9 +417,9 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List: # type: ignore def collect_resources(cls: Type[GcpResource], builder: GraphBuilder, **kwargs: Any) -> List[GcpResource]: # Default behavior: in case the class has an ApiSpec, call the api and call collect. if kwargs: - log.info(f"[GCP:{builder.project.id}] Collecting {cls.kind} with ({kwargs})") + log.info(f"[GCP:{builder.project.id}] collecting {cls.kind} with ({kwargs})") else: - log.info(f"[GCP:{builder.project.id}] Collecting {cls.kind}") + log.info(f"[GCP:{builder.project.id}] collecting {cls.kind}") if spec := cls.api_spec: expected_errors = GcpExpectedErrorCodes | (spec.expected_errors or set()) with GcpErrorHandler( diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index ac44417914..755fbb8c62 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -1,3 +1,4 @@ +from copy import deepcopy from functools import cached_property, lru_cache import logging import re @@ -104,12 +105,13 @@ def create( class GcpMonitoringMetricData: kind: ClassVar[str] = "gcp_monitoring_metric_data" mapping: ClassVar[Dict[str, Bender]] = { - "metric_values": S("points", default=[]) >> ForallBend(S("value", "doubleValue")), + "metric_values": S("points") + >> ForallBend(S("value", "doubleValue").or_else(S("value", "int64Value", default=0.0))), "metric_kind": S("metricKind"), "value_type": S("valueType"), "metric_type": S("metric", "type"), } - metric_values: List[float] = field(factory=list) + metric_values: Optional[List[float]] = field(factory=list) metric_kind: Optional[str] = field(default=None) value_type: Optional[str] = field(default=None) metric_type: Optional[str] = field(default=None) @@ -168,11 +170,6 @@ def query_for( ) for query in queries: - api_spec.request_parameter["filter"] = ( - f"metric.type = {query.query_name} AND metric.labels.{query.label_name} = {query.resource_name}" - ) - api_spec.request_parameter["aggregation_alignmentPeriod"] = f"{int(query.period.total_seconds())}s" - api_spec.request_parameter["aggregation_perSeriesAligner"] = query.stat future = builder.submit_work( GcpMonitoringMetricData._query_for_chunk, builder, @@ -199,8 +196,15 @@ def _query_for_chunk( query: GcpMonitoringQuery, ) -> "List[Tuple[str, GcpMonitoringMetricData]]": query_result = [] + local_api_spec = deepcopy(api_spec) + local_api_spec.request_parameters["filter"] = ( + f'metric.type = "{query.query_name}" AND metric.labels.{query.label_name} = "{query.resource_name}"' + ) + local_api_spec.request_parameters["aggregation_alignmentPeriod"] = f"{int(query.period.total_seconds())}s" + local_api_spec.request_parameters["aggregation_perSeriesAligner"] = query.stat + try: - part = builder.client.list(api_spec) + part = builder.client.list(local_api_spec) for single in part: metric = from_json(bend(GcpMonitoringMetricData.mapping, single), GcpMonitoringMetricData) query_result.append((query.metric_id, metric)) @@ -220,7 +224,7 @@ def update_resource_metrics( resource = resources_map.get(query.ref_id) if resource is None: continue - if len(metric.metric_values) == 0: + if not metric.metric_values or len(metric.metric_values) == 0: continue normalizer = query.normalization if not normalizer: diff --git a/plugins/gcp/test/conftest.py b/plugins/gcp/test/conftest.py index ec6b291b83..050ec77963 100644 --- a/plugins/gcp/test/conftest.py +++ b/plugins/gcp/test/conftest.py @@ -32,7 +32,15 @@ def random_builder() -> Iterator[GraphBuilder]: project_global_region = GcpRegion.fallback_global_region(project) credentials = AnonymousCredentials() # type: ignore builder = GraphBuilder( - Graph(), Cloud(id="gcp"), project, credentials, queue, feedback, accumulator, project_global_region + Graph(), + Cloud(id="gcp"), + project, + credentials, + queue, + feedback, + accumulator, + project_global_region, + GcpConfig(), ) builder.add_node(project_global_region, {}) # add predefined regions and zones diff --git a/plugins/gcp/test/test_collector.py b/plugins/gcp/test/test_collector.py index 2f6524a96e..3dfb5e5c80 100644 --- a/plugins/gcp/test/test_collector.py +++ b/plugins/gcp/test/test_collector.py @@ -24,6 +24,7 @@ def collector_with_graph(graph: Graph) -> GcpProjectCollector: cloud=Cloud(id="gcp"), project=GcpProject(id="test"), core_feedback=CoreFeedback("test", "test", "test", Queue()), + task_data={}, ) collector.graph = graph return collector @@ -32,7 +33,9 @@ def collector_with_graph(graph: Graph) -> GcpProjectCollector: def test_project_collection(random_builder: GraphBuilder) -> None: # create the collector from the builder values config: GcpConfig = current_config().gcp - project = GcpProjectCollector(config, random_builder.cloud, random_builder.project, random_builder.core_feedback) + project = GcpProjectCollector( + config, random_builder.cloud, random_builder.project, random_builder.core_feedback, {} + ) # use the graph provided by the random builder - it already has regions and zones # the random builder will not create new regions or zones during the test project.graph = random_builder.graph From 828ee2400b9bbe2ed46cede58cdc988a7dd817b7 Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 20 Nov 2024 16:06:05 +0000 Subject: [PATCH 06/43] feat: finish implementing metrics collection --- plugins/aws/fix_plugin_aws/collector.py | 3 ++- plugins/gcp/fix_plugin_gcp/collector.py | 3 ++- .../fix_plugin_gcp/resources/aiplatform.py | 1 - plugins/gcp/fix_plugin_gcp/resources/base.py | 4 --- .../gcp/fix_plugin_gcp/resources/compute.py | 4 ++- .../fix_plugin_gcp/resources/monitoring.py | 27 ++++++++----------- 6 files changed, 18 insertions(+), 24 deletions(-) diff --git a/plugins/aws/fix_plugin_aws/collector.py b/plugins/aws/fix_plugin_aws/collector.py index 01c27acd92..5b792c009c 100644 --- a/plugins/aws/fix_plugin_aws/collector.py +++ b/plugins/aws/fix_plugin_aws/collector.py @@ -334,8 +334,9 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> None: continue # region can be overridden in the query: s3 is global, but need to be queried per region if region := cast(AwsRegion, resource.region()): - lookup_map[resource.id] = resource resource_queries: List[cloudwatch.AwsCloudwatchQuery] = resource.collect_usage_metrics(builder) + if resource_queries: + lookup_map[resource.id] = resource for query in resource_queries: query_region = query.region or region start = query.start_delta or builder.metrics_delta diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index 5b1c81e809..476c2b982b 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -170,8 +170,9 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> None: continue # region can be overridden in the query: s3 is global, but need to be queried per region if region := cast(GcpRegion, resource.region()): - lookup_map[resource.id] = resource resource_queries: List[monitoring.GcpMonitoringQuery] = resource.collect_usage_metrics(builder) + if resource_queries: + lookup_map[resource.id] = resource for query in resource_queries: query_region = query.region or region start = builder.metrics_delta diff --git a/plugins/gcp/fix_plugin_gcp/resources/aiplatform.py b/plugins/gcp/fix_plugin_gcp/resources/aiplatform.py index e9577c3425..e9c16e052f 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/aiplatform.py +++ b/plugins/gcp/fix_plugin_gcp/resources/aiplatform.py @@ -36,7 +36,6 @@ def collect_resources(cls, builder: GraphBuilder, **kwargs: Any) -> List[GcpReso # Default behavior: in case the class has an ApiSpec, call the api and call collect. if issubclass(cls, GcpResource): region_name = "global" if not builder.region else builder.region.safe_name - log.info(f"[GCP:{builder.project.id}:{region_name}] Collecting {cls.kind}") if spec := cls.api_spec: expected_errors = GcpExpectedErrorCodes | (spec.expected_errors or set()) | {"HttpError:none:none"} with GcpErrorHandler( diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index 670e221296..125abff939 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -416,10 +416,6 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List: # type: ignore @classmethod def collect_resources(cls: Type[GcpResource], builder: GraphBuilder, **kwargs: Any) -> List[GcpResource]: # Default behavior: in case the class has an ApiSpec, call the api and call collect. - if kwargs: - log.info(f"[GCP:{builder.project.id}] collecting {cls.kind} with ({kwargs})") - else: - log.info(f"[GCP:{builder.project.id}] collecting {cls.kind}") if spec := cls.api_spec: expected_errors = GcpExpectedErrorCodes | (spec.expected_errors or set()) with GcpErrorHandler( diff --git a/plugins/gcp/fix_plugin_gcp/resources/compute.py b/plugins/gcp/fix_plugin_gcp/resources/compute.py index 6972289d7d..d5df20cd8d 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/compute.py +++ b/plugins/gcp/fix_plugin_gcp/resources/compute.py @@ -3556,6 +3556,8 @@ def connect_in_graph(self, builder: GraphBuilder, source: Json) -> None: ) def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuery]: + if self.instance_status != InstanceStatus.RUNNING: + return [] queries: List[GcpMonitoringQuery] = [] queries = [] delta = builder.metrics_delta @@ -3588,7 +3590,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer ) for stat in STAT_LIST for name, metric_name in [ - ("compute.googleapis.com/instance/network/received_bytes_count ", MetricName.NetworkIn), + ("compute.googleapis.com/instance/network/received_bytes_count", MetricName.NetworkIn), ("compute.googleapis.com/instance/network/sent_bytes_count", MetricName.NetworkOut), ] ] diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index 755fbb8c62..48d45dab72 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -1,22 +1,17 @@ from copy import deepcopy -from functools import cached_property, lru_cache +from functools import cached_property import logging -import re from datetime import datetime, timedelta -from typing import Callable, ClassVar, Dict, List, Optional, Type, Tuple, TypeVar, Any, Union +from typing import Callable, ClassVar, Dict, List, Optional, Tuple, TypeVar, Union from concurrent.futures import as_completed -from json import loads as json_loads from attr import define, field, frozen -from fix_plugin_gcp.gcp_client import GcpClient -from fix_plugin_gcp.resources.base import GcpApiSpec, GcpResource, GraphBuilder, GcpRegion -from fixlib.baseresources import MetricName, MetricUnit, ModelReference, BaseResource, StatName +from fix_plugin_gcp.resources.base import GcpApiSpec, GraphBuilder, GcpRegion +from fixlib.baseresources import MetricName, MetricUnit, BaseResource, StatName from fixlib.durations import duration_str -from fixlib.graph import Graph -from fixlib.json import from_json, sort_json -from fixlib.json_bender import S, K, Bend, Bender, ForallBend, bend, F, SecondsFromEpochToDatetime -from fixlib.types import Json +from fixlib.json import from_json +from fixlib.json_bender import S, Bender, ForallBend, bend from fixlib.utils import utc_str service_name = "monitoring" @@ -84,8 +79,8 @@ def create( normalization: Optional[MetricNormalization] = None, region: Optional[GcpRegion] = None, ) -> "GcpMonitoringQuery": - # Metric ID generation: metric query name + resource ID - metric_id = f"{query_name}/{ref_id}" + # Metric ID generation: metric query name + resource ID + stat + metric_id = f"{query_name}/{ref_id}/{stat}" return GcpMonitoringQuery( metric_name=metric_name, @@ -197,11 +192,11 @@ def _query_for_chunk( ) -> "List[Tuple[str, GcpMonitoringMetricData]]": query_result = [] local_api_spec = deepcopy(api_spec) - local_api_spec.request_parameters["filter"] = ( + local_api_spec.request_parameter["filter"] = ( f'metric.type = "{query.query_name}" AND metric.labels.{query.label_name} = "{query.resource_name}"' ) - local_api_spec.request_parameters["aggregation_alignmentPeriod"] = f"{int(query.period.total_seconds())}s" - local_api_spec.request_parameters["aggregation_perSeriesAligner"] = query.stat + local_api_spec.request_parameter["aggregation_alignmentPeriod"] = f"{int(query.period.total_seconds())}s" + local_api_spec.request_parameter["aggregation_perSeriesAligner"] = query.stat try: part = builder.client.list(local_api_spec) From 17ebd5ee9acfb677a70ce91f8e3b75903175de16 Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 20 Nov 2024 17:29:25 +0000 Subject: [PATCH 07/43] feat: added tests --- plugins/gcp/test/test_compute.py | 48 +++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/plugins/gcp/test/test_compute.py b/plugins/gcp/test/test_compute.py index d36beb554d..279777e88f 100644 --- a/plugins/gcp/test/test_compute.py +++ b/plugins/gcp/test/test_compute.py @@ -1,9 +1,18 @@ +from concurrent.futures import ThreadPoolExecutor +from datetime import timedelta import json import os + +from fix_plugin_gcp.resources.base import GraphBuilder, GcpRegion from fix_plugin_gcp.resources.compute import * from fix_plugin_gcp.resources.billing import GcpSku +from fix_plugin_gcp.resources.monitoring import GcpMonitoringQuery, GcpMonitoringMetricData, update_resource_metrics +from fixlib.threading import ExecutorQueue +from fixlib.baseresources import InstanceStatus + +from google.auth.credentials import AnonymousCredentials + from .random_client import roundtrip, connect_resource, FixturedClient -from fix_plugin_gcp.resources.base import GraphBuilder, GcpRegion def test_gcp_accelerator_type(random_builder: GraphBuilder) -> None: @@ -168,6 +177,43 @@ def test_gcp_instance_custom_machine_type(random_builder: GraphBuilder) -> None: assert only_machine_type._region +def test_gcp_instance_usage_metrics(random_builder: GraphBuilder) -> None: + gcp_instance = roundtrip(GcpInstance, random_builder) + gcp_instance.instance_status = InstanceStatus.RUNNING + + random_builder.region = GcpRegion(id="us-east1", name="us-east1") + queries = gcp_instance.collect_usage_metrics(random_builder) + lookup_map = {} + lookup_map[gcp_instance.id] = gcp_instance + + # simulates the `collect_usage_metrics` method found in `GcpAccountCollector`. + def collect_and_set_metrics(start_at: datetime, region: GcpRegion, queries: List[GcpMonitoringQuery]) -> None: + with ThreadPoolExecutor(max_workers=1) as executor: + queue = ExecutorQueue(executor, tasks_per_key=lambda _: 1, name="test") + g_builder = GraphBuilder( + random_builder.graph, + random_builder.cloud, + random_builder.project, + AnonymousCredentials(), + queue, + random_builder.core_feedback, + random_builder.error_accumulator, + GcpRegion(id="global", name="global"), + random_builder.config, + last_run_started_at=random_builder.last_run_started_at, + ) + result = GcpMonitoringMetricData.query_for(g_builder, queries, start_at, start_at + timedelta(hours=2)) + update_resource_metrics(lookup_map, result) + + start = datetime(2020, 5, 30, 15, 45, 30) + + collect_and_set_metrics(start, GcpRegion(id="us-east-1", name="us-east-1"), queries) + + assert gcp_instance._resource_usage["cpu_utilization_percent"]["avg"] > 0.0 + assert gcp_instance._resource_usage["network_in_count"]["avg"] > 0.0 + assert gcp_instance._resource_usage["disk_read_count"]["avg"] > 0.0 + + def test_machine_type_ondemand_cost(random_builder: GraphBuilder) -> None: # Cross-checking with pricing calculated on https://gcpinstances.doit-intl.com/ known_prices_linux_ondemand_hourly = [ From 19d7cf6e5d8374b17eaf208681e724723af853d2 Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 20 Nov 2024 17:31:32 +0000 Subject: [PATCH 08/43] chore: removed comment --- plugins/gcp/fix_plugin_gcp/collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index 476c2b982b..4474efe7f1 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -168,7 +168,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> None: for resource in builder.graph.nodes: if not isinstance(resource, GcpResource): continue - # region can be overridden in the query: s3 is global, but need to be queried per region + # region can be overridden in the query if region := cast(GcpRegion, resource.region()): resource_queries: List[monitoring.GcpMonitoringQuery] = resource.collect_usage_metrics(builder) if resource_queries: From 17996886a48bb2f9cf4797dbaf4745857c10f9ee Mon Sep 17 00:00:00 2001 From: Kirill Date: Thu, 21 Nov 2024 10:08:39 +0000 Subject: [PATCH 09/43] feat: fix tests --- plugins/gcp/fix_plugin_gcp/resources/monitoring.py | 3 ++- plugins/gcp/fix_plugin_gcp/utils.py | 5 ++--- plugins/gcp/test/test_compute.py | 5 +++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index 48d45dab72..0120400481 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -7,7 +7,8 @@ from attr import define, field, frozen -from fix_plugin_gcp.resources.base import GcpApiSpec, GraphBuilder, GcpRegion +from fix_plugin_gcp.resources.base import GraphBuilder, GcpRegion +from fix_plugin_gcp.gcp_client import GcpApiSpec from fixlib.baseresources import MetricName, MetricUnit, BaseResource, StatName from fixlib.durations import duration_str from fixlib.json import from_json diff --git a/plugins/gcp/fix_plugin_gcp/utils.py b/plugins/gcp/fix_plugin_gcp/utils.py index 6def26401a..9f9c6063f8 100644 --- a/plugins/gcp/fix_plugin_gcp/utils.py +++ b/plugins/gcp/fix_plugin_gcp/utils.py @@ -1,9 +1,8 @@ import os import socket from datetime import datetime -from typing import Iterable, Tuple, TypeVar, List, Union, Callable, Any, Dict, Optional +from typing import Iterable, TypeVar, List, Union, Callable, Any, Dict, Optional -from attrs import frozen from google.oauth2 import service_account from googleapiclient import discovery from googleapiclient.discovery_cache.base import Cache as GoogleApiClientCache @@ -12,7 +11,7 @@ from tenacity import Retrying, stop_after_attempt, retry_if_exception_type import fixlib.logger -from fixlib.baseresources import BaseResource, MetricName, MetricUnit, StatName +from fixlib.baseresources import BaseResource from fixlib.config import Config from fixlib.core.actions import CoreFeedback from fixlib.graph import Graph diff --git a/plugins/gcp/test/test_compute.py b/plugins/gcp/test/test_compute.py index 279777e88f..1da0b78367 100644 --- a/plugins/gcp/test/test_compute.py +++ b/plugins/gcp/test/test_compute.py @@ -1,7 +1,8 @@ from concurrent.futures import ThreadPoolExecutor -from datetime import timedelta +from datetime import timedelta, datetime import json import os +from typing import List from fix_plugin_gcp.resources.base import GraphBuilder, GcpRegion from fix_plugin_gcp.resources.compute import * @@ -194,7 +195,7 @@ def collect_and_set_metrics(start_at: datetime, region: GcpRegion, queries: List random_builder.graph, random_builder.cloud, random_builder.project, - AnonymousCredentials(), + AnonymousCredentials(), # type: ignore queue, random_builder.core_feedback, random_builder.error_accumulator, From f2f3c927353c1d03cebca42406453988b1dadba6 Mon Sep 17 00:00:00 2001 From: Kirill Date: Thu, 21 Nov 2024 10:20:29 +0000 Subject: [PATCH 10/43] fix syntax test --- plugins/gcp/fix_plugin_gcp/resources/aiplatform.py | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/aiplatform.py b/plugins/gcp/fix_plugin_gcp/resources/aiplatform.py index d3b1d87c3e..924d86a5f0 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/aiplatform.py +++ b/plugins/gcp/fix_plugin_gcp/resources/aiplatform.py @@ -50,7 +50,6 @@ class AIPlatformRegionFilter: def collect_resources(cls, builder: GraphBuilder, **kwargs: Any) -> List[GcpResource]: # Default behavior: in case the class has an ApiSpec, call the api and call collect. if issubclass(cls, GcpResource): - region_name = "global" if not builder.region else builder.region.safe_name if spec := cls.api_spec: expected_errors = GcpExpectedErrorCodes | (spec.expected_errors or set()) | {"HttpError:none:none"} with GcpErrorHandler( From 46b7259edac96926c14fb60b6242d3ef64cd55a2 Mon Sep 17 00:00:00 2001 From: Kirill Date: Thu, 21 Nov 2024 11:40:45 +0000 Subject: [PATCH 11/43] feat: added new test --- plugins/gcp/test/test_monitoring.py | 33 +++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 plugins/gcp/test/test_monitoring.py diff --git a/plugins/gcp/test/test_monitoring.py b/plugins/gcp/test/test_monitoring.py new file mode 100644 index 0000000000..ee92c3eaaa --- /dev/null +++ b/plugins/gcp/test/test_monitoring.py @@ -0,0 +1,33 @@ +from datetime import timedelta, datetime, timezone + +from fix_plugin_gcp.resources.base import GraphBuilder +from fix_plugin_gcp.resources.monitoring import GcpMonitoringQuery, GcpMonitoringMetricData, normalizer_factory +from fixlib.baseresources import MetricName + + +def test_metric(random_builder: GraphBuilder) -> None: + now = datetime(2020, 3, 1, tzinfo=timezone.utc) + earlier = now - timedelta(days=60) + read = GcpMonitoringQuery.create( + query_name="compute.googleapis.com/instance/disk/read_ops_count", + period=timedelta(hours=1), + ref_id="random_instance", + resource_name="random_instance", + metric_name=MetricName.DiskRead, + normalization=normalizer_factory.count, + stat="ALIGN_MIN", + label_name="instance_name", + ) + write = GcpMonitoringQuery.create( + query_name="compute.googleapis.com/instance/disk/write_ops_count", + period=timedelta(hours=1), + ref_id="random_instance", + resource_name="random_instance", + metric_name=MetricName.DiskWrite, + normalization=normalizer_factory.count, + stat="ALIGN_MIN", + label_name="instance_name", + ) + result = GcpMonitoringMetricData.query_for(random_builder, [read, write], earlier, now) + assert all(value > 0 for value in result[read].metric_values), "Not all values are greater than 0 for 'read'." + assert all(value > 0 for value in result[write].metric_values), "Not all values are greater than 0 for 'write'." From 48c713138b687c21475bf592b279d79389a35f8f Mon Sep 17 00:00:00 2001 From: Kirill Date: Thu, 21 Nov 2024 14:09:16 +0000 Subject: [PATCH 12/43] added region to the logging --- plugins/gcp/fix_plugin_gcp/resources/aiplatform.py | 4 +++- plugins/gcp/fix_plugin_gcp/resources/base.py | 4 +++- plugins/gcp/fix_plugin_gcp/resources/filestore.py | 4 +++- plugins/gcp/fix_plugin_gcp/resources/firestore.py | 4 +++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/aiplatform.py b/plugins/gcp/fix_plugin_gcp/resources/aiplatform.py index 924d86a5f0..c8cd57dd4a 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/aiplatform.py +++ b/plugins/gcp/fix_plugin_gcp/resources/aiplatform.py @@ -64,7 +64,9 @@ def collect_resources(cls, builder: GraphBuilder, **kwargs: Any) -> List[GcpReso if builder.region: items = builder.client.list(spec, **kwargs) collected_resources = cls.collect(items, builder) - log.info(f"[GCP:{builder.project.id}] finished collecting: {cls.kind}") + log.info( + f"[GCP:{builder.project.id}:{builder.region.safe_name}] finished collecting: {cls.kind}" + ) return collected_resources return [] diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index 125abff939..005d7c9bd7 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -428,7 +428,9 @@ def collect_resources(cls: Type[GcpResource], builder: GraphBuilder, **kwargs: A ): items = builder.client.list(spec, **kwargs) resources = cls.collect(items, builder) - log.info(f"[GCP:{builder.project.id}] finished collecting: {cls.kind}") + log.info( + f"[GCP:{builder.project.id}:{builder.region.safe_name if builder.region else "global"}] finished collecting: {cls.kind}" + ) return resources return [] diff --git a/plugins/gcp/fix_plugin_gcp/resources/filestore.py b/plugins/gcp/fix_plugin_gcp/resources/filestore.py index 7f7714fa15..309e037bc5 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/filestore.py +++ b/plugins/gcp/fix_plugin_gcp/resources/filestore.py @@ -289,7 +289,9 @@ def collect_snapshots() -> None: snapshots = GcpFilestoreInstanceSnapshot.collect(items, graph_builder) for snapshot in snapshots: graph_builder.add_edge(self, node=snapshot) - log.info(f"[GCP:{graph_builder.project.id}] finished collecting: {GcpFilestoreInstanceSnapshot.kind}") + log.info( + f"[GCP:{graph_builder.project.id}:{graph_builder.region.safe_name if graph_builder.region else "global"}] finished collecting: {GcpFilestoreInstanceSnapshot.kind}" + ) graph_builder.submit_work(collect_snapshots) diff --git a/plugins/gcp/fix_plugin_gcp/resources/firestore.py b/plugins/gcp/fix_plugin_gcp/resources/firestore.py index 5909408ce2..a9c3ac916c 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/firestore.py +++ b/plugins/gcp/fix_plugin_gcp/resources/firestore.py @@ -148,7 +148,9 @@ def collect_documents() -> None: documents = GcpFirestoreDocument.collect(items, graph_builder) for document in documents: graph_builder.add_edge(self, node=document) - log.info(f"[GCP:{graph_builder.project.id}] finished collecting: {GcpFirestoreDocument.kind}") + log.info( + f"[GCP:{graph_builder.project.id}:{graph_builder.region.safe_name if graph_builder.region else "global"}] finished collecting: {GcpFirestoreDocument.kind}" + ) graph_builder.submit_work(collect_documents) From aecc81636a6cb5796ac3a4fe7137276223201b42 Mon Sep 17 00:00:00 2001 From: Kirill Date: Thu, 21 Nov 2024 14:19:42 +0000 Subject: [PATCH 13/43] fix mypy --- plugins/gcp/test/test_monitoring.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/gcp/test/test_monitoring.py b/plugins/gcp/test/test_monitoring.py index ee92c3eaaa..86513cbc31 100644 --- a/plugins/gcp/test/test_monitoring.py +++ b/plugins/gcp/test/test_monitoring.py @@ -29,5 +29,5 @@ def test_metric(random_builder: GraphBuilder) -> None: label_name="instance_name", ) result = GcpMonitoringMetricData.query_for(random_builder, [read, write], earlier, now) - assert all(value > 0 for value in result[read].metric_values), "Not all values are greater than 0 for 'read'." - assert all(value > 0 for value in result[write].metric_values), "Not all values are greater than 0 for 'write'." + assert all(value > 0 for value in result[read].metric_values or []) + assert all(value > 0 for value in result[write].metric_values or []) From 7179a4e5929e68555ba3c19c3800800fcf47236b Mon Sep 17 00:00:00 2001 From: Kirill Date: Fri, 22 Nov 2024 14:51:49 +0000 Subject: [PATCH 14/43] feat: added collection for volumes --- .../gcp/fix_plugin_gcp/resources/compute.py | 61 +++++++++++++++++++ .../fix_plugin_gcp/resources/monitoring.py | 4 +- 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/compute.py b/plugins/gcp/fix_plugin_gcp/resources/compute.py index ee4656d2a2..f0c05cc691 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/compute.py +++ b/plugins/gcp/fix_plugin_gcp/resources/compute.py @@ -1205,6 +1205,67 @@ def connect_in_graph(self, builder: GraphBuilder, source: Json) -> None: builder.dependant_node(self, clazz=GcpInstance, link=user, reverse=True, delete_same_as_default=False) builder.add_edge(self, reverse=True, clazz=GcpDiskType, link=self.volume_type) + def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuery]: + queries: List[GcpMonitoringQuery] = [] + queries = [] + delta = builder.metrics_delta + queries.extend( + [ + GcpMonitoringQuery.create( + query_name="compute.googleapis.com/instance/disk/average_io_queue_depth", + period=delta, + ref_id=self.id, + resource_name=self.id, + metric_name=MetricName.VolumeQueueLength, + normalization=normalizer_factory.count, + stat=stat, + label_name="device_name", + ) + for stat in STAT_LIST + ] + ) + + queries.extend( + [ + GcpMonitoringQuery.create( + query_name=name, + period=delta, + ref_id=self.id, + resource_name=self.id, + metric_name=metric_name, + normalization=normalizer_factory.count, + stat=stat, + label_name="device_name", + ) + for stat in STAT_LIST + for name, metric_name in [ + ("compute.googleapis.com/instance/disk/read_ops_count", MetricName.DiskRead), + ("compute.googleapis.com/instance/disk/write_ops_count", MetricName.DiskWrite), + ] + ] + ) + + queries.extend( + [ + GcpMonitoringQuery.create( + query_name=name, + period=delta, + ref_id=self.id, + resource_name=self.id, + metric_name=metric_name, + normalization=normalizer_factory.count, + stat=stat, + label_name="device_name", + ) + for stat in STAT_LIST + for name, metric_name in [ + ("compute.googleapis.com/instance/disk/read_bytes_count", MetricName.DiskRead), + ("compute.googleapis.com/instance/disk/write_bytes_count", MetricName.DiskWrite), + ] + ] + ) + return queries + @define(eq=False, slots=False) class GcpExternalVpnGatewayInterface: diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index 0120400481..beacd8f848 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -226,14 +226,14 @@ def update_resource_metrics( if not normalizer: continue - most_recent_value = metric.metric_values[0] + average_value = sum(metric.metric_values) / len(metric.metric_values) try: metric_name = query.metric_name if not metric_name: continue name = metric_name + "_" + normalizer.unit - value = normalizer.normalize_value(most_recent_value) + value = normalizer.normalize_value(average_value) stat_name = normalizer.get_stat_value(query.stat) if stat_name: resource._resource_usage[name][str(stat_name)] = value From 8bc5c09a0e2dcfb39648fabc949ebc546e7766a7 Mon Sep 17 00:00:00 2001 From: Kirill Date: Fri, 22 Nov 2024 16:03:26 +0000 Subject: [PATCH 15/43] feat: added collection for sql instance --- fixlib/fixlib/baseresources.py | 2 + .../fix_plugin_gcp/resources/monitoring.py | 5 +- .../gcp/fix_plugin_gcp/resources/sqladmin.py | 66 ++++++++++++++++++- 3 files changed, 71 insertions(+), 2 deletions(-) diff --git a/fixlib/fixlib/baseresources.py b/fixlib/fixlib/baseresources.py index e6f42e1713..f82b66b036 100644 --- a/fixlib/fixlib/baseresources.py +++ b/fixlib/fixlib/baseresources.py @@ -194,6 +194,8 @@ def __str__(self) -> str: DiskQueueDepth = "disk_queue_depth" NetworkReceiveThroughput = "network_receive_throughput" NetworkTransmitThroughput = "network_transmit_throughput" + NetworkBytesSent = "network_bytes_sent" + NetworkBytesReceived = "network_bytes_received" # serverless Invocations = "invocations" diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index beacd8f848..ced12c8459 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -64,6 +64,7 @@ class GcpMonitoringQuery: metric_id: str # unique metric identifier (metric_name + instance_id) stat: str # aggregation type, supports ALIGN_MEAN, ALIGN_MAX, ALIGN_MIN label_name: str + metric_lable_query: bool # `metric` by default. can be `resource` label normalization: Optional[MetricNormalization] = None # normalization info region: Optional[GcpRegion] = None @@ -77,6 +78,7 @@ def create( metric_name: Union[str, MetricName], stat: str, label_name: str, + metric_lable_query: bool = True, normalization: Optional[MetricNormalization] = None, region: Optional[GcpRegion] = None, ) -> "GcpMonitoringQuery": @@ -91,6 +93,7 @@ def create( resource_name=resource_name, metric_id=metric_id, label_name=label_name, + metric_lable_query=metric_lable_query, stat=stat, region=region, normalization=normalization, @@ -194,7 +197,7 @@ def _query_for_chunk( query_result = [] local_api_spec = deepcopy(api_spec) local_api_spec.request_parameter["filter"] = ( - f'metric.type = "{query.query_name}" AND metric.labels.{query.label_name} = "{query.resource_name}"' + f'metric.type = "{query.query_name}" AND {"metric" if query.metric_lable_query else "resource"}.labels.{query.label_name} = "{query.resource_name}"' ) local_api_spec.request_parameter["aggregation_alignmentPeriod"] = f"{int(query.period.total_seconds())}s" local_api_spec.request_parameter["aggregation_perSeriesAligner"] = query.stat diff --git a/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py b/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py index 8786452307..1c9f9f21e4 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py +++ b/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py @@ -7,7 +7,8 @@ from fix_plugin_gcp.gcp_client import GcpApiSpec from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder from fix_plugin_gcp.resources.compute import GcpSslCertificate -from fixlib.baseresources import BaseDatabase, DatabaseInstanceStatus, ModelReference +from fix_plugin_gcp.resources.monitoring import GcpMonitoringQuery, normalizer_factory, STAT_LIST +from fixlib.baseresources import BaseDatabase, DatabaseInstanceStatus, MetricName, ModelReference from fixlib.json_bender import F, Bender, S, Bend, ForallBend, K, MapEnum, AsInt from fixlib.types import Json @@ -766,6 +767,69 @@ def collect_sql_resources(spec: GcpApiSpec, clazz: Type[GcpResource]) -> None: graph_builder.submit_work(collect_sql_resources, spec, cls) + def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuery]: + queries: List[GcpMonitoringQuery] = [] + queries = [] + delta = builder.metrics_delta + queries.extend( + [ + GcpMonitoringQuery.create( + query_name="cloudsql.googleapis.com/database/cpu/utilization", + period=delta, + ref_id=self.id, + resource_name=f"{builder.project.id}:{self.id}", + metric_name=MetricName.CpuUtilization, + normalization=normalizer_factory.percent, + stat=stat, + label_name="database_id", + metric_lable_query=False, + ) + for stat in STAT_LIST + ] + ) + queries.extend( + [ + GcpMonitoringQuery.create( + query_name=name, + period=delta, + ref_id=self.id, + resource_name=f"{builder.project.id}:{self.id}", + metric_name=metric_name, + normalization=normalizer_factory.count, + stat=stat, + label_name="database_id", + metric_lable_query=False, + ) + for stat in STAT_LIST + for name, metric_name in [ + ("cloudsql.googleapis.com/database/network/connections", MetricName.DatabaseConnections), + ("cloudsql.googleapis.com/database/network/sent_bytes_count", MetricName.NetworkBytesSent), + ("cloudsql.googleapis.com/database/network/received_bytes_count", MetricName.NetworkBytesReceived), + ] + ] + ) + queries.extend( + [ + GcpMonitoringQuery.create( + query_name=name, + period=delta, + ref_id=self.id, + resource_name=f"{builder.project.id}:{self.id}", + metric_name=metric_name, + normalization=normalizer_factory.iops, + stat=stat, + label_name="database_id", + metric_lable_query=False, + ) + for stat in STAT_LIST + for name, metric_name in [ + ("cloudsql.googleapis.com/database/disk/read_ops_count", MetricName.DiskRead), + ("cloudsql.googleapis.com/database/disk/write_ops_count", MetricName.DiskWrite), + ] + ] + ) + return queries + @classmethod def called_collect_apis(cls) -> List[GcpApiSpec]: return [ From 85d618354d17b1c74f351fcfce52ddcda0d1fbd5 Mon Sep 17 00:00:00 2001 From: Kirill Date: Fri, 22 Nov 2024 17:30:42 +0000 Subject: [PATCH 16/43] feat: make unique queries if resources have same name --- plugins/gcp/fix_plugin_gcp/collector.py | 3 ++- plugins/gcp/fix_plugin_gcp/resources/compute.py | 14 +++++++------- plugins/gcp/fix_plugin_gcp/resources/monitoring.py | 4 +++- plugins/gcp/fix_plugin_gcp/resources/sqladmin.py | 6 +++--- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index 4474efe7f1..51205538e8 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -172,7 +172,8 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> None: if region := cast(GcpRegion, resource.region()): resource_queries: List[monitoring.GcpMonitoringQuery] = resource.collect_usage_metrics(builder) if resource_queries: - lookup_map[resource.id] = resource + # set unique GcpMonitoringQuery.ref_id + lookup_map[f"{resource.kind}/{resource.id}/{region.id}"] = resource for query in resource_queries: query_region = query.region or region start = builder.metrics_delta diff --git a/plugins/gcp/fix_plugin_gcp/resources/compute.py b/plugins/gcp/fix_plugin_gcp/resources/compute.py index f0c05cc691..542d58438b 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/compute.py +++ b/plugins/gcp/fix_plugin_gcp/resources/compute.py @@ -1214,7 +1214,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer GcpMonitoringQuery.create( query_name="compute.googleapis.com/instance/disk/average_io_queue_depth", period=delta, - ref_id=self.id, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", resource_name=self.id, metric_name=MetricName.VolumeQueueLength, normalization=normalizer_factory.count, @@ -1230,7 +1230,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer GcpMonitoringQuery.create( query_name=name, period=delta, - ref_id=self.id, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", resource_name=self.id, metric_name=metric_name, normalization=normalizer_factory.count, @@ -1250,7 +1250,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer GcpMonitoringQuery.create( query_name=name, period=delta, - ref_id=self.id, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", resource_name=self.id, metric_name=metric_name, normalization=normalizer_factory.count, @@ -3627,7 +3627,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer GcpMonitoringQuery.create( query_name="compute.googleapis.com/instance/cpu/utilization", period=delta, - ref_id=self.id, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", resource_name=self.id, metric_name=MetricName.CpuUtilization, normalization=normalizer_factory.percent, @@ -3642,7 +3642,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer GcpMonitoringQuery.create( query_name=name, period=delta, - ref_id=self.id, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", resource_name=self.id, metric_name=metric_name, normalization=normalizer_factory.count, @@ -3662,7 +3662,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer GcpMonitoringQuery.create( query_name=name, period=delta, - ref_id=self.id, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", resource_name=self.id, metric_name=metric_name, normalization=normalizer_factory.count, @@ -3682,7 +3682,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer GcpMonitoringQuery.create( query_name=name, period=delta, - ref_id=self.id, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", resource_name=self.id, metric_name=metric_name, normalization=normalizer_factory.count, diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index ced12c8459..bdff3777f6 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -60,7 +60,8 @@ class GcpMonitoringQuery: query_name: str # name of the metric (e.g., GCP metric type) resource_name: str # name of resource period: timedelta # period of the metric - ref_id: str # reference ID for the resource (e.g., instance ID) + ref_id: str # A unique identifier for the resource, formatted as `{resource_kind}/{resource_id}/{resource_region}`. + # Example: "gcp_instance/12345/us-central1". This is used to uniquely reference resources across kinds and regions. metric_id: str # unique metric identifier (metric_name + instance_id) stat: str # aggregation type, supports ALIGN_MEAN, ALIGN_MAX, ALIGN_MIN label_name: str @@ -219,6 +220,7 @@ def update_resource_metrics( resources_map: Dict[str, V], monitoring_metric_result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData], ) -> None: + a = None for query, metric in monitoring_metric_result.items(): resource = resources_map.get(query.ref_id) if resource is None: diff --git a/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py b/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py index 1c9f9f21e4..6a3772a73c 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py +++ b/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py @@ -776,7 +776,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer GcpMonitoringQuery.create( query_name="cloudsql.googleapis.com/database/cpu/utilization", period=delta, - ref_id=self.id, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", resource_name=f"{builder.project.id}:{self.id}", metric_name=MetricName.CpuUtilization, normalization=normalizer_factory.percent, @@ -792,7 +792,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer GcpMonitoringQuery.create( query_name=name, period=delta, - ref_id=self.id, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", resource_name=f"{builder.project.id}:{self.id}", metric_name=metric_name, normalization=normalizer_factory.count, @@ -813,7 +813,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer GcpMonitoringQuery.create( query_name=name, period=delta, - ref_id=self.id, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", resource_name=f"{builder.project.id}:{self.id}", metric_name=metric_name, normalization=normalizer_factory.iops, From 1fd4d11a4b4edc762bc994231819ef7f5afdcad7 Mon Sep 17 00:00:00 2001 From: Kirill Date: Fri, 22 Nov 2024 17:34:56 +0000 Subject: [PATCH 17/43] fix test --- plugins/gcp/test/test_compute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/gcp/test/test_compute.py b/plugins/gcp/test/test_compute.py index 1da0b78367..c306f81198 100644 --- a/plugins/gcp/test/test_compute.py +++ b/plugins/gcp/test/test_compute.py @@ -185,7 +185,7 @@ def test_gcp_instance_usage_metrics(random_builder: GraphBuilder) -> None: random_builder.region = GcpRegion(id="us-east1", name="us-east1") queries = gcp_instance.collect_usage_metrics(random_builder) lookup_map = {} - lookup_map[gcp_instance.id] = gcp_instance + lookup_map[f"{gcp_instance.kind}/{gcp_instance.id}/{gcp_instance.region().id}"] = gcp_instance # simulates the `collect_usage_metrics` method found in `GcpAccountCollector`. def collect_and_set_metrics(start_at: datetime, region: GcpRegion, queries: List[GcpMonitoringQuery]) -> None: From ce4d3b214ade7c0e9b2d688806867580c65df836 Mon Sep 17 00:00:00 2001 From: Kirill Date: Fri, 22 Nov 2024 17:47:37 +0000 Subject: [PATCH 18/43] chore: delete unused var --- plugins/gcp/fix_plugin_gcp/resources/monitoring.py | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index bdff3777f6..58e6034a66 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -220,7 +220,6 @@ def update_resource_metrics( resources_map: Dict[str, V], monitoring_metric_result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData], ) -> None: - a = None for query, metric in monitoring_metric_result.items(): resource = resources_map.get(query.ref_id) if resource is None: From 240fd911f56e84b300d2be7893845b7a5239893f Mon Sep 17 00:00:00 2001 From: Kirill Date: Mon, 25 Nov 2024 12:39:33 +0000 Subject: [PATCH 19/43] feat: reimplemented collection and make filter dynamically --- plugins/gcp/fix_plugin_gcp/collector.py | 14 ++--- plugins/gcp/fix_plugin_gcp/resources/base.py | 15 ++++- .../resources/cloudfunctions.py | 57 ++++++++++++++++++- .../gcp/fix_plugin_gcp/resources/compute.py | 28 ++++----- .../fix_plugin_gcp/resources/monitoring.py | 44 +++++++++----- .../gcp/fix_plugin_gcp/resources/sqladmin.py | 24 +++++--- 6 files changed, 133 insertions(+), 49 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index 51205538e8..300bcdbe15 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -29,13 +29,13 @@ log = logging.getLogger("fix.plugins.gcp") all_resources: List[Type[GcpResource]] = ( compute.resources - + container.resources - + billing.resources - + sqladmin.resources - + storage.resources - + aiplatform.resources - + firestore.resources - + filestore.resources + # + container.resources + # + billing.resources + # + sqladmin.resources + # + storage.resources + # + aiplatform.resources + # + firestore.resources + # + filestore.resources + cloudfunctions.resources ) diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index 005d7c9bd7..096d02ebe7 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -9,10 +9,10 @@ from typing import Callable, List, ClassVar, Optional, TypeVar, Type, Any, Dict, Set, Tuple from attr import define, field -from fix_plugin_gcp.config import GcpConfig from google.auth.credentials import Credentials as GoogleAuthCredentials from googleapiclient.errors import HttpError +from fix_plugin_gcp.config import GcpConfig from fix_plugin_gcp.gcp_client import GcpClient, GcpApiSpec, InternalZoneProp, RegionProp from fix_plugin_gcp.utils import Credentials from fixlib.baseresources import ( @@ -32,9 +32,8 @@ from fixlib.json_bender import bend, Bender, S, Bend, MapDict, F from fixlib.threading import ExecutorQueue from fixlib.types import Json -from fixinventorydata.cloud import regions as cloud_region_data - from fixlib.utils import utc +from fixinventorydata.cloud import regions as cloud_region_data log = logging.getLogger("fix.plugins.gcp") @@ -335,6 +334,16 @@ def _keys(self) -> Tuple[Any, ...]: return tuple(list(super()._keys()) + [self.link]) return super()._keys() + @property + def resource_raw_name(self) -> str: + """ + Extracts the last segment of the GCP resource ID. + + Returns: + str: The last segment of the resource ID (e.g., "function-1" from "projects/{project}/locations/{location}/functions/function-1"). + """ + return self.id.strip().split("/")[-1] + def delete(self, graph: Graph) -> bool: if not self.api_spec: return False diff --git a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py index 0fd5f113e2..07979e99cd 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py +++ b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py @@ -4,8 +4,9 @@ from attr import define, field from fix_plugin_gcp.gcp_client import GcpApiSpec -from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus -from fixlib.baseresources import BaseServerlessFunction +from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder +from fix_plugin_gcp.resources.monitoring import GcpMonitoringQuery, normalizer_factory, STAT_LIST +from fixlib.baseresources import BaseServerlessFunction, MetricName from fixlib.json_bender import Bender, S, Bend, ForallBend @@ -300,5 +301,57 @@ class GcpCloudFunction(GcpResource, BaseServerlessFunction): upgrade_info: Optional[GcpUpgradeInfo] = field(default=None) url: Optional[str] = field(default=None) + def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuery]: + queries: List[GcpMonitoringQuery] = [] + queries = [] + delta = builder.metrics_delta + queries.extend( + [ + GcpMonitoringQuery.create( + query_name=name, + period=delta, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", + metric_name=metric_name, + normalization=normalizer_factory.count, + stat=stat, + project_id=builder.project.id, + metric_filters={ + **( + {"metric.labels.status": "error"} + if metric_name == MetricName.Errors + else {"metric.labels.status": "ok"} + ), + "resource.labels.function_name": self.resource_raw_name, + "resource.labels.region": self.region().id, + "resource.type": "cloud_function", + }, + ) + for stat in STAT_LIST + for name, metric_name in [ + ("cloudfunctions.googleapis.com/function/execution_count", MetricName.Invocations), + ("cloudfunctions.googleapis.com/function/execution_count", MetricName.Errors), + ] + ] + ) + queries.extend( + [ + GcpMonitoringQuery.create( + query_name="cloudfunctions.googleapis.com/function/execution_times", + period=delta, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", + metric_name=MetricName.Duration, + normalization=normalizer_factory.milliseconds, + stat="ALIGN_NONE", + project_id=builder.project.id, + metric_filters={ + "resource.labels.function_name": self.resource_raw_name, + "resource.labels.region": self.region().id, + "resource.type": "cloud_function", + }, + ) + ] + ) + return queries + resources: List[Type[GcpResource]] = [GcpCloudFunction] diff --git a/plugins/gcp/fix_plugin_gcp/resources/compute.py b/plugins/gcp/fix_plugin_gcp/resources/compute.py index 542d58438b..610f03195f 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/compute.py +++ b/plugins/gcp/fix_plugin_gcp/resources/compute.py @@ -1215,11 +1215,11 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer query_name="compute.googleapis.com/instance/disk/average_io_queue_depth", period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", - resource_name=self.id, metric_name=MetricName.VolumeQueueLength, normalization=normalizer_factory.count, stat=stat, - label_name="device_name", + project_id=builder.project.id, + metric_filters={"metric.labels.device_name": self.id, "resource.labels.zone": self.zone().id}, ) for stat in STAT_LIST ] @@ -1231,11 +1231,11 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer query_name=name, period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", - resource_name=self.id, metric_name=metric_name, normalization=normalizer_factory.count, stat=stat, - label_name="device_name", + project_id=builder.project.id, + metric_filters={"metric.labels.device_name": self.id, "resource.labels.zone": self.zone().id}, ) for stat in STAT_LIST for name, metric_name in [ @@ -1251,11 +1251,11 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer query_name=name, period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", - resource_name=self.id, metric_name=metric_name, normalization=normalizer_factory.count, stat=stat, - label_name="device_name", + project_id=builder.project.id, + metric_filters={"metric.labels.device_name": self.id}, ) for stat in STAT_LIST for name, metric_name in [ @@ -3628,11 +3628,11 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer query_name="compute.googleapis.com/instance/cpu/utilization", period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", - resource_name=self.id, metric_name=MetricName.CpuUtilization, normalization=normalizer_factory.percent, stat=stat, - label_name="instance_name", + project_id=builder.project.id, + metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, ) for stat in STAT_LIST ] @@ -3643,11 +3643,11 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer query_name=name, period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", - resource_name=self.id, metric_name=metric_name, normalization=normalizer_factory.count, stat=stat, - label_name="instance_name", + project_id=builder.project.id, + metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, ) for stat in STAT_LIST for name, metric_name in [ @@ -3663,11 +3663,11 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer query_name=name, period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", - resource_name=self.id, metric_name=metric_name, normalization=normalizer_factory.count, stat=stat, - label_name="instance_name", + project_id=builder.project.id, + metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, ) for stat in STAT_LIST for name, metric_name in [ @@ -3683,11 +3683,11 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer query_name=name, period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", - resource_name=self.id, metric_name=metric_name, normalization=normalizer_factory.count, stat=stat, - label_name="instance_name", + project_id=builder.project.id, + metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, ) for stat in STAT_LIST for name, metric_name in [ diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index 58e6034a66..b428272235 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -58,16 +58,18 @@ def get_stat_value(self, key: str) -> Optional[StatName]: class GcpMonitoringQuery: metric_name: Union[str, MetricName] # final name of the metric query_name: str # name of the metric (e.g., GCP metric type) - resource_name: str # name of resource + # resource_name: str # name of resource period: timedelta # period of the metric ref_id: str # A unique identifier for the resource, formatted as `{resource_kind}/{resource_id}/{resource_region}`. # Example: "gcp_instance/12345/us-central1". This is used to uniquely reference resources across kinds and regions. metric_id: str # unique metric identifier (metric_name + instance_id) stat: str # aggregation type, supports ALIGN_MEAN, ALIGN_MAX, ALIGN_MIN - label_name: str - metric_lable_query: bool # `metric` by default. can be `resource` label + # label_name: str + # metric_lable_query: bool # `metric` by default. can be `resource` label + project_id: str # GCP project name normalization: Optional[MetricNormalization] = None # normalization info region: Optional[GcpRegion] = None + metric_filters: Optional[Tuple[Tuple[str, str], ...]] = None # Immutable structure @staticmethod def create( @@ -75,29 +77,33 @@ def create( query_name: str, period: timedelta, ref_id: str, - resource_name: str, + # resource_name: str, metric_name: Union[str, MetricName], stat: str, - label_name: str, - metric_lable_query: bool = True, + # label_name: str, + # metric_lable_query: bool = True, + project_id: str, normalization: Optional[MetricNormalization] = None, region: Optional[GcpRegion] = None, + metric_filters: Optional[Dict[str, str]] = None, ) -> "GcpMonitoringQuery": # Metric ID generation: metric query name + resource ID + stat metric_id = f"{query_name}/{ref_id}/{stat}" - + immutable_filters = tuple(metric_filters.items()) if metric_filters else None return GcpMonitoringQuery( metric_name=metric_name, query_name=query_name, period=period, ref_id=ref_id, - resource_name=resource_name, + # resource_name=resource_name, metric_id=metric_id, - label_name=label_name, - metric_lable_query=metric_lable_query, + # label_name=label_name, + # metric_lable_query=metric_lable_query, stat=stat, region=region, normalization=normalization, + project_id=project_id, + metric_filters=immutable_filters, ) @@ -160,7 +166,7 @@ def query_for( "interval_endTime": utc_str(end_time), "interval_startTime": utc_str(start_time), "view": "FULL", - # set parametes below dynamically + # Below parameters are intended to be set dynamically # "aggregation_alignmentPeriod": None, # "aggregation_perSeriesAligner": None, # "filter": None, @@ -197,9 +203,19 @@ def _query_for_chunk( ) -> "List[Tuple[str, GcpMonitoringMetricData]]": query_result = [] local_api_spec = deepcopy(api_spec) - local_api_spec.request_parameter["filter"] = ( - f'metric.type = "{query.query_name}" AND {"metric" if query.metric_lable_query else "resource"}.labels.{query.label_name} = "{query.resource_name}"' - ) + + # Base filter + filters = [ + f'metric.type = "{query.query_name}"', + f'resource.labels.project_id="{query.project_id}"', + ] + + # Add additional filters + if query.metric_filters: + filters.extend(f'{key} = "{value}"' for key, value in query.metric_filters) + + # Join filters with " AND " to form the final filter string + local_api_spec.request_parameter["filter"] = " AND ".join(filters) local_api_spec.request_parameter["aggregation_alignmentPeriod"] = f"{int(query.period.total_seconds())}s" local_api_spec.request_parameter["aggregation_perSeriesAligner"] = query.stat diff --git a/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py b/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py index 6a3772a73c..bb2caf3aed 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py +++ b/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py @@ -777,12 +777,14 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer query_name="cloudsql.googleapis.com/database/cpu/utilization", period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", - resource_name=f"{builder.project.id}:{self.id}", metric_name=MetricName.CpuUtilization, normalization=normalizer_factory.percent, stat=stat, - label_name="database_id", - metric_lable_query=False, + project_id=builder.project.id, + metric_filters={ + "resource.labels.database_id": f"{builder.project.id}:{self.id}", + "resource.labels.region": self.region().id, + }, ) for stat in STAT_LIST ] @@ -793,12 +795,14 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer query_name=name, period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", - resource_name=f"{builder.project.id}:{self.id}", metric_name=metric_name, normalization=normalizer_factory.count, stat=stat, - label_name="database_id", - metric_lable_query=False, + project_id=builder.project.id, + metric_filters={ + "resource.labels.database_id": f"{builder.project.id}:{self.id}", + "resource.labels.region": self.region().id, + }, ) for stat in STAT_LIST for name, metric_name in [ @@ -814,12 +818,14 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer query_name=name, period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", - resource_name=f"{builder.project.id}:{self.id}", metric_name=metric_name, normalization=normalizer_factory.iops, stat=stat, - label_name="database_id", - metric_lable_query=False, + project_id=builder.project.id, + metric_filters={ + "resource.labels.database_id": f"{builder.project.id}:{self.id}", + "resource.labels.region": self.region().id, + }, ) for stat in STAT_LIST for name, metric_name in [ From ace0bc9f875b21dbf982557f54074cf268265e25 Mon Sep 17 00:00:00 2001 From: Kirill Date: Mon, 25 Nov 2024 13:34:48 +0000 Subject: [PATCH 20/43] feat: adjusted better --- plugins/gcp/fix_plugin_gcp/collector.py | 6 ++--- plugins/gcp/fix_plugin_gcp/resources/base.py | 9 +++++++ .../resources/cloudfunctions.py | 27 ++++++++++++++----- .../fix_plugin_gcp/resources/monitoring.py | 14 +--------- 4 files changed, 34 insertions(+), 22 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index 300bcdbe15..050da85140 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -28,7 +28,7 @@ log = logging.getLogger("fix.plugins.gcp") all_resources: List[Type[GcpResource]] = ( - compute.resources + # compute.resources # + container.resources # + billing.resources # + sqladmin.resources @@ -36,7 +36,7 @@ # + aiplatform.resources # + firestore.resources # + filestore.resources - + cloudfunctions.resources + cloudfunctions.resources ) @@ -175,7 +175,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> None: # set unique GcpMonitoringQuery.ref_id lookup_map[f"{resource.kind}/{resource.id}/{region.id}"] = resource for query in resource_queries: - query_region = query.region or region + query_region = region start = builder.metrics_delta if query.period and query.period < thirty_minutes: start = min(start, two_hours) diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index 096d02ebe7..3cbfd538a4 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -203,6 +203,15 @@ def _standard_edges(self, node: GcpResourceType, source: Optional[Json] = None) if node._region: self.add_edge(node, node=node._region, reverse=True) return True + if "locations" in node.id: + parts = node.id.split("/") + loc_index = parts.index("locations") + if loc_index + 1 < len(parts): + location_name = parts[loc_index + 1] + if region := self.region_by_name.get(location_name): + node._region = region + self.add_edge(node, node=region, reverse=True) + return True if source is not None: if InternalZoneProp in source: diff --git a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py index 07979e99cd..0ddb47ddf8 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py +++ b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py @@ -316,11 +316,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer stat=stat, project_id=builder.project.id, metric_filters={ - **( - {"metric.labels.status": "error"} - if metric_name == MetricName.Errors - else {"metric.labels.status": "ok"} - ), + "metric.labels.status": "ok", "resource.labels.function_name": self.resource_raw_name, "resource.labels.region": self.region().id, "resource.type": "cloud_function", @@ -329,10 +325,29 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer for stat in STAT_LIST for name, metric_name in [ ("cloudfunctions.googleapis.com/function/execution_count", MetricName.Invocations), - ("cloudfunctions.googleapis.com/function/execution_count", MetricName.Errors), ] ] ) + queries.extend( + [ + GcpMonitoringQuery.create( + query_name="cloudfunctions.googleapis.com/function/execution_count", + period=delta, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", + metric_name=MetricName.Errors, + normalization=normalizer_factory.count, + stat=stat, + project_id=builder.project.id, + metric_filters={ + "metric.labels.status": "error", + "resource.labels.function_name": self.resource_raw_name, + "resource.labels.region": self.region().id, + "resource.type": "cloud_function", + }, + ) + for stat in STAT_LIST + ] + ) queries.extend( [ GcpMonitoringQuery.create( diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index b428272235..84be792495 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -58,17 +58,13 @@ def get_stat_value(self, key: str) -> Optional[StatName]: class GcpMonitoringQuery: metric_name: Union[str, MetricName] # final name of the metric query_name: str # name of the metric (e.g., GCP metric type) - # resource_name: str # name of resource period: timedelta # period of the metric ref_id: str # A unique identifier for the resource, formatted as `{resource_kind}/{resource_id}/{resource_region}`. # Example: "gcp_instance/12345/us-central1". This is used to uniquely reference resources across kinds and regions. metric_id: str # unique metric identifier (metric_name + instance_id) stat: str # aggregation type, supports ALIGN_MEAN, ALIGN_MAX, ALIGN_MIN - # label_name: str - # metric_lable_query: bool # `metric` by default. can be `resource` label project_id: str # GCP project name normalization: Optional[MetricNormalization] = None # normalization info - region: Optional[GcpRegion] = None metric_filters: Optional[Tuple[Tuple[str, str], ...]] = None # Immutable structure @staticmethod @@ -77,15 +73,11 @@ def create( query_name: str, period: timedelta, ref_id: str, - # resource_name: str, metric_name: Union[str, MetricName], stat: str, - # label_name: str, - # metric_lable_query: bool = True, project_id: str, + metric_filters: Dict[str, str], normalization: Optional[MetricNormalization] = None, - region: Optional[GcpRegion] = None, - metric_filters: Optional[Dict[str, str]] = None, ) -> "GcpMonitoringQuery": # Metric ID generation: metric query name + resource ID + stat metric_id = f"{query_name}/{ref_id}/{stat}" @@ -95,12 +87,8 @@ def create( query_name=query_name, period=period, ref_id=ref_id, - # resource_name=resource_name, metric_id=metric_id, - # label_name=label_name, - # metric_lable_query=metric_lable_query, stat=stat, - region=region, normalization=normalization, project_id=project_id, metric_filters=immutable_filters, From 95271328eed71d62e8d6feb7334c71328eebeab4 Mon Sep 17 00:00:00 2001 From: Kirill Date: Mon, 25 Nov 2024 14:05:21 +0000 Subject: [PATCH 21/43] feat: make queries more unique --- plugins/gcp/fix_plugin_gcp/resources/monitoring.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index 84be792495..a0a7d885dc 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -79,9 +79,10 @@ def create( metric_filters: Dict[str, str], normalization: Optional[MetricNormalization] = None, ) -> "GcpMonitoringQuery": - # Metric ID generation: metric query name + resource ID + stat - metric_id = f"{query_name}/{ref_id}/{stat}" - immutable_filters = tuple(metric_filters.items()) if metric_filters else None + sorted_filters = sorted(metric_filters.items()) + filter_suffix = "/" + "/".join(f"{key}={value}" for key, value in sorted_filters) + metric_id = f"{query_name}/{ref_id}/{stat}{filter_suffix}" + immutable_filters = tuple(sorted_filters) return GcpMonitoringQuery( metric_name=metric_name, query_name=query_name, @@ -242,9 +243,8 @@ def update_resource_metrics( continue name = metric_name + "_" + normalizer.unit value = normalizer.normalize_value(average_value) - stat_name = normalizer.get_stat_value(query.stat) - if stat_name: - resource._resource_usage[name][str(stat_name)] = value + stat_name = normalizer.get_stat_value(query.stat) if normalizer.get_stat_value(query.stat) else "avg" + resource._resource_usage[name][str(stat_name)] = value except KeyError as e: log.warning(f"An error occured while setting metric values: {e}") raise From fce7179d8e00b4df39fcb54fa158f097db81b964 Mon Sep 17 00:00:00 2001 From: Kirill Date: Mon, 25 Nov 2024 16:12:12 +0000 Subject: [PATCH 22/43] feat: make safety check for region --- plugins/gcp/fix_plugin_gcp/resources/base.py | 19 ++++++++++++++----- .../fix_plugin_gcp/resources/monitoring.py | 2 +- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index 3cbfd538a4..41382ca8ed 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -203,11 +203,20 @@ def _standard_edges(self, node: GcpResourceType, source: Optional[Json] = None) if node._region: self.add_edge(node, node=node._region, reverse=True) return True - if "locations" in node.id: - parts = node.id.split("/") - loc_index = parts.index("locations") - if loc_index + 1 < len(parts): - location_name = parts[loc_index + 1] + + parts = node.id.split("/") + if len(parts) > 3 and parts[0] == "projects": + location_types = ["locations", "zones", "regions"] + if parts[2] in location_types: + location_name = parts[3] + # Check for zone first + if zone := self.zone_by_name.get(location_name): + node._zone = zone + node._region = self.region_by_zone_name.get(zone.id) + self.add_edge(node, node=zone, reverse=True) + return True + + # Then check for region if region := self.region_by_name.get(location_name): node._region = region self.add_edge(node, node=region, reverse=True) diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index a0a7d885dc..41afb7d36c 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -7,7 +7,7 @@ from attr import define, field, frozen -from fix_plugin_gcp.resources.base import GraphBuilder, GcpRegion +from fix_plugin_gcp.resources.base import GraphBuilder from fix_plugin_gcp.gcp_client import GcpApiSpec from fixlib.baseresources import MetricName, MetricUnit, BaseResource, StatName from fixlib.durations import duration_str From 8aebcb7eb14a18a150346610ea34bdac66f7bc16 Mon Sep 17 00:00:00 2001 From: Kirill Date: Tue, 26 Nov 2024 10:48:50 +0000 Subject: [PATCH 23/43] fix tests --- plugins/gcp/fix_plugin_gcp/collector.py | 18 +++++++++--------- plugins/gcp/test/test_monitoring.py | 12 ++++++------ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index 050da85140..2635b32100 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -28,15 +28,15 @@ log = logging.getLogger("fix.plugins.gcp") all_resources: List[Type[GcpResource]] = ( - # compute.resources - # + container.resources - # + billing.resources - # + sqladmin.resources - # + storage.resources - # + aiplatform.resources - # + firestore.resources - # + filestore.resources - cloudfunctions.resources + compute.resources + + container.resources + + billing.resources + + sqladmin.resources + + storage.resources + + aiplatform.resources + + firestore.resources + + filestore.resources + + cloudfunctions.resources ) diff --git a/plugins/gcp/test/test_monitoring.py b/plugins/gcp/test/test_monitoring.py index 86513cbc31..7f7c8c052a 100644 --- a/plugins/gcp/test/test_monitoring.py +++ b/plugins/gcp/test/test_monitoring.py @@ -11,22 +11,22 @@ def test_metric(random_builder: GraphBuilder) -> None: read = GcpMonitoringQuery.create( query_name="compute.googleapis.com/instance/disk/read_ops_count", period=timedelta(hours=1), - ref_id="random_instance", - resource_name="random_instance", + ref_id="gcp_instance/random_instance/global", metric_name=MetricName.DiskRead, normalization=normalizer_factory.count, stat="ALIGN_MIN", - label_name="instance_name", + project_id=random_builder.project.id, + metric_filters={"metric.labels.instance_name": "random_instance", "resource.labels.zone": "global"}, ) write = GcpMonitoringQuery.create( query_name="compute.googleapis.com/instance/disk/write_ops_count", period=timedelta(hours=1), - ref_id="random_instance", - resource_name="random_instance", + ref_id="gcp_instance/random_instance/global", metric_name=MetricName.DiskWrite, normalization=normalizer_factory.count, stat="ALIGN_MIN", - label_name="instance_name", + project_id=random_builder.project.id, + metric_filters={"metric.labels.instance_name": "random_instance", "resource.labels.zone": "global"}, ) result = GcpMonitoringMetricData.query_for(random_builder, [read, write], earlier, now) assert all(value > 0 for value in result[read].metric_values or []) From 6c1e5e04f3fcdcd24c2fb6c13d0c6829f91920ad Mon Sep 17 00:00:00 2001 From: Kirill Date: Tue, 26 Nov 2024 12:30:04 +0000 Subject: [PATCH 24/43] feat: compute stats if we do not use align in gcp --- .../fix_plugin_gcp/resources/monitoring.py | 53 ++++++++++++------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index 41afb7d36c..7053ccba43 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -12,7 +12,7 @@ from fixlib.baseresources import MetricName, MetricUnit, BaseResource, StatName from fixlib.durations import duration_str from fixlib.json import from_json -from fixlib.json_bender import S, Bender, ForallBend, bend +from fixlib.json_bender import S, Bender, ForallBend, bend, K from fixlib.utils import utc_str service_name = "monitoring" @@ -27,6 +27,10 @@ def identity(x: T) -> T: return x +def compute_stats(values: List[float]) -> List[Tuple[float, Optional[StatName]]]: + return [(sum(values) / len(values), None)] + + @frozen(kw_only=True) class MetricNormalization: unit: MetricUnit @@ -38,6 +42,8 @@ class MetricNormalization: ) normalize_value: Callable[[float], float] = identity + compute_stats: Callable[[List[float]], List[Tuple[float, Optional[StatName]]]] = compute_stats + def get_stat_value(self, key: str) -> Optional[StatName]: """ Get the value from stat_map based on the given key. @@ -61,7 +67,7 @@ class GcpMonitoringQuery: period: timedelta # period of the metric ref_id: str # A unique identifier for the resource, formatted as `{resource_kind}/{resource_id}/{resource_region}`. # Example: "gcp_instance/12345/us-central1". This is used to uniquely reference resources across kinds and regions. - metric_id: str # unique metric identifier (metric_name + instance_id) + metric_id: str # unique metric identifier stat: str # aggregation type, supports ALIGN_MEAN, ALIGN_MAX, ALIGN_MIN project_id: str # GCP project name normalization: Optional[MetricNormalization] = None # normalization info @@ -101,12 +107,14 @@ class GcpMonitoringMetricData: kind: ClassVar[str] = "gcp_monitoring_metric_data" mapping: ClassVar[Dict[str, Bender]] = { "metric_values": S("points") - >> ForallBend(S("value", "doubleValue").or_else(S("value", "int64Value", default=0.0))), + >> ForallBend( + S("value", "doubleValue").or_else(S("value", "int64Value")).or_else(S("value", "distributionValue", "mean")) + ).or_else(K([])), "metric_kind": S("metricKind"), "value_type": S("valueType"), "metric_type": S("metric", "type"), } - metric_values: Optional[List[float]] = field(factory=list) + metric_values: List[float] = field(factory=list) metric_kind: Optional[str] = field(default=None) value_type: Optional[str] = field(default=None) metric_type: Optional[str] = field(default=None) @@ -229,25 +237,25 @@ def update_resource_metrics( resource = resources_map.get(query.ref_id) if resource is None: continue - if not metric.metric_values or len(metric.metric_values) == 0: + if len(metric.metric_values) == 0: continue normalizer = query.normalization if not normalizer: continue - average_value = sum(metric.metric_values) / len(metric.metric_values) - - try: - metric_name = query.metric_name - if not metric_name: - continue - name = metric_name + "_" + normalizer.unit - value = normalizer.normalize_value(average_value) - stat_name = normalizer.get_stat_value(query.stat) if normalizer.get_stat_value(query.stat) else "avg" - resource._resource_usage[name][str(stat_name)] = value - except KeyError as e: - log.warning(f"An error occured while setting metric values: {e}") - raise + for metric_value, maybe_stat_name in normalizer.compute_stats(metric.metric_values): + try: + metric_name = query.metric_name + if not metric_name: + continue + name = metric_name + "_" + normalizer.unit + value = normalizer.normalize_value(metric_value) + stat_name = maybe_stat_name or normalizer.get_stat_value(query.stat) + if stat_name: + resource._resource_usage[name][str(stat_name)] = value + except KeyError as e: + log.warning(f"An error occured while setting metric values: {e}") + raise class NormalizerFactory: @@ -290,6 +298,7 @@ def seconds(self) -> MetricNormalization: def milliseconds(self) -> MetricNormalization: return MetricNormalization( unit=MetricUnit.Milliseconds, + compute_stats=calculate_min_max_avg, normalize_value=lambda x: round(x, ndigits=4), ) @@ -301,4 +310,12 @@ def percent(self) -> MetricNormalization: ) +def calculate_min_max_avg(values: List[float]) -> List[Tuple[float, Optional[StatName]]]: + return [ + (min(values), StatName.min), + (max(values), StatName.max), + (sum(values) / len(values), StatName.avg), + ] + + normalizer_factory = NormalizerFactory() From fc527e2e96eb11c1984ce727938998c597573738 Mon Sep 17 00:00:00 2001 From: Kirill Date: Tue, 26 Nov 2024 17:58:18 +0000 Subject: [PATCH 25/43] feat: reimplemented collection and increase default max workers --- plugins/gcp/fix_plugin_gcp/collector.py | 43 ++++++++----------- plugins/gcp/fix_plugin_gcp/config.py | 3 +- .../fix_plugin_gcp/resources/monitoring.py | 7 +-- plugins/gcp/test/test_compute.py | 6 +-- plugins/gcp/test/test_config.py | 2 +- 5 files changed, 23 insertions(+), 38 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index 2635b32100..7e03b1c58e 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -139,9 +139,9 @@ def get_last_run() -> Optional[datetime]: try: log.info(f"[GCP:{self.project.id}] Collect usage metrics.") self.collect_usage_metrics(global_builder) + global_builder.executor.wait_for_submitted_work() except Exception as e: log.warning(f"Failed to collect usage metrics in project {self.project.id}: {e}") - shared_queue.wait_for_submitted_work() log.info(f"[GCP:{self.project.id}] Connect resources and create edges.") # connect nodes for node, data in list(self.graph.nodes(data=True)): @@ -161,10 +161,6 @@ def get_last_run() -> Optional[datetime]: log.info(f"[GCP:{self.project.id}] Collecting resources done.") def collect_usage_metrics(self, builder: GraphBuilder) -> None: - metrics_queries = defaultdict(list) - two_hours = timedelta(hours=2) - thirty_minutes = timedelta(minutes=30) - lookup_map = {} for resource in builder.graph.nodes: if not isinstance(resource, GcpResource): continue @@ -172,29 +168,24 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> None: if region := cast(GcpRegion, resource.region()): resource_queries: List[monitoring.GcpMonitoringQuery] = resource.collect_usage_metrics(builder) if resource_queries: - # set unique GcpMonitoringQuery.ref_id - lookup_map[f"{resource.kind}/{resource.id}/{region.id}"] = resource - for query in resource_queries: - query_region = region start = builder.metrics_delta - if query.period and query.period < thirty_minutes: - start = min(start, two_hours) - metrics_queries[(query_region, start)].append(query) - for (region, start), queries in metrics_queries.items(): - - def collect_and_set_metrics( - start: timedelta, region: GcpRegion, queries: List[monitoring.GcpMonitoringQuery] - ) -> None: - start_at = builder.created_at - start - try: - result = monitoring.GcpMonitoringMetricData.query_for( - builder.for_region(region), queries, start_at, builder.created_at - ) - monitoring.update_resource_metrics(lookup_map, result) - except Exception as e: - log.warning(f"Error occurred in region {region}: {e}") - builder.submit_work(collect_and_set_metrics, start, region, queries) + def collect_and_set_metrics( + start: timedelta, + region: GcpRegion, + queries: List[monitoring.GcpMonitoringQuery], + resource: GcpResource, + ) -> None: + start_at = builder.created_at - start + try: + result = monitoring.GcpMonitoringMetricData.query_for( + builder.for_region(region), queries, start_at, builder.created_at + ) + monitoring.update_resource_metrics(resource, result) + except Exception as e: + log.warning(f"Error occurred in region {region}: {e}") + + builder.submit_work(collect_and_set_metrics, start, region, resource_queries, resource) def remove_unconnected_nodes(self, builder: GraphBuilder) -> None: def rm_leaf_nodes(clazz: Any, ignore_kinds: Optional[Type[Any]] = None) -> None: diff --git a/plugins/gcp/fix_plugin_gcp/config.py b/plugins/gcp/fix_plugin_gcp/config.py index afb0463998..e7373ccc75 100644 --- a/plugins/gcp/fix_plugin_gcp/config.py +++ b/plugins/gcp/fix_plugin_gcp/config.py @@ -1,4 +1,3 @@ -from fixlib.proc import num_default_threads from attrs import define, field from typing import List, ClassVar, Optional @@ -17,7 +16,7 @@ class GcpConfig: metadata={"description": "GCP services to exclude (default: none)"}, ) project_pool_size: int = field( - factory=num_default_threads, + default=64, metadata={"description": "GCP project thread/process pool size"}, ) fork_process: bool = field( diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index 7053ccba43..e38f869569 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -7,7 +7,7 @@ from attr import define, field, frozen -from fix_plugin_gcp.resources.base import GraphBuilder +from fix_plugin_gcp.resources.base import GraphBuilder, GcpResource from fix_plugin_gcp.gcp_client import GcpApiSpec from fixlib.baseresources import MetricName, MetricUnit, BaseResource, StatName from fixlib.durations import duration_str @@ -230,13 +230,10 @@ def _query_for_chunk( def update_resource_metrics( - resources_map: Dict[str, V], + resource: GcpResource, monitoring_metric_result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData], ) -> None: for query, metric in monitoring_metric_result.items(): - resource = resources_map.get(query.ref_id) - if resource is None: - continue if len(metric.metric_values) == 0: continue normalizer = query.normalization diff --git a/plugins/gcp/test/test_compute.py b/plugins/gcp/test/test_compute.py index c306f81198..ce18bb3ada 100644 --- a/plugins/gcp/test/test_compute.py +++ b/plugins/gcp/test/test_compute.py @@ -184,11 +184,9 @@ def test_gcp_instance_usage_metrics(random_builder: GraphBuilder) -> None: random_builder.region = GcpRegion(id="us-east1", name="us-east1") queries = gcp_instance.collect_usage_metrics(random_builder) - lookup_map = {} - lookup_map[f"{gcp_instance.kind}/{gcp_instance.id}/{gcp_instance.region().id}"] = gcp_instance # simulates the `collect_usage_metrics` method found in `GcpAccountCollector`. - def collect_and_set_metrics(start_at: datetime, region: GcpRegion, queries: List[GcpMonitoringQuery]) -> None: + def collect_and_set_metrics(start_at: datetime, _: GcpRegion, queries: List[GcpMonitoringQuery]) -> None: with ThreadPoolExecutor(max_workers=1) as executor: queue = ExecutorQueue(executor, tasks_per_key=lambda _: 1, name="test") g_builder = GraphBuilder( @@ -204,7 +202,7 @@ def collect_and_set_metrics(start_at: datetime, region: GcpRegion, queries: List last_run_started_at=random_builder.last_run_started_at, ) result = GcpMonitoringMetricData.query_for(g_builder, queries, start_at, start_at + timedelta(hours=2)) - update_resource_metrics(lookup_map, result) + update_resource_metrics(gcp_instance, result) start = datetime(2020, 5, 30, 15, 45, 30) diff --git a/plugins/gcp/test/test_config.py b/plugins/gcp/test/test_config.py index 5fc9cae0ba..7ebac1252e 100644 --- a/plugins/gcp/test/test_config.py +++ b/plugins/gcp/test/test_config.py @@ -11,5 +11,5 @@ def test_args() -> None: assert len(Config.gcp.project) == 0 assert len(Config.gcp.collect) == 0 assert len(Config.gcp.no_collect) == 0 - assert Config.gcp.project_pool_size == num_default_threads() + assert Config.gcp.project_pool_size == 64 assert Config.gcp.fork_process is True From 7a21031e5fb137082be4333d0dd492efa74119c0 Mon Sep 17 00:00:00 2001 From: Kirill Date: Tue, 26 Nov 2024 18:09:23 +0000 Subject: [PATCH 26/43] fixed tests and move wait for submitted work in aws --- plugins/aws/fix_plugin_aws/collector.py | 2 +- plugins/gcp/fix_plugin_gcp/collector.py | 1 - plugins/gcp/test/test_config.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/plugins/aws/fix_plugin_aws/collector.py b/plugins/aws/fix_plugin_aws/collector.py index 5b792c009c..8ffabc2a10 100644 --- a/plugins/aws/fix_plugin_aws/collector.py +++ b/plugins/aws/fix_plugin_aws/collector.py @@ -242,11 +242,11 @@ def get_last_run() -> Optional[datetime]: try: log.info(f"[Aws:{self.account.id}] Collect usage metrics.") self.collect_usage_metrics(global_builder) + shared_queue.wait_for_submitted_work() except Exception as e: log.warning( f"Failed to collect usage metrics on account {self.account.id} in region {global_builder.region.id}: {e}" ) - shared_queue.wait_for_submitted_work() # call all registered after collect hooks for after_collect in global_builder.after_collect_actions: diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index 7e03b1c58e..e2459ab8c9 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -1,4 +1,3 @@ -from collections import defaultdict from datetime import datetime, timedelta, timezone import logging from concurrent.futures import ThreadPoolExecutor diff --git a/plugins/gcp/test/test_config.py b/plugins/gcp/test/test_config.py index 7ebac1252e..1ce445aead 100644 --- a/plugins/gcp/test/test_config.py +++ b/plugins/gcp/test/test_config.py @@ -1,4 +1,3 @@ -from fixlib.proc import num_default_threads from fixlib.config import Config from fix_plugin_gcp import GCPCollectorPlugin From 6d50044dc3ddbd45f98fe0f8c0595cc5d76ecb8f Mon Sep 17 00:00:00 2001 From: Matthias Veit Date: Wed, 27 Nov 2024 09:34:32 +0100 Subject: [PATCH 27/43] simplify and cleanup --- plugins/gcp/fix_plugin_gcp/collector.py | 39 +++---- plugins/gcp/fix_plugin_gcp/resources/base.py | 63 +++++++++-- .../resources/cloudfunctions.py | 5 +- .../gcp/fix_plugin_gcp/resources/compute.py | 4 +- .../fix_plugin_gcp/resources/monitoring.py | 100 ++---------------- .../gcp/fix_plugin_gcp/resources/sqladmin.py | 4 +- plugins/gcp/test/test_compute.py | 4 +- plugins/gcp/test/test_monitoring.py | 4 +- 8 files changed, 91 insertions(+), 132 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index e2459ab8c9..a67020a634 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -1,6 +1,6 @@ -from datetime import datetime, timedelta, timezone import logging from concurrent.futures import ThreadPoolExecutor +from datetime import datetime, timezone from typing import Type, List, Any, Optional, cast from fix_plugin_gcp.config import GcpConfig @@ -161,30 +161,19 @@ def get_last_run() -> Optional[datetime]: def collect_usage_metrics(self, builder: GraphBuilder) -> None: for resource in builder.graph.nodes: - if not isinstance(resource, GcpResource): - continue - # region can be overridden in the query - if region := cast(GcpRegion, resource.region()): - resource_queries: List[monitoring.GcpMonitoringQuery] = resource.collect_usage_metrics(builder) - if resource_queries: - start = builder.metrics_delta - - def collect_and_set_metrics( - start: timedelta, - region: GcpRegion, - queries: List[monitoring.GcpMonitoringQuery], - resource: GcpResource, - ) -> None: - start_at = builder.created_at - start - try: - result = monitoring.GcpMonitoringMetricData.query_for( - builder.for_region(region), queries, start_at, builder.created_at - ) - monitoring.update_resource_metrics(resource, result) - except Exception as e: - log.warning(f"Error occurred in region {region}: {e}") - - builder.submit_work(collect_and_set_metrics, start, region, resource_queries, resource) + if isinstance(resource, GcpResource) and (mq := resource.collect_usage_metrics(builder)): + + def collect_and_set_metrics() -> None: + start_at = builder.created_at - builder.metrics_delta + region = cast(GcpRegion, resource.region()) + try: + rb = builder.for_region(region) + result = monitoring.GcpMonitoringMetricData.query_for(rb, mq, start_at, builder.created_at) + monitoring.update_resource_metrics(resource, result) + except Exception as e: + log.warning(f"Error occurred in region {region}: {e}") + + builder.submit_work(collect_and_set_metrics) def remove_unconnected_nodes(self, builder: GraphBuilder) -> None: def rm_leaf_nodes(clazz: Any, ignore_kinds: Optional[Type[Any]] = None) -> None: diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index 41382ca8ed..366a13add1 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -9,6 +9,8 @@ from typing import Callable, List, ClassVar, Optional, TypeVar, Type, Any, Dict, Set, Tuple from attr import define, field +from attrs import frozen +from frozendict import frozendict from google.auth.credentials import Credentials as GoogleAuthCredentials from googleapiclient.errors import HttpError @@ -24,6 +26,9 @@ BaseZone, ModelReference, PhantomBaseResource, + MetricName, + MetricUnit, + StatName, ) from fixlib.config import Config from fixlib.core.actions import CoreFeedback, ErrorAccumulator @@ -204,22 +209,21 @@ def _standard_edges(self, node: GcpResourceType, source: Optional[Json] = None) self.add_edge(node, node=node._region, reverse=True) return True - parts = node.id.split("/") + parts = node.id.split("/", maxsplit=4) if len(parts) > 3 and parts[0] == "projects": - location_types = ["locations", "zones", "regions"] - if parts[2] in location_types: + if parts[2] in ["locations", "zones", "regions"]: location_name = parts[3] # Check for zone first if zone := self.zone_by_name.get(location_name): node._zone = zone node._region = self.region_by_zone_name.get(zone.id) - self.add_edge(node, node=zone, reverse=True) + self.add_edge(zone, node=node) return True # Then check for region if region := self.region_by_name.get(location_name): node._region = region - self.add_edge(node, node=region, reverse=True) + self.add_edge(region, node=node) return True if source is not None: @@ -333,6 +337,53 @@ def for_region(self, region: GcpRegion) -> GraphBuilder: ) +@frozen(kw_only=True) +class MetricNormalization: + unit: MetricUnit + stat_map: frozendict[str, StatName] = frozendict( + {"ALIGN_MIN": StatName.min, "ALIGN_MEAN": StatName.avg, "ALIGN_MAX": StatName.max} + ) + normalize_value: Callable[[float], float] = lambda x: x + compute_stats: Callable[[List[float]], List[Tuple[float, Optional[StatName]]]] = lambda x: [(sum(x) / len(x), None)] + + +@define(hash=True, frozen=True) +class GcpMonitoringQuery: + metric_name: MetricName # final name of the metric + query_name: str # name of the metric (e.g., GCP metric type) + period: timedelta # period of the metric + metric_id: str # unique metric identifier + stat: str # aggregation type, supports ALIGN_MEAN, ALIGN_MAX, ALIGN_MIN + project_id: str # GCP project name + normalization: Optional[MetricNormalization] # normalization info + metric_filters: frozendict[str, str] # filters for the metric + + @staticmethod + def create( + *, + query_name: str, + period: timedelta, + ref_id: str, + metric_name: MetricName, + stat: str, + project_id: str, + metric_filters: Dict[str, str], + normalization: Optional[MetricNormalization] = None, + ) -> "GcpMonitoringQuery": + filter_suffix = "/" + "/".join(f"{key}={value}" for key, value in sorted(metric_filters.items())) + metric_id = f"{query_name}/{ref_id}/{stat}{filter_suffix}" + return GcpMonitoringQuery( + metric_name=metric_name, + query_name=query_name, + period=period, + metric_id=metric_id, + stat=stat, + normalization=normalization, + project_id=project_id, + metric_filters=frozendict(metric_filters), + ) + + @define(eq=False, slots=False) class GcpResource(BaseResource): kind: ClassVar[str] = "gcp_resource" @@ -436,7 +487,7 @@ def post_process_instance(self, builder: GraphBuilder, source: Json) -> None: """ pass - def collect_usage_metrics(self, builder: GraphBuilder) -> List: # type: ignore + def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuery]: # Default behavior: do nothing return [] diff --git a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py index 0ddb47ddf8..9f7f25c45e 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py +++ b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py @@ -4,8 +4,8 @@ from attr import define, field from fix_plugin_gcp.gcp_client import GcpApiSpec -from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder -from fix_plugin_gcp.resources.monitoring import GcpMonitoringQuery, normalizer_factory, STAT_LIST +from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder, GcpMonitoringQuery +from fix_plugin_gcp.resources.monitoring import normalizer_factory, STAT_LIST from fixlib.baseresources import BaseServerlessFunction, MetricName from fixlib.json_bender import Bender, S, Bend, ForallBend @@ -303,7 +303,6 @@ class GcpCloudFunction(GcpResource, BaseServerlessFunction): def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuery]: queries: List[GcpMonitoringQuery] = [] - queries = [] delta = builder.metrics_delta queries.extend( [ diff --git a/plugins/gcp/fix_plugin_gcp/resources/compute.py b/plugins/gcp/fix_plugin_gcp/resources/compute.py index 610f03195f..45ad3ecb0b 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/compute.py +++ b/plugins/gcp/fix_plugin_gcp/resources/compute.py @@ -7,9 +7,9 @@ from attr import define, field from fix_plugin_gcp.gcp_client import GcpApiSpec, InternalZoneProp -from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder +from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder, GcpMonitoringQuery from fix_plugin_gcp.resources.billing import GcpSku -from fix_plugin_gcp.resources.monitoring import GcpMonitoringQuery, STAT_LIST, normalizer_factory +from fix_plugin_gcp.resources.monitoring import STAT_LIST, normalizer_factory from fixlib.baseresources import ( BaseAutoScalingGroup, BaseBucket, diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index e38f869569..fe7a4be6bf 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -1,15 +1,15 @@ -from copy import deepcopy -from functools import cached_property import logging -from datetime import datetime, timedelta -from typing import Callable, ClassVar, Dict, List, Optional, Tuple, TypeVar, Union from concurrent.futures import as_completed +from copy import deepcopy +from datetime import datetime +from functools import cached_property +from typing import ClassVar, Dict, List, Optional, Tuple, TypeVar -from attr import define, field, frozen +from attr import define, field -from fix_plugin_gcp.resources.base import GraphBuilder, GcpResource from fix_plugin_gcp.gcp_client import GcpApiSpec -from fixlib.baseresources import MetricName, MetricUnit, BaseResource, StatName +from fix_plugin_gcp.resources.base import GraphBuilder, GcpResource, GcpMonitoringQuery, MetricNormalization +from fixlib.baseresources import MetricUnit, BaseResource, StatName from fixlib.durations import duration_str from fixlib.json import from_json from fixlib.json_bender import S, Bender, ForallBend, bend, K @@ -23,85 +23,6 @@ STAT_LIST: List[str] = ["ALIGN_MIN", "ALIGN_MEAN", "ALIGN_MAX"] -def identity(x: T) -> T: - return x - - -def compute_stats(values: List[float]) -> List[Tuple[float, Optional[StatName]]]: - return [(sum(values) / len(values), None)] - - -@frozen(kw_only=True) -class MetricNormalization: - unit: MetricUnit - # Use Tuple instead of Dict for stat_map because it should be immutable - stat_map: Tuple[Tuple[str, StatName], Tuple[str, StatName], Tuple[str, StatName]] = ( - ("ALIGN_MIN", StatName.min), - ("ALIGN_MEAN", StatName.avg), - ("ALIGN_MAX", StatName.max), - ) - normalize_value: Callable[[float], float] = identity - - compute_stats: Callable[[List[float]], List[Tuple[float, Optional[StatName]]]] = compute_stats - - def get_stat_value(self, key: str) -> Optional[StatName]: - """ - Get the value from stat_map based on the given key. - - Args: - key: The key to search for in the stat_map. - - Returns: - The corresponding value from stat_map. - """ - for stat_key, value in self.stat_map: - if stat_key == key: - return value - return None - - -@define(hash=True, frozen=True) -class GcpMonitoringQuery: - metric_name: Union[str, MetricName] # final name of the metric - query_name: str # name of the metric (e.g., GCP metric type) - period: timedelta # period of the metric - ref_id: str # A unique identifier for the resource, formatted as `{resource_kind}/{resource_id}/{resource_region}`. - # Example: "gcp_instance/12345/us-central1". This is used to uniquely reference resources across kinds and regions. - metric_id: str # unique metric identifier - stat: str # aggregation type, supports ALIGN_MEAN, ALIGN_MAX, ALIGN_MIN - project_id: str # GCP project name - normalization: Optional[MetricNormalization] = None # normalization info - metric_filters: Optional[Tuple[Tuple[str, str], ...]] = None # Immutable structure - - @staticmethod - def create( - *, - query_name: str, - period: timedelta, - ref_id: str, - metric_name: Union[str, MetricName], - stat: str, - project_id: str, - metric_filters: Dict[str, str], - normalization: Optional[MetricNormalization] = None, - ) -> "GcpMonitoringQuery": - sorted_filters = sorted(metric_filters.items()) - filter_suffix = "/" + "/".join(f"{key}={value}" for key, value in sorted_filters) - metric_id = f"{query_name}/{ref_id}/{stat}{filter_suffix}" - immutable_filters = tuple(sorted_filters) - return GcpMonitoringQuery( - metric_name=metric_name, - query_name=query_name, - period=period, - ref_id=ref_id, - metric_id=metric_id, - stat=stat, - normalization=normalization, - project_id=project_id, - metric_filters=immutable_filters, - ) - - @define(eq=False, slots=False) class GcpMonitoringMetricData: kind: ClassVar[str] = "gcp_monitoring_metric_data" @@ -209,7 +130,7 @@ def _query_for_chunk( # Add additional filters if query.metric_filters: - filters.extend(f'{key} = "{value}"' for key, value in query.metric_filters) + filters.extend(f'{key} = "{value}"' for key, value in query.metric_filters.items()) # Join filters with " AND " to form the final filter string local_api_spec.request_parameter["filter"] = " AND ".join(filters) @@ -247,9 +168,8 @@ def update_resource_metrics( continue name = metric_name + "_" + normalizer.unit value = normalizer.normalize_value(metric_value) - stat_name = maybe_stat_name or normalizer.get_stat_value(query.stat) - if stat_name: - resource._resource_usage[name][str(stat_name)] = value + stat_name = maybe_stat_name or normalizer.stat_map[query.stat] + resource._resource_usage[name][str(stat_name)] = value except KeyError as e: log.warning(f"An error occured while setting metric values: {e}") raise diff --git a/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py b/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py index bb2caf3aed..96fb3590e9 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py +++ b/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py @@ -5,9 +5,9 @@ from attr import define, field from fix_plugin_gcp.gcp_client import GcpApiSpec -from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder +from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder, GcpMonitoringQuery from fix_plugin_gcp.resources.compute import GcpSslCertificate -from fix_plugin_gcp.resources.monitoring import GcpMonitoringQuery, normalizer_factory, STAT_LIST +from fix_plugin_gcp.resources.monitoring import normalizer_factory, STAT_LIST from fixlib.baseresources import BaseDatabase, DatabaseInstanceStatus, MetricName, ModelReference from fixlib.json_bender import F, Bender, S, Bend, ForallBend, K, MapEnum, AsInt from fixlib.types import Json diff --git a/plugins/gcp/test/test_compute.py b/plugins/gcp/test/test_compute.py index ce18bb3ada..02129e5f0f 100644 --- a/plugins/gcp/test/test_compute.py +++ b/plugins/gcp/test/test_compute.py @@ -4,10 +4,10 @@ import os from typing import List -from fix_plugin_gcp.resources.base import GraphBuilder, GcpRegion +from fix_plugin_gcp.resources.base import GraphBuilder, GcpRegion, GcpMonitoringQuery from fix_plugin_gcp.resources.compute import * from fix_plugin_gcp.resources.billing import GcpSku -from fix_plugin_gcp.resources.monitoring import GcpMonitoringQuery, GcpMonitoringMetricData, update_resource_metrics +from fix_plugin_gcp.resources.monitoring import GcpMonitoringMetricData, update_resource_metrics from fixlib.threading import ExecutorQueue from fixlib.baseresources import InstanceStatus diff --git a/plugins/gcp/test/test_monitoring.py b/plugins/gcp/test/test_monitoring.py index 7f7c8c052a..3b7e3eb3a3 100644 --- a/plugins/gcp/test/test_monitoring.py +++ b/plugins/gcp/test/test_monitoring.py @@ -1,7 +1,7 @@ from datetime import timedelta, datetime, timezone -from fix_plugin_gcp.resources.base import GraphBuilder -from fix_plugin_gcp.resources.monitoring import GcpMonitoringQuery, GcpMonitoringMetricData, normalizer_factory +from fix_plugin_gcp.resources.base import GraphBuilder, GcpMonitoringQuery +from fix_plugin_gcp.resources.monitoring import GcpMonitoringMetricData, normalizer_factory from fixlib.baseresources import MetricName From bd9774dee83f59455102b3e8d3f98662db8c769b Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 27 Nov 2024 09:17:50 +0000 Subject: [PATCH 28/43] feat: resolved issues --- plugins/gcp/fix_plugin_gcp/collector.py | 20 ++++++++----------- plugins/gcp/fix_plugin_gcp/resources/base.py | 6 +++--- .../fix_plugin_gcp/resources/monitoring.py | 4 ++-- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index a67020a634..fbf407dea1 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -162,18 +162,14 @@ def get_last_run() -> Optional[datetime]: def collect_usage_metrics(self, builder: GraphBuilder) -> None: for resource in builder.graph.nodes: if isinstance(resource, GcpResource) and (mq := resource.collect_usage_metrics(builder)): - - def collect_and_set_metrics() -> None: - start_at = builder.created_at - builder.metrics_delta - region = cast(GcpRegion, resource.region()) - try: - rb = builder.for_region(region) - result = monitoring.GcpMonitoringMetricData.query_for(rb, mq, start_at, builder.created_at) - monitoring.update_resource_metrics(resource, result) - except Exception as e: - log.warning(f"Error occurred in region {region}: {e}") - - builder.submit_work(collect_and_set_metrics) + start_at = builder.created_at - builder.metrics_delta + region = cast(GcpRegion, resource.region()) + try: + rb = builder.for_region(region) + result = monitoring.GcpMonitoringMetricData.query_for(rb, mq, start_at, builder.created_at) + monitoring.update_resource_metrics(resource, result) + except Exception as e: + log.warning(f"Error occurred in region {region}: {e}") def remove_unconnected_nodes(self, builder: GraphBuilder) -> None: def rm_leaf_nodes(clazz: Any, ignore_kinds: Optional[Type[Any]] = None) -> None: diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index 366a13add1..d6833a4216 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -337,12 +337,12 @@ def for_region(self, region: GcpRegion) -> GraphBuilder: ) +STAT_MAP: Dict[str, StatName] = {"ALIGN_MIN": StatName.min, "ALIGN_MEAN": StatName.avg, "ALIGN_MAX": StatName.max} + + @frozen(kw_only=True) class MetricNormalization: unit: MetricUnit - stat_map: frozendict[str, StatName] = frozendict( - {"ALIGN_MIN": StatName.min, "ALIGN_MEAN": StatName.avg, "ALIGN_MAX": StatName.max} - ) normalize_value: Callable[[float], float] = lambda x: x compute_stats: Callable[[List[float]], List[Tuple[float, Optional[StatName]]]] = lambda x: [(sum(x) / len(x), None)] diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index fe7a4be6bf..bb53c0a80f 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -8,7 +8,7 @@ from attr import define, field from fix_plugin_gcp.gcp_client import GcpApiSpec -from fix_plugin_gcp.resources.base import GraphBuilder, GcpResource, GcpMonitoringQuery, MetricNormalization +from fix_plugin_gcp.resources.base import STAT_MAP, GraphBuilder, GcpResource, GcpMonitoringQuery, MetricNormalization from fixlib.baseresources import MetricUnit, BaseResource, StatName from fixlib.durations import duration_str from fixlib.json import from_json @@ -168,7 +168,7 @@ def update_resource_metrics( continue name = metric_name + "_" + normalizer.unit value = normalizer.normalize_value(metric_value) - stat_name = maybe_stat_name or normalizer.stat_map[query.stat] + stat_name = maybe_stat_name or STAT_MAP[query.stat] resource._resource_usage[name][str(stat_name)] = value except KeyError as e: log.warning(f"An error occured while setting metric values: {e}") From 69bf8b2e8ad776ff332261f7936374214faad515 Mon Sep 17 00:00:00 2001 From: Matthias Veit Date: Wed, 27 Nov 2024 11:02:31 +0100 Subject: [PATCH 29/43] move stat_map to monitoring --- plugins/gcp/fix_plugin_gcp/resources/base.py | 3 --- .../fix_plugin_gcp/resources/cloudfunctions.py | 6 +++--- plugins/gcp/fix_plugin_gcp/resources/compute.py | 16 ++++++++-------- .../gcp/fix_plugin_gcp/resources/monitoring.py | 11 ++++------- plugins/gcp/fix_plugin_gcp/resources/sqladmin.py | 8 ++++---- 5 files changed, 19 insertions(+), 25 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index d6833a4216..ed437ce60c 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -337,9 +337,6 @@ def for_region(self, region: GcpRegion) -> GraphBuilder: ) -STAT_MAP: Dict[str, StatName] = {"ALIGN_MIN": StatName.min, "ALIGN_MEAN": StatName.avg, "ALIGN_MAX": StatName.max} - - @frozen(kw_only=True) class MetricNormalization: unit: MetricUnit diff --git a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py index 9f7f25c45e..1e83ab1c41 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py +++ b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py @@ -5,7 +5,7 @@ from fix_plugin_gcp.gcp_client import GcpApiSpec from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder, GcpMonitoringQuery -from fix_plugin_gcp.resources.monitoring import normalizer_factory, STAT_LIST +from fix_plugin_gcp.resources.monitoring import normalizer_factory, STAT_MAP from fixlib.baseresources import BaseServerlessFunction, MetricName from fixlib.json_bender import Bender, S, Bend, ForallBend @@ -321,7 +321,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer "resource.type": "cloud_function", }, ) - for stat in STAT_LIST + for stat in STAT_MAP for name, metric_name in [ ("cloudfunctions.googleapis.com/function/execution_count", MetricName.Invocations), ] @@ -344,7 +344,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer "resource.type": "cloud_function", }, ) - for stat in STAT_LIST + for stat in STAT_MAP ] ) queries.extend( diff --git a/plugins/gcp/fix_plugin_gcp/resources/compute.py b/plugins/gcp/fix_plugin_gcp/resources/compute.py index 45ad3ecb0b..221e0af28f 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/compute.py +++ b/plugins/gcp/fix_plugin_gcp/resources/compute.py @@ -9,7 +9,7 @@ from fix_plugin_gcp.gcp_client import GcpApiSpec, InternalZoneProp from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder, GcpMonitoringQuery from fix_plugin_gcp.resources.billing import GcpSku -from fix_plugin_gcp.resources.monitoring import STAT_LIST, normalizer_factory +from fix_plugin_gcp.resources.monitoring import STAT_MAP, normalizer_factory from fixlib.baseresources import ( BaseAutoScalingGroup, BaseBucket, @@ -1221,7 +1221,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer project_id=builder.project.id, metric_filters={"metric.labels.device_name": self.id, "resource.labels.zone": self.zone().id}, ) - for stat in STAT_LIST + for stat in STAT_MAP ] ) @@ -1237,7 +1237,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer project_id=builder.project.id, metric_filters={"metric.labels.device_name": self.id, "resource.labels.zone": self.zone().id}, ) - for stat in STAT_LIST + for stat in STAT_MAP for name, metric_name in [ ("compute.googleapis.com/instance/disk/read_ops_count", MetricName.DiskRead), ("compute.googleapis.com/instance/disk/write_ops_count", MetricName.DiskWrite), @@ -1257,7 +1257,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer project_id=builder.project.id, metric_filters={"metric.labels.device_name": self.id}, ) - for stat in STAT_LIST + for stat in STAT_MAP for name, metric_name in [ ("compute.googleapis.com/instance/disk/read_bytes_count", MetricName.DiskRead), ("compute.googleapis.com/instance/disk/write_bytes_count", MetricName.DiskWrite), @@ -3634,7 +3634,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer project_id=builder.project.id, metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, ) - for stat in STAT_LIST + for stat in STAT_MAP ] ) queries.extend( @@ -3649,7 +3649,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer project_id=builder.project.id, metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, ) - for stat in STAT_LIST + for stat in STAT_MAP for name, metric_name in [ ("compute.googleapis.com/instance/network/received_bytes_count", MetricName.NetworkIn), ("compute.googleapis.com/instance/network/sent_bytes_count", MetricName.NetworkOut), @@ -3669,7 +3669,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer project_id=builder.project.id, metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, ) - for stat in STAT_LIST + for stat in STAT_MAP for name, metric_name in [ ("compute.googleapis.com/instance/disk/read_ops_count", MetricName.DiskRead), ("compute.googleapis.com/instance/disk/write_ops_count", MetricName.DiskWrite), @@ -3689,7 +3689,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer project_id=builder.project.id, metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, ) - for stat in STAT_LIST + for stat in STAT_MAP for name, metric_name in [ ("compute.googleapis.com/instance/disk/read_bytes_count", MetricName.DiskRead), ("compute.googleapis.com/instance/disk/write_bytes_count", MetricName.DiskWrite), diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index bb53c0a80f..532fa05f14 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -8,7 +8,7 @@ from attr import define, field from fix_plugin_gcp.gcp_client import GcpApiSpec -from fix_plugin_gcp.resources.base import STAT_MAP, GraphBuilder, GcpResource, GcpMonitoringQuery, MetricNormalization +from fix_plugin_gcp.resources.base import GraphBuilder, GcpResource, GcpMonitoringQuery, MetricNormalization from fixlib.baseresources import MetricUnit, BaseResource, StatName from fixlib.durations import duration_str from fixlib.json import from_json @@ -18,9 +18,9 @@ service_name = "monitoring" log = logging.getLogger("fix.plugins.gcp") T = TypeVar("T") +V = TypeVar("V", bound=BaseResource) - -STAT_LIST: List[str] = ["ALIGN_MIN", "ALIGN_MEAN", "ALIGN_MAX"] +STAT_MAP: Dict[str, StatName] = {"ALIGN_MIN": StatName.min, "ALIGN_MEAN": StatName.avg, "ALIGN_MAX": StatName.max} @define(eq=False, slots=False) @@ -147,9 +147,6 @@ def _query_for_chunk( raise e -V = TypeVar("V", bound=BaseResource) - - def update_resource_metrics( resource: GcpResource, monitoring_metric_result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData], @@ -171,7 +168,7 @@ def update_resource_metrics( stat_name = maybe_stat_name or STAT_MAP[query.stat] resource._resource_usage[name][str(stat_name)] = value except KeyError as e: - log.warning(f"An error occured while setting metric values: {e}") + log.warning(f"An error occurred while setting metric values: {e}") raise diff --git a/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py b/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py index 96fb3590e9..921bb37b5c 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py +++ b/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py @@ -7,7 +7,7 @@ from fix_plugin_gcp.gcp_client import GcpApiSpec from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder, GcpMonitoringQuery from fix_plugin_gcp.resources.compute import GcpSslCertificate -from fix_plugin_gcp.resources.monitoring import normalizer_factory, STAT_LIST +from fix_plugin_gcp.resources.monitoring import normalizer_factory, STAT_MAP from fixlib.baseresources import BaseDatabase, DatabaseInstanceStatus, MetricName, ModelReference from fixlib.json_bender import F, Bender, S, Bend, ForallBend, K, MapEnum, AsInt from fixlib.types import Json @@ -786,7 +786,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer "resource.labels.region": self.region().id, }, ) - for stat in STAT_LIST + for stat in STAT_MAP ] ) queries.extend( @@ -804,7 +804,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer "resource.labels.region": self.region().id, }, ) - for stat in STAT_LIST + for stat in STAT_MAP for name, metric_name in [ ("cloudsql.googleapis.com/database/network/connections", MetricName.DatabaseConnections), ("cloudsql.googleapis.com/database/network/sent_bytes_count", MetricName.NetworkBytesSent), @@ -827,7 +827,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer "resource.labels.region": self.region().id, }, ) - for stat in STAT_LIST + for stat in STAT_MAP for name, metric_name in [ ("cloudsql.googleapis.com/database/disk/read_ops_count", MetricName.DiskRead), ("cloudsql.googleapis.com/database/disk/write_ops_count", MetricName.DiskWrite), From 0a7c326e9587b84b711acf0bc728158e0d8b21c3 Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 27 Nov 2024 10:01:12 +0000 Subject: [PATCH 30/43] fix: adjusted collection of cloudfunction metrics --- plugins/gcp/fix_plugin_gcp/resources/base.py | 3 +++ .../gcp/fix_plugin_gcp/resources/cloudfunctions.py | 1 + plugins/gcp/fix_plugin_gcp/resources/monitoring.py | 12 ++++++------ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index ed437ce60c..8dc3067a87 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -354,6 +354,7 @@ class GcpMonitoringQuery: project_id: str # GCP project name normalization: Optional[MetricNormalization] # normalization info metric_filters: frozendict[str, str] # filters for the metric + cross_series_reducer: str # default REDUCE_NONE @staticmethod def create( @@ -366,6 +367,7 @@ def create( project_id: str, metric_filters: Dict[str, str], normalization: Optional[MetricNormalization] = None, + cross_series_reducer: str = "REDUCE_NONE", ) -> "GcpMonitoringQuery": filter_suffix = "/" + "/".join(f"{key}={value}" for key, value in sorted(metric_filters.items())) metric_id = f"{query_name}/{ref_id}/{stat}{filter_suffix}" @@ -378,6 +380,7 @@ def create( normalization=normalization, project_id=project_id, metric_filters=frozendict(metric_filters), + cross_series_reducer=cross_series_reducer, ) diff --git a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py index 1e83ab1c41..80d5170b9a 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py +++ b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py @@ -362,6 +362,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer "resource.labels.region": self.region().id, "resource.type": "cloud_function", }, + cross_series_reducer="REDUCE_PERCENTILE_50", ) ] ) diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index 532fa05f14..b2fe5189f1 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -28,9 +28,7 @@ class GcpMonitoringMetricData: kind: ClassVar[str] = "gcp_monitoring_metric_data" mapping: ClassVar[Dict[str, Bender]] = { "metric_values": S("points") - >> ForallBend( - S("value", "doubleValue").or_else(S("value", "int64Value")).or_else(S("value", "distributionValue", "mean")) - ).or_else(K([])), + >> ForallBend(S("value", "doubleValue").or_else(S("value", "int64Value"))).or_else(K([])), "metric_kind": S("metricKind"), "value_type": S("valueType"), "metric_type": S("metric", "type"), @@ -79,14 +77,14 @@ def query_for( action="list", request_parameter={ "name": "projects/{project}", - "aggregation_crossSeriesReducer": "REDUCE_NONE", - "aggregation_groupByFields": "[]", + "aggregation_groupByFields": "", "interval_endTime": utc_str(end_time), "interval_startTime": utc_str(start_time), "view": "FULL", # Below parameters are intended to be set dynamically # "aggregation_alignmentPeriod": None, # "aggregation_perSeriesAligner": None, + # "aggregation_crossSeriesReducer": None, # "filter": None, }, request_parameter_in={"project"}, @@ -134,6 +132,7 @@ def _query_for_chunk( # Join filters with " AND " to form the final filter string local_api_spec.request_parameter["filter"] = " AND ".join(filters) + local_api_spec.request_parameter["aggregation_crossSeriesReducer"] = f"{query.cross_series_reducer}" local_api_spec.request_parameter["aggregation_alignmentPeriod"] = f"{int(query.period.total_seconds())}s" local_api_spec.request_parameter["aggregation_perSeriesAligner"] = query.stat @@ -213,7 +212,8 @@ def milliseconds(self) -> MetricNormalization: return MetricNormalization( unit=MetricUnit.Milliseconds, compute_stats=calculate_min_max_avg, - normalize_value=lambda x: round(x, ndigits=4), + # convert nanoseconds to milliseconds + normalize_value=lambda x: round(x / 1_000_000, ndigits=4), ) @cached_property From 21d575152c3b4704ccbfac75d9e56ba52feb2452 Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 27 Nov 2024 10:07:01 +0000 Subject: [PATCH 31/43] chore: removed unused --- plugins/gcp/fix_plugin_gcp/resources/monitoring.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index b2fe5189f1..c3d1b32cb8 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -9,7 +9,7 @@ from fix_plugin_gcp.gcp_client import GcpApiSpec from fix_plugin_gcp.resources.base import GraphBuilder, GcpResource, GcpMonitoringQuery, MetricNormalization -from fixlib.baseresources import MetricUnit, BaseResource, StatName +from fixlib.baseresources import MetricUnit, StatName from fixlib.durations import duration_str from fixlib.json import from_json from fixlib.json_bender import S, Bender, ForallBend, bend, K @@ -18,7 +18,6 @@ service_name = "monitoring" log = logging.getLogger("fix.plugins.gcp") T = TypeVar("T") -V = TypeVar("V", bound=BaseResource) STAT_MAP: Dict[str, StatName] = {"ALIGN_MIN": StatName.min, "ALIGN_MEAN": StatName.avg, "ALIGN_MAX": StatName.max} From 6095f8e6bb6906003c74cbdba58f49f50e498512 Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 27 Nov 2024 12:04:51 +0000 Subject: [PATCH 32/43] feat: created tests for new implementation --- plugins/gcp/fix_plugin_gcp/collector.py | 31 ++++++--- plugins/gcp/fix_plugin_gcp/resources/base.py | 2 + .../fix_plugin_gcp/resources/monitoring.py | 39 ++++++----- plugins/gcp/test/test_compute.py | 68 +++++++++++-------- plugins/gcp/test/test_monitoring.py | 11 ++- 5 files changed, 96 insertions(+), 55 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index fbf407dea1..128052e45e 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -1,7 +1,7 @@ import logging -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timezone -from typing import Type, List, Any, Optional, cast +from typing import Tuple, Type, List, Any, Optional, cast, Dict from fix_plugin_gcp.config import GcpConfig from fix_plugin_gcp.gcp_client import GcpApiSpec @@ -160,16 +160,31 @@ def get_last_run() -> Optional[datetime]: log.info(f"[GCP:{self.project.id}] Collecting resources done.") def collect_usage_metrics(self, builder: GraphBuilder) -> None: + metric_data_futures = [] + mq_lookup = {} + resources_map = {} + result: Dict[monitoring.GcpMonitoringQuery, monitoring.GcpMonitoringMetricData] = {} for resource in builder.graph.nodes: if isinstance(resource, GcpResource) and (mq := resource.collect_usage_metrics(builder)): + mq_lookup.update({q.metric_id: q for q in mq}) start_at = builder.created_at - builder.metrics_delta region = cast(GcpRegion, resource.region()) - try: - rb = builder.for_region(region) - result = monitoring.GcpMonitoringMetricData.query_for(rb, mq, start_at, builder.created_at) - monitoring.update_resource_metrics(resource, result) - except Exception as e: - log.warning(f"Error occurred in region {region}: {e}") + rb = builder.for_region(region) + resources_map[f"{resource.kind}/{resource.id}/{resource.region().id}"] = resource + metric_data_futures.extend( + monitoring.GcpMonitoringMetricData.query_for(rb, mq, start_at, builder.created_at) + ) + + for metric_data in as_completed(metric_data_futures): + try: + metric_query_result: List[Tuple[str, monitoring.GcpMonitoringMetricData]] = metric_data.result() + for metric_id, metric in metric_query_result: + if metric is not None and metric_id is not None: + result[mq_lookup[metric_id]] = metric + except Exception as e: + log.warning(f"An error occurred while processing a metric query: {e}") + + monitoring.update_resource_metrics(resources_map, result) def remove_unconnected_nodes(self, builder: GraphBuilder) -> None: def rm_leaf_nodes(clazz: Any, ignore_kinds: Optional[Type[Any]] = None) -> None: diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index 8dc3067a87..c1f3b53cf0 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -349,6 +349,7 @@ class GcpMonitoringQuery: metric_name: MetricName # final name of the metric query_name: str # name of the metric (e.g., GCP metric type) period: timedelta # period of the metric + ref_id: str # unique id of the resource metric_id: str # unique metric identifier stat: str # aggregation type, supports ALIGN_MEAN, ALIGN_MAX, ALIGN_MIN project_id: str # GCP project name @@ -375,6 +376,7 @@ def create( metric_name=metric_name, query_name=query_name, period=period, + ref_id=ref_id, metric_id=metric_id, stat=stat, normalization=normalization, diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index c3d1b32cb8..28eb55fd04 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -1,5 +1,5 @@ import logging -from concurrent.futures import as_completed +from concurrent.futures import Future from copy import deepcopy from datetime import datetime from functools import cached_property @@ -8,8 +8,8 @@ from attr import define, field from fix_plugin_gcp.gcp_client import GcpApiSpec -from fix_plugin_gcp.resources.base import GraphBuilder, GcpResource, GcpMonitoringQuery, MetricNormalization -from fixlib.baseresources import MetricUnit, StatName +from fix_plugin_gcp.resources.base import GraphBuilder, GcpMonitoringQuery, MetricNormalization +from fixlib.baseresources import MetricUnit, StatName, BaseResource from fixlib.durations import duration_str from fixlib.json import from_json from fixlib.json_bender import S, Bender, ForallBend, bend, K @@ -18,6 +18,7 @@ service_name = "monitoring" log = logging.getLogger("fix.plugins.gcp") T = TypeVar("T") +V = TypeVar("V", bound=BaseResource) STAT_MAP: Dict[str, StatName] = {"ALIGN_MIN": StatName.min, "ALIGN_MEAN": StatName.avg, "ALIGN_MAX": StatName.max} @@ -58,15 +59,15 @@ def query_for( queries: List[GcpMonitoringQuery], start_time: datetime, end_time: datetime, - ) -> "Dict[GcpMonitoringQuery, GcpMonitoringMetricData]": + ) -> List[Future[List[Tuple[str, "GcpMonitoringMetricData"]]]]: if builder.region: log.info( f"[{builder.region.safe_name}|{start_time}|{duration_str(end_time - start_time)}] Query for {len(queries)} metrics." ) else: log.info(f"[global|{start_time}|{duration_str(end_time - start_time)}] Query for {len(queries)} metrics.") - lookup = {q.metric_id: q for q in queries} - result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData] = {} + # lookup = {q.metric_id: q for q in queries} + # result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData] = {} futures = [] api_spec = GcpApiSpec( @@ -76,7 +77,6 @@ def query_for( action="list", request_parameter={ "name": "projects/{project}", - "aggregation_groupByFields": "", "interval_endTime": utc_str(end_time), "interval_startTime": utc_str(start_time), "view": "FULL", @@ -99,16 +99,16 @@ def query_for( ) futures.append(future) # Retrieve results from submitted queries and populate the result dictionary - for future in as_completed(futures): - try: - metric_query_result: List[Tuple[str, GcpMonitoringMetricData]] = future.result() - for metric_id, metric in metric_query_result: - if metric is not None and metric_id is not None: - result[lookup[metric_id]] = metric - except Exception as e: - log.warning(f"An error occurred while processing a metric query: {e}") - raise e - return result + # for future in as_completed(futures): + # try: + # metric_query_result: List[Tuple[str, GcpMonitoringMetricData]] = future.result() + # for metric_id, metric in metric_query_result: + # if metric is not None and metric_id is not None: + # result[lookup[metric_id]] = metric + # except Exception as e: + # log.warning(f"An error occurred while processing a metric query: {e}") + # raise e + return futures @staticmethod def _query_for_chunk( @@ -146,10 +146,13 @@ def _query_for_chunk( def update_resource_metrics( - resource: GcpResource, + resources_map: Dict[str, V], monitoring_metric_result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData], ) -> None: for query, metric in monitoring_metric_result.items(): + resource = resources_map.get(query.ref_id) + if resource is None: + continue if len(metric.metric_values) == 0: continue normalizer = query.normalization diff --git a/plugins/gcp/test/test_compute.py b/plugins/gcp/test/test_compute.py index 02129e5f0f..7a8fbb0271 100644 --- a/plugins/gcp/test/test_compute.py +++ b/plugins/gcp/test/test_compute.py @@ -1,4 +1,4 @@ -from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import timedelta, datetime import json import os @@ -183,34 +183,46 @@ def test_gcp_instance_usage_metrics(random_builder: GraphBuilder) -> None: gcp_instance.instance_status = InstanceStatus.RUNNING random_builder.region = GcpRegion(id="us-east1", name="us-east1") - queries = gcp_instance.collect_usage_metrics(random_builder) + random_builder.created_at = datetime(2020, 5, 30, 15, 45, 30) + random_builder.metrics_delta = timedelta(hours=1) - # simulates the `collect_usage_metrics` method found in `GcpAccountCollector`. - def collect_and_set_metrics(start_at: datetime, _: GcpRegion, queries: List[GcpMonitoringQuery]) -> None: - with ThreadPoolExecutor(max_workers=1) as executor: - queue = ExecutorQueue(executor, tasks_per_key=lambda _: 1, name="test") - g_builder = GraphBuilder( - random_builder.graph, - random_builder.cloud, - random_builder.project, - AnonymousCredentials(), # type: ignore - queue, - random_builder.core_feedback, - random_builder.error_accumulator, - GcpRegion(id="global", name="global"), - random_builder.config, - last_run_started_at=random_builder.last_run_started_at, - ) - result = GcpMonitoringMetricData.query_for(g_builder, queries, start_at, start_at + timedelta(hours=2)) - update_resource_metrics(gcp_instance, result) - - start = datetime(2020, 5, 30, 15, 45, 30) - - collect_and_set_metrics(start, GcpRegion(id="us-east-1", name="us-east-1"), queries) - - assert gcp_instance._resource_usage["cpu_utilization_percent"]["avg"] > 0.0 - assert gcp_instance._resource_usage["network_in_count"]["avg"] > 0.0 - assert gcp_instance._resource_usage["disk_read_count"]["avg"] > 0.0 + queries = gcp_instance.collect_usage_metrics(random_builder) + mq_lookup = {q.metric_id: q for q in queries} + resources_map = {f"{gcp_instance.kind}/{gcp_instance.id}/{gcp_instance.region().id}": gcp_instance} + with ThreadPoolExecutor(max_workers=1) as executor: + queue = ExecutorQueue(executor, tasks_per_key=lambda _: 10, name="test") + g_builder = GraphBuilder( + random_builder.graph, + random_builder.cloud, + random_builder.project, + AnonymousCredentials(), # type: ignore + queue, + random_builder.core_feedback, + random_builder.error_accumulator, + GcpRegion(id="global", name="global"), + random_builder.config, + last_run_started_at=random_builder.last_run_started_at, + ) + metric_data_futures = GcpMonitoringMetricData.query_for( + g_builder, queries, random_builder.created_at, random_builder.created_at + random_builder.metrics_delta + ) + + result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData] = {} + + for metric_data in as_completed(metric_data_futures): + try: + metric_query_result: List[Tuple[str, GcpMonitoringMetricData]] = metric_data.result() + for metric_id, metric in metric_query_result: + if metric is not None and metric_id is not None: + result[mq_lookup[metric_id]] = metric + except Exception as e: + log.warning(f"An error occurred while processing a metric query: {e}") + + update_resource_metrics(resources_map, result) + + assert gcp_instance._resource_usage["cpu_utilization_percent"]["avg"] > 0.0 + assert gcp_instance._resource_usage["network_in_count"]["avg"] > 0.0 + assert gcp_instance._resource_usage["disk_read_count"]["avg"] > 0.0 def test_machine_type_ondemand_cost(random_builder: GraphBuilder) -> None: diff --git a/plugins/gcp/test/test_monitoring.py b/plugins/gcp/test/test_monitoring.py index 3b7e3eb3a3..2e58583961 100644 --- a/plugins/gcp/test/test_monitoring.py +++ b/plugins/gcp/test/test_monitoring.py @@ -1,4 +1,6 @@ +from concurrent.futures import as_completed from datetime import timedelta, datetime, timezone +from typing import List, Tuple, Dict from fix_plugin_gcp.resources.base import GraphBuilder, GcpMonitoringQuery from fix_plugin_gcp.resources.monitoring import GcpMonitoringMetricData, normalizer_factory @@ -28,6 +30,13 @@ def test_metric(random_builder: GraphBuilder) -> None: project_id=random_builder.project.id, metric_filters={"metric.labels.instance_name": "random_instance", "resource.labels.zone": "global"}, ) - result = GcpMonitoringMetricData.query_for(random_builder, [read, write], earlier, now) + queries = GcpMonitoringMetricData.query_for(random_builder, [read, write], earlier, now) + mq_lookup = {q.metric_id: q for q in [read, write]} + result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData] = {} + for metric_data in as_completed(queries): + metric_query_result: List[Tuple[str, GcpMonitoringMetricData]] = metric_data.result() + for metric_id, metric in metric_query_result: + if metric is not None and metric_id is not None: + result[mq_lookup[metric_id]] = metric assert all(value > 0 for value in result[read].metric_values or []) assert all(value > 0 for value in result[write].metric_values or []) From 68a006a6fc1863b8504d53e44f5cce3768b244d4 Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 27 Nov 2024 12:07:57 +0000 Subject: [PATCH 33/43] chore: removed comments --- plugins/gcp/fix_plugin_gcp/resources/monitoring.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index 28eb55fd04..9fd9404e34 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -66,8 +66,7 @@ def query_for( ) else: log.info(f"[global|{start_time}|{duration_str(end_time - start_time)}] Query for {len(queries)} metrics.") - # lookup = {q.metric_id: q for q in queries} - # result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData] = {} + futures = [] api_spec = GcpApiSpec( @@ -98,16 +97,7 @@ def query_for( query, ) futures.append(future) - # Retrieve results from submitted queries and populate the result dictionary - # for future in as_completed(futures): - # try: - # metric_query_result: List[Tuple[str, GcpMonitoringMetricData]] = future.result() - # for metric_id, metric in metric_query_result: - # if metric is not None and metric_id is not None: - # result[lookup[metric_id]] = metric - # except Exception as e: - # log.warning(f"An error occurred while processing a metric query: {e}") - # raise e + return futures @staticmethod From 508a01e46784a70f6927be32124ee4c937877610 Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 27 Nov 2024 12:24:01 +0000 Subject: [PATCH 34/43] fix: fixed cloudfunctions metric fetching --- plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py | 2 +- plugins/gcp/test/test_compute.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py index 80d5170b9a..189805f0f2 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py +++ b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py @@ -355,7 +355,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer ref_id=f"{self.kind}/{self.id}/{self.region().id}", metric_name=MetricName.Duration, normalization=normalizer_factory.milliseconds, - stat="ALIGN_NONE", + stat="ALIGN_DELTA", project_id=builder.project.id, metric_filters={ "resource.labels.function_name": self.resource_raw_name, diff --git a/plugins/gcp/test/test_compute.py b/plugins/gcp/test/test_compute.py index 7a8fbb0271..8ace60e558 100644 --- a/plugins/gcp/test/test_compute.py +++ b/plugins/gcp/test/test_compute.py @@ -2,7 +2,7 @@ from datetime import timedelta, datetime import json import os -from typing import List +from typing import List, Dict, Tuple from fix_plugin_gcp.resources.base import GraphBuilder, GcpRegion, GcpMonitoringQuery from fix_plugin_gcp.resources.compute import * From 364f8cd3da8d004e92d03d5662513912d81f6510 Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 27 Nov 2024 13:44:38 +0000 Subject: [PATCH 35/43] feat: finished new implementation and adjusted tests --- plugins/gcp/fix_plugin_gcp/collector.py | 21 +------ plugins/gcp/fix_plugin_gcp/resources/base.py | 4 +- .../resources/cloudfunctions.py | 7 +-- .../gcp/fix_plugin_gcp/resources/compute.py | 10 +-- .../fix_plugin_gcp/resources/monitoring.py | 62 ++++++++----------- plugins/gcp/test/test_compute.py | 36 ++++------- plugins/gcp/test/test_monitoring.py | 23 +++---- 7 files changed, 57 insertions(+), 106 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index 128052e45e..8d19e65e9d 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -160,31 +160,12 @@ def get_last_run() -> Optional[datetime]: log.info(f"[GCP:{self.project.id}] Collecting resources done.") def collect_usage_metrics(self, builder: GraphBuilder) -> None: - metric_data_futures = [] - mq_lookup = {} - resources_map = {} - result: Dict[monitoring.GcpMonitoringQuery, monitoring.GcpMonitoringMetricData] = {} for resource in builder.graph.nodes: if isinstance(resource, GcpResource) and (mq := resource.collect_usage_metrics(builder)): - mq_lookup.update({q.metric_id: q for q in mq}) start_at = builder.created_at - builder.metrics_delta region = cast(GcpRegion, resource.region()) rb = builder.for_region(region) - resources_map[f"{resource.kind}/{resource.id}/{resource.region().id}"] = resource - metric_data_futures.extend( - monitoring.GcpMonitoringMetricData.query_for(rb, mq, start_at, builder.created_at) - ) - - for metric_data in as_completed(metric_data_futures): - try: - metric_query_result: List[Tuple[str, monitoring.GcpMonitoringMetricData]] = metric_data.result() - for metric_id, metric in metric_query_result: - if metric is not None and metric_id is not None: - result[mq_lookup[metric_id]] = metric - except Exception as e: - log.warning(f"An error occurred while processing a metric query: {e}") - - monitoring.update_resource_metrics(resources_map, result) + monitoring.GcpMonitoringMetricData.query_for(rb, resource, mq, start_at, builder.created_at) def remove_unconnected_nodes(self, builder: GraphBuilder) -> None: def rm_leaf_nodes(clazz: Any, ignore_kinds: Optional[Type[Any]] = None) -> None: diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index c1f3b53cf0..a882eceaf6 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -353,7 +353,7 @@ class GcpMonitoringQuery: metric_id: str # unique metric identifier stat: str # aggregation type, supports ALIGN_MEAN, ALIGN_MAX, ALIGN_MIN project_id: str # GCP project name - normalization: Optional[MetricNormalization] # normalization info + normalization: MetricNormalization # normalization info metric_filters: frozendict[str, str] # filters for the metric cross_series_reducer: str # default REDUCE_NONE @@ -367,7 +367,7 @@ def create( stat: str, project_id: str, metric_filters: Dict[str, str], - normalization: Optional[MetricNormalization] = None, + normalization: MetricNormalization, cross_series_reducer: str = "REDUCE_NONE", ) -> "GcpMonitoringQuery": filter_suffix = "/" + "/".join(f"{key}={value}" for key, value in sorted(metric_filters.items())) diff --git a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py index 189805f0f2..ec1b87b582 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py +++ b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py @@ -307,10 +307,10 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer queries.extend( [ GcpMonitoringQuery.create( - query_name=name, + query_name="cloudfunctions.googleapis.com/function/execution_count", period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", - metric_name=metric_name, + metric_name=MetricName.Invocations, normalization=normalizer_factory.count, stat=stat, project_id=builder.project.id, @@ -322,9 +322,6 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer }, ) for stat in STAT_MAP - for name, metric_name in [ - ("cloudfunctions.googleapis.com/function/execution_count", MetricName.Invocations), - ] ] ) queries.extend( diff --git a/plugins/gcp/fix_plugin_gcp/resources/compute.py b/plugins/gcp/fix_plugin_gcp/resources/compute.py index 221e0af28f..3e2da13d32 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/compute.py +++ b/plugins/gcp/fix_plugin_gcp/resources/compute.py @@ -1232,7 +1232,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", metric_name=metric_name, - normalization=normalizer_factory.count, + normalization=normalizer_factory.iops, stat=stat, project_id=builder.project.id, metric_filters={"metric.labels.device_name": self.id, "resource.labels.zone": self.zone().id}, @@ -1252,7 +1252,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", metric_name=metric_name, - normalization=normalizer_factory.count, + normalization=normalizer_factory.bytes, stat=stat, project_id=builder.project.id, metric_filters={"metric.labels.device_name": self.id}, @@ -3644,7 +3644,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", metric_name=metric_name, - normalization=normalizer_factory.count, + normalization=normalizer_factory.bytes, stat=stat, project_id=builder.project.id, metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, @@ -3664,7 +3664,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", metric_name=metric_name, - normalization=normalizer_factory.count, + normalization=normalizer_factory.iops, stat=stat, project_id=builder.project.id, metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, @@ -3684,7 +3684,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", metric_name=metric_name, - normalization=normalizer_factory.count, + normalization=normalizer_factory.bytes, stat=stat, project_id=builder.project.id, metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index 9fd9404e34..3ec1d6a71f 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -8,7 +8,7 @@ from attr import define, field from fix_plugin_gcp.gcp_client import GcpApiSpec -from fix_plugin_gcp.resources.base import GraphBuilder, GcpMonitoringQuery, MetricNormalization +from fix_plugin_gcp.resources.base import GraphBuilder, GcpMonitoringQuery, MetricNormalization, GcpResource from fixlib.baseresources import MetricUnit, StatName, BaseResource from fixlib.durations import duration_str from fixlib.json import from_json @@ -56,10 +56,11 @@ def called_collect_apis(cls) -> List[GcpApiSpec]: @staticmethod def query_for( builder: GraphBuilder, + resource: GcpResource, queries: List[GcpMonitoringQuery], start_time: datetime, end_time: datetime, - ) -> List[Future[List[Tuple[str, "GcpMonitoringMetricData"]]]]: + ) -> None: if builder.region: log.info( f"[{builder.region.safe_name}|{start_time}|{duration_str(end_time - start_time)}] Query for {len(queries)} metrics." @@ -67,8 +68,6 @@ def query_for( else: log.info(f"[global|{start_time}|{duration_str(end_time - start_time)}] Query for {len(queries)} metrics.") - futures = [] - api_spec = GcpApiSpec( service="monitoring", version="v3", @@ -90,23 +89,21 @@ def query_for( ) for query in queries: - future = builder.submit_work( + builder.submit_work( GcpMonitoringMetricData._query_for_chunk, builder, + resource, api_spec, query, ) - futures.append(future) - - return futures @staticmethod def _query_for_chunk( builder: GraphBuilder, + resource: GcpResource, api_spec: GcpApiSpec, query: GcpMonitoringQuery, - ) -> "List[Tuple[str, GcpMonitoringMetricData]]": - query_result = [] + ) -> None: local_api_spec = deepcopy(api_spec) # Base filter @@ -129,38 +126,29 @@ def _query_for_chunk( part = builder.client.list(local_api_spec) for single in part: metric = from_json(bend(GcpMonitoringMetricData.mapping, single), GcpMonitoringMetricData) - query_result.append((query.metric_id, metric)) - return query_result + update_resource_metrics(resource, query, metric) except Exception as e: - raise e + log.warning(f"An error occurred while processing a metric data: {e}") def update_resource_metrics( - resources_map: Dict[str, V], - monitoring_metric_result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData], + resource: GcpResource, + query: GcpMonitoringQuery, + metric: GcpMonitoringMetricData, ) -> None: - for query, metric in monitoring_metric_result.items(): - resource = resources_map.get(query.ref_id) - if resource is None: - continue - if len(metric.metric_values) == 0: - continue - normalizer = query.normalization - if not normalizer: - continue - - for metric_value, maybe_stat_name in normalizer.compute_stats(metric.metric_values): - try: - metric_name = query.metric_name - if not metric_name: - continue - name = metric_name + "_" + normalizer.unit - value = normalizer.normalize_value(metric_value) - stat_name = maybe_stat_name or STAT_MAP[query.stat] - resource._resource_usage[name][str(stat_name)] = value - except KeyError as e: - log.warning(f"An error occurred while setting metric values: {e}") - raise + normalizer = query.normalization + for metric_value, maybe_stat_name in normalizer.compute_stats(metric.metric_values): + try: + metric_name = query.metric_name + if not metric_name: + continue + name = metric_name + "_" + normalizer.unit + value = normalizer.normalize_value(metric_value) + stat_name = maybe_stat_name or STAT_MAP[query.stat] + resource._resource_usage[name][str(stat_name)] = value + except KeyError as e: + log.warning(f"An error occurred while setting metric values: {e}") + raise class NormalizerFactory: diff --git a/plugins/gcp/test/test_compute.py b/plugins/gcp/test/test_compute.py index 8ace60e558..eb1046f35e 100644 --- a/plugins/gcp/test/test_compute.py +++ b/plugins/gcp/test/test_compute.py @@ -1,13 +1,13 @@ -from concurrent.futures import ThreadPoolExecutor, as_completed +from concurrent.futures import ThreadPoolExecutor from datetime import timedelta, datetime import json import os -from typing import List, Dict, Tuple +from typing import List -from fix_plugin_gcp.resources.base import GraphBuilder, GcpRegion, GcpMonitoringQuery +from fix_plugin_gcp.resources.base import GraphBuilder, GcpRegion from fix_plugin_gcp.resources.compute import * from fix_plugin_gcp.resources.billing import GcpSku -from fix_plugin_gcp.resources.monitoring import GcpMonitoringMetricData, update_resource_metrics +from fix_plugin_gcp.resources.monitoring import GcpMonitoringMetricData from fixlib.threading import ExecutorQueue from fixlib.baseresources import InstanceStatus @@ -187,8 +187,6 @@ def test_gcp_instance_usage_metrics(random_builder: GraphBuilder) -> None: random_builder.metrics_delta = timedelta(hours=1) queries = gcp_instance.collect_usage_metrics(random_builder) - mq_lookup = {q.metric_id: q for q in queries} - resources_map = {f"{gcp_instance.kind}/{gcp_instance.id}/{gcp_instance.region().id}": gcp_instance} with ThreadPoolExecutor(max_workers=1) as executor: queue = ExecutorQueue(executor, tasks_per_key=lambda _: 10, name="test") g_builder = GraphBuilder( @@ -203,26 +201,18 @@ def test_gcp_instance_usage_metrics(random_builder: GraphBuilder) -> None: random_builder.config, last_run_started_at=random_builder.last_run_started_at, ) - metric_data_futures = GcpMonitoringMetricData.query_for( - g_builder, queries, random_builder.created_at, random_builder.created_at + random_builder.metrics_delta + GcpMonitoringMetricData.query_for( + g_builder, + gcp_instance, + queries, + random_builder.created_at, + random_builder.created_at + random_builder.metrics_delta, ) - - result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData] = {} - - for metric_data in as_completed(metric_data_futures): - try: - metric_query_result: List[Tuple[str, GcpMonitoringMetricData]] = metric_data.result() - for metric_id, metric in metric_query_result: - if metric is not None and metric_id is not None: - result[mq_lookup[metric_id]] = metric - except Exception as e: - log.warning(f"An error occurred while processing a metric query: {e}") - - update_resource_metrics(resources_map, result) + g_builder.executor.wait_for_submitted_work() assert gcp_instance._resource_usage["cpu_utilization_percent"]["avg"] > 0.0 - assert gcp_instance._resource_usage["network_in_count"]["avg"] > 0.0 - assert gcp_instance._resource_usage["disk_read_count"]["avg"] > 0.0 + assert gcp_instance._resource_usage["network_in_bytes"]["avg"] > 0.0 + assert gcp_instance._resource_usage["disk_read_iops"]["avg"] > 0.0 def test_machine_type_ondemand_cost(random_builder: GraphBuilder) -> None: diff --git a/plugins/gcp/test/test_monitoring.py b/plugins/gcp/test/test_monitoring.py index 2e58583961..374fe9d4c2 100644 --- a/plugins/gcp/test/test_monitoring.py +++ b/plugins/gcp/test/test_monitoring.py @@ -1,9 +1,8 @@ -from concurrent.futures import as_completed from datetime import timedelta, datetime, timezone -from typing import List, Tuple, Dict from fix_plugin_gcp.resources.base import GraphBuilder, GcpMonitoringQuery from fix_plugin_gcp.resources.monitoring import GcpMonitoringMetricData, normalizer_factory +from fix_plugin_gcp.resources.compute import GcpInstance from fixlib.baseresources import MetricName @@ -15,7 +14,7 @@ def test_metric(random_builder: GraphBuilder) -> None: period=timedelta(hours=1), ref_id="gcp_instance/random_instance/global", metric_name=MetricName.DiskRead, - normalization=normalizer_factory.count, + normalization=normalizer_factory.iops, stat="ALIGN_MIN", project_id=random_builder.project.id, metric_filters={"metric.labels.instance_name": "random_instance", "resource.labels.zone": "global"}, @@ -25,18 +24,14 @@ def test_metric(random_builder: GraphBuilder) -> None: period=timedelta(hours=1), ref_id="gcp_instance/random_instance/global", metric_name=MetricName.DiskWrite, - normalization=normalizer_factory.count, + normalization=normalizer_factory.iops, stat="ALIGN_MIN", project_id=random_builder.project.id, metric_filters={"metric.labels.instance_name": "random_instance", "resource.labels.zone": "global"}, ) - queries = GcpMonitoringMetricData.query_for(random_builder, [read, write], earlier, now) - mq_lookup = {q.metric_id: q for q in [read, write]} - result: Dict[GcpMonitoringQuery, GcpMonitoringMetricData] = {} - for metric_data in as_completed(queries): - metric_query_result: List[Tuple[str, GcpMonitoringMetricData]] = metric_data.result() - for metric_id, metric in metric_query_result: - if metric is not None and metric_id is not None: - result[mq_lookup[metric_id]] = metric - assert all(value > 0 for value in result[read].metric_values or []) - assert all(value > 0 for value in result[write].metric_values or []) + gcp_instance = GcpInstance(id="random_instance") + GcpMonitoringMetricData.query_for(random_builder, gcp_instance, [read, write], earlier, now) + random_builder.executor.wait_for_submitted_work() + usages = list(gcp_instance._resource_usage.keys()) + assert usages[0] == f"{read.metric_name}_{read.normalization.unit}" + assert usages[1] == f"{write.metric_name}_{write.normalization.unit}" From 8d41cf893277753c7bbf0a5e07ca5faaa29b18c5 Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 27 Nov 2024 13:50:24 +0000 Subject: [PATCH 36/43] fixed linter --- plugins/gcp/fix_plugin_gcp/collector.py | 4 ++-- plugins/gcp/fix_plugin_gcp/resources/monitoring.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index 8d19e65e9d..71832f9934 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -1,7 +1,7 @@ import logging -from concurrent.futures import ThreadPoolExecutor, as_completed +from concurrent.futures import ThreadPoolExecutor from datetime import datetime, timezone -from typing import Tuple, Type, List, Any, Optional, cast, Dict +from typing import Type, List, Any, Optional, cast from fix_plugin_gcp.config import GcpConfig from fix_plugin_gcp.gcp_client import GcpApiSpec diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index 3ec1d6a71f..0b4129e4a8 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -136,6 +136,8 @@ def update_resource_metrics( query: GcpMonitoringQuery, metric: GcpMonitoringMetricData, ) -> None: + if len(metric.metric_values) == 0: + return normalizer = query.normalization for metric_value, maybe_stat_name in normalizer.compute_stats(metric.metric_values): try: From e6690858d8066024c0d1637764e38b04fd2ceb16 Mon Sep 17 00:00:00 2001 From: Kirill Date: Wed, 27 Nov 2024 14:22:04 +0000 Subject: [PATCH 37/43] fixed syntax test --- plugins/gcp/fix_plugin_gcp/resources/monitoring.py | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index 0b4129e4a8..a7b065d8ec 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -1,5 +1,4 @@ import logging -from concurrent.futures import Future from copy import deepcopy from datetime import datetime from functools import cached_property From 86f6435cd9272d8ff3efb6c22182eeae993517a7 Mon Sep 17 00:00:00 2001 From: Kirill Date: Thu, 28 Nov 2024 13:35:18 +0000 Subject: [PATCH 38/43] feat: finished collecting metrics for lb --- plugins/gcp/fix_plugin_gcp/resources/base.py | 3 - .../resources/cloudfunctions.py | 13 ++-- .../gcp/fix_plugin_gcp/resources/compute.py | 67 +++++++++++++++---- .../fix_plugin_gcp/resources/monitoring.py | 39 +++++++---- .../gcp/fix_plugin_gcp/resources/sqladmin.py | 9 ++- 5 files changed, 92 insertions(+), 39 deletions(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index a882eceaf6..6f2508ad9a 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -355,7 +355,6 @@ class GcpMonitoringQuery: project_id: str # GCP project name normalization: MetricNormalization # normalization info metric_filters: frozendict[str, str] # filters for the metric - cross_series_reducer: str # default REDUCE_NONE @staticmethod def create( @@ -368,7 +367,6 @@ def create( project_id: str, metric_filters: Dict[str, str], normalization: MetricNormalization, - cross_series_reducer: str = "REDUCE_NONE", ) -> "GcpMonitoringQuery": filter_suffix = "/" + "/".join(f"{key}={value}" for key, value in sorted(metric_filters.items())) metric_id = f"{query_name}/{ref_id}/{stat}{filter_suffix}" @@ -382,7 +380,6 @@ def create( normalization=normalization, project_id=project_id, metric_filters=frozendict(metric_filters), - cross_series_reducer=cross_series_reducer, ) diff --git a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py index ec1b87b582..857c2d3574 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py +++ b/plugins/gcp/fix_plugin_gcp/resources/cloudfunctions.py @@ -5,7 +5,7 @@ from fix_plugin_gcp.gcp_client import GcpApiSpec from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder, GcpMonitoringQuery -from fix_plugin_gcp.resources.monitoring import normalizer_factory, STAT_MAP +from fix_plugin_gcp.resources.monitoring import normalizer_factory, STANDART_STAT_MAP, PERCENTILE_STAT_MAP from fixlib.baseresources import BaseServerlessFunction, MetricName from fixlib.json_bender import Bender, S, Bend, ForallBend @@ -321,7 +321,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer "resource.type": "cloud_function", }, ) - for stat in STAT_MAP + for stat in STANDART_STAT_MAP ] ) queries.extend( @@ -341,7 +341,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer "resource.type": "cloud_function", }, ) - for stat in STAT_MAP + for stat in STANDART_STAT_MAP ] ) queries.extend( @@ -351,16 +351,17 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", metric_name=MetricName.Duration, - normalization=normalizer_factory.milliseconds, - stat="ALIGN_DELTA", + # convert nanoseconds to milliseconds + normalization=normalizer_factory.milliseconds(lambda x: round(x / 1_000_000, ndigits=4)), + stat=stat, project_id=builder.project.id, metric_filters={ "resource.labels.function_name": self.resource_raw_name, "resource.labels.region": self.region().id, "resource.type": "cloud_function", }, - cross_series_reducer="REDUCE_PERCENTILE_50", ) + for stat in PERCENTILE_STAT_MAP ] ) return queries diff --git a/plugins/gcp/fix_plugin_gcp/resources/compute.py b/plugins/gcp/fix_plugin_gcp/resources/compute.py index 3e2da13d32..6d73195f50 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/compute.py +++ b/plugins/gcp/fix_plugin_gcp/resources/compute.py @@ -9,7 +9,7 @@ from fix_plugin_gcp.gcp_client import GcpApiSpec, InternalZoneProp from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder, GcpMonitoringQuery from fix_plugin_gcp.resources.billing import GcpSku -from fix_plugin_gcp.resources.monitoring import STAT_MAP, normalizer_factory +from fix_plugin_gcp.resources.monitoring import STANDART_STAT_MAP, PERCENTILE_STAT_MAP, normalizer_factory from fixlib.baseresources import ( BaseAutoScalingGroup, BaseBucket, @@ -1207,7 +1207,6 @@ def connect_in_graph(self, builder: GraphBuilder, source: Json) -> None: def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuery]: queries: List[GcpMonitoringQuery] = [] - queries = [] delta = builder.metrics_delta queries.extend( [ @@ -1221,7 +1220,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer project_id=builder.project.id, metric_filters={"metric.labels.device_name": self.id, "resource.labels.zone": self.zone().id}, ) - for stat in STAT_MAP + for stat in STANDART_STAT_MAP ] ) @@ -1237,7 +1236,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer project_id=builder.project.id, metric_filters={"metric.labels.device_name": self.id, "resource.labels.zone": self.zone().id}, ) - for stat in STAT_MAP + for stat in STANDART_STAT_MAP for name, metric_name in [ ("compute.googleapis.com/instance/disk/read_ops_count", MetricName.DiskRead), ("compute.googleapis.com/instance/disk/write_ops_count", MetricName.DiskWrite), @@ -1257,7 +1256,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer project_id=builder.project.id, metric_filters={"metric.labels.device_name": self.id}, ) - for stat in STAT_MAP + for stat in STANDART_STAT_MAP for name, metric_name in [ ("compute.googleapis.com/instance/disk/read_bytes_count", MetricName.DiskRead), ("compute.googleapis.com/instance/disk/write_bytes_count", MetricName.DiskWrite), @@ -1732,9 +1731,52 @@ def connect_in_graph(self, builder: GraphBuilder, source: Json) -> None: GcpTargetPool, ) builder.add_edge(self, clazz=target_classes, link=self.target) - self._collect_backends(builder) - def _collect_backends(self, graph_builder: GraphBuilder) -> None: + def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuery]: + queries: List[GcpMonitoringQuery] = [] + delta = builder.metrics_delta + if not self.load_balancing_scheme: + return [] + lb_type = "external/regional" if "EXTERNAL" in self.load_balancing_scheme else "internal" + queries.extend( + [ + GcpMonitoringQuery.create( + query_name=f"loadbalancing.googleapis.com/https/{lb_type}/request_count", + period=delta, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", + metric_name=MetricName.RequestCount, + normalization=normalizer_factory.count_with_compute, + stat="ALIGN_RATE", + project_id=builder.project.id, + metric_filters={ + "resource.label.forwarding_rule_name": self.id, + "resource.labels.region": self.region().id, + }, + ) + ] + ) + queries.extend( + [ + GcpMonitoringQuery.create( + query_name=f"loadbalancing.googleapis.com/https/{lb_type}/backend_latencies", + period=delta, + ref_id=f"{self.kind}/{self.id}/{self.region().id}", + metric_name=MetricName.Latency, + normalization=normalizer_factory.milliseconds(), + stat=stat, + project_id=builder.project.id, + metric_filters={ + "resource.label.forwarding_rule_name": self.id, + "resource.labels.region": self.region().id, + }, + ) + for stat in PERCENTILE_STAT_MAP + ] + ) + + return queries + + def post_process(self, graph_builder: GraphBuilder, source: Json) -> None: if not self.target: return backend_services = graph_builder.nodes(clazz=GcpBackendService) @@ -1776,7 +1818,7 @@ def fetch_instances(group: str) -> None: if vm_id := item.get("instance"): self.backends.append(vm_id) except Exception as e: - log.warning(f"An error occured while setting backends property: {e}") + log.warning(f"An error occurred while setting backends property: {e}") graph_builder.submit_work(fetch_instances, backend.group) @@ -3620,7 +3662,6 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer if self.instance_status != InstanceStatus.RUNNING: return [] queries: List[GcpMonitoringQuery] = [] - queries = [] delta = builder.metrics_delta queries.extend( [ @@ -3634,7 +3675,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer project_id=builder.project.id, metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, ) - for stat in STAT_MAP + for stat in STANDART_STAT_MAP ] ) queries.extend( @@ -3649,7 +3690,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer project_id=builder.project.id, metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, ) - for stat in STAT_MAP + for stat in STANDART_STAT_MAP for name, metric_name in [ ("compute.googleapis.com/instance/network/received_bytes_count", MetricName.NetworkIn), ("compute.googleapis.com/instance/network/sent_bytes_count", MetricName.NetworkOut), @@ -3669,7 +3710,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer project_id=builder.project.id, metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, ) - for stat in STAT_MAP + for stat in STANDART_STAT_MAP for name, metric_name in [ ("compute.googleapis.com/instance/disk/read_ops_count", MetricName.DiskRead), ("compute.googleapis.com/instance/disk/write_ops_count", MetricName.DiskWrite), @@ -3689,7 +3730,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer project_id=builder.project.id, metric_filters={"metric.labels.instance_name": self.id, "resource.labels.zone": self.zone().id}, ) - for stat in STAT_MAP + for stat in STANDART_STAT_MAP for name, metric_name in [ ("compute.googleapis.com/instance/disk/read_bytes_count", MetricName.DiskRead), ("compute.googleapis.com/instance/disk/write_bytes_count", MetricName.DiskWrite), diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index a7b065d8ec..0392f2dd60 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -1,8 +1,8 @@ import logging from copy import deepcopy from datetime import datetime -from functools import cached_property -from typing import ClassVar, Dict, List, Optional, Tuple, TypeVar +from functools import cached_property, lru_cache +from typing import ClassVar, Dict, List, Optional, Tuple, TypeVar, Callable from attr import define, field @@ -19,7 +19,16 @@ T = TypeVar("T") V = TypeVar("V", bound=BaseResource) -STAT_MAP: Dict[str, StatName] = {"ALIGN_MIN": StatName.min, "ALIGN_MEAN": StatName.avg, "ALIGN_MAX": StatName.max} +STANDART_STAT_MAP: Dict[str, StatName] = { + "ALIGN_MIN": StatName.min, + "ALIGN_MEAN": StatName.avg, + "ALIGN_MAX": StatName.max, +} +PERCENTILE_STAT_MAP: Dict[str, StatName] = { + "ALIGN_PERCENTILE_05": StatName.min, + "ALIGN_PERCENTILE_50": StatName.avg, + "ALIGN_PERCENTILE_99": StatName.max, +} @define(eq=False, slots=False) @@ -76,11 +85,11 @@ def query_for( "name": "projects/{project}", "interval_endTime": utc_str(end_time), "interval_startTime": utc_str(start_time), + "aggregation_crossSeriesReducer": "REDUCE_NONE", "view": "FULL", # Below parameters are intended to be set dynamically # "aggregation_alignmentPeriod": None, # "aggregation_perSeriesAligner": None, - # "aggregation_crossSeriesReducer": None, # "filter": None, }, request_parameter_in={"project"}, @@ -117,7 +126,6 @@ def _query_for_chunk( # Join filters with " AND " to form the final filter string local_api_spec.request_parameter["filter"] = " AND ".join(filters) - local_api_spec.request_parameter["aggregation_crossSeriesReducer"] = f"{query.cross_series_reducer}" local_api_spec.request_parameter["aggregation_alignmentPeriod"] = f"{int(query.period.total_seconds())}s" local_api_spec.request_parameter["aggregation_perSeriesAligner"] = query.stat @@ -145,8 +153,9 @@ def update_resource_metrics( continue name = metric_name + "_" + normalizer.unit value = normalizer.normalize_value(metric_value) - stat_name = maybe_stat_name or STAT_MAP[query.stat] - resource._resource_usage[name][str(stat_name)] = value + stat_name = maybe_stat_name or STANDART_STAT_MAP.get(query.stat) or PERCENTILE_STAT_MAP.get(query.stat) + if stat_name: + resource._resource_usage[name][str(stat_name)] = value except KeyError as e: log.warning(f"An error occurred while setting metric values: {e}") raise @@ -160,6 +169,14 @@ def count(self) -> MetricNormalization: normalize_value=lambda x: round(x, ndigits=4), ) + @cached_property + def count_with_compute(self) -> MetricNormalization: + return MetricNormalization( + unit=MetricUnit.Count, + compute_stats=calculate_min_max_avg, + normalize_value=lambda x: round(x, ndigits=4), + ) + @cached_property def bytes(self) -> MetricNormalization: return MetricNormalization( @@ -188,13 +205,11 @@ def seconds(self) -> MetricNormalization: normalize_value=lambda x: round(x, ndigits=4), ) - @cached_property - def milliseconds(self) -> MetricNormalization: + @lru_cache(maxsize=128) + def milliseconds(self, normalize_value: Optional[Callable[[float], float]] = None) -> MetricNormalization: return MetricNormalization( unit=MetricUnit.Milliseconds, - compute_stats=calculate_min_max_avg, - # convert nanoseconds to milliseconds - normalize_value=lambda x: round(x / 1_000_000, ndigits=4), + normalize_value=normalize_value or (lambda x: round(x, ndigits=4)), ) @cached_property diff --git a/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py b/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py index 921bb37b5c..2402768f9e 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py +++ b/plugins/gcp/fix_plugin_gcp/resources/sqladmin.py @@ -7,7 +7,7 @@ from fix_plugin_gcp.gcp_client import GcpApiSpec from fix_plugin_gcp.resources.base import GcpResource, GcpDeprecationStatus, GraphBuilder, GcpMonitoringQuery from fix_plugin_gcp.resources.compute import GcpSslCertificate -from fix_plugin_gcp.resources.monitoring import normalizer_factory, STAT_MAP +from fix_plugin_gcp.resources.monitoring import normalizer_factory, STANDART_STAT_MAP from fixlib.baseresources import BaseDatabase, DatabaseInstanceStatus, MetricName, ModelReference from fixlib.json_bender import F, Bender, S, Bend, ForallBend, K, MapEnum, AsInt from fixlib.types import Json @@ -769,7 +769,6 @@ def collect_sql_resources(spec: GcpApiSpec, clazz: Type[GcpResource]) -> None: def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuery]: queries: List[GcpMonitoringQuery] = [] - queries = [] delta = builder.metrics_delta queries.extend( [ @@ -786,7 +785,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer "resource.labels.region": self.region().id, }, ) - for stat in STAT_MAP + for stat in STANDART_STAT_MAP ] ) queries.extend( @@ -804,7 +803,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer "resource.labels.region": self.region().id, }, ) - for stat in STAT_MAP + for stat in STANDART_STAT_MAP for name, metric_name in [ ("cloudsql.googleapis.com/database/network/connections", MetricName.DatabaseConnections), ("cloudsql.googleapis.com/database/network/sent_bytes_count", MetricName.NetworkBytesSent), @@ -827,7 +826,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer "resource.labels.region": self.region().id, }, ) - for stat in STAT_MAP + for stat in STANDART_STAT_MAP for name, metric_name in [ ("cloudsql.googleapis.com/database/disk/read_ops_count", MetricName.DiskRead), ("cloudsql.googleapis.com/database/disk/write_ops_count", MetricName.DiskWrite), From cf987fd3d7b290a584693900a77e2d49b469593a Mon Sep 17 00:00:00 2001 From: Kirill Date: Thu, 28 Nov 2024 14:16:16 +0000 Subject: [PATCH 39/43] convert seconds to ms --- plugins/gcp/fix_plugin_gcp/resources/compute.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/compute.py b/plugins/gcp/fix_plugin_gcp/resources/compute.py index 6d73195f50..682fa62427 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/compute.py +++ b/plugins/gcp/fix_plugin_gcp/resources/compute.py @@ -1762,7 +1762,8 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", metric_name=MetricName.Latency, - normalization=normalizer_factory.milliseconds(), + # convert seconds to milliseconds + normalization=normalizer_factory.milliseconds(lambda x: round(x * 1000, ndigits=4)), stat=stat, project_id=builder.project.id, metric_filters={ From 7f5d7f7077ad539ebb39e7a2f95d1e5a844635d2 Mon Sep 17 00:00:00 2001 From: Kirill Date: Thu, 28 Nov 2024 15:07:45 +0000 Subject: [PATCH 40/43] fix: take values correctly --- fixlib/fixlib/baseresources.py | 2 ++ .../gcp/fix_plugin_gcp/resources/compute.py | 31 +++++++++++++++---- .../fix_plugin_gcp/resources/monitoring.py | 22 ++----------- 3 files changed, 30 insertions(+), 25 deletions(-) diff --git a/fixlib/fixlib/baseresources.py b/fixlib/fixlib/baseresources.py index f82b66b036..1eb7a53f15 100644 --- a/fixlib/fixlib/baseresources.py +++ b/fixlib/fixlib/baseresources.py @@ -149,6 +149,8 @@ def __str__(self) -> str: # load balancers RequestCount = "request" # _count will be added to the end because of the unit + RequestBytesCount = "request_bytes" # _count will be added to the end because of the unit + ResponseBytesCount = "response_bytes" # _count will be added to the end because of the unit ActiveConnectionCount = "active_connection" # _count will be added to the end because of the unit ALBActiveConnectionCount = "alb_active_connection" # _count will be added to the end because of the unit ConnectionAttemptCount = "connection_attempt" # _count will be added to the end because of the unit diff --git a/plugins/gcp/fix_plugin_gcp/resources/compute.py b/plugins/gcp/fix_plugin_gcp/resources/compute.py index 682fa62427..64e8fd87f5 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/compute.py +++ b/plugins/gcp/fix_plugin_gcp/resources/compute.py @@ -1,3 +1,4 @@ +from collections import defaultdict import logging import ipaddress from datetime import datetime @@ -1732,6 +1733,16 @@ def connect_in_graph(self, builder: GraphBuilder, source: Json) -> None: ) builder.add_edge(self, clazz=target_classes, link=self.target) + def post_process_instance(self, builder: GraphBuilder, source: Json) -> None: + # Calculate the processed bytes + total_bytes: Dict[str, float] = defaultdict(float) + for metric_name, metric_values in self._resource_usage.items(): + if metric_name.endswith("_bytes_count"): + for name, value in metric_values.items(): + total_bytes[name] += value + if total_bytes: + self._resource_usage["processed_bytes"] = dict(total_bytes) + def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuery]: queries: List[GcpMonitoringQuery] = [] delta = builder.metrics_delta @@ -1741,18 +1752,27 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer queries.extend( [ GcpMonitoringQuery.create( - query_name=f"loadbalancing.googleapis.com/https/{lb_type}/request_count", + query_name=name, period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", - metric_name=MetricName.RequestCount, - normalization=normalizer_factory.count_with_compute, - stat="ALIGN_RATE", + metric_name=metric_name, + normalization=normalizer_factory.count, + stat=stat, project_id=builder.project.id, metric_filters={ "resource.label.forwarding_rule_name": self.id, "resource.labels.region": self.region().id, }, ) + for stat in STANDART_STAT_MAP + for name, metric_name in [ + (f"loadbalancing.googleapis.com/https/{lb_type}/request_count", MetricName.RequestCount), + (f"loadbalancing.googleapis.com/https/{lb_type}/request_bytes_count", MetricName.RequestBytesCount), + ( + f"loadbalancing.googleapis.com/https/{lb_type}/response_bytes_count", + MetricName.ResponseBytesCount, + ), + ] ] ) queries.extend( @@ -1762,8 +1782,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer period=delta, ref_id=f"{self.kind}/{self.id}/{self.region().id}", metric_name=MetricName.Latency, - # convert seconds to milliseconds - normalization=normalizer_factory.milliseconds(lambda x: round(x * 1000, ndigits=4)), + normalization=normalizer_factory.milliseconds(), stat=stat, project_id=builder.project.id, metric_filters={ diff --git a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py index 0392f2dd60..6219083e2e 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/monitoring.py +++ b/plugins/gcp/fix_plugin_gcp/resources/monitoring.py @@ -2,7 +2,7 @@ from copy import deepcopy from datetime import datetime from functools import cached_property, lru_cache -from typing import ClassVar, Dict, List, Optional, Tuple, TypeVar, Callable +from typing import ClassVar, Dict, List, Optional, TypeVar, Callable from attr import define, field @@ -11,7 +11,7 @@ from fixlib.baseresources import MetricUnit, StatName, BaseResource from fixlib.durations import duration_str from fixlib.json import from_json -from fixlib.json_bender import S, Bender, ForallBend, bend, K +from fixlib.json_bender import S, Bender, ForallBend, bend, K, AsFloat from fixlib.utils import utc_str service_name = "monitoring" @@ -36,7 +36,7 @@ class GcpMonitoringMetricData: kind: ClassVar[str] = "gcp_monitoring_metric_data" mapping: ClassVar[Dict[str, Bender]] = { "metric_values": S("points") - >> ForallBend(S("value", "doubleValue").or_else(S("value", "int64Value"))).or_else(K([])), + >> ForallBend((S("value", "doubleValue").or_else(S("value", "int64Value")) >> AsFloat())).or_else(K([])), "metric_kind": S("metricKind"), "value_type": S("valueType"), "metric_type": S("metric", "type"), @@ -169,14 +169,6 @@ def count(self) -> MetricNormalization: normalize_value=lambda x: round(x, ndigits=4), ) - @cached_property - def count_with_compute(self) -> MetricNormalization: - return MetricNormalization( - unit=MetricUnit.Count, - compute_stats=calculate_min_max_avg, - normalize_value=lambda x: round(x, ndigits=4), - ) - @cached_property def bytes(self) -> MetricNormalization: return MetricNormalization( @@ -220,12 +212,4 @@ def percent(self) -> MetricNormalization: ) -def calculate_min_max_avg(values: List[float]) -> List[Tuple[float, Optional[StatName]]]: - return [ - (min(values), StatName.min), - (max(values), StatName.max), - (sum(values) / len(values), StatName.avg), - ] - - normalizer_factory = NormalizerFactory() From abfcfe88f5919161e45a20d4591fab2747a4cd5d Mon Sep 17 00:00:00 2001 From: Kirill Date: Thu, 28 Nov 2024 15:30:18 +0000 Subject: [PATCH 41/43] small improvement --- plugins/gcp/fix_plugin_gcp/resources/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/base.py b/plugins/gcp/fix_plugin_gcp/resources/base.py index 6f2508ad9a..2372a20e0d 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/base.py +++ b/plugins/gcp/fix_plugin_gcp/resources/base.py @@ -410,7 +410,7 @@ def resource_raw_name(self) -> str: Returns: str: The last segment of the resource ID (e.g., "function-1" from "projects/{project}/locations/{location}/functions/function-1"). """ - return self.id.strip().split("/")[-1] + return self.id.rsplit("/", maxsplit=1)[-1] def delete(self, graph: Graph) -> bool: if not self.api_spec: From 738d5e98062c01b7a5ebd20773008d4906f22830 Mon Sep 17 00:00:00 2001 From: Kirill Date: Fri, 29 Nov 2024 14:01:51 +0000 Subject: [PATCH 42/43] fix: moved method to make collection correctly --- plugins/gcp/fix_plugin_gcp/resources/compute.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/plugins/gcp/fix_plugin_gcp/resources/compute.py b/plugins/gcp/fix_plugin_gcp/resources/compute.py index 64e8fd87f5..62621968d3 100644 --- a/plugins/gcp/fix_plugin_gcp/resources/compute.py +++ b/plugins/gcp/fix_plugin_gcp/resources/compute.py @@ -1743,6 +1743,8 @@ def post_process_instance(self, builder: GraphBuilder, source: Json) -> None: if total_bytes: self._resource_usage["processed_bytes"] = dict(total_bytes) + self.collect_backends(builder) + def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuery]: queries: List[GcpMonitoringQuery] = [] delta = builder.metrics_delta @@ -1796,7 +1798,7 @@ def collect_usage_metrics(self, builder: GraphBuilder) -> List[GcpMonitoringQuer return queries - def post_process(self, graph_builder: GraphBuilder, source: Json) -> None: + def collect_backends(self, graph_builder: GraphBuilder) -> None: if not self.target: return backend_services = graph_builder.nodes(clazz=GcpBackendService) From ef5b88b92909d445865c48d017cc9624a3b476b5 Mon Sep 17 00:00:00 2001 From: Kirill Date: Mon, 2 Dec 2024 08:32:55 +0000 Subject: [PATCH 43/43] make sure all futures done --- plugins/gcp/fix_plugin_gcp/collector.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/plugins/gcp/fix_plugin_gcp/collector.py b/plugins/gcp/fix_plugin_gcp/collector.py index 71832f9934..654432541b 100644 --- a/plugins/gcp/fix_plugin_gcp/collector.py +++ b/plugins/gcp/fix_plugin_gcp/collector.py @@ -156,6 +156,8 @@ def get_last_run() -> Optional[datetime]: if isinstance(node, GcpResource): node.post_process_instance(global_builder, data.get("source", {})) + global_builder.executor.wait_for_submitted_work() + self.core_feedback.progress_done(self.project.id, 1, 1, context=[self.cloud.id]) log.info(f"[GCP:{self.project.id}] Collecting resources done.")