diff --git a/examples/post_training_quantization/openvino/yolov8/main.py b/examples/post_training_quantization/openvino/yolov8/main.py index e69136db796..7e2e7b93c33 100644 --- a/examples/post_training_quantization/openvino/yolov8/main.py +++ b/examples/post_training_quantization/openvino/yolov8/main.py @@ -25,6 +25,7 @@ import openvino.torch # noqa import torch from torch._export import capture_pre_autograd_graph +from torch.export import Dim # noqa from torch.fx.passes.graph_drawer import FxGraphDrawer from tqdm import tqdm from ultralytics.cfg import get_cfg @@ -35,6 +36,7 @@ from ultralytics.utils import DATASETS_DIR from ultralytics.utils import DEFAULT_CFG from ultralytics.utils.metrics import ConfusionMatrix +from ultralytics.utils.torch_utils import de_parallel import nncf @@ -53,13 +55,24 @@ def measure_time(model, example_inputs, num_iters=500): return average_time +def validate_fx_ult_method(model: ov.Model) -> Tuple[Dict, int, int]: + """ + Uses .val ultralitics method instead of a dataloader loop. + For some reason this shows better metrics on torch.compiled models + """ + yolo = YOLO(f"{ROOT}/{MODEL_NAME}.pt") + yolo.model = model + result = yolo.val(data="coco128.yaml", batch=1, rect=False) + return result.results_dict + + def validate_fx( model: ov.Model, data_loader: torch.utils.data.DataLoader, validator: Validator, num_samples: int = None ) -> Tuple[Dict, int, int]: - validator.seen = 0 - validator.jdict = [] - validator.stats = [] - validator.confusion_matrix = ConfusionMatrix(nc=validator.nc) + # validator.seen = 0 + # validator.jdict = [] + # validator.stats = [] + # validator.confusion_matrix = ConfusionMatrix(nc=validator.nc) for batch_i, batch in enumerate(data_loader): if num_samples is not None and batch_i == num_samples: break @@ -71,7 +84,20 @@ def validate_fx( return stats, validator.seen, validator.nt_per_class.sum() -def validate( +def print_statistics_short(stats: np.ndarray) -> None: + mp, mr, map50, mean_ap = ( + stats["metrics/precision(B)"], + stats["metrics/recall(B)"], + stats["metrics/mAP50(B)"], + stats["metrics/mAP50-95(B)"], + ) + s = ("%20s" + "%12s" * 4) % ("Class", "Precision", "Recall", "mAP@.5", "mAP@.5:.95") + print(s) + pf = "%20s" + "%12.3g" * 4 # print format + print(pf % ("all", mp, mr, map50, mean_ap)) + + +def validate_ov( model: ov.Model, data_loader: torch.utils.data.DataLoader, validator: Validator, num_samples: int = None ) -> Tuple[Dict, int, int]: validator.seen = 0 @@ -105,6 +131,23 @@ def print_statistics(stats: np.ndarray, total_images: int, total_objects: int) - print(pf % ("all", total_images, total_objects, mp, mr, map50, mean_ap)) +def prepare_validation_new(model: YOLO, data: str) -> Tuple[Validator, torch.utils.data.DataLoader]: + # custom = {"rect": True, "batch": 1} # method defaults + # rect: false forces to resize all input pictures to one size + custom = {"rect": False, "batch": 1} # method defaults + args = {**model.overrides, **custom, "mode": "val"} # highest priority args on the right + + validator = model._smart_load("validator")(args=args, _callbacks=model.callbacks) + stride = 32 # default stride + validator.stride = stride # used in get_dataloader() for padding + validator.data = check_det_dataset(data) + validator.init_metrics(de_parallel(model)) + + data_loader = validator.get_dataloader(validator.data.get(validator.args.split), validator.args.batch) + + return validator, data_loader + + def prepare_validation(model: YOLO, args: Any) -> Tuple[Validator, torch.utils.data.DataLoader]: validator = model.smart_load("validator")(args) validator.data = check_det_dataset(args.data) @@ -236,49 +279,65 @@ def transform_fn(x): TORCH_FX = True +MODEL_NAME = "yolov8n" def main(): - MODEL_NAME = "yolov8n" model = YOLO(f"{ROOT}/{MODEL_NAME}.pt") - args = get_cfg(cfg=DEFAULT_CFG) - args.data = "coco128.yaml" + # args = get_cfg(cfg=DEFAULT_CFG) + # args.data = "coco128.yaml" # Prepare validation dataset and helper - validator, data_loader = prepare_validation(model, args) + + validator, data_loader = prepare_validation_new(model, "coco128.yaml") # Convert to OpenVINO model if TORCH_FX: batch = next(iter(data_loader)) batch = validator.preprocess(batch) + fp_stats, total_images, total_objects = validate_fx(model.model, tqdm(data_loader), validator) + print("Floating-point Torch model validation results:") + print_statistics(fp_stats, total_images, total_objects) + + fp32_compiled_model = torch.compile(model.model, backend="openvino") + fp32_stats, total_images, total_objects = validate_fx(fp32_compiled_model, tqdm(data_loader), validator) + print("FP32 FX model validation results:") + print_statistics(fp32_stats, total_images, total_objects) + + # result = validate_fx_ult_method(fp32_compiled_model) + # print("FX FP32 model .val validation") + # print_statistics_short(result) + + print("Start quantization...") + # Rebuild model to reset ultralitics cache + model = YOLO(f"{ROOT}/{MODEL_NAME}.pt") with torch.no_grad(): - # fp_stats, total_images, total_object = validate(model.model, tqdm(data_loader), validator) - # print("Floating-point model validation results:") - # print_statistics(fp_stats, total_images, total_objects) model.model.eval() model.model(batch["img"]) - exported_model = capture_pre_autograd_graph(model.model, args=(batch["img"],)) + # dynamic_shapes = ((None, None, Dim("H", min=1, max=29802), Dim("W", min=1, max=29802)),) + dynamic_shapes = ((None, None, None, None),) + exported_model = capture_pre_autograd_graph( + model.model, args=(batch["img"],), dynamic_shapes=dynamic_shapes + ) quantized_model = quantize_impl(deepcopy(exported_model), data_loader, validator) - fp32_compiled_model = torch.compile(exported_model, backend="openvino") - fp32_stats, total_images, total_objects = validate_fx(fp32_compiled_model, tqdm(data_loader), validator) - # fp32_stats, total_images, total_objects = validate_fx(model.model, tqdm(data_loader), validator) - print("FP32 model validation results:") - print_statistics(fp32_stats, total_images, total_objects) + # result = validate_fx_ult_method(quantized_model) + # print("FX INT8 model .val validation") + # print_statistics_short(result) int8_stats, total_images, total_objects = validate_fx(quantized_model, tqdm(data_loader), validator) - print("INT8 model validation results:") + print("INT8 FX model validation results:") print_statistics(int8_stats, total_images, total_objects) - print("Start fp32 model benchmarking...") + print("Start FX fp32 model benchmarking...") fp32_latency = measure_time(fp32_compiled_model, (batch["img"],)) - print(f"fp32 latency: {fp32_latency}") + print(f"fp32 FX latency: {fp32_latency}") - print("Start int8 model benchmarking...") + print("Start FX int8 model benchmarking...") int8_latency = measure_time(quantized_model, (batch["img"],)) - print(f"int8 latency: {int8_latency}") + print(f"FX int8 latency: {int8_latency}") print(f"Speed up: {fp32_latency / int8_latency}") return @@ -289,13 +348,15 @@ def main(): quantized_model_path = Path(f"{ROOT}/{MODEL_NAME}_openvino_model/{MODEL_NAME}_quantized.xml") ov.save_model(quantized_model, str(quantized_model_path), compress_to_fp16=False) + args = get_cfg(cfg=DEFAULT_CFG) + args.data = "coco128.yaml" # Validate FP32 model - fp_stats, total_images, total_objects = validate(ov_model, tqdm(data_loader), validator) + fp_stats, total_images, total_objects = validate_ov(ov_model, tqdm(data_loader), validator) print("Floating-point model validation results:") print_statistics(fp_stats, total_images, total_objects) # Validate quantized model - q_stats, total_images, total_objects = validate(quantized_model, tqdm(data_loader), validator) + q_stats, total_images, total_objects = validate_ov(quantized_model, tqdm(data_loader), validator) print("Quantized model validation results:") print_statistics(q_stats, total_images, total_objects) diff --git a/examples/post_training_quantization/torch/ssd300_vgg16/main.py b/examples/post_training_quantization/torch/ssd300_vgg16/main.py index 1b586f4a995..567e12d44f3 100644 --- a/examples/post_training_quantization/torch/ssd300_vgg16/main.py +++ b/examples/post_training_quantization/torch/ssd300_vgg16/main.py @@ -19,6 +19,7 @@ import nncf from nncf.torch import disable_tracing +from torch._export import capture_pre_autograd_graph import openvino as ov import torch import torchvision @@ -27,7 +28,9 @@ from torchmetrics.detection.mean_ap import MeanAveragePrecision from torchvision.models.detection.ssd import SSD from torchvision.models.detection.ssd import GeneralizedRCNNTransform +from torchvision.transforms.functional import pil_to_tensor from torchvision.models.detection.anchor_utils import DefaultBoxGenerator +from torch.export import Dim from nncf.common.logging.track_progress import track from functools import partial @@ -118,6 +121,7 @@ def validate(model: torch.nn.Module, dataset: COCO128Dataset, device: torch.devi metric = MeanAveragePrecision() with torch.no_grad(): for img, target in track(dataset, description="Validating"): + print(img.shape) prediction = model(img.to(device)[None])[0] for k in prediction: prediction[k] = prediction[k].to(torch.device("cpu")) @@ -135,16 +139,38 @@ def transform_fn(data_item: Tuple[torch.Tensor, Dict], device: torch.device) -> def main(): # Download and prepare the COCO128 dataset dataset_path = download_dataset() + # weights = torchvision.models.detection.SSDLite320_MobileNet_V3_Large_Weights.DEFAULT + # transform = weights.transforms() weights_name = "SSD300_VGG16_Weights.DEFAULT" transform = torchvision.models.get_weight(weights_name).transforms() dataset = COCO128Dataset(dataset_path, lambda img, target: (transform(img), target)) # Get the pretrained ssd300_vgg16 model from torchvision.models model = torchvision.models.get_model("ssd300_vgg16", weights=weights_name) + # model = torchvision.models.detection.ssdlite320_mobilenet_v3_large(weights=weights) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") model.to(device) model.eval() + calibration_dataset = nncf.Dataset(dataset, partial(transform_fn, device=device)) + + inp = next(iter(calibration_dataset.get_inference_data())) + # dynamic_shapes = ((None, None, Dim("H"), Dim("W")),) + dynamic_shapes = ((None, None, None, None),) + # dynamic_shapes = ((Dim("batch"), None, None, None),) + _ = model(inp) + # r = validate(model, dataset, device) + # print(r) + compiled_model = capture_pre_autograd_graph(model, args=(inp,), dynamic_shapes=dynamic_shapes) + # compiled_model = torch.compile(model) + print("torch model") + r = validate(model, dataset, device) + print(f"mAP @ 0.5: {r:.3f}") + print("compiled model") + r = validate(compiled_model, dataset, device) + print(f"mAP @ 0.5: {r:.3f}") + return + # Disable NNCF tracing for some methods in order for the model to be properly traced by NNCF disable_tracing(GeneralizedRCNNTransform.normalize) disable_tracing(SSD.postprocess_detections) @@ -198,5 +224,103 @@ def main(): return fp32_map, int8_map, fp32_fps, int8_fps, fp32_model_size, int8_model_size +def validate_detr(model: torch.nn.Module, dataset: COCO128Dataset, device: torch.device, processor): + model.to(device) + metric = MeanAveragePrecision() + min_h = 1000000 + max_h = 0 + min_w = 1000000 + max_w = 0 + with torch.no_grad(): + for img, target in track(dataset, description="Validating"): + + inputs = pil_to_tensor(img) + if inputs.shape[0] == 1: + inputs = torch.cat([inputs] * 3) + inputs = inputs[None] + + inputs = processor(images=inputs, return_tensors="pt") + min_h = min(min_h, inputs["pixel_values"].shape[2]) + max_h = max(max_h, inputs["pixel_values"].shape[2]) + min_w = min(min_w, inputs["pixel_values"].shape[3]) + max_w = max(max_w, inputs["pixel_values"].shape[3]) + + output = model(**inputs) + target_sizes = torch.tensor([img.size[::-1]]) + prediction = processor.post_process_object_detection(output, target_sizes=target_sizes, threshold=0.9)[0] + for k in prediction: + prediction[k] = prediction[k].to(torch.device("cpu")) + metric.update([prediction], [target]) + computed_metrics = metric.compute() + print(min_h, max_h, min_w, max_w) + return computed_metrics["map_50"] + + +def get_dert_inputs(processor, dataset): + img = next(iter(dataset))[0] + inputs = pil_to_tensor(img) + inputs = inputs[None] + return processor(images=inputs, return_tensors="pt") + + +def get_image(): + from PIL import Image + import requests + + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + + return image + + +def main_detr(): + from transformers import DetrImageProcessor, DetrForObjectDetection + import torch + + device = torch.device("cpu") + # you can specify the revision tag if you don't want the timm dependency + processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50", revision="no_timm") + model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50", revision="no_timm") + model.eval() + + dataset_path = download_dataset() + dataset = COCO128Dataset(dataset_path, lambda img, target: (img, target)) + + h, w = Dim("H", min=454, max=1333), Dim("W", min=748, max=1333) + dynamic_shapes = {"pixel_values": {2: h, 3: w}, "pixel_mask": {2: h, 3: w}} + dynamic_shapes = ((None, None, h, w), (None, h, w)) + ex_inputs = get_dert_inputs(processor, dataset) + # captured_model = capture_pre_autograd_graph(model, args=(), kwargs=ex_inputs, dynamic_shapes=dynamic_shapes) + # captured_model = capture_pre_autograd_graph(model, args=(tuple(ex_inputs.values()),), + # dynamic_shapes=dynamic_shapes) + # captured_model = capture_pre_autograd_graph(model, args=tuple(ex_inputs.values())) + captured_model = capture_pre_autograd_graph(model, args=tuple(ex_inputs.values()), dynamic_shapes=dynamic_shapes) + # captured_model = capture_pre_autograd_graph(model,args=(), kwargs=ex_inputs) + + # compiled_model = torch.compile(model, dynamic=True) + # r = validate_detr(compiled_model, dataset, device, processor) + r = validate_detr(captured_model, dataset, device, processor) + print(f"mAP @ 0.5: {r:.3f}") + r = validate_detr(model, dataset, device, processor) + print(f"mAP @ 0.5: {r:.3f}") + + outputs = model(**ex_inputs) + + # convert outputs (bounding boxes and class logits) to COCO API + # let's only keep detections with score > 0.9 + image = get_image() + processor(images=image, return_tensors="pt") + target_sizes = torch.tensor([image.size[::-1]]) + results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0] + + for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + box = [round(i, 2) for i in box.tolist()] + print( + f"Detected {model.config.id2label[label.item()]} with confidence " + f"{round(score.item(), 3)} at location {box}" + ) + + if __name__ == "__main__": - main() + # main() + main_detr() diff --git a/yolo_fx_bad_metrics_repro.py b/yolo_fx_bad_metrics_repro.py new file mode 100644 index 00000000000..b5c05d6bbcb --- /dev/null +++ b/yolo_fx_bad_metrics_repro.py @@ -0,0 +1,86 @@ +# Copyright (c) 2024 Intel Corporation +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, Tuple + +import numpy as np +import torch +from tqdm import tqdm +from ultralytics.data.utils import check_det_dataset +from ultralytics.engine.validator import BaseValidator as Validator +from ultralytics.models.yolo import YOLO +from ultralytics.utils.torch_utils import de_parallel + + +def print_statistics(stats: np.ndarray, total_images: int, total_objects: int) -> None: + mp, mr, map50, mean_ap = ( + stats["metrics/precision(B)"], + stats["metrics/recall(B)"], + stats["metrics/mAP50(B)"], + stats["metrics/mAP50-95(B)"], + ) + s = ("%20s" + "%12s" * 6) % ("Class", "Images", "Labels", "Precision", "Recall", "mAP@.5", "mAP@.5:.95") + print(s) + pf = "%20s" + "%12i" * 2 + "%12.3g" * 4 # print format + print(pf % ("all", total_images, total_objects, mp, mr, map50, mean_ap)) + + +def prepare_validation(model: YOLO, data: str) -> Tuple[Validator, torch.utils.data.DataLoader]: + # custom = {"rect": True, "batch": 1} # method defaults + # rect: false forces to resize all input pictures to one size + custom = {"rect": False, "batch": 1} # method defaults + args = {**model.overrides, **custom, "mode": "val"} # highest priority args on the right + + validator = model._smart_load("validator")(args=args, _callbacks=model.callbacks) + stride = 32 # default stride + validator.stride = stride # used in get_dataloader() for padding + validator.data = check_det_dataset(data) + validator.init_metrics(de_parallel(model)) + + data_loader = validator.get_dataloader(validator.data.get(validator.args.split), validator.args.batch) + return validator, data_loader + + +def validate(model, data_loader: torch.utils.data.DataLoader, validator: Validator) -> Tuple[Dict, int, int]: + with torch.no_grad(): + for batch in data_loader: + batch = validator.preprocess(batch) + preds = model(batch["img"]) + preds = validator.postprocess(preds) + validator.update_metrics(preds, batch) + stats = validator.get_stats() + return stats, validator.seen, validator.nt_per_class.sum() + + +def main(torch_fx): + # ultralytics @ git+https://github.com/THU-MIG/yolov10.git@2c36ab0f108efdd17c7e290564bb845ccb6844d8 + # pip install git+https://github.com/THU-MIG/yolov10.git + # pip install huggingface-hub + # yolo_model = YOLO("yolov10n.pt") + + yolo_model = YOLO("yolov8n") + + model_type = "torch" + model = yolo_model.model + if torch_fx: + model = torch.compile(model) + model_type = "FX" + print(f"FP32 {model_type} model validation results:") + validator, data_loader = prepare_validation(yolo_model, "coco128.yaml") + stats, total_images, total_objects = validate(model, tqdm(data_loader), validator) + print_statistics(stats, total_images, total_objects) + + +if __name__ == "__main__": + print("Torch model:") + main(torch_fx=False) + print("Torch FX model:") + main(torch_fx=True)