From 57744b29f9e21532b4a9c7525daae3b0d9671378 Mon Sep 17 00:00:00 2001 From: felix Date: Tue, 3 Sep 2024 16:38:25 +0200 Subject: [PATCH 1/7] small change Resize --- doctr/transforms/modules/pytorch.py | 11 ++++++----- doctr/transforms/modules/tensorflow.py | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/doctr/transforms/modules/pytorch.py b/doctr/transforms/modules/pytorch.py index f893afc2f7..91e79bdc09 100644 --- a/doctr/transforms/modules/pytorch.py +++ b/doctr/transforms/modules/pytorch.py @@ -74,16 +74,19 @@ def forward( if self.symmetric_pad: half_pad = (math.ceil(_pad[1] / 2), math.ceil(_pad[3] / 2)) _pad = (half_pad[0], _pad[1] - half_pad[0], half_pad[1], _pad[3] - half_pad[1]) + # Pad image img = pad(img, _pad) + if self.symmetric_pad: + offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2] + # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) if target is not None: + target = np.clip(target, 0, 1) if self.preserve_aspect_ratio: # Get absolute coords if target.shape[1:] == (4,): if isinstance(self.size, (tuple, list)) and self.symmetric_pad: - if np.max(target) <= 1: - offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2] target[:, [0, 2]] = offset[0] + target[:, [0, 2]] * raw_shape[-1] / img.shape[-1] target[:, [1, 3]] = offset[1] + target[:, [1, 3]] * raw_shape[-2] / img.shape[-2] else: @@ -91,15 +94,13 @@ def forward( target[:, [1, 3]] *= raw_shape[-2] / img.shape[-2] elif target.shape[1:] == (4, 2): if isinstance(self.size, (tuple, list)) and self.symmetric_pad: - if np.max(target) <= 1: - offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2] target[..., 0] = offset[0] + target[..., 0] * raw_shape[-1] / img.shape[-1] target[..., 1] = offset[1] + target[..., 1] * raw_shape[-2] / img.shape[-2] else: target[..., 0] *= raw_shape[-1] / img.shape[-1] target[..., 1] *= raw_shape[-2] / img.shape[-2] else: - raise AssertionError + raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") return img, target return img diff --git a/doctr/transforms/modules/tensorflow.py b/doctr/transforms/modules/tensorflow.py index b3f7bcfd8a..3bd08ca247 100644 --- a/doctr/transforms/modules/tensorflow.py +++ b/doctr/transforms/modules/tensorflow.py @@ -120,16 +120,19 @@ def __call__( offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) else: offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) + # Pad image img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) + if self.symmetric_pad: + offset = offset[0] / img.shape[0], offset[1] / img.shape[1] + # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) if target is not None: + target = np.clip(target, 0, 1) if self.preserve_aspect_ratio: # Get absolute coords if target.shape[1:] == (4,): if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: - if np.max(target) <= 1: - offset = offset[0] / img.shape[0], offset[1] / img.shape[1] target[:, [0, 2]] = offset[1] + target[:, [0, 2]] * raw_shape[1] / img.shape[1] target[:, [1, 3]] = offset[0] + target[:, [1, 3]] * raw_shape[0] / img.shape[0] else: @@ -137,15 +140,13 @@ def __call__( target[:, [1, 3]] *= raw_shape[0] / img.shape[0] elif target.shape[1:] == (4, 2): if isinstance(self.output_size, (tuple, list)) and self.symmetric_pad: - if np.max(target) <= 1: - offset = offset[0] / img.shape[0], offset[1] / img.shape[1] target[..., 0] = offset[1] + target[..., 0] * raw_shape[1] / img.shape[1] target[..., 1] = offset[0] + target[..., 1] * raw_shape[0] / img.shape[0] else: target[..., 0] *= raw_shape[1] / img.shape[1] target[..., 1] *= raw_shape[0] / img.shape[0] else: - raise AssertionError + raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") return tf.cast(img, dtype=input_dtype), target return tf.cast(img, dtype=input_dtype) From b2d5df0a0d4a865f8d7148dd2845052d739204d4 Mon Sep 17 00:00:00 2001 From: felix Date: Tue, 3 Sep 2024 16:40:55 +0200 Subject: [PATCH 2/7] update --- doctr/transforms/modules/pytorch.py | 7 ++++--- doctr/transforms/modules/tensorflow.py | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/doctr/transforms/modules/pytorch.py b/doctr/transforms/modules/pytorch.py index 91e79bdc09..667101f3d3 100644 --- a/doctr/transforms/modules/pytorch.py +++ b/doctr/transforms/modules/pytorch.py @@ -77,12 +77,13 @@ def forward( # Pad image img = pad(img, _pad) - if self.symmetric_pad: - offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2] - # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) if target is not None: target = np.clip(target, 0, 1) + + if self.symmetric_pad: + offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2] + if self.preserve_aspect_ratio: # Get absolute coords if target.shape[1:] == (4,): diff --git a/doctr/transforms/modules/tensorflow.py b/doctr/transforms/modules/tensorflow.py index 3bd08ca247..cf55381598 100644 --- a/doctr/transforms/modules/tensorflow.py +++ b/doctr/transforms/modules/tensorflow.py @@ -123,12 +123,13 @@ def __call__( # Pad image img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - if self.symmetric_pad: - offset = offset[0] / img.shape[0], offset[1] / img.shape[1] - # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) if target is not None: target = np.clip(target, 0, 1) + + if self.symmetric_pad: + offset = offset[0] / img.shape[0], offset[1] / img.shape[1] + if self.preserve_aspect_ratio: # Get absolute coords if target.shape[1:] == (4,): From 845e7e1d2dde9c5a35560bab6812eb47adac8e1a Mon Sep 17 00:00:00 2001 From: felix Date: Wed, 4 Sep 2024 09:31:13 +0200 Subject: [PATCH 3/7] fix eval scripts and clip issue --- api/app/vision.py | 2 +- doctr/transforms/modules/pytorch.py | 31 +++++++++---- doctr/transforms/modules/tensorflow.py | 43 ++++++++++++------ .../classification/latency_tensorflow.py | 2 +- .../train_tensorflow_character.py | 2 +- .../train_tensorflow_orientation.py | 2 +- references/detection/evaluate_pytorch.py | 10 ++++- references/detection/evaluate_tensorflow.py | 12 +++-- references/detection/latency_tensorflow.py | 2 +- references/detection/train_tensorflow.py | 2 +- references/recognition/evaluate_tensorflow.py | 2 +- references/recognition/latency_tensorflow.py | 2 +- references/recognition/train_tensorflow.py | 2 +- scripts/analyze.py | 2 +- scripts/detect_text.py | 2 +- scripts/evaluate.py | 30 +++++++++++-- scripts/evaluate_kie.py | 30 +++++++++++-- tests/pytorch/test_transforms_pt.py | 44 +++++++++++++++++++ tests/tensorflow/test_transforms_tf.py | 44 +++++++++++++++++++ 19 files changed, 220 insertions(+), 46 deletions(-) diff --git a/api/app/vision.py b/api/app/vision.py index 005c8d1548..144b5e4c3b 100644 --- a/api/app/vision.py +++ b/api/app/vision.py @@ -6,7 +6,7 @@ import tensorflow as tf -gpu_devices = tf.config.experimental.list_physical_devices("GPU") +gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/doctr/transforms/modules/pytorch.py b/doctr/transforms/modules/pytorch.py index 667101f3d3..266a5aea3f 100644 --- a/doctr/transforms/modules/pytorch.py +++ b/doctr/transforms/modules/pytorch.py @@ -4,7 +4,7 @@ # See LICENSE or go to for full license details. import math -from typing import Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import numpy as np import torch @@ -38,8 +38,8 @@ def __init__( def forward( self, img: torch.Tensor, - target: Optional[np.ndarray] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, np.ndarray]]: + target: Optional[Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]]] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]]]]: if isinstance(self.size, int): target_ratio = img.shape[-2] / img.shape[-1] else: @@ -77,10 +77,7 @@ def forward( # Pad image img = pad(img, _pad) - # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) - if target is not None: - target = np.clip(target, 0, 1) - + def _prepare_targets(target: np.ndarray) -> np.ndarray: if self.symmetric_pad: offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2] @@ -102,7 +99,25 @@ def forward( target[..., 1] *= raw_shape[-2] / img.shape[-2] else: raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") - return img, target + return np.clip(target, 0, 1) + + # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) + if target is not None: + # Possible formats: + # KIE: Dict[str, np.ndarray] + # Built-in datasets: Dict[str, Union[np.ndarray, List[str]]] + # Custom datasets: np.ndarray + + if isinstance(target, dict): + # Built-in datasets + if "boxes" and "labels" in target.keys(): + target["boxes"] = _prepare_targets(target["boxes"]) # type: ignore[arg-type] + return img, target + # KIE + else: + return img, {k: _prepare_targets(v) for k, v in target.items()} # type: ignore[arg-type] + # Custom datasets + return img, _prepare_targets(target) return img diff --git a/doctr/transforms/modules/tensorflow.py b/doctr/transforms/modules/tensorflow.py index cf55381598..781937df30 100644 --- a/doctr/transforms/modules/tensorflow.py +++ b/doctr/transforms/modules/tensorflow.py @@ -4,7 +4,7 @@ # See LICENSE or go to for full license details. import random -from typing import Any, Callable, Iterable, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import numpy as np import tensorflow as tf @@ -104,31 +104,28 @@ def extra_repr(self) -> str: def __call__( self, img: tf.Tensor, - target: Optional[np.ndarray] = None, - ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]: + target: Optional[Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]]] = None, + ) -> Union[tf.Tensor, Tuple[tf.Tensor, Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]]]]: input_dtype = img.dtype img = tf.image.resize(img, self.wanted_size, self.method, self.preserve_aspect_ratio, self.antialias) # It will produce an un-padded resized image, with a side shorter than wanted if we preserve aspect ratio raw_shape = img.shape[:2] + if self.symmetric_pad: + half_pad = (int((self.output_size[0] - img.shape[0]) / 2), 0) if self.preserve_aspect_ratio: if isinstance(self.output_size, (tuple, list)): # In that case we need to pad because we want to enforce both width and height if not self.symmetric_pad: - offset = (0, 0) + half_pad = (0, 0) elif self.output_size[0] == img.shape[0]: - offset = (0, int((self.output_size[1] - img.shape[1]) / 2)) - else: - offset = (int((self.output_size[0] - img.shape[0]) / 2), 0) + half_pad = (0, int((self.output_size[1] - img.shape[1]) / 2)) # Pad image - img = tf.image.pad_to_bounding_box(img, *offset, *self.output_size) - - # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) - if target is not None: - target = np.clip(target, 0, 1) + img = tf.image.pad_to_bounding_box(img, *half_pad, *self.output_size) + def _prepare_targets(target: np.ndarray) -> np.ndarray: if self.symmetric_pad: - offset = offset[0] / img.shape[0], offset[1] / img.shape[1] + offset = half_pad[0] / img.shape[0], half_pad[1] / img.shape[1] if self.preserve_aspect_ratio: # Get absolute coords @@ -148,7 +145,25 @@ def __call__( target[..., 1] *= raw_shape[0] / img.shape[0] else: raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") - return tf.cast(img, dtype=input_dtype), target + return np.clip(target, 0, 1) + + # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) + if target is not None: + # Possible formats: + # KIE: Dict[str, np.ndarray] + # Built-in datasets: Dict[str, Union[np.ndarray, List[str]]] + # Custom datasets: np.ndarray + + if isinstance(target, dict): + # Built-in datasets + if "boxes" and "labels" in target.keys(): + target["boxes"] = _prepare_targets(target["boxes"]) # type: ignore[arg-type] + return tf.cast(img, dtype=input_dtype), target + # KIE + else: + return tf.cast(img, dtype=input_dtype), {k: _prepare_targets(v) for k, v in target.items()} # type: ignore[arg-type] + # Custom datasets + return tf.cast(img, dtype=input_dtype), _prepare_targets(target) return tf.cast(img, dtype=input_dtype) diff --git a/references/classification/latency_tensorflow.py b/references/classification/latency_tensorflow.py index fc010df91a..6ccdefac18 100644 --- a/references/classification/latency_tensorflow.py +++ b/references/classification/latency_tensorflow.py @@ -20,7 +20,7 @@ def main(args): if args.gpu: - gpu_devices = tf.config.experimental.list_physical_devices("GPU") + gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) else: diff --git a/references/classification/train_tensorflow_character.py b/references/classification/train_tensorflow_character.py index 580cf6fb1b..c787c1957e 100644 --- a/references/classification/train_tensorflow_character.py +++ b/references/classification/train_tensorflow_character.py @@ -18,7 +18,7 @@ from doctr.models import login_to_hub, push_to_hf_hub -gpu_devices = tf.config.experimental.list_physical_devices("GPU") +gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py index ad25713df7..d6e8c4c0a4 100644 --- a/references/classification/train_tensorflow_orientation.py +++ b/references/classification/train_tensorflow_orientation.py @@ -18,7 +18,7 @@ from doctr.models import login_to_hub, push_to_hf_hub -gpu_devices = tf.config.experimental.list_physical_devices("GPU") +gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/references/detection/evaluate_pytorch.py b/references/detection/evaluate_pytorch.py index 15f60df664..f5d3925092 100644 --- a/references/detection/evaluate_pytorch.py +++ b/references/detection/evaluate_pytorch.py @@ -82,7 +82,9 @@ def main(args): train=True, download=True, use_polygons=args.rotation, - sample_transforms=T.Resize(input_shape), + sample_transforms=T.Resize( + input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad + ), ) # Monkeypatch subfolder = ds.root.split("/")[-2:] @@ -92,7 +94,9 @@ def main(args): train=False, download=True, use_polygons=args.rotation, - sample_transforms=T.Resize(input_shape), + sample_transforms=T.Resize( + input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad + ), ) subfolder = _ds.root.split("/")[-2:] ds.data.extend([(os.path.join(*subfolder, name), target) for name, target in _ds.data]) @@ -155,6 +159,8 @@ def parse_args(): parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for evaluation") parser.add_argument("--device", default=None, type=int, help="device") parser.add_argument("--size", type=int, default=None, help="model input size, H = W") + parser.add_argument("--keep_ratio", action="store_true", help="keep the aspect ratio of the input image") + parser.add_argument("--symmetric_pad", action="store_true", help="pad the image symmetrically") parser.add_argument("-j", "--workers", type=int, default=None, help="number of workers used for dataloading") parser.add_argument("--rotation", dest="rotation", action="store_true", help="inference with rotated bbox") parser.add_argument("--resume", type=str, default=None, help="Checkpoint to resume") diff --git a/references/detection/evaluate_tensorflow.py b/references/detection/evaluate_tensorflow.py index 139932f2c4..0eb70cf245 100644 --- a/references/detection/evaluate_tensorflow.py +++ b/references/detection/evaluate_tensorflow.py @@ -17,7 +17,7 @@ from tensorflow.keras import mixed_precision from tqdm import tqdm -gpu_devices = tf.config.experimental.list_physical_devices("GPU") +gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) @@ -81,7 +81,9 @@ def main(args): train=True, download=True, use_polygons=args.rotation, - sample_transforms=T.Resize(input_shape[:2]), + sample_transforms=T.Resize( + input_shape[:2], preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad + ), ) # Monkeypatch subfolder = ds.root.split("/")[-2:] @@ -91,7 +93,9 @@ def main(args): train=False, download=True, use_polygons=args.rotation, - sample_transforms=T.Resize(input_shape[:2]), + sample_transforms=T.Resize( + input_shape[:2], preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad + ), ) subfolder = _ds.root.split("/")[-2:] ds.data.extend([(os.path.join(*subfolder, name), target) for name, target in _ds.data]) @@ -129,6 +133,8 @@ def parse_args(): parser.add_argument("--dataset", type=str, default="FUNSD", help="Dataset to evaluate on") parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for evaluation") parser.add_argument("--size", type=int, default=None, help="model input size, H = W") + parser.add_argument("--keep_ratio", action="store_true", help="keep the aspect ratio of the input image") + parser.add_argument("--symmetric_pad", action="store_true", help="pad the image symmetrically") parser.add_argument("--rotation", dest="rotation", action="store_true", help="inference with rotated bbox") parser.add_argument("--resume", type=str, default=None, help="Checkpoint to resume") parser.add_argument("--amp", dest="amp", help="Use Automatic Mixed Precision", action="store_true") diff --git a/references/detection/latency_tensorflow.py b/references/detection/latency_tensorflow.py index e3e0d1d8af..39c0cd6e36 100644 --- a/references/detection/latency_tensorflow.py +++ b/references/detection/latency_tensorflow.py @@ -20,7 +20,7 @@ def main(args): if args.gpu: - gpu_devices = tf.config.experimental.list_physical_devices("GPU") + gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) else: diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py index 1312a6ea13..b2a6fd667e 100644 --- a/references/detection/train_tensorflow.py +++ b/references/detection/train_tensorflow.py @@ -19,7 +19,7 @@ from doctr.models import login_to_hub, push_to_hf_hub -gpu_devices = tf.config.experimental.list_physical_devices("GPU") +gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/references/recognition/evaluate_tensorflow.py b/references/recognition/evaluate_tensorflow.py index 62651245c4..b6acdabbb6 100644 --- a/references/recognition/evaluate_tensorflow.py +++ b/references/recognition/evaluate_tensorflow.py @@ -14,7 +14,7 @@ from tensorflow.keras import mixed_precision from tqdm import tqdm -gpu_devices = tf.config.experimental.list_physical_devices("GPU") +gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/references/recognition/latency_tensorflow.py b/references/recognition/latency_tensorflow.py index 405cf56892..318ff03fcb 100644 --- a/references/recognition/latency_tensorflow.py +++ b/references/recognition/latency_tensorflow.py @@ -20,7 +20,7 @@ def main(args): if args.gpu: - gpu_devices = tf.config.experimental.list_physical_devices("GPU") + gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) else: diff --git a/references/recognition/train_tensorflow.py b/references/recognition/train_tensorflow.py index 7f55142859..371c308aa2 100644 --- a/references/recognition/train_tensorflow.py +++ b/references/recognition/train_tensorflow.py @@ -20,7 +20,7 @@ from doctr.models import login_to_hub, push_to_hf_hub -gpu_devices = tf.config.experimental.list_physical_devices("GPU") +gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/scripts/analyze.py b/scripts/analyze.py index 94415267a2..fdffa30e48 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -16,7 +16,7 @@ if is_tf_available(): import tensorflow as tf - gpu_devices = tf.config.experimental.list_physical_devices("GPU") + gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/scripts/detect_text.py b/scripts/detect_text.py index f65b6685df..e3ca08c7b0 100644 --- a/scripts/detect_text.py +++ b/scripts/detect_text.py @@ -20,7 +20,7 @@ if is_tf_available(): import tensorflow as tf - gpu_devices = tf.config.experimental.list_physical_devices("GPU") + gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) diff --git a/scripts/evaluate.py b/scripts/evaluate.py index bc9459b727..b8568ef7e4 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -11,6 +11,7 @@ from tqdm import tqdm from doctr import datasets +from doctr import transforms as T from doctr.file_utils import is_tf_available from doctr.models import ocr_predictor from doctr.utils.geometry import extract_crops, extract_rcrops @@ -20,7 +21,7 @@ if is_tf_available(): import tensorflow as tf - gpu_devices = tf.config.experimental.list_physical_devices("GPU") + gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) else: @@ -35,12 +36,15 @@ def main(args): if not args.rotation: args.eval_straight = True + input_shape = (args.size, args.size) + predictor = ocr_predictor( args.detection, args.recognition, pretrained=True, reco_bs=args.batch_size, - preserve_aspect_ratio=False, + preserve_aspect_ratio=False, # we handle the transformation directly in the dataset so this is set to False + symmetric_pad=False, # we handle the transformation directly in the dataset so this is set to False assume_straight_pages=not args.rotation, ) @@ -48,11 +52,26 @@ def main(args): testset = datasets.OCRDataset( img_folder=args.img_folder, label_file=args.label_file, + sample_transforms=T.Resize(input_shape, preserve_aspect_ratio=True, symmetric_pad=True), ) sets = [testset] else: - train_set = datasets.__dict__[args.dataset](train=True, download=True, use_polygons=not args.eval_straight) - val_set = datasets.__dict__[args.dataset](train=False, download=True, use_polygons=not args.eval_straight) + train_set = datasets.__dict__[args.dataset]( + train=True, + download=True, + use_polygons=not args.eval_straight, + sample_transforms=T.Resize( + input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad + ), + ) + val_set = datasets.__dict__[args.dataset]( + train=False, + download=True, + use_polygons=not args.eval_straight, + sample_transforms=T.Resize( + input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad + ), + ) sets = [train_set, val_set] reco_metric = TextMatch() @@ -190,6 +209,9 @@ def parse_args(): parser.add_argument("--label_file", type=str, default=None, help="Only for local sets, path to labels") parser.add_argument("--rotation", dest="rotation", action="store_true", help="run rotated OCR + postprocessing") parser.add_argument("-b", "--batch_size", type=int, default=32, help="batch size for recognition") + parser.add_argument("--size", type=int, default=1024, help="model input size, H = W") + parser.add_argument("--keep_ratio", action="store_true", help="keep the aspect ratio of the input image") + parser.add_argument("--symmetric_pad", action="store_true", help="pad the image symmetrically") parser.add_argument("--samples", type=int, default=None, help="evaluate only on the N first samples") parser.add_argument( "--eval-straight", diff --git a/scripts/evaluate_kie.py b/scripts/evaluate_kie.py index b3d75d9beb..6afe8584bb 100644 --- a/scripts/evaluate_kie.py +++ b/scripts/evaluate_kie.py @@ -13,6 +13,7 @@ from tqdm import tqdm from doctr import datasets +from doctr import transforms as T from doctr.file_utils import is_tf_available from doctr.models import kie_predictor from doctr.utils.geometry import extract_crops, extract_rcrops @@ -22,7 +23,7 @@ if is_tf_available(): import tensorflow as tf - gpu_devices = tf.config.experimental.list_physical_devices("GPU") + gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): tf.config.experimental.set_memory_growth(gpu_devices[0], True) else: @@ -37,12 +38,15 @@ def main(args): if not args.rotation: args.eval_straight = True + input_shape = (args.size, args.size) + predictor = kie_predictor( args.detection, args.recognition, pretrained=True, reco_bs=args.batch_size, - preserve_aspect_ratio=False, + preserve_aspect_ratio=False, # we handle the transformation directly in the dataset so this is set to False + symmetric_pad=False, # we handle the transformation directly in the dataset so this is set to False assume_straight_pages=not args.rotation, ) @@ -50,11 +54,26 @@ def main(args): testset = datasets.OCRDataset( img_folder=args.img_folder, label_file=args.label_file, + sample_transforms=T.Resize(input_shape, preserve_aspect_ratio=True, symmetric_pad=True), ) sets = [testset] else: - train_set = datasets.__dict__[args.dataset](train=True, download=True, use_polygons=not args.eval_straight) - val_set = datasets.__dict__[args.dataset](train=False, download=True, use_polygons=not args.eval_straight) + train_set = datasets.__dict__[args.dataset]( + train=True, + download=True, + use_polygons=not args.eval_straight, + sample_transforms=T.Resize( + input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad + ), + ) + val_set = datasets.__dict__[args.dataset]( + train=False, + download=True, + use_polygons=not args.eval_straight, + sample_transforms=T.Resize( + input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad + ), + ) sets = [train_set, val_set] reco_metric = TextMatch() @@ -187,6 +206,9 @@ def parse_args(): parser.add_argument("--label_file", type=str, default=None, help="Only for local sets, path to labels") parser.add_argument("--rotation", dest="rotation", action="store_true", help="run rotated OCR + postprocessing") parser.add_argument("-b", "--batch_size", type=int, default=32, help="batch size for recognition") + parser.add_argument("--size", type=int, default=1024, help="model input size, H = W") + parser.add_argument("--keep_ratio", action="store_true", help="keep the aspect ratio of the input image") + parser.add_argument("--symmetric_pad", action="store_true", help="pad the image symmetrically") parser.add_argument("--samples", type=int, default=None, help="evaluate only on the N first samples") parser.add_argument( "--eval-straight", diff --git a/tests/pytorch/test_transforms_pt.py b/tests/pytorch/test_transforms_pt.py index 2567dd8486..0a4d05b234 100644 --- a/tests/pytorch/test_transforms_pt.py +++ b/tests/pytorch/test_transforms_pt.py @@ -66,6 +66,50 @@ def test_resize(): out = transfo(input_t) assert out.dtype == torch.float16 + # --- Test with target (bounding boxes) --- + + # 1. Custom dataset (n_boxes, 4) format bounding boxes + target_boxes = np.array([[0.1, 0.1, 0.9, 0.9], [0.2, 0.2, 0.8, 0.8]]) + output_size = (64, 64) + + transfo = Resize(output_size, preserve_aspect_ratio=True) + input_t = torch.ones((3, 32, 64), dtype=torch.float32) + out, new_target = transfo(input_t, target_boxes) + + assert out.shape[-2:] == output_size + assert new_target.shape == target_boxes.shape + assert np.all(new_target >= 0) and np.all(new_target <= 1) + + # 2. Built-in dataset: Dict with "boxes" and "labels" + target = {"boxes": np.array([[0.1, 0.1, 0.9, 0.9]]), "labels": ["text"]} + + transfo = Resize(output_size, preserve_aspect_ratio=True) + out, new_target = transfo(input_t, target) + + assert out.shape[-2:] == output_size + assert "boxes" in new_target + assert "labels" in new_target + assert new_target["boxes"].shape == target["boxes"].shape + assert new_target["labels"] == target["labels"] + assert np.all(new_target["boxes"] >= 0) and np.all(new_target["boxes"] <= 1) + + # 3. KIE dataset: Dict[str, np.ndarray] (key-value np.ndarray pairs) + target_kie = {"class_1": np.array([[0.1, 0.1, 0.9, 0.9]]), "class_2": np.array([[0.2, 0.2, 0.8, 0.8]])} + + transfo = Resize(output_size, preserve_aspect_ratio=True, symmetric_pad=True) + out, new_target_kie = transfo(input_t, target_kie) + + assert out.shape[-2:] == output_size + assert "class_1" in new_target_kie + assert "class_2" in new_target_kie + assert new_target_kie["class_1"].shape == target_kie["class_1"].shape + assert new_target_kie["class_2"].shape == target_kie["class_2"].shape + assert np.all(new_target_kie["class_1"] >= 0) and np.all(new_target_kie["class_1"] <= 1) + + # 4. No target (to ensure backward compatibility) + out = transfo(input_t) + assert out.shape[-2:] == output_size + @pytest.mark.parametrize( "rgb_min", diff --git a/tests/tensorflow/test_transforms_tf.py b/tests/tensorflow/test_transforms_tf.py index e53945f2e3..2101ca6208 100644 --- a/tests/tensorflow/test_transforms_tf.py +++ b/tests/tensorflow/test_transforms_tf.py @@ -48,6 +48,50 @@ def test_resize(): out = transfo(input_t) assert out.dtype == tf.float16 + # --- Test with target (bounding boxes) --- + + # 1. Custom dataset (n_boxes, 4) format bounding boxes + target_boxes = np.array([[0.1, 0.1, 0.9, 0.9], [0.2, 0.2, 0.8, 0.8]]) + output_size = (64, 64) + + transfo = T.Resize(output_size, preserve_aspect_ratio=True) + input_t = tf.cast(tf.fill([64, 32, 3], 1), dtype=tf.float32) + out, new_target = transfo(input_t, target_boxes) + + assert out.shape[:2] == output_size + assert new_target.shape == target_boxes.shape + assert np.all(new_target >= 0) and np.all(new_target <= 1) + + # 2. Built-in dataset: Dict with "boxes" and "labels" + target = {"boxes": np.array([[0.1, 0.1, 0.9, 0.9]]), "labels": ["text"]} + + transfo = T.Resize(output_size, preserve_aspect_ratio=True) + out, new_target = transfo(input_t, target) + + assert out.shape[:2] == output_size + assert "boxes" in new_target + assert "labels" in new_target + assert new_target["boxes"].shape == target["boxes"].shape + assert new_target["labels"] == target["labels"] + assert np.all(new_target["boxes"] >= 0) and np.all(new_target["boxes"] <= 1) + + # 3. KIE dataset: Dict[str, np.ndarray] (key-value np.ndarray pairs) + target_kie = {"class_1": np.array([[0.1, 0.1, 0.9, 0.9]]), "class_2": np.array([[0.2, 0.2, 0.8, 0.8]])} + + transfo = T.Resize(output_size, preserve_aspect_ratio=True, symmetric_pad=True) + out, new_target_kie = transfo(input_t, target_kie) + + assert out.shape[:2] == output_size + assert "class_1" in new_target_kie + assert "class_2" in new_target_kie + assert new_target_kie["class_1"].shape == target_kie["class_1"].shape + assert new_target_kie["class_2"].shape == target_kie["class_2"].shape + assert np.all(new_target_kie["class_1"] >= 0) and np.all(new_target_kie["class_1"] <= 1) + + # 4. No target (to ensure backward compatibility) + out = transfo(input_t) + assert out.shape[:2] == output_size + def test_compose(): output_size = (16, 16) From f713ed23c59fd2b640ddd551de9b3f3fafbbe2c5 Mon Sep 17 00:00:00 2001 From: felix Date: Wed, 4 Sep 2024 12:14:52 +0200 Subject: [PATCH 4/7] mypy --- doctr/transforms/modules/pytorch.py | 6 +++++- doctr/transforms/modules/tensorflow.py | 9 ++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doctr/transforms/modules/pytorch.py b/doctr/transforms/modules/pytorch.py index 266a5aea3f..f99f422874 100644 --- a/doctr/transforms/modules/pytorch.py +++ b/doctr/transforms/modules/pytorch.py @@ -264,7 +264,11 @@ def __init__( self.p = p self._resize = Resize - def forward(self, img: torch.Tensor, target: np.ndarray) -> Tuple[torch.Tensor, np.ndarray]: + def forward( + self, + img: torch.Tensor, + target: Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]], + ) -> Tuple[torch.Tensor, Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]]]: if torch.rand(1) < self.p: scale_h = np.random.uniform(*self.scale_range) scale_w = np.random.uniform(*self.scale_range) diff --git a/doctr/transforms/modules/tensorflow.py b/doctr/transforms/modules/tensorflow.py index 781937df30..23f035716f 100644 --- a/doctr/transforms/modules/tensorflow.py +++ b/doctr/transforms/modules/tensorflow.py @@ -107,6 +107,9 @@ def __call__( target: Optional[Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]]] = None, ) -> Union[tf.Tensor, Tuple[tf.Tensor, Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]]]]: input_dtype = img.dtype + self.output_size = ( + (self.output_size, self.output_size) if isinstance(self.output_size, int) else self.output_size + ) img = tf.image.resize(img, self.wanted_size, self.method, self.preserve_aspect_ratio, self.antialias) # It will produce an un-padded resized image, with a side shorter than wanted if we preserve aspect ratio @@ -567,7 +570,11 @@ def __init__( self.p = p self._resize = Resize - def __call__(self, img: tf.Tensor, target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: + def __call__( + self, + img: tf.Tensor, + target: Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]], + ) -> Tuple[tf.Tensor, Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]]]: if np.random.rand(1) <= self.p: scale_h = random.uniform(*self.scale_range) scale_w = random.uniform(*self.scale_range) From 61cd9c6366082a6efa6d94efa05a84f7cba8433e Mon Sep 17 00:00:00 2001 From: felix Date: Wed, 4 Sep 2024 13:24:54 +0200 Subject: [PATCH 5/7] update --- doctr/transforms/modules/pytorch.py | 18 +++++------------- doctr/transforms/modules/tensorflow.py | 19 ++++++------------- scripts/evaluate.py | 4 +++- scripts/evaluate_kie.py | 4 +++- tests/pytorch/test_transforms_pt.py | 15 +-------------- tests/tensorflow/test_transforms_tf.py | 15 +-------------- 6 files changed, 19 insertions(+), 56 deletions(-) diff --git a/doctr/transforms/modules/pytorch.py b/doctr/transforms/modules/pytorch.py index f99f422874..32e3adc71c 100644 --- a/doctr/transforms/modules/pytorch.py +++ b/doctr/transforms/modules/pytorch.py @@ -103,21 +103,13 @@ def _prepare_targets(target: np.ndarray) -> np.ndarray: # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) if target is not None: - # Possible formats: - # KIE: Dict[str, np.ndarray] - # Built-in datasets: Dict[str, Union[np.ndarray, List[str]]] - # Custom datasets: np.ndarray - - if isinstance(target, dict): + if isinstance(target, dict) and "boxes" in target.keys(): # Built-in datasets - if "boxes" and "labels" in target.keys(): - target["boxes"] = _prepare_targets(target["boxes"]) # type: ignore[arg-type] - return img, target - # KIE - else: - return img, {k: _prepare_targets(v) for k, v in target.items()} # type: ignore[arg-type] + # NOTE: This is required for end-to-end evaluation + target["boxes"] = _prepare_targets(target["boxes"]) # type: ignore[arg-type] + return img, target # Custom datasets - return img, _prepare_targets(target) + return img, _prepare_targets(target) # type: ignore[arg-type] return img diff --git a/doctr/transforms/modules/tensorflow.py b/doctr/transforms/modules/tensorflow.py index 23f035716f..001c03c40e 100644 --- a/doctr/transforms/modules/tensorflow.py +++ b/doctr/transforms/modules/tensorflow.py @@ -152,21 +152,14 @@ def _prepare_targets(target: np.ndarray) -> np.ndarray: # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) if target is not None: - # Possible formats: - # KIE: Dict[str, np.ndarray] - # Built-in datasets: Dict[str, Union[np.ndarray, List[str]]] - # Custom datasets: np.ndarray - - if isinstance(target, dict): + if isinstance(target, dict) and "boxes" in target.keys(): # Built-in datasets - if "boxes" and "labels" in target.keys(): - target["boxes"] = _prepare_targets(target["boxes"]) # type: ignore[arg-type] - return tf.cast(img, dtype=input_dtype), target - # KIE - else: - return tf.cast(img, dtype=input_dtype), {k: _prepare_targets(v) for k, v in target.items()} # type: ignore[arg-type] + # NOTE: This is required for end-to-end evaluation + target["boxes"] = _prepare_targets(target["boxes"]) # type: ignore[arg-type] + return tf.cast(img, dtype=input_dtype), target + # Custom datasets - return tf.cast(img, dtype=input_dtype), _prepare_targets(target) + return tf.cast(img, dtype=input_dtype), _prepare_targets(target) # type: ignore[arg-type] return tf.cast(img, dtype=input_dtype) diff --git a/scripts/evaluate.py b/scripts/evaluate.py index b8568ef7e4..512d740cbd 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -52,7 +52,9 @@ def main(args): testset = datasets.OCRDataset( img_folder=args.img_folder, label_file=args.label_file, - sample_transforms=T.Resize(input_shape, preserve_aspect_ratio=True, symmetric_pad=True), + sample_transforms=T.Resize( + input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad + ), ) sets = [testset] else: diff --git a/scripts/evaluate_kie.py b/scripts/evaluate_kie.py index 6afe8584bb..edb75fe6ce 100644 --- a/scripts/evaluate_kie.py +++ b/scripts/evaluate_kie.py @@ -54,7 +54,9 @@ def main(args): testset = datasets.OCRDataset( img_folder=args.img_folder, label_file=args.label_file, - sample_transforms=T.Resize(input_shape, preserve_aspect_ratio=True, symmetric_pad=True), + sample_transforms=T.Resize( + input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad + ), ) sets = [testset] else: diff --git a/tests/pytorch/test_transforms_pt.py b/tests/pytorch/test_transforms_pt.py index 0a4d05b234..34a13b961c 100644 --- a/tests/pytorch/test_transforms_pt.py +++ b/tests/pytorch/test_transforms_pt.py @@ -93,20 +93,7 @@ def test_resize(): assert new_target["labels"] == target["labels"] assert np.all(new_target["boxes"] >= 0) and np.all(new_target["boxes"] <= 1) - # 3. KIE dataset: Dict[str, np.ndarray] (key-value np.ndarray pairs) - target_kie = {"class_1": np.array([[0.1, 0.1, 0.9, 0.9]]), "class_2": np.array([[0.2, 0.2, 0.8, 0.8]])} - - transfo = Resize(output_size, preserve_aspect_ratio=True, symmetric_pad=True) - out, new_target_kie = transfo(input_t, target_kie) - - assert out.shape[-2:] == output_size - assert "class_1" in new_target_kie - assert "class_2" in new_target_kie - assert new_target_kie["class_1"].shape == target_kie["class_1"].shape - assert new_target_kie["class_2"].shape == target_kie["class_2"].shape - assert np.all(new_target_kie["class_1"] >= 0) and np.all(new_target_kie["class_1"] <= 1) - - # 4. No target (to ensure backward compatibility) + # 3. No target (to ensure backward compatibility) out = transfo(input_t) assert out.shape[-2:] == output_size diff --git a/tests/tensorflow/test_transforms_tf.py b/tests/tensorflow/test_transforms_tf.py index 2101ca6208..0c818ba9c4 100644 --- a/tests/tensorflow/test_transforms_tf.py +++ b/tests/tensorflow/test_transforms_tf.py @@ -75,20 +75,7 @@ def test_resize(): assert new_target["labels"] == target["labels"] assert np.all(new_target["boxes"] >= 0) and np.all(new_target["boxes"] <= 1) - # 3. KIE dataset: Dict[str, np.ndarray] (key-value np.ndarray pairs) - target_kie = {"class_1": np.array([[0.1, 0.1, 0.9, 0.9]]), "class_2": np.array([[0.2, 0.2, 0.8, 0.8]])} - - transfo = T.Resize(output_size, preserve_aspect_ratio=True, symmetric_pad=True) - out, new_target_kie = transfo(input_t, target_kie) - - assert out.shape[:2] == output_size - assert "class_1" in new_target_kie - assert "class_2" in new_target_kie - assert new_target_kie["class_1"].shape == target_kie["class_1"].shape - assert new_target_kie["class_2"].shape == target_kie["class_2"].shape - assert np.all(new_target_kie["class_1"] >= 0) and np.all(new_target_kie["class_1"] <= 1) - - # 4. No target (to ensure backward compatibility) + # 3. No target (to ensure backward compatibility) out = transfo(input_t) assert out.shape[:2] == output_size From a8b394fd8883786f02714d2fc92d73e5220c5fda Mon Sep 17 00:00:00 2001 From: felix Date: Thu, 5 Sep 2024 10:45:26 +0200 Subject: [PATCH 6/7] revert to transformation instead of Resize modifications --- doctr/transforms/modules/pytorch.py | 26 ++++++--------------- doctr/transforms/modules/tensorflow.py | 25 +++++++------------- references/detection/evaluate_pytorch.py | 10 ++------ references/detection/evaluate_tensorflow.py | 10 ++------ scripts/evaluate.py | 21 ++++++++++------- scripts/evaluate_kie.py | 21 ++++++++++------- tests/pytorch/test_transforms_pt.py | 15 ------------ tests/tensorflow/test_transforms_tf.py | 15 ------------ 8 files changed, 43 insertions(+), 100 deletions(-) diff --git a/doctr/transforms/modules/pytorch.py b/doctr/transforms/modules/pytorch.py index 32e3adc71c..639b27e2cf 100644 --- a/doctr/transforms/modules/pytorch.py +++ b/doctr/transforms/modules/pytorch.py @@ -4,7 +4,7 @@ # See LICENSE or go to for full license details. import math -from typing import Dict, List, Optional, Tuple, Union +from typing import Optional, Tuple, Union import numpy as np import torch @@ -38,8 +38,8 @@ def __init__( def forward( self, img: torch.Tensor, - target: Optional[Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]]] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]]]]: + target: Optional[np.ndarray] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, np.ndarray]]: if isinstance(self.size, int): target_ratio = img.shape[-2] / img.shape[-1] else: @@ -77,7 +77,8 @@ def forward( # Pad image img = pad(img, _pad) - def _prepare_targets(target: np.ndarray) -> np.ndarray: + # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) + if target is not None: if self.symmetric_pad: offset = half_pad[0] / img.shape[-1], half_pad[1] / img.shape[-2] @@ -99,17 +100,8 @@ def _prepare_targets(target: np.ndarray) -> np.ndarray: target[..., 1] *= raw_shape[-2] / img.shape[-2] else: raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") - return np.clip(target, 0, 1) - # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) - if target is not None: - if isinstance(target, dict) and "boxes" in target.keys(): - # Built-in datasets - # NOTE: This is required for end-to-end evaluation - target["boxes"] = _prepare_targets(target["boxes"]) # type: ignore[arg-type] - return img, target - # Custom datasets - return img, _prepare_targets(target) # type: ignore[arg-type] + return img, np.clip(target, 0, 1) return img @@ -256,11 +248,7 @@ def __init__( self.p = p self._resize = Resize - def forward( - self, - img: torch.Tensor, - target: Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]], - ) -> Tuple[torch.Tensor, Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]]]: + def forward(self, img: torch.Tensor, target: np.ndarray) -> Tuple[torch.Tensor, np.ndarray]: if torch.rand(1) < self.p: scale_h = np.random.uniform(*self.scale_range) scale_w = np.random.uniform(*self.scale_range) diff --git a/doctr/transforms/modules/tensorflow.py b/doctr/transforms/modules/tensorflow.py index 001c03c40e..1496abb31c 100644 --- a/doctr/transforms/modules/tensorflow.py +++ b/doctr/transforms/modules/tensorflow.py @@ -4,7 +4,7 @@ # See LICENSE or go to for full license details. import random -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Callable, Iterable, List, Optional, Tuple, Union import numpy as np import tensorflow as tf @@ -104,8 +104,8 @@ def extra_repr(self) -> str: def __call__( self, img: tf.Tensor, - target: Optional[Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]]] = None, - ) -> Union[tf.Tensor, Tuple[tf.Tensor, Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]]]]: + target: Optional[np.ndarray] = None, + ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]: input_dtype = img.dtype self.output_size = ( (self.output_size, self.output_size) if isinstance(self.output_size, int) else self.output_size @@ -126,7 +126,8 @@ def __call__( # Pad image img = tf.image.pad_to_bounding_box(img, *half_pad, *self.output_size) - def _prepare_targets(target: np.ndarray) -> np.ndarray: + # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) + if target is not None: if self.symmetric_pad: offset = half_pad[0] / img.shape[0], half_pad[1] / img.shape[1] @@ -148,18 +149,8 @@ def _prepare_targets(target: np.ndarray) -> np.ndarray: target[..., 1] *= raw_shape[0] / img.shape[0] else: raise AssertionError("Boxes should be in the format (n_boxes, 4, 2) or (n_boxes, 4)") - return np.clip(target, 0, 1) - - # In case boxes are provided, resize boxes if needed (for detection task if preserve aspect ratio) - if target is not None: - if isinstance(target, dict) and "boxes" in target.keys(): - # Built-in datasets - # NOTE: This is required for end-to-end evaluation - target["boxes"] = _prepare_targets(target["boxes"]) # type: ignore[arg-type] - return tf.cast(img, dtype=input_dtype), target - # Custom datasets - return tf.cast(img, dtype=input_dtype), _prepare_targets(target) # type: ignore[arg-type] + return tf.cast(img, dtype=input_dtype), np.clip(target, 0, 1) return tf.cast(img, dtype=input_dtype) @@ -566,8 +557,8 @@ def __init__( def __call__( self, img: tf.Tensor, - target: Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]], - ) -> Tuple[tf.Tensor, Union[np.ndarray, Dict[str, Union[np.ndarray, List[str]]]]]: + target: Optional[np.ndarray], + ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]: if np.random.rand(1) <= self.p: scale_h = random.uniform(*self.scale_range) scale_w = random.uniform(*self.scale_range) diff --git a/references/detection/evaluate_pytorch.py b/references/detection/evaluate_pytorch.py index f5d3925092..15f60df664 100644 --- a/references/detection/evaluate_pytorch.py +++ b/references/detection/evaluate_pytorch.py @@ -82,9 +82,7 @@ def main(args): train=True, download=True, use_polygons=args.rotation, - sample_transforms=T.Resize( - input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad - ), + sample_transforms=T.Resize(input_shape), ) # Monkeypatch subfolder = ds.root.split("/")[-2:] @@ -94,9 +92,7 @@ def main(args): train=False, download=True, use_polygons=args.rotation, - sample_transforms=T.Resize( - input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad - ), + sample_transforms=T.Resize(input_shape), ) subfolder = _ds.root.split("/")[-2:] ds.data.extend([(os.path.join(*subfolder, name), target) for name, target in _ds.data]) @@ -159,8 +155,6 @@ def parse_args(): parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for evaluation") parser.add_argument("--device", default=None, type=int, help="device") parser.add_argument("--size", type=int, default=None, help="model input size, H = W") - parser.add_argument("--keep_ratio", action="store_true", help="keep the aspect ratio of the input image") - parser.add_argument("--symmetric_pad", action="store_true", help="pad the image symmetrically") parser.add_argument("-j", "--workers", type=int, default=None, help="number of workers used for dataloading") parser.add_argument("--rotation", dest="rotation", action="store_true", help="inference with rotated bbox") parser.add_argument("--resume", type=str, default=None, help="Checkpoint to resume") diff --git a/references/detection/evaluate_tensorflow.py b/references/detection/evaluate_tensorflow.py index 0eb70cf245..5496db655f 100644 --- a/references/detection/evaluate_tensorflow.py +++ b/references/detection/evaluate_tensorflow.py @@ -81,9 +81,7 @@ def main(args): train=True, download=True, use_polygons=args.rotation, - sample_transforms=T.Resize( - input_shape[:2], preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad - ), + sample_transforms=T.Resize(input_shape[:2]), ) # Monkeypatch subfolder = ds.root.split("/")[-2:] @@ -93,9 +91,7 @@ def main(args): train=False, download=True, use_polygons=args.rotation, - sample_transforms=T.Resize( - input_shape[:2], preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad - ), + sample_transforms=T.Resize(input_shape[:2]), ) subfolder = _ds.root.split("/")[-2:] ds.data.extend([(os.path.join(*subfolder, name), target) for name, target in _ds.data]) @@ -133,8 +129,6 @@ def parse_args(): parser.add_argument("--dataset", type=str, default="FUNSD", help="Dataset to evaluate on") parser.add_argument("-b", "--batch_size", type=int, default=2, help="batch size for evaluation") parser.add_argument("--size", type=int, default=None, help="model input size, H = W") - parser.add_argument("--keep_ratio", action="store_true", help="keep the aspect ratio of the input image") - parser.add_argument("--symmetric_pad", action="store_true", help="pad the image symmetrically") parser.add_argument("--rotation", dest="rotation", action="store_true", help="inference with rotated bbox") parser.add_argument("--resume", type=str, default=None, help="Checkpoint to resume") parser.add_argument("--amp", dest="amp", help="Use Automatic Mixed Precision", action="store_true") diff --git a/scripts/evaluate.py b/scripts/evaluate.py index 512d740cbd..86dbc0e561 100644 --- a/scripts/evaluate.py +++ b/scripts/evaluate.py @@ -38,6 +38,15 @@ def main(args): input_shape = (args.size, args.size) + # We define a transformation function which does transform the annotation + # to the required format for the Resize transformation + def _transform(img, target): + boxes = target["boxes"] + transformed_img, transformed_boxes = T.Resize( + input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad + )(img, boxes) + return transformed_img, {"boxes": transformed_boxes, "labels": target["labels"]} + predictor = ocr_predictor( args.detection, args.recognition, @@ -52,9 +61,7 @@ def main(args): testset = datasets.OCRDataset( img_folder=args.img_folder, label_file=args.label_file, - sample_transforms=T.Resize( - input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad - ), + sample_transforms=_transform, ) sets = [testset] else: @@ -62,17 +69,13 @@ def main(args): train=True, download=True, use_polygons=not args.eval_straight, - sample_transforms=T.Resize( - input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad - ), + sample_transforms=_transform, ) val_set = datasets.__dict__[args.dataset]( train=False, download=True, use_polygons=not args.eval_straight, - sample_transforms=T.Resize( - input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad - ), + sample_transforms=_transform, ) sets = [train_set, val_set] diff --git a/scripts/evaluate_kie.py b/scripts/evaluate_kie.py index edb75fe6ce..ca17332e2c 100644 --- a/scripts/evaluate_kie.py +++ b/scripts/evaluate_kie.py @@ -40,6 +40,15 @@ def main(args): input_shape = (args.size, args.size) + # We define a transformation function which does transform the annotation + # to the required format for the Resize transformation + def _transform(img, target): + boxes = target["boxes"] + transformed_img, transformed_boxes = T.Resize( + input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad + )(img, boxes) + return transformed_img, {"boxes": transformed_boxes, "labels": target["labels"]} + predictor = kie_predictor( args.detection, args.recognition, @@ -54,9 +63,7 @@ def main(args): testset = datasets.OCRDataset( img_folder=args.img_folder, label_file=args.label_file, - sample_transforms=T.Resize( - input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad - ), + sample_transforms=_transform, ) sets = [testset] else: @@ -64,17 +71,13 @@ def main(args): train=True, download=True, use_polygons=not args.eval_straight, - sample_transforms=T.Resize( - input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad - ), + sample_transforms=_transform, ) val_set = datasets.__dict__[args.dataset]( train=False, download=True, use_polygons=not args.eval_straight, - sample_transforms=T.Resize( - input_shape, preserve_aspect_ratio=args.keep_ratio, symmetric_pad=args.symmetric_pad - ), + sample_transforms=_transform, ) sets = [train_set, val_set] diff --git a/tests/pytorch/test_transforms_pt.py b/tests/pytorch/test_transforms_pt.py index 34a13b961c..3c11412556 100644 --- a/tests/pytorch/test_transforms_pt.py +++ b/tests/pytorch/test_transforms_pt.py @@ -68,7 +68,6 @@ def test_resize(): # --- Test with target (bounding boxes) --- - # 1. Custom dataset (n_boxes, 4) format bounding boxes target_boxes = np.array([[0.1, 0.1, 0.9, 0.9], [0.2, 0.2, 0.8, 0.8]]) output_size = (64, 64) @@ -80,20 +79,6 @@ def test_resize(): assert new_target.shape == target_boxes.shape assert np.all(new_target >= 0) and np.all(new_target <= 1) - # 2. Built-in dataset: Dict with "boxes" and "labels" - target = {"boxes": np.array([[0.1, 0.1, 0.9, 0.9]]), "labels": ["text"]} - - transfo = Resize(output_size, preserve_aspect_ratio=True) - out, new_target = transfo(input_t, target) - - assert out.shape[-2:] == output_size - assert "boxes" in new_target - assert "labels" in new_target - assert new_target["boxes"].shape == target["boxes"].shape - assert new_target["labels"] == target["labels"] - assert np.all(new_target["boxes"] >= 0) and np.all(new_target["boxes"] <= 1) - - # 3. No target (to ensure backward compatibility) out = transfo(input_t) assert out.shape[-2:] == output_size diff --git a/tests/tensorflow/test_transforms_tf.py b/tests/tensorflow/test_transforms_tf.py index 0c818ba9c4..5fa87eab8a 100644 --- a/tests/tensorflow/test_transforms_tf.py +++ b/tests/tensorflow/test_transforms_tf.py @@ -50,7 +50,6 @@ def test_resize(): # --- Test with target (bounding boxes) --- - # 1. Custom dataset (n_boxes, 4) format bounding boxes target_boxes = np.array([[0.1, 0.1, 0.9, 0.9], [0.2, 0.2, 0.8, 0.8]]) output_size = (64, 64) @@ -62,20 +61,6 @@ def test_resize(): assert new_target.shape == target_boxes.shape assert np.all(new_target >= 0) and np.all(new_target <= 1) - # 2. Built-in dataset: Dict with "boxes" and "labels" - target = {"boxes": np.array([[0.1, 0.1, 0.9, 0.9]]), "labels": ["text"]} - - transfo = T.Resize(output_size, preserve_aspect_ratio=True) - out, new_target = transfo(input_t, target) - - assert out.shape[:2] == output_size - assert "boxes" in new_target - assert "labels" in new_target - assert new_target["boxes"].shape == target["boxes"].shape - assert new_target["labels"] == target["labels"] - assert np.all(new_target["boxes"] >= 0) and np.all(new_target["boxes"] <= 1) - - # 3. No target (to ensure backward compatibility) out = transfo(input_t) assert out.shape[:2] == output_size From 99222b4a18dd93ae0d0d6399a6640edae34f3197 Mon Sep 17 00:00:00 2001 From: felix Date: Thu, 5 Sep 2024 10:48:24 +0200 Subject: [PATCH 7/7] typings --- doctr/transforms/modules/tensorflow.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/doctr/transforms/modules/tensorflow.py b/doctr/transforms/modules/tensorflow.py index 1496abb31c..4b00a9359f 100644 --- a/doctr/transforms/modules/tensorflow.py +++ b/doctr/transforms/modules/tensorflow.py @@ -554,11 +554,7 @@ def __init__( self.p = p self._resize = Resize - def __call__( - self, - img: tf.Tensor, - target: Optional[np.ndarray], - ) -> Union[tf.Tensor, Tuple[tf.Tensor, np.ndarray]]: + def __call__(self, img: tf.Tensor, target: np.ndarray) -> Tuple[tf.Tensor, np.ndarray]: if np.random.rand(1) <= self.p: scale_h = random.uniform(*self.scale_range) scale_w = random.uniform(*self.scale_range)