From d41ba359a3456c560088eb8658f3844805cb1499 Mon Sep 17 00:00:00 2001 From: Felix Dittrich Date: Thu, 23 Jan 2025 14:56:49 +0100 Subject: [PATCH] [references] Update Logging (#1847) --- .../classification/train_pytorch_character.py | 124 ++++++++++++------ .../train_pytorch_orientation.py | 124 ++++++++++++------ .../train_tensorflow_character.py | 119 ++++++++++++----- .../train_tensorflow_orientation.py | 119 ++++++++++++----- references/detection/evaluate_pytorch.py | 14 +- references/detection/evaluate_tensorflow.py | 11 +- references/detection/train_pytorch.py | 122 +++++++++++------ references/detection/train_pytorch_ddp.py | 68 +++++----- references/detection/train_tensorflow.py | 120 ++++++++++++----- references/recognition/evaluate_pytorch.py | 24 ++-- references/recognition/evaluate_tensorflow.py | 20 ++- references/recognition/train_pytorch.py | 122 +++++++++++------ references/recognition/train_pytorch_ddp.py | 62 +++++---- references/recognition/train_tensorflow.py | 120 ++++++++++++----- 14 files changed, 802 insertions(+), 367 deletions(-) diff --git a/references/classification/train_pytorch_character.py b/references/classification/train_pytorch_character.py index de4b19731..715ebe737 100644 --- a/references/classification/train_pytorch_character.py +++ b/references/classification/train_pytorch_character.py @@ -110,18 +110,14 @@ def record_lr( return lr_recorder[: len(loss_recorder)], loss_recorder -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, log=None): if amp: scaler = torch.cuda.amp.GradScaler() model.train() - if clearml_log: - from clearml import Logger - - logger = Logger.current_logger() - # Iterate over the batches of the dataset - pbar = tqdm(train_loader, position=1) + epoch_train_loss, batch_cnt = 0.0, 0.0 + pbar = tqdm(train_loader, dynamic_ncols=True) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() @@ -143,24 +139,28 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a train_loss = cross_entropy(out, targets) train_loss.backward() optimizer.step() + scheduler.step() + last_lr = scheduler.get_last_lr()[0] + + pbar.set_description(f"Training loss: {train_loss.item():.6} | LR: {last_lr:.6}") + log(train_loss=train_loss.item(), lr=last_lr) + + epoch_train_loss += train_loss.item() + batch_cnt += 1 - pbar.set_description(f"Training loss: {train_loss.item():.6}") - if clearml_log: - global iteration - logger.report_scalar( - title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration - ) - iteration += 1 + epoch_train_loss /= batch_cnt + return epoch_train_loss, last_lr @torch.no_grad() -def evaluate(model, val_loader, batch_transforms, amp=False): +def evaluate(model, val_loader, batch_transforms, amp=False, log=None): # Model in eval mode model.eval() # Validation loop val_loss, correct, samples, batch_cnt = 0, 0, 0, 0 - for images, targets in tqdm(val_loader): + pbar = tqdm(val_loader, dynamic_ncols=True) + for images, targets in pbar: images = batch_transforms(images) if torch.cuda.is_available(): @@ -177,6 +177,9 @@ def evaluate(model, val_loader, batch_transforms, amp=False): # Compute metric correct += (out.argmax(dim=1) == targets).sum().item() + pbar.set_description(f"Validation loss: {loss.item():.6}") + log(val_loss=loss.item()) + val_loss += loss.item() batch_cnt += 1 samples += images.shape[0] @@ -187,7 +190,8 @@ def evaluate(model, val_loader, batch_transforms, amp=False): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -222,7 +226,7 @@ def main(args): sampler=SequentialSampler(val_set), pin_memory=torch.cuda.is_available(), ) - print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") + pbar.write(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") batch_transforms = Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301)) @@ -231,7 +235,7 @@ def main(args): # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -251,9 +255,9 @@ def main(args): model = model.cuda() if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, acc = evaluate(model, val_loader, batch_transforms) - print(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") return st = time.time() @@ -286,7 +290,7 @@ def main(args): sampler=RandomSampler(train_set), pin_memory=torch.cuda.is_available(), ) - print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") + pbar.write(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") if args.show_samples: x, target = next(iter(train_loader)) @@ -343,24 +347,61 @@ def main(args): "pretrained": args.pretrained, } + global global_step + global_step = 0 # Shared global step counter + # W&B if args.wb: import wandb - run = wandb.init( - name=exp_name, - project="character-classification", - config=config, - ) + run = wandb.init(name=exp_name, project="character-classification", config=config) + + def wandb_log_at_step(train_loss=None, val_loss=None, lr=None): + wandb.log({ + **({"train_loss_step": train_loss} if train_loss is not None else {}), + **({"val_loss_step": val_loss} if val_loss is not None else {}), + **({"step_lr": lr} if lr is not None else {}), + }) # ClearML if args.clearml: - from clearml import Task + from clearml import Logger, Task task = Task.init(project_name="docTR/character-classification", task_name=exp_name, reuse_last_task_id=False) task.upload_artifact("config", config) - global iteration - iteration = 0 + + def clearml_log_at_step(train_loss=None, val_loss=None, lr=None): + logger = Logger.current_logger() + if train_loss is not None: + logger.report_scalar( + title="Training Step Loss", + series="train_loss_step", + iteration=global_step, + value=train_loss, + ) + if val_loss is not None: + logger.report_scalar( + title="Validation Step Loss", + series="val_loss_step", + iteration=global_step, + value=val_loss, + ) + if lr is not None: + logger.report_scalar( + title="Step Learning Rate", + series="step_lr", + iteration=global_step, + value=lr, + ) + + # Unified logger + def log_at_step(train_loss=None, val_loss=None, lr=None): + global global_step + if args.wb: + wandb_log_at_step(train_loss, val_loss, lr) + if args.clearml: + clearml_log_at_step(train_loss, val_loss, lr) + global_step += 1 # Increment the shared global step counter # Create loss queue min_loss = np.inf @@ -368,21 +409,25 @@ def main(args): if args.early_stop: early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): - fit_one_epoch( - model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml + train_loss, actual_lr = fit_one_epoch( + model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, log=log_at_step ) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") # Validation loop at the end of each epoch - val_loss, acc = evaluate(model, val_loader, batch_transforms) + val_loss, acc = evaluate(model, val_loader, batch_transforms, log=log_at_step) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.state_dict(), Path(args.output_dir) / f"{exp_name}.pt") min_loss = val_loss - print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + # W&B if args.wb: wandb.log({ + "train_loss": train_loss, "val_loss": val_loss, + "learning_rate": actual_lr, "acc": acc, }) @@ -391,12 +436,15 @@ def main(args): from clearml import Logger logger = Logger.current_logger() + logger.report_scalar(title="Training Loss", series="train_loss", value=train_loss, iteration=epoch) logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Learning Rate", series="lr", value=actual_lr, iteration=epoch) logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break + if args.wb: run.finish() @@ -404,11 +452,11 @@ def main(args): push_to_hf_hub(model, exp_name, task="classification", run_config=args) if args.export_onnx: - print("Exporting model to ONNX...") + pbar.write("Exporting model to ONNX...") dummy_batch = next(iter(val_loader)) dummy_input = dummy_batch[0].cuda() if torch.cuda.is_available() else dummy_batch[0] model_path = export_model_to_onnx(model, exp_name, dummy_input) - print(f"Exported model saved in {model_path}") + pbar.write(f"Exported model saved in {model_path}") def parse_args(): diff --git a/references/classification/train_pytorch_orientation.py b/references/classification/train_pytorch_orientation.py index dbec4886b..13df5f843 100644 --- a/references/classification/train_pytorch_orientation.py +++ b/references/classification/train_pytorch_orientation.py @@ -121,18 +121,14 @@ def record_lr( return lr_recorder[: len(loss_recorder)], loss_recorder -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, log=None): if amp: scaler = torch.cuda.amp.GradScaler() model.train() - if clearml_log: - from clearml import Logger - - logger = Logger.current_logger() - # Iterate over the batches of the dataset - pbar = tqdm(train_loader, position=1) + epoch_train_loss, batch_cnt = 0.0, 0.0 + pbar = tqdm(train_loader, dynamic_ncols=True) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() @@ -154,24 +150,28 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a train_loss = cross_entropy(out, targets) train_loss.backward() optimizer.step() + scheduler.step() + last_lr = scheduler.get_last_lr()[0] + + pbar.set_description(f"Training loss: {train_loss.item():.6} | LR: {last_lr:.6}") + log(train_loss=train_loss.item(), lr=last_lr) + + epoch_train_loss += train_loss.item() + batch_cnt += 1 - pbar.set_description(f"Training loss: {train_loss.item():.6}") - if clearml_log: - global iteration - logger.report_scalar( - title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration - ) - iteration += 1 + epoch_train_loss /= batch_cnt + return epoch_train_loss, last_lr @torch.no_grad() -def evaluate(model, val_loader, batch_transforms, amp=False): +def evaluate(model, val_loader, batch_transforms, amp=False, log=None): # Model in eval mode model.eval() # Validation loop val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0 - for images, targets in tqdm(val_loader): + pbar = tqdm(val_loader, dynamic_ncols=True) + for images, targets in pbar: images = batch_transforms(images) if torch.cuda.is_available(): @@ -188,6 +188,9 @@ def evaluate(model, val_loader, batch_transforms, amp=False): # Compute metric correct += (out.argmax(dim=1) == targets).sum().item() + pbar.set_description(f"Validation loss: {loss.item():.6}") + log(val_loss=loss.item()) + val_loss += loss.item() batch_cnt += 1 samples += images.shape[0] @@ -198,7 +201,8 @@ def evaluate(model, val_loader, batch_transforms, amp=False): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -230,7 +234,7 @@ def main(args): sampler=SequentialSampler(val_set), pin_memory=torch.cuda.is_available(), ) - print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") + pbar.write(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") batch_transforms = Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301)) @@ -239,7 +243,7 @@ def main(args): # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -259,9 +263,9 @@ def main(args): model = model.cuda() if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, acc = evaluate(model, val_loader, batch_transforms) - print(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") return st = time.time() @@ -292,7 +296,7 @@ def main(args): sampler=RandomSampler(train_set), pin_memory=torch.cuda.is_available(), ) - print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") + pbar.write(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") if args.show_samples: x, target = next(iter(train_loader)) @@ -349,24 +353,61 @@ def main(args): "pretrained": args.pretrained, } + global global_step + global_step = 0 # Shared global step counter + # W&B if args.wb: import wandb - run = wandb.init( - name=exp_name, - project="orientation-classification", - config=config, - ) + run = wandb.init(name=exp_name, project="orientation-classification", config=config) + + def wandb_log_at_step(train_loss=None, val_loss=None, lr=None): + wandb.log({ + **({"train_loss_step": train_loss} if train_loss is not None else {}), + **({"val_loss_step": val_loss} if val_loss is not None else {}), + **({"step_lr": lr} if lr is not None else {}), + }) # ClearML if args.clearml: - from clearml import Task + from clearml import Logger, Task task = Task.init(project_name="docTR/orientation-classification", task_name=exp_name, reuse_last_task_id=False) task.upload_artifact("config", config) - global iteration - iteration = 0 + + def clearml_log_at_step(train_loss=None, val_loss=None, lr=None): + logger = Logger.current_logger() + if train_loss is not None: + logger.report_scalar( + title="Training Step Loss", + series="train_loss_step", + iteration=global_step, + value=train_loss, + ) + if val_loss is not None: + logger.report_scalar( + title="Validation Step Loss", + series="val_loss_step", + iteration=global_step, + value=val_loss, + ) + if lr is not None: + logger.report_scalar( + title="Step Learning Rate", + series="step_lr", + iteration=global_step, + value=lr, + ) + + # Unified logger + def log_at_step(train_loss=None, val_loss=None, lr=None): + global global_step + if args.wb: + wandb_log_at_step(train_loss, val_loss, lr) + if args.clearml: + clearml_log_at_step(train_loss, val_loss, lr) + global_step += 1 # Increment the shared global step counter # Create loss queue min_loss = np.inf @@ -374,21 +415,25 @@ def main(args): if args.early_stop: early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): - fit_one_epoch( - model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml + train_loss, actual_lr = fit_one_epoch( + model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, log=log_at_step ) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") # Validation loop at the end of each epoch - val_loss, acc = evaluate(model, val_loader, batch_transforms) + val_loss, acc = evaluate(model, val_loader, batch_transforms, log=log_at_step) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.state_dict(), Path(args.output_dir) / f"{exp_name}.pt") min_loss = val_loss - print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + # W&B if args.wb: wandb.log({ + "train_loss": train_loss, "val_loss": val_loss, + "learning_rate": actual_lr, "acc": acc, }) @@ -397,12 +442,15 @@ def main(args): from clearml import Logger logger = Logger.current_logger() + logger.report_scalar(title="Training Loss", series="train_loss", value=train_loss, iteration=epoch) logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Learning Rate", series="lr", value=actual_lr, iteration=epoch) logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break + if args.wb: run.finish() @@ -410,11 +458,11 @@ def main(args): push_to_hf_hub(model, exp_name, task="classification", run_config=args) if args.export_onnx: - print("Exporting model to ONNX...") + pbar.write("Exporting model to ONNX...") dummy_batch = next(iter(val_loader)) dummy_input = dummy_batch[0].cuda() if torch.cuda.is_available() else dummy_batch[0] model_path = export_model_to_onnx(model, exp_name, dummy_input) - print(f"Exported model saved in {model_path}") + pbar.write(f"Exported model saved in {model_path}") def parse_args(): diff --git a/references/classification/train_tensorflow_character.py b/references/classification/train_tensorflow_character.py index 5d7dfe728..a46fa24f6 100644 --- a/references/classification/train_tensorflow_character.py +++ b/references/classification/train_tensorflow_character.py @@ -96,14 +96,11 @@ def apply_grads(optimizer, grads, model): optimizer.apply_gradients(zip(grads, model.trainable_weights)) -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, clearml_log=False): - if clearml_log: - from clearml import Logger - - logger = Logger.current_logger() - +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, log=None): + train_iter = iter(train_loader) # Iterate over the batches of the dataset - pbar = tqdm(train_loader, position=1) + epoch_train_loss, batch_cnt = 0, 0 + pbar = tqdm(train_iter, dynamic_ncols=True) for images, targets in pbar: images = batch_transforms(images) @@ -115,26 +112,34 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, c grads = optimizer.get_unscaled_gradients(grads) apply_grads(optimizer, grads, model) - pbar.set_description(f"Training loss: {train_loss.numpy().mean():.6}") - if clearml_log: - global iteration - logger.report_scalar( - title="Training Loss", series="train_loss", value=train_loss.numpy().mean(), iteration=iteration - ) - iteration += 1 + last_lr = optimizer.learning_rate.numpy().item() + train_loss = train_loss.numpy().mean() + + pbar.set_description(f"Training loss: {train_loss:.6} | LR: {last_lr:.6}") + log(train_loss=train_loss, lr=last_lr) + + epoch_train_loss += train_loss + batch_cnt += 1 + + epoch_train_loss /= batch_cnt + return epoch_train_loss, last_lr -def evaluate(model, val_loader, batch_transforms): +def evaluate(model, val_loader, batch_transforms, log=None): # Validation loop val_loss, correct, samples, batch_cnt = 0, 0, 0, 0 val_iter = iter(val_loader) - for images, targets in tqdm(val_iter): + pbar = tqdm(val_iter, dynamic_ncols=True) + for images, targets in pbar: images = batch_transforms(images) out = model(images, training=False) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, out) # Compute metric correct += int((out.numpy().argmax(1) == targets.numpy()).sum()) + pbar.set_description(f"Validation loss: {loss.numpy().mean():.6}") + log(val_loss=loss.numpy().mean()) + val_loss += loss.numpy().mean() batch_cnt += 1 samples += images.shape[0] @@ -152,7 +157,8 @@ def collate_fn(samples): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -185,7 +191,7 @@ def main(args): drop_last=False, collate_fn=collate_fn, ) - print( + pbar.write( f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" ) @@ -207,9 +213,9 @@ def main(args): ]) if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, acc = evaluate(model, val_loader, batch_transforms) - print(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") return st = time.time() @@ -240,7 +246,7 @@ def main(args): drop_last=True, collate_fn=collate_fn, ) - print( + pbar.write( f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" ) @@ -315,19 +321,61 @@ def main(args): "pretrained": args.pretrained, } + global global_step + global_step = 0 # Shared global step counter + # W&B if args.wb: import wandb run = wandb.init(name=exp_name, project="character-classification", config=config) + + def wandb_log_at_step(train_loss=None, val_loss=None, lr=None): + wandb.log({ + **({"train_loss_step": train_loss} if train_loss is not None else {}), + **({"val_loss_step": val_loss} if val_loss is not None else {}), + **({"step_lr": lr} if lr is not None else {}), + }) + # ClearML if args.clearml: - from clearml import Task + from clearml import Logger, Task task = Task.init(project_name="docTR/character-classification", task_name=exp_name, reuse_last_task_id=False) task.upload_artifact("config", config) - global iteration - iteration = 0 + + def clearml_log_at_step(train_loss=None, val_loss=None, lr=None): + logger = Logger.current_logger() + if train_loss is not None: + logger.report_scalar( + title="Training Step Loss", + series="train_loss_step", + iteration=global_step, + value=train_loss, + ) + if val_loss is not None: + logger.report_scalar( + title="Validation Step Loss", + series="val_loss_step", + iteration=global_step, + value=val_loss, + ) + if lr is not None: + logger.report_scalar( + title="Step Learning Rate", + series="step_lr", + iteration=global_step, + value=lr, + ) + + # Unified logger + def log_at_step(train_loss=None, val_loss=None, lr=None): + global global_step + if args.wb: + wandb_log_at_step(train_loss, val_loss, lr) + if args.clearml: + clearml_log_at_step(train_loss, val_loss, lr) + global_step += 1 # Increment the shared global step counter # Create loss queue min_loss = np.inf @@ -336,19 +384,25 @@ def main(args): if args.early_stop: early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): - fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp, args.clearml) + train_loss, actual_lr = fit_one_epoch( + model, train_loader, batch_transforms, optimizer, args.amp, log=log_at_step + ) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") # Validation loop at the end of each epoch - val_loss, acc = evaluate(model, val_loader, batch_transforms) + val_loss, acc = evaluate(model, val_loader, batch_transforms, log=log_at_step) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") model.save_weights(Path(args.output_dir) / f"{exp_name}.weights.h5") min_loss = val_loss - print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + # W&B if args.wb: wandb.log({ + "train_loss": train_loss, "val_loss": val_loss, + "learning_rate": actual_lr, "acc": acc, }) @@ -357,12 +411,15 @@ def main(args): from clearml import Logger logger = Logger.current_logger() + logger.report_scalar(title="Training Loss", series="train_loss", value=train_loss, iteration=epoch) logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Learning Rate", series="lr", value=actual_lr, iteration=epoch) logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break + if args.wb: run.finish() @@ -370,7 +427,7 @@ def main(args): push_to_hf_hub(model, exp_name, task="classification", run_config=args) if args.export_onnx: - print("Exporting model to ONNX...") + pbar.write("Exporting model to ONNX...") if args.arch == "vit_b": # fixed batch size for vit dummy_input = [tf.TensorSpec([1, args.input_size, args.input_size, 3], tf.float32, name="input")] @@ -378,7 +435,7 @@ def main(args): # dynamic batch size dummy_input = [tf.TensorSpec([None, args.input_size, args.input_size, 3], tf.float32, name="input")] model_path, _ = export_model_to_onnx(model, exp_name, dummy_input) - print(f"Exported model saved in {model_path}") + pbar.write(f"Exported model saved in {model_path}") def parse_args(): diff --git a/references/classification/train_tensorflow_orientation.py b/references/classification/train_tensorflow_orientation.py index b74a4be77..87c46b62d 100644 --- a/references/classification/train_tensorflow_orientation.py +++ b/references/classification/train_tensorflow_orientation.py @@ -110,14 +110,11 @@ def apply_grads(optimizer, grads, model): optimizer.apply_gradients(zip(grads, model.trainable_weights)) -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, clearml_log=False): - if clearml_log: - from clearml import Logger - - logger = Logger.current_logger() - +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, log=None): + train_iter = iter(train_loader) # Iterate over the batches of the dataset - pbar = tqdm(train_loader, position=1) + epoch_train_loss, batch_cnt = 0, 0 + pbar = tqdm(train_iter, dynamic_ncols=True) for images, targets in pbar: images = batch_transforms(images) @@ -129,26 +126,34 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, c grads = optimizer.get_unscaled_gradients(grads) apply_grads(optimizer, grads, model) - pbar.set_description(f"Training loss: {train_loss.numpy().mean():.6}") - if clearml_log: - global iteration - logger.report_scalar( - title="Training Loss", series="train_loss", value=train_loss.numpy().mean(), iteration=iteration - ) - iteration += 1 + last_lr = optimizer.learning_rate.numpy().item() + train_loss = train_loss.numpy().mean() + + pbar.set_description(f"Training loss: {train_loss:.6} | LR: {last_lr:.6}") + log(train_loss=train_loss, lr=last_lr) + + epoch_train_loss += train_loss + batch_cnt += 1 + + epoch_train_loss /= batch_cnt + return epoch_train_loss, last_lr -def evaluate(model, val_loader, batch_transforms): +def evaluate(model, val_loader, batch_transforms, log=None): # Validation loop val_loss, correct, samples, batch_cnt = 0.0, 0.0, 0.0, 0.0 val_iter = iter(val_loader) - for images, targets in tqdm(val_iter): + pbar = tqdm(val_iter, dynamic_ncols=True) + for images, targets in pbar: images = batch_transforms(images) out = model(images, training=False) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(targets, out) # Compute metric correct += int((out.numpy().argmax(1) == targets.numpy()).sum()) + pbar.set_description(f"Validation loss: {loss.numpy().mean():.6}") + log(val_loss=loss.numpy().mean()) + val_loss += loss.numpy().mean() batch_cnt += 1 samples += images.shape[0] @@ -166,7 +171,8 @@ def collate_fn(samples): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -196,7 +202,7 @@ def main(args): drop_last=False, collate_fn=collate_fn, ) - print( + pbar.write( f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" ) @@ -218,9 +224,9 @@ def main(args): ]) if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, acc = evaluate(model, val_loader, batch_transforms) - print(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Acc: {acc:.2%})") return st = time.time() @@ -250,7 +256,7 @@ def main(args): drop_last=True, collate_fn=collate_fn, ) - print( + pbar.write( f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" ) @@ -324,19 +330,61 @@ def main(args): "pretrained": args.pretrained, } + global global_step + global_step = 0 # Shared global step counter + # W&B if args.wb: import wandb run = wandb.init(name=exp_name, project="orientation-classification", config=config) + + def wandb_log_at_step(train_loss=None, val_loss=None, lr=None): + wandb.log({ + **({"train_loss_step": train_loss} if train_loss is not None else {}), + **({"val_loss_step": val_loss} if val_loss is not None else {}), + **({"step_lr": lr} if lr is not None else {}), + }) + # ClearML if args.clearml: - from clearml import Task + from clearml import Logger, Task task = Task.init(project_name="docTR/orientation-classification", task_name=exp_name, reuse_last_task_id=False) task.upload_artifact("config", config) - global iteration - iteration = 0 + + def clearml_log_at_step(train_loss=None, val_loss=None, lr=None): + logger = Logger.current_logger() + if train_loss is not None: + logger.report_scalar( + title="Training Step Loss", + series="train_loss_step", + iteration=global_step, + value=train_loss, + ) + if val_loss is not None: + logger.report_scalar( + title="Validation Step Loss", + series="val_loss_step", + iteration=global_step, + value=val_loss, + ) + if lr is not None: + logger.report_scalar( + title="Step Learning Rate", + series="step_lr", + iteration=global_step, + value=lr, + ) + + # Unified logger + def log_at_step(train_loss=None, val_loss=None, lr=None): + global global_step + if args.wb: + wandb_log_at_step(train_loss, val_loss, lr) + if args.clearml: + clearml_log_at_step(train_loss, val_loss, lr) + global_step += 1 # Increment the shared global step counter # Create loss queue min_loss = np.inf @@ -345,19 +393,25 @@ def main(args): if args.early_stop: early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): - fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp, args.clearml) + train_loss, actual_lr = fit_one_epoch( + model, train_loader, batch_transforms, optimizer, args.amp, log=log_at_step + ) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") # Validation loop at the end of each epoch - val_loss, acc = evaluate(model, val_loader, batch_transforms) + val_loss, acc = evaluate(model, val_loader, batch_transforms, log=log_at_step) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") model.save_weights(Path(args.output_dir) / f"{exp_name}.weights.h5") min_loss = val_loss - print(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} (Acc: {acc:.2%})") + # W&B if args.wb: wandb.log({ + "train_loss": train_loss, "val_loss": val_loss, + "learning_rate": actual_lr, "acc": acc, }) @@ -366,12 +420,15 @@ def main(args): from clearml import Logger logger = Logger.current_logger() + logger.report_scalar(title="Training Loss", series="train_loss", value=train_loss, iteration=epoch) logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Learning Rate", series="lr", value=actual_lr, iteration=epoch) logger.report_scalar(title="Accuracy", series="acc", value=acc, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break + if args.wb: run.finish() @@ -379,7 +436,7 @@ def main(args): push_to_hf_hub(model, exp_name, task="classification", run_config=args) if args.export_onnx: - print("Exporting model to ONNX...") + pbar.write("Exporting model to ONNX...") if args.arch in ["vit_s", "vit_b"]: # fixed batch size for vit dummy_input = [tf.TensorSpec([1, *(input_size), 3], tf.float32, name="input")] @@ -387,7 +444,7 @@ def main(args): # dynamic batch size dummy_input = [tf.TensorSpec([None, *(input_size), 3], tf.float32, name="input")] model_path, _ = export_model_to_onnx(model, exp_name, dummy_input) - print(f"Exported model saved in {model_path}") + pbar.write(f"Exported model saved in {model_path}") def parse_args(): diff --git a/references/detection/evaluate_pytorch.py b/references/detection/evaluate_pytorch.py index 1a0a9b32a..23476e5b6 100644 --- a/references/detection/evaluate_pytorch.py +++ b/references/detection/evaluate_pytorch.py @@ -9,7 +9,6 @@ os.environ["USE_TORCH"] = "1" -import logging import multiprocessing as mp import time from pathlib import Path @@ -63,7 +62,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if not isinstance(args.workers, int): args.workers = min(16, mp.cpu_count()) @@ -116,13 +116,13 @@ def main(args): pin_memory=torch.cuda.is_available(), collate_fn=ds.collate_fn, ) - print(f"Test set loaded in {time.time() - st:.4}s ({len(ds)} samples in {len(test_loader)} batches)") + pbar.write(f"Test set loaded in {time.time() - st:.4}s ({len(ds)} samples in {len(test_loader)} batches)") batch_transforms = Normalize(mean=mean, std=std) # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -136,7 +136,7 @@ def main(args): elif torch.cuda.is_available(): args.device = 0 else: - logging.warning("No accessible GPU, targe device set to CPU.") + pbar.write("No accessible GPU, target device set to CPU.") if torch.cuda.is_available(): torch.cuda.set_device(args.device) model = model.cuda() @@ -144,9 +144,9 @@ def main(args): # Metrics metric = LocalizationConfusion(use_polygons=args.rotation) - print("Running evaluation") + pbar.write("Running evaluation") val_loss, recall, precision, mean_iou = evaluate(model, test_loader, batch_transforms, metric, amp=args.amp) - print( + pbar.write( f"Validation loss: {val_loss:.6} (Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})" ) diff --git a/references/detection/evaluate_tensorflow.py b/references/detection/evaluate_tensorflow.py index b7f031db6..407a2b255 100644 --- a/references/detection/evaluate_tensorflow.py +++ b/references/detection/evaluate_tensorflow.py @@ -61,7 +61,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) # AMP if args.amp: @@ -78,7 +79,7 @@ def main(args): # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") model.load_weights(args.resume).expect_partial() input_shape = model.cfg["input_shape"] if input_shape is None else input_shape @@ -116,16 +117,16 @@ def main(args): drop_last=False, shuffle=False, ) - print(f"Test set loaded in {time.time() - st:.4}s ({len(ds)} samples in {len(test_loader)} batches)") + pbar.write(f"Test set loaded in {time.time() - st:.4}s ({len(ds)} samples in {len(test_loader)} batches)") batch_transforms = T.Normalize(mean=mean, std=std) # Metrics metric = LocalizationConfusion(use_polygons=args.rotation) - print("Running evaluation") + pbar.write("Running evaluation") val_loss, recall, precision, mean_iou = evaluate(model, test_loader, batch_transforms, metric) - print( + pbar.write( f"Validation loss: {val_loss:.6} (Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})" ) diff --git a/references/detection/train_pytorch.py b/references/detection/train_pytorch.py index 6b0e9b692..2dc0593b0 100644 --- a/references/detection/train_pytorch.py +++ b/references/detection/train_pytorch.py @@ -103,18 +103,14 @@ def record_lr( return lr_recorder[: len(loss_recorder)], loss_recorder -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, log=None): if amp: scaler = torch.cuda.amp.GradScaler() model.train() - if clearml_log: - from clearml import Logger - - logger = Logger.current_logger() - # Iterate over the batches of the dataset - pbar = tqdm(train_loader, position=1) + epoch_train_loss, batch_cnt = 0, 0 + pbar = tqdm(train_loader, dynamic_ncols=True) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() @@ -138,25 +134,28 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a optimizer.step() scheduler.step() + last_lr = scheduler.get_last_lr()[0] - pbar.set_description(f"Training loss: {train_loss.item():.6}") - if clearml_log: - global iteration - logger.report_scalar( - title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration - ) - iteration += 1 + pbar.set_description(f"Training loss: {train_loss.item():.6} | LR: {last_lr:.6}") + log(train_loss=train_loss.item(), lr=last_lr) + + epoch_train_loss += train_loss.item() + batch_cnt += 1 + + epoch_train_loss /= batch_cnt + return epoch_train_loss, last_lr @torch.no_grad() -def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): +def evaluate(model, val_loader, batch_transforms, val_metric, amp=False, log=None): # Model in eval mode model.eval() # Reset val metric val_metric.reset() # Validation loop val_loss, batch_cnt = 0, 0 - for images, targets in tqdm(val_loader): + pbar = tqdm(val_loader, dynamic_ncols=True) + for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() images = batch_transforms(images) @@ -174,6 +173,9 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): boxes_pred = np.concatenate((boxes_pred[:, :4].min(axis=1), boxes_pred[:, :4].max(axis=1)), axis=-1) val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :4]) + pbar.set_description(f"Validation loss: {out['loss'].item():.6}") + log(val_loss=out["loss"].item()) + val_loss += out["loss"].item() batch_cnt += 1 @@ -183,7 +185,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -224,7 +227,7 @@ def main(args): pin_memory=torch.cuda.is_available(), collate_fn=val_set.collate_fn, ) - print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") + pbar.write(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") with open(os.path.join(args.val_path, "labels.json"), "rb") as f: val_hash = hashlib.sha256(f.read()).hexdigest() @@ -239,7 +242,7 @@ def main(args): # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -262,9 +265,9 @@ def main(args): val_metric = LocalizationConfusion(use_polygons=args.rotation and not args.eval_straight) if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) - print( + pbar.write( f"Validation loss: {val_loss:.6} (Recall: {recall:.2%} | Precision: {precision:.2%} | " f"Mean IoU: {mean_iou:.2%})" ) @@ -331,7 +334,7 @@ def main(args): pin_memory=torch.cuda.is_available(), collate_fn=train_set.collate_fn, ) - print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") + pbar.write(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") with open(os.path.join(args.train_path, "labels.json"), "rb") as f: train_hash = hashlib.sha256(f.read()).hexdigest() @@ -397,24 +400,61 @@ def main(args): "amp": args.amp, } + global global_step + global_step = 0 # Shared global step counter + # W&B if args.wb: import wandb - run = wandb.init( - name=exp_name, - project="text-detection", - config=config, - ) + run = wandb.init(name=exp_name, project="text-detection", config=config) + + def wandb_log_at_step(train_loss=None, val_loss=None, lr=None): + wandb.log({ + **({"train_loss_step": train_loss} if train_loss is not None else {}), + **({"val_loss_step": val_loss} if val_loss is not None else {}), + **({"step_lr": lr} if lr is not None else {}), + }) # ClearML if args.clearml: - from clearml import Task + from clearml import Logger, Task task = Task.init(project_name="docTR/text-detection", task_name=exp_name, reuse_last_task_id=False) task.upload_artifact("config", config) - global iteration - iteration = 0 + + def clearml_log_at_step(train_loss=None, val_loss=None, lr=None): + logger = Logger.current_logger() + if train_loss is not None: + logger.report_scalar( + title="Training Step Loss", + series="train_loss_step", + iteration=global_step, + value=train_loss, + ) + if val_loss is not None: + logger.report_scalar( + title="Validation Step Loss", + series="val_loss_step", + iteration=global_step, + value=val_loss, + ) + if lr is not None: + logger.report_scalar( + title="Step Learning Rate", + series="step_lr", + iteration=global_step, + value=lr, + ) + + # Unified logger + def log_at_step(train_loss=None, val_loss=None, lr=None): + global global_step + if args.wb: + wandb_log_at_step(train_loss, val_loss, lr) + if args.clearml: + clearml_log_at_step(train_loss, val_loss, lr) + global_step += 1 # Increment the shared global step counter # Create loss queue min_loss = np.inf @@ -423,28 +463,33 @@ def main(args): # Training loop for epoch in range(args.epochs): - fit_one_epoch( - model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml + train_loss, actual_lr = fit_one_epoch( + model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, log=log_at_step ) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") # Validation loop at the end of each epoch - val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) + val_loss, recall, precision, mean_iou = evaluate( + model, val_loader, batch_transforms, val_metric, amp=args.amp, log=log_at_step + ) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.state_dict(), Path(args.output_dir) / f"{exp_name}.pt") min_loss = val_loss if args.save_interval_epoch: - print(f"Saving state at epoch: {epoch + 1}") + pbar.write(f"Saving state at epoch: {epoch + 1}") torch.save(model.state_dict(), Path(args.output_dir) / f"{exp_name}_epoch{epoch + 1}.pt") log_msg = f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} " if any(val is None for val in (recall, precision, mean_iou)): log_msg += "(Undefined metric value, caused by empty GTs or predictions)" else: log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})" - print(log_msg) + pbar.write(log_msg) # W&B if args.wb: wandb.log({ + "train_loss": train_loss, "val_loss": val_loss, + "learning_rate": actual_lr, "recall": recall, "precision": precision, "mean_iou": mean_iou, @@ -455,14 +500,17 @@ def main(args): from clearml import Logger logger = Logger.current_logger() + logger.report_scalar(title="Training Loss", series="train_loss", value=train_loss, iteration=epoch) logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Learning Rate", series="lr", value=actual_lr, iteration=epoch) logger.report_scalar(title="Recall", series="recall", value=recall, iteration=epoch) logger.report_scalar(title="Precision", series="precision", value=precision, iteration=epoch) logger.report_scalar(title="Mean IoU", series="mean_iou", value=mean_iou, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break + if args.wb: run.finish() diff --git a/references/detection/train_pytorch_ddp.py b/references/detection/train_pytorch_ddp.py index a96a6dc3b..85607c2a7 100644 --- a/references/detection/train_pytorch_ddp.py +++ b/references/detection/train_pytorch_ddp.py @@ -109,18 +109,14 @@ def record_lr( return lr_recorder[: len(loss_recorder)], loss_recorder -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False): if amp: scaler = torch.cuda.amp.GradScaler() model.train() - if clearml_log: - from clearml import Logger - - logger = Logger.current_logger() - # Iterate over the batches of the dataset - pbar = tqdm(train_loader, position=1) + epoch_train_loss, batch_cnt = 0, 0 + pbar = tqdm(train_loader, dynamic_ncols=True) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() @@ -144,14 +140,14 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a optimizer.step() scheduler.step() + last_lr = scheduler.get_last_lr()[0] - pbar.set_description(f"Training loss: {train_loss.item():.6}") - if clearml_log: - global iteration - logger.report_scalar( - title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration - ) - iteration += 1 + pbar.set_description(f"Training loss: {train_loss.item():.6} | LR: {last_lr:.6}") + epoch_train_loss += train_loss.item() + batch_cnt += 1 + + epoch_train_loss /= batch_cnt + return epoch_train_loss, last_lr @torch.no_grad() @@ -162,7 +158,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, args, amp=False): val_metric.reset() # Validation loop val_loss, batch_cnt = 0, 0 - for images, targets in tqdm(val_loader): + pbar = tqdm(val_loader, dynamic_ncols=True) + for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() images = batch_transforms(images) @@ -180,6 +177,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, args, amp=False): boxes_pred = np.concatenate((boxes_pred[:, :4].min(axis=1), boxes_pred[:, :4].max(axis=1)), axis=-1) val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :4]) + pbar.set_description(f"Validation loss: {out['loss'].item():.6}") + val_loss += out["loss"].item() batch_cnt += 1 @@ -195,8 +194,8 @@ def main(rank: int, world_size: int, args): world_size (int): number of processes participating in the job args: other arguments passed through the CLI """ - - print(args) + pbar = tqdm(disable=True) + pbar.write(args) if rank == 0 and args.push_to_hub: login_to_hub() @@ -239,7 +238,9 @@ def main(rank: int, world_size: int, args): pin_memory=torch.cuda.is_available(), collate_fn=val_set.collate_fn, ) - print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") + pbar.write( + f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)" + ) with open(os.path.join(args.val_path, "labels.json"), "rb") as f: val_hash = hashlib.sha256(f.read()).hexdigest() @@ -258,7 +259,7 @@ def main(rank: int, world_size: int, args): # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -275,11 +276,11 @@ def main(rank: int, world_size: int, args): val_metric = LocalizationConfusion(use_polygons=args.rotation and not args.eval_straight) if rank == 0 and args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, recall, precision, mean_iou = evaluate( model, val_loader, batch_transforms, val_metric, args, amp=args.amp ) - print( + pbar.write( f"Validation loss: {val_loss:.6} (Recall: {recall:.2%} | Precision: {precision:.2%} | " f"Mean IoU: {mean_iou:.2%})" ) @@ -346,7 +347,7 @@ def main(rank: int, world_size: int, args): pin_memory=torch.cuda.is_available(), collate_fn=train_set.collate_fn, ) - print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") + pbar.write(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") with open(os.path.join(args.train_path, "labels.json"), "rb") as f: train_hash = hashlib.sha256(f.read()).hexdigest() @@ -429,8 +430,6 @@ def main(rank: int, world_size: int, args): task = Task.init(project_name="docTR/text-detection", task_name=exp_name, reuse_last_task_id=False) task.upload_artifact("config", config) - global iteration - iteration = 0 # Create loss queue min_loss = np.inf @@ -439,9 +438,8 @@ def main(rank: int, world_size: int, args): # Training loop for epoch in range(args.epochs): - fit_one_epoch( - model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml - ) + train_loss, actual_lr = fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") if rank == 0: # Validation loop at the end of each epoch @@ -449,22 +447,24 @@ def main(rank: int, world_size: int, args): model, val_loader, batch_transforms, val_metric, args, amp=args.amp ) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.module.state_dict(), Path(args.output_dir) / f"{exp_name}.pt") min_loss = val_loss if args.save_interval_epoch: - print(f"Saving state at epoch: {epoch + 1}") + pbar.write(f"Saving state at epoch: {epoch + 1}") torch.save(model.module.state_dict(), Path(args.output_dir) / f"{exp_name}_epoch{epoch + 1}.pt") log_msg = f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} " if any(val is None for val in (recall, precision, mean_iou)): log_msg += "(Undefined metric value, caused by empty GTs or predictions)" else: log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})" - print(log_msg) + pbar.write(log_msg) # W&B if args.wb: wandb.log({ + "train_loss": train_loss, "val_loss": val_loss, + "learning_rate": actual_lr, "recall": recall, "precision": precision, "mean_iou": mean_iou, @@ -475,13 +475,15 @@ def main(rank: int, world_size: int, args): from clearml import Logger logger = Logger.current_logger() + logger.report_scalar(title="Training Loss", series="train_loss", value=train_loss, iteration=epoch) logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) - logger.report_scalar(title="Precision Recall", series="recall", value=recall, iteration=epoch) - logger.report_scalar(title="Precision Recall", series="precision", value=precision, iteration=epoch) + logger.report_scalar(title="Learning Rate", series="lr", value=actual_lr, iteration=epoch) + logger.report_scalar(title="Recall", series="recall", value=recall, iteration=epoch) + logger.report_scalar(title="Precision", series="precision", value=precision, iteration=epoch) logger.report_scalar(title="Mean IoU", series="mean_iou", value=mean_iou, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break if rank == 0: diff --git a/references/detection/train_tensorflow.py b/references/detection/train_tensorflow.py index fb7d78b48..dde8d5eb5 100644 --- a/references/detection/train_tensorflow.py +++ b/references/detection/train_tensorflow.py @@ -96,15 +96,11 @@ def apply_grads(optimizer, grads, model): optimizer.apply_gradients(zip(grads, model.trainable_weights)) -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, clearml_log=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, log=None): train_iter = iter(train_loader) # Iterate over the batches of the dataset - if clearml_log: - from clearml import Logger - - logger = Logger.current_logger() - - pbar = tqdm(train_iter, position=1) + epoch_train_loss, batch_cnt = 0, 0 + pbar = tqdm(train_iter, dynamic_ncols=True) for images, targets in pbar: images = batch_transforms(images) @@ -115,22 +111,27 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, c grads = optimizer.get_unscaled_gradients(grads) apply_grads(optimizer, grads, model) - pbar.set_description(f"Training loss: {train_loss.numpy():.6}") - if clearml_log: - global iteration - logger.report_scalar( - title="Training Loss", series="train_loss", value=train_loss.numpy(), iteration=iteration - ) - iteration += 1 + last_lr = optimizer.learning_rate.numpy().item() + train_loss = train_loss.numpy().item() + + pbar.set_description(f"Training loss: {train_loss:.6} | LR: {last_lr:.6}") + log(train_loss=train_loss, lr=last_lr) + + epoch_train_loss += train_loss + batch_cnt += 1 + epoch_train_loss /= batch_cnt + return epoch_train_loss, last_lr -def evaluate(model, val_loader, batch_transforms, val_metric): + +def evaluate(model, val_loader, batch_transforms, val_metric, log=None): # Reset val metric val_metric.reset() # Validation loop val_loss, batch_cnt = 0, 0 val_iter = iter(val_loader) - for images, targets in tqdm(val_iter): + pbar = tqdm(val_iter, dynamic_ncols=True) + for images, targets in pbar: images = batch_transforms(images) out = model(images, target=targets, training=False, return_preds=True) # Compute metric @@ -142,6 +143,9 @@ def evaluate(model, val_loader, batch_transforms, val_metric): boxes_pred = np.concatenate((boxes_pred[:, :4].min(axis=1), boxes_pred[:, :4].max(axis=1)), axis=-1) val_metric.update(gts=boxes_gt, preds=boxes_pred[:, :4]) + pbar.set_description(f"Validation loss: {out['loss'].numpy():.6}") + log(val_loss=out["loss"].numpy()) + val_loss += out["loss"].numpy() batch_cnt += 1 @@ -151,7 +155,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -188,7 +193,7 @@ def main(args): shuffle=False, drop_last=False, ) - print( + pbar.write( f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" ) with open(os.path.join(args.val_path, "labels.json"), "rb") as f: @@ -214,9 +219,9 @@ def main(args): val_metric = LocalizationConfusion(use_polygons=args.rotation and not args.eval_straight) if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric) - print( + pbar.write( f"Validation loss: {val_loss:.6} (Recall: {recall:.2%} | Precision: {precision:.2%} | " f"Mean IoU: {mean_iou:.2%})" ) @@ -283,7 +288,7 @@ def main(args): shuffle=True, drop_last=True, ) - print( + pbar.write( f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" ) with open(os.path.join(args.train_path, "labels.json"), "rb") as f: @@ -362,20 +367,61 @@ def main(args): "rotation": args.rotation, } + global global_step + global_step = 0 # Shared global step counter + # W&B if args.wb: import wandb run = wandb.init(name=exp_name, project="text-detection", config=config) + def wandb_log_at_step(train_loss=None, val_loss=None, lr=None): + wandb.log({ + **({"train_loss_step": train_loss} if train_loss is not None else {}), + **({"val_loss_step": val_loss} if val_loss is not None else {}), + **({"step_lr": lr} if lr is not None else {}), + }) + # ClearML if args.clearml: - from clearml import Task + from clearml import Logger, Task task = Task.init(project_name="docTR/text-detection", task_name=exp_name, reuse_last_task_id=False) task.upload_artifact("config", config) - global iteration - iteration = 0 + + def clearml_log_at_step(train_loss=None, val_loss=None, lr=None): + logger = Logger.current_logger() + if train_loss is not None: + logger.report_scalar( + title="Training Step Loss", + series="train_loss_step", + iteration=global_step, + value=train_loss, + ) + if val_loss is not None: + logger.report_scalar( + title="Validation Step Loss", + series="val_loss_step", + iteration=global_step, + value=val_loss, + ) + if lr is not None: + logger.report_scalar( + title="Step Learning Rate", + series="step_lr", + iteration=global_step, + value=lr, + ) + + # Unified logger + def log_at_step(train_loss=None, val_loss=None, lr=None): + global global_step + if args.wb: + wandb_log_at_step(train_loss, val_loss, lr) + if args.clearml: + clearml_log_at_step(train_loss, val_loss, lr) + global_step += 1 # Increment the shared global step counter if args.freeze_backbone: for layer in model.feat_extractor.layers: @@ -387,26 +433,34 @@ def main(args): # Training loop for epoch in range(args.epochs): - fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp, args.clearml) + train_loss, actual_lr = fit_one_epoch( + model, train_loader, batch_transforms, optimizer, args.amp, log=log_at_step + ) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") + # Validation loop at the end of each epoch - val_loss, recall, precision, mean_iou = evaluate(model, val_loader, batch_transforms, val_metric) + val_loss, recall, precision, mean_iou = evaluate( + model, val_loader, batch_transforms, val_metric, log=log_at_step + ) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") model.save_weights(Path(args.output_dir) / f"{exp_name}.weights.h5") min_loss = val_loss if args.save_interval_epoch: - print(f"Saving state at epoch: {epoch + 1}") + pbar.write(f"Saving state at epoch: {epoch + 1}") model.save_weights(Path(args.output_dir) / f"{exp_name}_{epoch + 1}.weights.h5") log_msg = f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} " if any(val is None for val in (recall, precision, mean_iou)): log_msg += "(Undefined metric value, caused by empty GTs or predictions)" else: log_msg += f"(Recall: {recall:.2%} | Precision: {precision:.2%} | Mean IoU: {mean_iou:.2%})" - print(log_msg) + pbar.write(log_msg) # W&B if args.wb: wandb.log({ + "train_loss": train_loss, "val_loss": val_loss, + "learning_rate": actual_lr, "recall": recall, "precision": precision, "mean_iou": mean_iou, @@ -417,13 +471,15 @@ def main(args): from clearml import Logger logger = Logger.current_logger() + logger.report_scalar(title="Training Loss", series="train_loss", value=train_loss, iteration=epoch) logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) - logger.report_scalar(title="Precision Recall", series="recall", value=recall, iteration=epoch) - logger.report_scalar(title="Precision Recall", series="precision", value=precision, iteration=epoch) + logger.report_scalar(title="Learning Rate", series="lr", value=actual_lr, iteration=epoch) + logger.report_scalar(title="Recall", series="recall", value=recall, iteration=epoch) + logger.report_scalar(title="Precision", series="precision", value=precision, iteration=epoch) logger.report_scalar(title="Mean IoU", series="mean_iou", value=mean_iou, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break if args.wb: run.finish() diff --git a/references/recognition/evaluate_pytorch.py b/references/recognition/evaluate_pytorch.py index 7025c5448..99baaefe8 100644 --- a/references/recognition/evaluate_pytorch.py +++ b/references/recognition/evaluate_pytorch.py @@ -13,7 +13,11 @@ import torch from torch.utils.data import DataLoader, SequentialSampler from torchvision.transforms import Normalize -from tqdm import tqdm + +if os.getenv("TQDM_SLACK_TOKEN") and os.getenv("TQDM_SLACK_CHANNEL"): + from tqdm.contrib.slack import tqdm +else: + from tqdm.auto import tqdm from doctr import datasets from doctr import transforms as T @@ -30,7 +34,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): val_metric.reset() # Validation loop val_loss, batch_cnt = 0, 0 - for images, targets in tqdm(val_loader): + pbar = tqdm(val_loader) + for images, targets in pbar: try: if torch.cuda.is_available(): images = images.cuda() @@ -50,7 +55,7 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): val_loss += out["loss"].item() batch_cnt += 1 except ValueError: - print(f"unexpected symbol/s in targets:\n{targets} \n--> skip batch") + pbar.write(f"unexpected symbol/s in targets:\n{targets} \n--> skip batch") continue val_loss /= batch_cnt @@ -59,7 +64,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) torch.backends.cudnn.benchmark = True @@ -75,7 +81,7 @@ def main(args): # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -106,7 +112,7 @@ def main(args): pin_memory=torch.cuda.is_available(), collate_fn=ds.collate_fn, ) - print(f"Test set loaded in {time.time() - st:.4}s ({len(ds)} samples in {len(test_loader)} batches)") + pbar.write(f"Test set loaded in {time.time() - st:.4}s ({len(ds)} samples in {len(test_loader)} batches)") mean, std = model.cfg["mean"], model.cfg["std"] batch_transforms = Normalize(mean=mean, std=std) @@ -124,14 +130,14 @@ def main(args): elif torch.cuda.is_available(): args.device = 0 else: - print("No accessible GPU, targe device set to CPU.") + pbar.write("No accessible GPU, target device set to CPU.") if torch.cuda.is_available(): torch.cuda.set_device(args.device) model = model.cuda() - print("Running evaluation") + pbar.write("Running evaluation") val_loss, exact_match, partial_match = evaluate(model, test_loader, batch_transforms, val_metric, amp=args.amp) - print(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") def parse_args(): diff --git a/references/recognition/evaluate_tensorflow.py b/references/recognition/evaluate_tensorflow.py index cea96ace1..f49b83aa5 100644 --- a/references/recognition/evaluate_tensorflow.py +++ b/references/recognition/evaluate_tensorflow.py @@ -16,7 +16,11 @@ import tensorflow as tf from tensorflow.keras import mixed_precision -from tqdm import tqdm + +if os.getenv("TQDM_SLACK_TOKEN") and os.getenv("TQDM_SLACK_CHANNEL"): + from tqdm.contrib.slack import tqdm +else: + from tqdm.auto import tqdm gpu_devices = tf.config.list_physical_devices("GPU") if any(gpu_devices): @@ -35,7 +39,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric): # Validation loop val_loss, batch_cnt = 0, 0 val_iter = iter(val_loader) - for images, targets in tqdm(val_iter): + pbar = tqdm(val_iter) + for images, targets in pbar: try: images = batch_transforms(images) out = model(images, target=targets, return_preds=True, training=False) @@ -49,7 +54,7 @@ def evaluate(model, val_loader, batch_transforms, val_metric): val_loss += out["loss"].numpy().mean() batch_cnt += 1 except ValueError: - print(f"unexpected symbol/s in targets:\n{targets} \n--> skip batch") + pbar.write(f"unexpected symbol/s in targets:\n{targets} \n--> skip batch") continue val_loss /= batch_cnt @@ -58,7 +63,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) # AMP if args.amp: @@ -99,7 +105,7 @@ def main(args): drop_last=False, shuffle=False, ) - print(f"Test set loaded in {time.time() - st:.4}s ({len(ds)} samples in {len(test_loader)} batches)") + pbar.write(f"Test set loaded in {time.time() - st:.4}s ({len(ds)} samples in {len(test_loader)} batches)") mean, std = model.cfg["mean"], model.cfg["std"] batch_transforms = T.Normalize(mean=mean, std=std) @@ -107,9 +113,9 @@ def main(args): # Metrics val_metric = TextMatch() - print("Running evaluation") + pbar.write("Running evaluation") val_loss, exact_match, partial_match = evaluate(model, test_loader, batch_transforms, val_metric) - print(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") def parse_args(): diff --git a/references/recognition/train_pytorch.py b/references/recognition/train_pytorch.py index 70a841dce..f362d7295 100644 --- a/references/recognition/train_pytorch.py +++ b/references/recognition/train_pytorch.py @@ -109,18 +109,14 @@ def record_lr( return lr_recorder[: len(loss_recorder)], loss_recorder -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, amp=False, log=None): if amp: scaler = torch.cuda.amp.GradScaler() model.train() - if clearml_log: - from clearml import Logger - - logger = Logger.current_logger() - # Iterate over the batches of the dataset - pbar = tqdm(train_loader, position=1) + epoch_train_loss, batch_cnt = 0, 0 + pbar = tqdm(train_loader, dynamic_ncols=True) for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() @@ -146,25 +142,28 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, scheduler, a optimizer.step() scheduler.step() + last_lr = scheduler.get_last_lr()[0] + + pbar.set_description(f"Training loss: {train_loss.item():.6} | LR: {last_lr:.6}") + log(train_loss=train_loss.item(), lr=last_lr) - pbar.set_description(f"Training loss: {train_loss.item():.6}") - if clearml_log: - global iteration - logger.report_scalar( - title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration - ) - iteration += 1 + epoch_train_loss += train_loss.item() + batch_cnt += 1 + + epoch_train_loss /= batch_cnt + return epoch_train_loss, last_lr @torch.no_grad() -def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): +def evaluate(model, val_loader, batch_transforms, val_metric, amp=False, log=None): # Model in eval mode model.eval() # Reset val metric val_metric.reset() # Validation loop val_loss, batch_cnt = 0, 0 - for images, targets in tqdm(val_loader): + pbar = tqdm(val_loader, dynamic_ncols=True) + for images, targets in pbar: if torch.cuda.is_available(): images = images.cuda() images = batch_transforms(images) @@ -180,6 +179,9 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): words = [] val_metric.update(targets, words) + pbar.set_description(f"Validation loss: {out['loss'].item():.6}") + log(val_loss=out["loss"].item()) + val_loss += out["loss"].item() batch_cnt += 1 @@ -189,7 +191,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric, amp=False): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -238,7 +241,7 @@ def main(args): pin_memory=torch.cuda.is_available(), collate_fn=val_set.collate_fn, ) - print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") + pbar.write(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") batch_transforms = Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301)) @@ -247,7 +250,7 @@ def main(args): # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -270,9 +273,9 @@ def main(args): val_metric = TextMatch() if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, exact_match, partial_match = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) - print(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") return st = time.time() @@ -339,7 +342,7 @@ def main(args): pin_memory=torch.cuda.is_available(), collate_fn=train_set.collate_fn, ) - print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") + pbar.write(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") if args.show_samples: x, target = next(iter(train_loader)) @@ -403,24 +406,61 @@ def main(args): "pretrained": args.pretrained, } + global global_step + global_step = 0 # Shared global step counter + # W&B if args.wb: import wandb - run = wandb.init( - name=exp_name, - project="text-recognition", - config=config, - ) + run = wandb.init(name=exp_name, project="text-recognition", config=config) + + def wandb_log_at_step(train_loss=None, val_loss=None, lr=None): + wandb.log({ + **({"train_loss_step": train_loss} if train_loss is not None else {}), + **({"val_loss_step": val_loss} if val_loss is not None else {}), + **({"step_lr": lr} if lr is not None else {}), + }) # ClearML if args.clearml: - from clearml import Task + from clearml import Logger, Task task = Task.init(project_name="docTR/text-recognition", task_name=exp_name, reuse_last_task_id=False) task.upload_artifact("config", config) - global iteration - iteration = 0 + + def clearml_log_at_step(train_loss=None, val_loss=None, lr=None): + logger = Logger.current_logger() + if train_loss is not None: + logger.report_scalar( + title="Training Step Loss", + series="train_loss_step", + iteration=global_step, + value=train_loss, + ) + if val_loss is not None: + logger.report_scalar( + title="Validation Step Loss", + series="val_loss_step", + iteration=global_step, + value=val_loss, + ) + if lr is not None: + logger.report_scalar( + title="Step Learning Rate", + series="step_lr", + iteration=global_step, + value=lr, + ) + + # Unified logger + def log_at_step(train_loss=None, val_loss=None, lr=None): + global global_step + if args.wb: + wandb_log_at_step(train_loss, val_loss, lr) + if args.clearml: + clearml_log_at_step(train_loss, val_loss, lr) + global_step += 1 # Increment the shared global step counter # Create loss queue min_loss = np.inf @@ -428,24 +468,29 @@ def main(args): if args.early_stop: early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) for epoch in range(args.epochs): - fit_one_epoch( - model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml + train_loss, actual_lr = fit_one_epoch( + model, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, log=log_at_step ) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") # Validation loop at the end of each epoch - val_loss, exact_match, partial_match = evaluate(model, val_loader, batch_transforms, val_metric, amp=args.amp) + val_loss, exact_match, partial_match = evaluate( + model, val_loader, batch_transforms, val_metric, amp=args.amp, log=log_at_step + ) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.state_dict(), Path(args.output_dir) / f"{exp_name}.pt") min_loss = val_loss - print( + pbar.write( f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} " f"(Exact: {exact_match:.2%} | Partial: {partial_match:.2%})" ) # W&B if args.wb: wandb.log({ + "train_loss": train_loss, "val_loss": val_loss, + "learning_rate": actual_lr, "exact_match": exact_match, "partial_match": partial_match, }) @@ -455,13 +500,16 @@ def main(args): from clearml import Logger logger = Logger.current_logger() + logger.report_scalar(title="Training Loss", series="train_loss", value=train_loss, iteration=epoch) logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Learning Rate", series="lr", value=actual_lr, iteration=epoch) logger.report_scalar(title="Exact Match", series="exact_match", value=exact_match, iteration=epoch) - logger.report_scalar(title="Partial Match", series="partial_match", value=exact_match, iteration=epoch) + logger.report_scalar(title="Partial Match", series="partial_match", value=partial_match, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break + if args.wb: run.finish() diff --git a/references/recognition/train_pytorch_ddp.py b/references/recognition/train_pytorch_ddp.py index 3d1b95b56..df3be466d 100644 --- a/references/recognition/train_pytorch_ddp.py +++ b/references/recognition/train_pytorch_ddp.py @@ -42,18 +42,14 @@ from utils import EarlyStopper, plot_samples -def fit_one_epoch(model, device, train_loader, batch_transforms, optimizer, scheduler, amp=False, clearml_log=False): +def fit_one_epoch(model, device, train_loader, batch_transforms, optimizer, scheduler, amp=False): if amp: scaler = torch.cuda.amp.GradScaler() model.train() - if clearml_log: - from clearml import Logger - - logger = Logger.current_logger() - # Iterate over the batches of the dataset - pbar = tqdm(train_loader, position=1) + epoch_train_loss, batch_cnt = 0, 0 + pbar = tqdm(train_loader, dynamic_ncols=True) for images, targets in pbar: images = images.to(device) images = batch_transforms(images) @@ -78,14 +74,14 @@ def fit_one_epoch(model, device, train_loader, batch_transforms, optimizer, sche optimizer.step() scheduler.step() + last_lr = scheduler.get_last_lr()[0] - pbar.set_description(f"Training loss: {train_loss.item():.6}") - if clearml_log: - global iteration - logger.report_scalar( - title="Training Loss", series="train_loss", value=train_loss.item(), iteration=iteration - ) - iteration += 1 + pbar.set_description(f"Training loss: {train_loss.item():.6} | LR: {last_lr:.6}") + epoch_train_loss += train_loss.item() + batch_cnt += 1 + + epoch_train_loss /= batch_cnt + return epoch_train_loss, last_lr @torch.no_grad() @@ -96,7 +92,8 @@ def evaluate(model, device, val_loader, batch_transforms, val_metric, amp=False) val_metric.reset() # Validation loop val_loss, batch_cnt = 0, 0 - for images, targets in tqdm(val_loader): + pbar = tqdm(val_loader, dynamic_ncols=True) + for images, targets in pbar: images = images.to(device) images = batch_transforms(images) if amp: @@ -111,6 +108,8 @@ def evaluate(model, device, val_loader, batch_transforms, val_metric, amp=False) words = [] val_metric.update(targets, words) + pbar.set_description(f"Validation loss: {out['loss'].item():.6}") + val_loss += out["loss"].item() batch_cnt += 1 @@ -126,7 +125,8 @@ def main(rank: int, world_size: int, args): world_size (int): number of processes participating in the job args: other arguments passed through the CLI """ - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if rank == 0 and args.push_to_hub: login_to_hub() @@ -176,7 +176,9 @@ def main(rank: int, world_size: int, args): pin_memory=torch.cuda.is_available(), collate_fn=val_set.collate_fn, ) - print(f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)") + pbar.write( + f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {len(val_loader)} batches)" + ) batch_transforms = Normalize(mean=(0.694, 0.695, 0.693), std=(0.299, 0.296, 0.301)) @@ -185,7 +187,7 @@ def main(rank: int, world_size: int, args): # Resume weights if isinstance(args.resume, str): - print(f"Resuming {args.resume}") + pbar.write(f"Resuming {args.resume}") checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint) @@ -207,11 +209,11 @@ def main(rank: int, world_size: int, args): val_metric = TextMatch() if rank == 0 and args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, exact_match, partial_match = evaluate( model, device, val_loader, batch_transforms, val_metric, amp=args.amp ) - print(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") return st = time.time() @@ -278,7 +280,7 @@ def main(rank: int, world_size: int, args): pin_memory=torch.cuda.is_available(), collate_fn=train_set.collate_fn, ) - print(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") + pbar.write(f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {len(train_loader)} batches)") if rank == 0 and args.show_samples: x, target = next(iter(train_loader)) @@ -347,8 +349,6 @@ def main(rank: int, world_size: int, args): task = Task.init(project_name="docTR/text-recognition", task_name=exp_name, reuse_last_task_id=False) task.upload_artifact("config", config) - global iteration - iteration = 0 # Create loss queue min_loss = np.inf @@ -356,9 +356,10 @@ def main(rank: int, world_size: int, args): early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) # Training loop for epoch in range(args.epochs): - fit_one_epoch( - model, device, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp, clearml_log=args.clearml + train_loss, actual_lr = fit_one_epoch( + model, device, train_loader, batch_transforms, optimizer, scheduler, amp=args.amp ) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") if rank == 0: # Validation loop at the end of each epoch @@ -369,17 +370,19 @@ def main(rank: int, world_size: int, args): # All processes should see same parameters as they all start from same # random parameters and gradients are synchronized in backward passes. # Therefore, saving it in one process is sufficient. - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") torch.save(model.module.state_dict(), Path(args.output_dir) / f"{exp_name}.pt") min_loss = val_loss - print( + pbar.write( f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} " f"(Exact: {exact_match:.2%} | Partial: {partial_match:.2%})" ) # W&B if args.wb: wandb.log({ + "train_loss": train_loss, "val_loss": val_loss, + "learning_rate": actual_lr, "exact_match": exact_match, "partial_match": partial_match, }) @@ -389,15 +392,18 @@ def main(rank: int, world_size: int, args): from clearml import Logger logger = Logger.current_logger() + logger.report_scalar(title="Training Loss", series="train_loss", value=train_loss, iteration=epoch) logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Learning Rate", series="lr", value=actual_lr, iteration=epoch) logger.report_scalar(title="Exact Match", series="exact_match", value=exact_match, iteration=epoch) logger.report_scalar( title="Partial Match", series="partial_match", value=partial_match, iteration=epoch ) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break + if rank == 0: if args.wb: run.finish() diff --git a/references/recognition/train_tensorflow.py b/references/recognition/train_tensorflow.py index d9748a74f..610ea4ed6 100644 --- a/references/recognition/train_tensorflow.py +++ b/references/recognition/train_tensorflow.py @@ -96,15 +96,11 @@ def apply_grads(optimizer, grads, model): optimizer.apply_gradients(zip(grads, model.trainable_weights)) -def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, clearml_log=False): +def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, log=None): train_iter = iter(train_loader) - if clearml_log: - from clearml import Logger - - logger = Logger.current_logger() - # Iterate over the batches of the dataset - pbar = tqdm(train_iter, position=1) + epoch_train_loss, batch_cnt = 0, 0 + pbar = tqdm(train_iter, dynamic_ncols=True) for images, targets in pbar: images = batch_transforms(images) @@ -115,22 +111,27 @@ def fit_one_epoch(model, train_loader, batch_transforms, optimizer, amp=False, c grads = optimizer.get_unscaled_gradients(grads) apply_grads(optimizer, grads, model) - pbar.set_description(f"Training loss: {train_loss.numpy().mean():.6}") - if clearml_log: - global iteration - logger.report_scalar( - title="Training Loss", series="train_loss", value=train_loss.numpy().mean(), iteration=iteration - ) - iteration += 1 + last_lr = optimizer.learning_rate.numpy().item() + train_loss = train_loss.numpy().mean() + pbar.set_description(f"Training loss: {train_loss:.6} | LR: {last_lr:.6}") + log(train_loss=train_loss, lr=last_lr) + + epoch_train_loss += train_loss + batch_cnt += 1 -def evaluate(model, val_loader, batch_transforms, val_metric): + epoch_train_loss /= batch_cnt + return epoch_train_loss, last_lr + + +def evaluate(model, val_loader, batch_transforms, val_metric, log=None): # Reset val metric val_metric.reset() # Validation loop val_loss, batch_cnt = 0, 0 val_iter = iter(val_loader) - for images, targets in tqdm(val_iter): + pbar = tqdm(val_iter, dynamic_ncols=True) + for images, targets in pbar: images = batch_transforms(images) out = model(images, target=targets, return_preds=True, training=False) # Compute metric @@ -140,6 +141,9 @@ def evaluate(model, val_loader, batch_transforms, val_metric): words = [] val_metric.update(targets, words) + pbar.set_description(f"Validation loss: {out['loss'].numpy().mean():.6}") + log(val_loss=out["loss"].numpy().mean()) + val_loss += out["loss"].numpy().mean() batch_cnt += 1 @@ -149,7 +153,8 @@ def evaluate(model, val_loader, batch_transforms, val_metric): def main(args): - print(args) + pbar = tqdm(disable=True) + pbar.write(str(args)) if args.push_to_hub: login_to_hub() @@ -195,7 +200,7 @@ def main(args): shuffle=False, drop_last=False, ) - print( + pbar.write( f"Validation set loaded in {time.time() - st:.4}s ({len(val_set)} samples in {val_loader.num_batches} batches)" ) @@ -217,9 +222,9 @@ def main(args): ]) if args.test_only: - print("Running evaluation") + pbar.write("Running evaluation") val_loss, exact_match, partial_match = evaluate(model, val_loader, batch_transforms, val_metric) - print(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") + pbar.write(f"Validation loss: {val_loss:.6} (Exact: {exact_match:.2%} | Partial: {partial_match:.2%})") return st = time.time() @@ -287,7 +292,7 @@ def main(args): shuffle=True, drop_last=True, ) - print( + pbar.write( f"Train set loaded in {time.time() - st:.4}s ({len(train_set)} samples in {train_loader.num_batches} batches)" ) @@ -364,24 +369,61 @@ def main(args): "pretrained": args.pretrained, } + global global_step + global_step = 0 # Shared global step counter + # W&B if args.wb: import wandb - run = wandb.init( - name=exp_name, - project="text-recognition", - config=config, - ) + run = wandb.init(name=exp_name, project="text-recognition", config=config) + + def wandb_log_at_step(train_loss=None, val_loss=None, lr=None): + wandb.log({ + **({"train_loss_step": train_loss} if train_loss is not None else {}), + **({"val_loss_step": val_loss} if val_loss is not None else {}), + **({"step_lr": lr} if lr is not None else {}), + }) # ClearML if args.clearml: - from clearml import Task + from clearml import Logger, Task task = Task.init(project_name="docTR/text-recognition", task_name=exp_name, reuse_last_task_id=False) task.upload_artifact("config", config) - global iteration - iteration = 0 + + def clearml_log_at_step(train_loss=None, val_loss=None, lr=None): + logger = Logger.current_logger() + if train_loss is not None: + logger.report_scalar( + title="Training Step Loss", + series="train_loss_step", + iteration=global_step, + value=train_loss, + ) + if val_loss is not None: + logger.report_scalar( + title="Validation Step Loss", + series="val_loss_step", + iteration=global_step, + value=val_loss, + ) + if lr is not None: + logger.report_scalar( + title="Step Learning Rate", + series="step_lr", + iteration=global_step, + value=lr, + ) + + # Unified logger + def log_at_step(train_loss=None, val_loss=None, lr=None): + global global_step + if args.wb: + wandb_log_at_step(train_loss, val_loss, lr) + if args.clearml: + clearml_log_at_step(train_loss, val_loss, lr) + global_step += 1 # Increment the shared global step counter # Backbone freezing if args.freeze_backbone: @@ -393,22 +435,29 @@ def main(args): early_stopper = EarlyStopper(patience=args.early_stop_epochs, min_delta=args.early_stop_delta) # Training loop for epoch in range(args.epochs): - fit_one_epoch(model, train_loader, batch_transforms, optimizer, args.amp, args.clearml) + train_loss, actual_lr = fit_one_epoch( + model, train_loader, batch_transforms, optimizer, args.amp, log=log_at_step + ) + pbar.write(f"Epoch {epoch + 1}/{args.epochs} - Training loss: {train_loss:.6} | LR: {actual_lr:.6}") # Validation loop at the end of each epoch - val_loss, exact_match, partial_match = evaluate(model, val_loader, batch_transforms, val_metric) + val_loss, exact_match, partial_match = evaluate( + model, val_loader, batch_transforms, val_metric, log=log_at_step + ) if val_loss < min_loss: - print(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") + pbar.write(f"Validation loss decreased {min_loss:.6} --> {val_loss:.6}: saving state...") model.save_weights(Path(args.output_dir) / f"{exp_name}.weights.h5") min_loss = val_loss - print( + pbar.write( f"Epoch {epoch + 1}/{args.epochs} - Validation loss: {val_loss:.6} " f"(Exact: {exact_match:.2%} | Partial: {partial_match:.2%})" ) # W&B if args.wb: wandb.log({ + "train_loss": train_loss, "val_loss": val_loss, + "learning_rate": actual_lr, "exact_match": exact_match, "partial_match": partial_match, }) @@ -418,13 +467,16 @@ def main(args): from clearml import Logger logger = Logger.current_logger() + logger.report_scalar(title="Training Loss", series="train_loss", value=train_loss, iteration=epoch) logger.report_scalar(title="Validation Loss", series="val_loss", value=val_loss, iteration=epoch) + logger.report_scalar(title="Learning Rate", series="lr", value=actual_lr, iteration=epoch) logger.report_scalar(title="Exact Match", series="exact_match", value=exact_match, iteration=epoch) logger.report_scalar(title="Partial Match", series="partial_match", value=partial_match, iteration=epoch) if args.early_stop and early_stopper.early_stop(val_loss): - print("Training halted early due to reaching patience limit.") + pbar.write("Training halted early due to reaching patience limit.") break + if args.wb: run.finish()