From fa5e0ecec4d596966d1074afca70f00281eb27e5 Mon Sep 17 00:00:00 2001 From: Chris Olszewski Date: Thu, 19 Oct 2023 15:06:53 -0700 Subject: [PATCH] fix: bias towards graph walk cancel over continue (#6210) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Description This PR does two things: - Shuts down process manager in all the various failure modes - Changes walker code to try to favor canceling the graph walk over continuing the graph walk. These changes come down to usage of [`biased;`](https://docs.rs/tokio/latest/tokio/macro.select.html#fairness). There's some future work that needs to be done as I also discovered we have an issue where we can accidentally spawn a child process if the `manager.spawn()` call ends up getting delayed enough that it happens [after this line in `manager.close`](https://github.com/vercel/turbo/blob/main/crates/turborepo-lib/src/process/mod.rs#L122). ### Testing Instructions Sadly, I haven't been able to get a red/green unit test so I need to rely on manual testing. #### Setup - `npx create-turbo@canary testing pnpm` - `pnpm rm turbo` - Add failing `build` script to `ui`: ``` [1 olszewski@chriss-mbp] /tmp/testing $ git diff diff --git a/packages/ui/package.json b/packages/ui/package.json index 689c3ee..00ebf8c 100644 --- a/packages/ui/package.json +++ b/packages/ui/package.json @@ -5,6 +5,7 @@ "types": "./index.tsx", "license": "MIT", "scripts": { + "build": "echo 'failing'; exit 1", "lint": "eslint .", "generate:component": "turbo gen react-component" }, ``` #### Testing On main, build a binary (`cargo build -p turbo`) and then use it to run the failing tasks. You can run it a few times and you should see some non-deterministic behavior which some of the time is incorrect where tasks are `web#build` or `docs#build` are run even after the dependency `ui#build` failed: ``` [0 olszewski@chriss-mbp] /tmp/testing $ turbo_dev build --experimental-rust-codepath ui:build: cache miss, executing ef675823628c01fe ui:build: ui:build: > ui@0.0.0 build /private/tmp/testing/packages/ui ui:build: > echo 'failing'; exit 1 ui:build: ui:build: failing ui:build:  ELIFECYCLE  Command failed with exit code 1. ui:build: ERROR: command finished with error: command (/private/tmp/testing/packages/ui) pnpm run build exited (1) ui#build: command (/private/tmp/testing/packages/ui) pnpm run build exited (1) [1 olszewski@chriss-mbp] /tmp/testing $ turbo_dev build --experimental-rust-codepath ui:build: cache miss, executing ef675823628c01fe ui:build: ui:build: > ui@0.0.0 build /private/tmp/testing/packages/ui ui:build: > echo 'failing'; exit 1 ui:build: ui:build: failing ui:build:  ELIFECYCLE  Command failed with exit code 1. ui:build: ERROR: command finished with error: command (/private/tmp/testing/packages/ui) pnpm run build exited (1) docs:build: cache hit (outputs already on disk), replaying logs 804a78e82dc8e3c9 docs:build: docs:build: > docs@1.0.0 build /private/tmp/testing/apps/docs docs:build: > next build docs:build: docs:build: - info Creating an optimized production build... docs:build: - info Compiled successfully docs:build: - info Linting and checking validity of types... docs:build: - info Collecting page data... docs:build: - info Generating static pages (0/4) docs:build: - info Generating static pages (1/4) docs:build: - info Generating static pages (2/4) docs:build: - info Generating static pages (3/4) docs:build: - info Generating static pages (4/4) docs:build: - info Finalizing page optimization... docs:build: docs:build: Route (app) Size First Load JS docs:build: ┌ ○ / 5.52 kB 84 kB docs:build: └ ○ /favicon.ico 0 B 0 B docs:build: + First Load JS shared by all 78.5 kB docs:build: ├ chunks/934-196dcc5a61008b80.js 26.1 kB docs:build: ├ chunks/c260e7fb-1dc88cd74c938f5d.js 50.5 kB docs:build: ├ chunks/main-app-dc37cd09df02200e.js 219 B docs:build: └ chunks/webpack-46498be4babc7638.js 1.68 kB docs:build: docs:build: Route (pages) Size First Load JS docs:build: ─ ○ /404 182 B 76.5 kB docs:build: + First Load JS shared by all 76.3 kB docs:build: ├ chunks/framework-eb124dc7acb3bb04.js 45.1 kB docs:build: ├ chunks/main-15364e85b6f1124e.js 29.4 kB docs:build: ├ chunks/pages/_app-82ff52170628f1f6.js 191 B docs:build: └ chunks/webpack-46498be4babc7638.js 1.68 kB docs:build: docs:build: ○ (Static) automatically rendered as static HTML (uses no initial props) docs:build: ui#build: command (/private/tmp/testing/packages/ui) pnpm run build exited (1) ``` Now checkout this PR and build a `turbo` binary. Running `turbo build` in the repo we set up should now be deterministic and only `ui#build` will be run. ``` [1 olszewski@chriss-mbp] /tmp/testing $ turbo_dev build --experimental-rust-codepath ui:build: cache miss, executing ef675823628c01fe ui:build: ui:build: > ui@0.0.0 build /private/tmp/testing/packages/ui ui:build: > echo 'failing'; exit 1 ui:build: ui:build: failing ui:build:  ELIFECYCLE  Command failed with exit code 1. ui:build: ERROR: command finished with error: command (/private/tmp/testing/packages/ui) pnpm run build exited (1) ui#build: command (/private/tmp/testing/packages/ui) pnpm run build exited (1) ``` Closes TURBO-1489 --------- Co-authored-by: Chris Olszewski --- crates/turborepo-lib/src/graph/walker.rs | 15 +++++++++------ crates/turborepo-lib/src/task_graph/visitor.rs | 6 +++++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/crates/turborepo-lib/src/graph/walker.rs b/crates/turborepo-lib/src/graph/walker.rs index 5181d734140cc..9821d4989378b 100644 --- a/crates/turborepo-lib/src/graph/walker.rs +++ b/crates/turborepo-lib/src/graph/walker.rs @@ -62,6 +62,15 @@ impl Walker { let deps_fut = join_all(deps_rx.iter_mut().map(|rx| rx.recv())); tokio::select! { + // If both the cancel and dependencies are ready, we want to + // execute the cancel instead of sending an additional node. + biased; + _ = cancel_rx.changed() => { + // If this future resolves this means that either: + // - cancel was updated, this can only happen through + // the cancel method which only sets it to true + // - the cancel sender was dropped which is also a sign that we should exit + } results = deps_fut => { for res in results { match res { @@ -102,12 +111,6 @@ impl Walker { // happens when this node has no dependents tx.send(()).ok(); } - _ = cancel_rx.changed() => { - // If this future resolves this means that either: - // - cancel was updated, this can only happen through - // the cancel method which only sets it to true - // - the cancel sender was dropped which is also a sign that we should exit - } } })); } diff --git a/crates/turborepo-lib/src/task_graph/visitor.rs b/crates/turborepo-lib/src/task_graph/visitor.rs index 2b1bc3769a11e..268a1d305ac7a 100644 --- a/crates/turborepo-lib/src/task_graph/visitor.rs +++ b/crates/turborepo-lib/src/task_graph/visitor.rs @@ -276,6 +276,7 @@ impl<'a> Visitor<'a> { Ok(w) => w, Err(e) => { error!("failed to capture outputs for \"{task_id}\": {e}"); + manager.stop().await; callback.send(Err(StopExecution)).ok(); return; } @@ -295,6 +296,7 @@ impl<'a> Visitor<'a> { .send(if continue_on_error { Ok(()) } else { + manager.stop().await; Err(StopExecution) }) .ok(); @@ -323,6 +325,7 @@ impl<'a> Visitor<'a> { // exit status with Some and it is only initialized with // None. Is it still running? error!("unable to determine why child exited"); + manager.stop().await; callback.send(Err(StopExecution)).ok(); return; } @@ -343,8 +346,8 @@ impl<'a> Visitor<'a> { callback.send(Ok(())).ok(); } else { prefixed_ui.error(format!("command finished with error: {error}")); - callback.send(Err(StopExecution)).ok(); manager.stop().await; + callback.send(Err(StopExecution)).ok(); } errors.lock().expect("lock poisoned").push(TaskError { task_id: task_id_for_display.clone(), @@ -357,6 +360,7 @@ impl<'a> Visitor<'a> { | ChildExit::Killed | ChildExit::KilledExternal | ChildExit::Failed => { + manager.stop().await; callback.send(Err(StopExecution)).ok(); return; }